summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 06:20:58 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 06:20:58 -0700
commitfc02cb2b37fe2cbf1d3334b9f0f0eab9431766c4 (patch)
tree93b16bc48fdc3be4a1adccbf4c7de92a5e8440e1 /net/ipv4/tcp.c
parentbfc484fe6abba4b89ec9330e0e68778e2a9856b2 (diff)
parent84882cf72cd774cf16fd338bdbf00f69ac9f9194 (diff)
Merge tag 'net-next-for-5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - Remove socket skb caches - Add a SO_RESERVE_MEM socket op to forward allocate buffer space and avoid memory accounting overhead on each message sent - Introduce managed neighbor entries - added by control plane and resolved by the kernel for use in acceleration paths (BPF / XDP right now, HW offload users will benefit as well) - Make neighbor eviction on link down controllable by userspace to work around WiFi networks with bad roaming implementations - vrf: Rework interaction with netfilter/conntrack - fq_codel: implement L4S style ce_threshold_ect1 marking - sch: Eliminate unnecessary RCU waits in mini_qdisc_pair_swap() BPF: - Add support for new btf kind BTF_KIND_TAG, arbitrary type tagging as implemented in LLVM14 - Introduce bpf_get_branch_snapshot() to capture Last Branch Records - Implement variadic trace_printk helper - Add a new Bloomfilter map type - Track <8-byte scalar spill and refill - Access hw timestamp through BPF's __sk_buff - Disallow unprivileged BPF by default - Document BPF licensing Netfilter: - Introduce egress hook for looking at raw outgoing packets - Allow matching on and modifying inner headers / payload data - Add NFT_META_IFTYPE to match on the interface type either from ingress or egress Protocols: - Multi-Path TCP: - increase default max additional subflows to 2 - rework forward memory allocation - add getsockopts: MPTCP_INFO, MPTCP_TCPINFO, MPTCP_SUBFLOW_ADDRS - MCTP flow support allowing lower layer drivers to configure msg muxing as needed - Automatic Multicast Tunneling (AMT) driver based on RFC7450 - HSR support the redbox supervision frames (IEC-62439-3:2018) - Support for the ip6ip6 encapsulation of IOAM - Netlink interface for CAN-FD's Transmitter Delay Compensation - Support SMC-Rv2 eliminating the current same-subnet restriction, by exploiting the UDP encapsulation feature of RoCE adapters - TLS: add SM4 GCM/CCM crypto support - Bluetooth: initial support for link quality and audio/codec offload Driver APIs: - Add a batched interface for RX buffer allocation in AF_XDP buffer pool - ethtool: Add ability to control transceiver modules' power mode - phy: Introduce supported interfaces bitmap to express MAC capabilities and simplify PHY code - Drop rtnl_lock from DSA .port_fdb_{add,del} callbacks New drivers: - WiFi driver for Realtek 8852AE 802.11ax devices (rtw89) - Ethernet driver for ASIX AX88796C SPI device (x88796c) Drivers: - Broadcom PHYs - support 72165, 7712 16nm PHYs - support IDDQ-SR for additional power savings - PHY support for QCA8081, QCA9561 PHYs - NXP DPAA2: support for IRQ coalescing - NXP Ethernet (enetc): support for software TCP segmentation - Renesas Ethernet (ravb) - support DMAC and EMAC blocks of Gigabit-capable IP found on RZ/G2L SoC - Intel 100G Ethernet - support for eswitch offload of TC/OvS flow API, including offload of GRE, VxLAN, Geneve tunneling - support application device queues - ability to assign Rx and Tx queues to application threads - PTP and PPS (pulse-per-second) extensions - Broadcom Ethernet (bnxt) - devlink health reporting and device reload extensions - Mellanox Ethernet (mlx5) - offload macvlan interfaces - support HW offload of TC rules involving OVS internal ports - support HW-GRO and header/data split - support application device queues - Marvell OcteonTx2: - add XDP support for PF - add PTP support for VF - Qualcomm Ethernet switch (qca8k): support for QCA8328 - Realtek Ethernet DSA switch (rtl8366rb) - support bridge offload - support STP, fast aging, disabling address learning - support for Realtek RTL8365MB-VC, a 4+1 port 10M/100M/1GE switch - Mellanox Ethernet/IB switch (mlxsw) - multi-level qdisc hierarchy offload (e.g. RED, prio and shaping) - offload root TBF qdisc as port shaper - support multiple routing interface MAC address prefixes - support for IP-in-IP with IPv6 underlay - MediaTek WiFi (mt76) - mt7921 - ASPM, 6GHz, SDIO and testmode support - mt7915 - LED and TWT support - Qualcomm WiFi (ath11k) - include channel rx and tx time in survey dump statistics - support for 80P80 and 160 MHz bandwidths - support channel 2 in 6 GHz band - spectral scan support for QCN9074 - support for rx decapsulation offload (data frames in 802.3 format) - Qualcomm phone SoC WiFi (wcn36xx) - enable Idle Mode Power Save (IMPS) to reduce power consumption during idle - Bluetooth driver support for MediaTek MT7922 and MT7921 - Enable support for AOSP Bluetooth extension in Qualcomm WCN399x and Realtek 8822C/8852A - Microsoft vNIC driver (mana) - support hibernation and kexec - Google vNIC driver (gve) - support for jumbo frames - implement Rx page reuse Refactor: - Make all writes to netdev->dev_addr go thru helpers, so that we can add this address to the address rbtree and handle the updates - Various TCP cleanups and optimizations including improvements to CPU cache use - Simplify the gnet_stats, Qdisc stats' handling and remove qdisc->running sequence counter - Driver changes and API updates to address devlink locking deficiencies" * tag 'net-next-for-5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2122 commits) Revert "net: avoid double accounting for pure zerocopy skbs" selftests: net: add arp_ndisc_evict_nocarrier net: ndisc: introduce ndisc_evict_nocarrier sysctl parameter net: arp: introduce arp_evict_nocarrier sysctl parameter libbpf: Deprecate AF_XDP support kbuild: Unify options for BTF generation for vmlinux and modules selftests/bpf: Add a testcase for 64-bit bounds propagation issue. bpf: Fix propagation of signed bounds from 64-bit min/max into 32-bit. bpf: Fix propagation of bounds from 64-bit min/max into 32-bit and var_off. net: vmxnet3: remove multiple false checks in vmxnet3_ethtool.c net: avoid double accounting for pure zerocopy skbs tcp: rename sk_wmem_free_skb netdevsim: fix uninit value in nsim_drv_configure_vfs() selftests/bpf: Fix also no-alu32 strobemeta selftest bpf: Add missing map_delete_elem method to bloom filter map selftests/bpf: Add bloom map success test for userspace calls bpf: Add alignment padding for "map_extra" + consolidate holes bpf: Bloom filter map naming fixups selftests/bpf: Add test cases for struct_ops prog bpf: Add dummy BPF STRUCT_OPS for test purpose ...
Diffstat (limited to 'net/ipv4/tcp.c')
-rw-r--r--net/ipv4/tcp.c132
1 files changed, 58 insertions, 74 deletions
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f5c336f8b0c8..bc7f419184aa 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -287,8 +287,8 @@ enum {
TCP_CMSG_TS = 2
};
-struct percpu_counter tcp_orphan_count;
-EXPORT_SYMBOL_GPL(tcp_orphan_count);
+DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
long sysctl_tcp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_tcp_mem);
@@ -325,11 +325,6 @@ struct tcp_splice_state {
unsigned long tcp_memory_pressure __read_mostly;
EXPORT_SYMBOL_GPL(tcp_memory_pressure);
-DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key);
-EXPORT_SYMBOL(tcp_rx_skb_cache_key);
-
-DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key);
-
void tcp_enter_memory_pressure(struct sock *sk)
{
unsigned long val;
@@ -644,7 +639,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
}
EXPORT_SYMBOL(tcp_ioctl);
-static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
tp->pushed_seq = tp->write_seq;
@@ -655,15 +650,13 @@ static inline bool forced_push(const struct tcp_sock *tp)
return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
}
-static void skb_entail(struct sock *sk, struct sk_buff *skb)
+void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- skb->csum = 0;
tcb->seq = tcb->end_seq = tp->write_seq;
tcb->tcp_flags = TCPHDR_ACK;
- tcb->sacked = 0;
__skb_header_release(skb);
tcp_add_write_queue_tail(sk, skb);
sk_wmem_queued_add(sk, skb->truesize);
@@ -858,30 +851,15 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
}
EXPORT_SYMBOL(tcp_splice_read);
-struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
- bool force_schedule)
+struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
+ bool force_schedule)
{
struct sk_buff *skb;
- if (likely(!size)) {
- skb = sk->sk_tx_skb_cache;
- if (skb) {
- skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
- sk->sk_tx_skb_cache = NULL;
- pskb_trim(skb, 0);
- INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
- skb_shinfo(skb)->tx_flags = 0;
- memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb));
- return skb;
- }
- }
- /* The TCP header must be at least 32-bit aligned. */
- size = ALIGN(size, 4);
-
if (unlikely(tcp_under_memory_pressure(sk)))
sk_mem_reclaim_partial(sk);
- skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
+ skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp);
if (likely(skb)) {
bool mem_scheduled;
@@ -892,12 +870,8 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
}
if (likely(mem_scheduled)) {
- skb_reserve(skb, sk->sk_prot->max_header);
- /*
- * Make sure that we have exactly size bytes
- * available to the caller, no more, no less.
- */
- skb->reserved_tailroom = skb->end - skb->tail - size;
+ skb_reserve(skb, MAX_TCP_HEADER);
+ skb->ip_summed = CHECKSUM_PARTIAL;
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
return skb;
}
@@ -950,18 +924,20 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
* importantly be able to generate EPOLLOUT for Edge Trigger epoll()
* users.
*/
-void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb)
+void tcp_remove_empty_skb(struct sock *sk)
{
- if (skb && !skb->len) {
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
+
+ if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
tcp_unlink_write_queue(skb, sk);
if (tcp_write_queue_empty(sk))
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
- sk_wmem_free_skb(sk, skb);
+ tcp_wmem_free_skb(sk, skb);
}
}
-struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
- struct page *page, int offset, size_t *size)
+static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
+ struct page *page, int offset, size_t *size)
{
struct sk_buff *skb = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -974,15 +950,15 @@ new_segment:
if (!sk_stream_memory_free(sk))
return NULL;
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
- tcp_rtx_and_write_queues_empty(sk));
+ skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
+ tcp_rtx_and_write_queues_empty(sk));
if (!skb)
return NULL;
#ifdef CONFIG_TLS_DEVICE
skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
#endif
- skb_entail(sk, skb);
+ tcp_skb_entail(sk, skb);
copy = size_goal;
}
@@ -1013,7 +989,6 @@ new_segment:
skb->truesize += copy;
sk_wmem_queued_add(sk, copy);
sk_mem_charge(sk, copy);
- skb->ip_summed = CHECKSUM_PARTIAL;
WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0);
@@ -1104,7 +1079,7 @@ out:
return copied;
do_error:
- tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk));
+ tcp_remove_empty_skb(sk);
if (copied)
goto out;
out_err:
@@ -1303,15 +1278,14 @@ new_segment:
goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
- first_skb);
+ skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
+ first_skb);
if (!skb)
goto wait_for_space;
process_backlog++;
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb_entail(sk, skb);
+ tcp_skb_entail(sk, skb);
copy = size_goal;
/* All packets are restored as if they have
@@ -1326,14 +1300,7 @@ new_segment:
if (copy > msg_data_left(msg))
copy = msg_data_left(msg);
- /* Where to copy to? */
- if (skb_availroom(skb) > 0 && !zc) {
- /* We have some space in skb head. Superb! */
- copy = min_t(int, copy, skb_availroom(skb));
- err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
- if (err)
- goto do_fault;
- } else if (!zc) {
+ if (!zc) {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
@@ -1432,9 +1399,7 @@ out_nopush:
return copied + copied_syn;
do_error:
- skb = tcp_write_queue_tail(sk);
-do_fault:
- tcp_remove_empty_skb(sk, skb);
+ tcp_remove_empty_skb(sk);
if (copied + copied_syn)
goto out;
@@ -2687,11 +2652,36 @@ void tcp_shutdown(struct sock *sk, int how)
}
EXPORT_SYMBOL(tcp_shutdown);
+int tcp_orphan_count_sum(void)
+{
+ int i, total = 0;
+
+ for_each_possible_cpu(i)
+ total += per_cpu(tcp_orphan_count, i);
+
+ return max(total, 0);
+}
+
+static int tcp_orphan_cache;
+static struct timer_list tcp_orphan_timer;
+#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
+
+static void tcp_orphan_update(struct timer_list *unused)
+{
+ WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
+ mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+}
+
+static bool tcp_too_many_orphans(int shift)
+{
+ return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans;
+}
+
bool tcp_check_oom(struct sock *sk, int shift)
{
bool too_many_orphans, out_of_socket_memory;
- too_many_orphans = tcp_too_many_orphans(sk, shift);
+ too_many_orphans = tcp_too_many_orphans(shift);
out_of_socket_memory = tcp_out_of_memory(sk);
if (too_many_orphans)
@@ -2800,7 +2790,7 @@ adjudge_to_death:
/* remove backlog if any, without releasing ownership. */
__release_sock(sk);
- percpu_counter_inc(sk->sk_prot->orphan_count);
+ this_cpu_inc(tcp_orphan_count);
/* Have we already been destroyed by a softirq or backlog? */
if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -2903,7 +2893,7 @@ static void tcp_rtx_queue_purge(struct sock *sk)
* list_del(&skb->tcp_tsorted_anchor)
*/
tcp_rtx_queue_unlink(skb, sk);
- sk_wmem_free_skb(sk, skb);
+ tcp_wmem_free_skb(sk, skb);
}
}
@@ -2914,14 +2904,9 @@ void tcp_write_queue_purge(struct sock *sk)
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
tcp_skb_tsorted_anchor_cleanup(skb);
- sk_wmem_free_skb(sk, skb);
+ tcp_wmem_free_skb(sk, skb);
}
tcp_rtx_queue_purge(sk);
- skb = sk->sk_tx_skb_cache;
- if (skb) {
- __kfree_skb(skb);
- sk->sk_tx_skb_cache = NULL;
- }
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
@@ -2958,10 +2943,6 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
- if (sk->sk_rx_skb_cache) {
- __kfree_skb(sk->sk_rx_skb_cache);
- sk->sk_rx_skb_cache = NULL;
- }
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->urg_data = 0;
tcp_write_queue_purge(sk);
@@ -4502,7 +4483,10 @@ void __init tcp_init(void)
sizeof_field(struct sk_buff, cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
- percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
+
+ timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
+ mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+
inet_hashinfo_init(&tcp_hashinfo);
inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
thash_entries, 21, /* one slot per 2 MB*/