summaryrefslogtreecommitdiff
path: root/net/ipv4/udp.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-05-28 15:24:36 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-05-28 15:24:36 -0700
commit1b98f357dadd6ea613a435fbaef1a5dd7b35fd21 (patch)
tree32a7195aead30f4dcadf3c3f897df2b4611b88b8 /net/ipv4/udp.c
parent47cf96fbe393839b125a9b694a8cfdd3f4216baa (diff)
parentf6bd8faeb113c8ab783466bc5bc1a5442ae85176 (diff)
Merge tag 'net-next-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "Core: - Implement the Device Memory TCP transmit path, allowing zero-copy data transmission on top of TCP from e.g. GPU memory to the wire. - Move all the IPv6 routing tables management outside the RTNL scope, under its own lock and RCU. The route control path is now 3x times faster. - Convert queue related netlink ops to instance lock, reducing again the scope of the RTNL lock. This improves the control plane scalability. - Refactor the software crc32c implementation, removing unneeded abstraction layers and improving significantly the related micro-benchmarks. - Optimize the GRO engine for UDP-tunneled traffic, for a 10% performance improvement in related stream tests. - Cover more per-CPU storage with local nested BH locking; this is a prep work to remove the current per-CPU lock in local_bh_disable() on PREMPT_RT. - Introduce and use nlmsg_payload helper, combining buffer bounds verification with accessing payload carried by netlink messages. Netfilter: - Rewrite the procfs conntrack table implementation, improving considerably the dump performance. A lot of user-space tools still use this interface. - Implement support for wildcard netdevice in netdev basechain and flowtables. - Integrate conntrack information into nft trace infrastructure. - Export set count and backend name to userspace, for better introspection. BPF: - BPF qdisc support: BPF-qdisc can be implemented with BPF struct_ops programs and can be controlled in similar way to traditional qdiscs using the "tc qdisc" command. - Refactor the UDP socket iterator, addressing long standing issues WRT duplicate hits or missed sockets. Protocols: - Improve TCP receive buffer auto-tuning and increase the default upper bound for the receive buffer; overall this improves the single flow maximum thoughput on 200Gbs link by over 60%. - Add AFS GSSAPI security class to AF_RXRPC; it provides transport security for connections to the AFS fileserver and VL server. - Improve TCP multipath routing, so that the sources address always matches the nexthop device. - Introduce SO_PASSRIGHTS for AF_UNIX, to allow disabling SCM_RIGHTS, and thus preventing DoS caused by passing around problematic FDs. - Retire DCCP socket. DCCP only receives updates for bugs, and major distros disable it by default. Its removal allows for better organisation of TCP fields to reduce the number of cache lines hit in the fast path. - Extend TCP drop-reason support to cover PAWS checks. Driver API: - Reorganize PTP ioctl flag support to require an explicit opt-in for the drivers, avoiding the problem of drivers not rejecting new unsupported flags. - Converted several device drivers to timestamping APIs. - Introduce per-PHY ethtool dump helpers, improving the support for dump operations targeting PHYs. Tests and tooling: - Add support for classic netlink in user space C codegen, so that ynl-c can now read, create and modify links, routes addresses and qdisc layer configuration. - Add ynl sub-types for binary attributes, allowing ynl-c to output known struct instead of raw binary data, clarifying the classic netlink output. - Extend MPTCP selftests to improve the code-coverage. - Add tests for XDP tail adjustment in AF_XDP. New hardware / drivers: - OpenVPN virtual driver: offload OpenVPN data channels processing to the kernel-space, increasing the data transfer throughput WRT the user-space implementation. - Renesas glue driver for the gigabit ethernet RZ/V2H(P) SoC. - Broadcom asp-v3.0 ethernet driver. - AMD Renoir ethernet device. - ReakTek MT9888 2.5G ethernet PHY driver. - Aeonsemi 10G C45 PHYs driver. Drivers: - Ethernet high-speed NICs: - nVidia/Mellanox (mlx5): - refactor the steering table handling to significantly reduce the amount of memory used - add support for complex matches in H/W flow steering - improve flow streeing error handling - convert to netdev instance locking - Intel (100G, ice, igb, ixgbe, idpf): - ice: add switchdev support for LLDP traffic over VF - ixgbe: add firmware manipulation and regions devlink support - igb: introduce support for frame transmission premption - igb: adds persistent NAPI configuration - idpf: introduce RDMA support - idpf: add initial PTP support - Meta (fbnic): - extend hardware stats coverage - add devlink dev flash support - Broadcom (bnxt): - add support for RX-side device memory TCP - Wangxun (txgbe): - implement support for udp tunnel offload - complete PTP and SRIOV support for AML 25G/10G devices - Ethernet NICs embedded and virtual: - Google (gve): - add device memory TCP TX support - Amazon (ena): - support persistent per-NAPI config - Airoha: - add H/W support for L2 traffic offload - add per flow stats for flow offloading - RealTek (rtl8211): add support for WoL magic packet - Synopsys (stmmac): - dwmac-socfpga 1000BaseX support - add Loongson-2K3000 support - introduce support for hardware-accelerated VLAN stripping - Broadcom (bcmgenet): - expose more H/W stats - Freescale (enetc, dpaa2-eth): - enetc: add MAC filter, VLAN filter RSS and loopback support - dpaa2-eth: convert to H/W timestamping APIs - vxlan: convert FDB table to rhashtable, for better scalabilty - veth: apply qdisc backpressure on full ring to reduce TX drops - Ethernet switches: - Microchip (kzZ88x3): add ETS scheduler support - Ethernet PHYs: - RealTek (rtl8211): - add support for WoL magic packet - add support for PHY LEDs - CAN: - Adds RZ/G3E CANFD support to the rcar_canfd driver. - Preparatory work for CAN-XL support. - Add self-tests framework with support for CAN physical interfaces. - WiFi: - mac80211: - scan improvements with multi-link operation (MLO) - Qualcomm (ath12k): - enable AHB support for IPQ5332 - add monitor interface support to QCN9274 - add multi-link operation support to WCN7850 - add 802.11d scan offload support to WCN7850 - monitor mode for WCN7850, better 6 GHz regulatory - Qualcomm (ath11k): - restore hibernation support - MediaTek (mt76): - WiFi-7 improvements - implement support for mt7990 - Intel (iwlwifi): - enhanced multi-link single-radio (EMLSR) support on 5 GHz links - rework device configuration - RealTek (rtw88): - improve throughput for RTL8814AU - RealTek (rtw89): - add multi-link operation support - STA/P2P concurrency improvements - support different SAR configs by antenna - Bluetooth: - introduce HCI Driver protocol - btintel_pcie: do not generate coredump for diagnostic events - btusb: add HCI Drv commands for configuring altsetting - btusb: add RTL8851BE device 0x0bda:0xb850 - btusb: add new VID/PID 13d3/3584 for MT7922 - btusb: add new VID/PID 13d3/3630 and 13d3/3613 for MT7925 - btnxpuart: implement host-wakeup feature" * tag 'net-next-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1611 commits) selftests/bpf: Fix bpf selftest build warning selftests: netfilter: Fix skip of wildcard interface test net: phy: mscc: Stop clearing the the UDPv4 checksum for L2 frames net: openvswitch: Fix the dead loop of MPLS parse calipso: Don't call calipso functions for AF_INET sk. selftests/tc-testing: Add a test for HFSC eltree double add with reentrant enqueue behaviour on netem net_sched: hfsc: Address reentrant enqueue adding class to eltree twice octeontx2-pf: QOS: Refactor TC_HTB_LEAF_DEL_LAST callback octeontx2-pf: QOS: Perform cache sync on send queue teardown net: mana: Add support for Multi Vports on Bare metal net: devmem: ncdevmem: remove unused variable net: devmem: ksft: upgrade rx test to send 1K data net: devmem: ksft: add 5 tuple FS support net: devmem: ksft: add exit_wait to make rx test pass net: devmem: ksft: add ipv4 support net: devmem: preserve sockc_err page_pool: fix ugly page_pool formatting net: devmem: move list_add to net_devmem_bind_dmabuf. selftests: netfilter: nft_queue.sh: include file transfer duration in log message net: phy: mscc: Fix memory leak when using one step timestamping ...
Diffstat (limited to 'net/ipv4/udp.c')
-rw-r--r--net/ipv4/udp.c227
1 files changed, 161 insertions, 66 deletions
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2742cc7602bb..dde52b8050b8 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -93,6 +93,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
+#include <linux/sock_diag.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
@@ -119,6 +120,7 @@
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6_stubs.h>
#endif
+#include <net/rps.h>
struct udp_table udp_table __read_mostly;
@@ -1942,8 +1944,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
error = -EAGAIN;
do {
spin_lock_bh(&queue->lock);
- skb = __skb_try_recv_from_queue(sk, queue, flags, off,
- err, &last);
+ skb = __skb_try_recv_from_queue(queue, flags, off, err,
+ &last);
if (skb) {
if (!(flags & MSG_PEEK))
udp_skb_destructor(sk, skb);
@@ -1964,8 +1966,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
spin_lock(&sk_queue->lock);
skb_queue_splice_tail_init(sk_queue, queue);
- skb = __skb_try_recv_from_queue(sk, queue, flags, off,
- err, &last);
+ skb = __skb_try_recv_from_queue(queue, flags, off, err,
+ &last);
if (skb && !(flags & MSG_PEEK))
udp_skb_dtor_locked(sk, skb);
spin_unlock(&sk_queue->lock);
@@ -2199,6 +2201,7 @@ void udp_lib_unhash(struct sock *sk)
struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2;
+ sock_rps_delete_flow(sk);
hslot = udp_hashslot(udptable, sock_net(sk),
udp_sk(sk)->udp_port_hash);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
@@ -2897,20 +2900,40 @@ void udp_destroy_sock(struct sock *sk)
if (encap_destroy)
encap_destroy(sk);
}
- if (udp_test_bit(ENCAP_ENABLED, sk))
+ if (udp_test_bit(ENCAP_ENABLED, sk)) {
static_branch_dec(&udp_encap_needed_key);
+ udp_tunnel_cleanup_gro(sk);
+ }
}
}
+typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb);
+
static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family,
struct sock *sk)
{
#ifdef CONFIG_XFRM
+ udp_gro_receive_t new_gro_receive;
+
if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) {
- if (family == AF_INET)
- WRITE_ONCE(udp_sk(sk)->gro_receive, xfrm4_gro_udp_encap_rcv);
- else if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
- WRITE_ONCE(udp_sk(sk)->gro_receive, ipv6_stub->xfrm6_gro_udp_encap_rcv);
+ if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
+ new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv;
+ else
+ new_gro_receive = xfrm4_gro_udp_encap_rcv;
+
+ if (udp_sk(sk)->gro_receive != new_gro_receive) {
+ /*
+ * With IPV6_ADDRFORM the gro callback could change
+ * after being set, unregister the old one, if valid.
+ */
+ if (udp_sk(sk)->gro_receive)
+ udp_tunnel_update_gro_rcv(sk, false);
+
+ WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive);
+ udp_tunnel_update_gro_rcv(sk, true);
+ }
}
#endif
}
@@ -2960,6 +2983,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
break;
case UDP_ENCAP:
+ sockopt_lock_sock(sk);
switch (val) {
case 0:
#ifdef CONFIG_XFRM
@@ -2983,6 +3007,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
err = -ENOPROTOOPT;
break;
}
+ sockopt_release_sock(sk);
break;
case UDP_NO_CHECK6_TX:
@@ -3000,13 +3025,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
break;
case UDP_GRO:
-
+ sockopt_lock_sock(sk);
/* when enabling GRO, accept the related GSO packet type */
if (valbool)
udp_tunnel_encap_enable(sk);
udp_assign_bit(GRO_ENABLED, sk, valbool);
udp_assign_bit(ACCEPT_L4, sk, valbool);
set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk);
+ sockopt_release_sock(sk);
break;
/*
@@ -3390,34 +3416,55 @@ struct bpf_iter__udp {
int bucket __aligned(8);
};
+union bpf_udp_iter_batch_item {
+ struct sock *sk;
+ __u64 cookie;
+};
+
struct bpf_udp_iter_state {
struct udp_iter_state state;
unsigned int cur_sk;
unsigned int end_sk;
unsigned int max_sk;
- int offset;
- struct sock **batch;
- bool st_bucket_done;
+ union bpf_udp_iter_batch_item *batch;
};
static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
- unsigned int new_batch_sz);
+ unsigned int new_batch_sz, gfp_t flags);
+static struct sock *bpf_iter_udp_resume(struct sock *first_sk,
+ union bpf_udp_iter_batch_item *cookies,
+ int n_cookies)
+{
+ struct sock *sk = NULL;
+ int i;
+
+ for (i = 0; i < n_cookies; i++) {
+ sk = first_sk;
+ udp_portaddr_for_each_entry_from(sk)
+ if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
+ goto done;
+ }
+done:
+ return sk;
+}
+
static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
{
struct bpf_udp_iter_state *iter = seq->private;
struct udp_iter_state *state = &iter->state;
+ unsigned int find_cookie, end_cookie;
struct net *net = seq_file_net(seq);
- int resume_bucket, resume_offset;
struct udp_table *udptable;
unsigned int batch_sks = 0;
- bool resized = false;
+ int resume_bucket;
+ int resizes = 0;
struct sock *sk;
+ int err = 0;
resume_bucket = state->bucket;
- resume_offset = iter->offset;
/* The current batch is done, so advance the bucket. */
- if (iter->st_bucket_done)
+ if (iter->cur_sk == iter->end_sk)
state->bucket++;
udptable = udp_get_table_seq(seq, net);
@@ -3430,62 +3477,89 @@ again:
* before releasing the bucket lock. This allows BPF programs that are
* called in seq_show to acquire the bucket lock if needed.
*/
+ find_cookie = iter->cur_sk;
+ end_cookie = iter->end_sk;
iter->cur_sk = 0;
iter->end_sk = 0;
- iter->st_bucket_done = false;
batch_sks = 0;
for (; state->bucket <= udptable->mask; state->bucket++) {
struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;
if (hlist_empty(&hslot2->head))
- continue;
+ goto next_bucket;
- iter->offset = 0;
spin_lock_bh(&hslot2->lock);
- udp_portaddr_for_each_entry(sk, &hslot2->head) {
+ sk = hlist_entry_safe(hslot2->head.first, struct sock,
+ __sk_common.skc_portaddr_node);
+ /* Resume from the first (in iteration order) unseen socket from
+ * the last batch that still exists in resume_bucket. Most of
+ * the time this will just be where the last iteration left off
+ * in resume_bucket unless that socket disappeared between
+ * reads.
+ */
+ if (state->bucket == resume_bucket)
+ sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+fill_batch:
+ udp_portaddr_for_each_entry_from(sk) {
if (seq_sk_match(seq, sk)) {
- /* Resume from the last iterated socket at the
- * offset in the bucket before iterator was stopped.
- */
- if (state->bucket == resume_bucket &&
- iter->offset < resume_offset) {
- ++iter->offset;
- continue;
- }
if (iter->end_sk < iter->max_sk) {
sock_hold(sk);
- iter->batch[iter->end_sk++] = sk;
+ iter->batch[iter->end_sk++].sk = sk;
}
batch_sks++;
}
}
+
+ /* Allocate a larger batch and try again. */
+ if (unlikely(resizes <= 1 && iter->end_sk &&
+ iter->end_sk != batch_sks)) {
+ resizes++;
+
+ /* First, try with GFP_USER to maximize the chances of
+ * grabbing more memory.
+ */
+ if (resizes == 1) {
+ spin_unlock_bh(&hslot2->lock);
+ err = bpf_iter_udp_realloc_batch(iter,
+ batch_sks * 3 / 2,
+ GFP_USER);
+ if (err)
+ return ERR_PTR(err);
+ /* Start over. */
+ goto again;
+ }
+
+ /* Next, hold onto the lock, so the bucket doesn't
+ * change while we get the rest of the sockets.
+ */
+ err = bpf_iter_udp_realloc_batch(iter, batch_sks,
+ GFP_NOWAIT);
+ if (err) {
+ spin_unlock_bh(&hslot2->lock);
+ return ERR_PTR(err);
+ }
+
+ /* Pick up where we left off. */
+ sk = iter->batch[iter->end_sk - 1].sk;
+ sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next,
+ struct sock,
+ __sk_common.skc_portaddr_node);
+ batch_sks = iter->end_sk;
+ goto fill_batch;
+ }
+
spin_unlock_bh(&hslot2->lock);
if (iter->end_sk)
break;
+next_bucket:
+ resizes = 0;
}
- /* All done: no batch made. */
- if (!iter->end_sk)
- return NULL;
-
- if (iter->end_sk == batch_sks) {
- /* Batching is done for the current bucket; return the first
- * socket to be iterated from the batch.
- */
- iter->st_bucket_done = true;
- goto done;
- }
- if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) {
- resized = true;
- /* After allocating a larger batch, retry one more time to grab
- * the whole bucket.
- */
- goto again;
- }
-done:
- return iter->batch[0];
+ WARN_ON_ONCE(iter->end_sk != batch_sks);
+ return iter->end_sk ? iter->batch[0].sk : NULL;
}
static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -3496,16 +3570,14 @@ static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
/* Whenever seq_next() is called, the iter->cur_sk is
* done with seq_show(), so unref the iter->cur_sk.
*/
- if (iter->cur_sk < iter->end_sk) {
- sock_put(iter->batch[iter->cur_sk++]);
- ++iter->offset;
- }
+ if (iter->cur_sk < iter->end_sk)
+ sock_put(iter->batch[iter->cur_sk++].sk);
/* After updating iter->cur_sk, check if there are more sockets
* available in the current bucket batch.
*/
if (iter->cur_sk < iter->end_sk)
- sk = iter->batch[iter->cur_sk];
+ sk = iter->batch[iter->cur_sk].sk;
else
/* Prepare a new batch. */
sk = bpf_iter_udp_batch(seq);
@@ -3569,8 +3641,19 @@ unlock:
static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
{
- while (iter->cur_sk < iter->end_sk)
- sock_put(iter->batch[iter->cur_sk++]);
+ union bpf_udp_iter_batch_item *item;
+ unsigned int cur_sk = iter->cur_sk;
+ __u64 cookie;
+
+ /* Remember the cookies of the sockets we haven't seen yet, so we can
+ * pick up where we left off next time around.
+ */
+ while (cur_sk < iter->end_sk) {
+ item = &iter->batch[cur_sk++];
+ cookie = sock_gen_cookie(item->sk);
+ sock_put(item->sk);
+ item->cookie = cookie;
+ }
}
static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
@@ -3586,10 +3669,8 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
(void)udp_prog_seq_show(prog, &meta, v, 0, 0);
}
- if (iter->cur_sk < iter->end_sk) {
+ if (iter->cur_sk < iter->end_sk)
bpf_iter_udp_put_batch(iter);
- iter->st_bucket_done = false;
- }
}
static const struct seq_operations bpf_iter_udp_seq_ops = {
@@ -3810,6 +3891,15 @@ fallback:
static int __net_init udp_pernet_init(struct net *net)
{
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+ int i;
+
+ /* No tunnel is configured */
+ for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) {
+ INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list);
+ RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL);
+ }
+#endif
udp_sysctl_init(net);
udp_set_table(net);
@@ -3831,16 +3921,19 @@ DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
struct udp_sock *udp_sk, uid_t uid, int bucket)
static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
- unsigned int new_batch_sz)
+ unsigned int new_batch_sz, gfp_t flags)
{
- struct sock **new_batch;
+ union bpf_udp_iter_batch_item *new_batch;
new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
- GFP_USER | __GFP_NOWARN);
+ flags | __GFP_NOWARN);
if (!new_batch)
return -ENOMEM;
- bpf_iter_udp_put_batch(iter);
+ if (flags != GFP_NOWAIT)
+ bpf_iter_udp_put_batch(iter);
+
+ memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
kvfree(iter->batch);
iter->batch = new_batch;
iter->max_sk = new_batch_sz;
@@ -3859,10 +3952,12 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
if (ret)
return ret;
- ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ);
+ ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
if (ret)
bpf_iter_fini_seq_net(priv_data);
+ iter->state.bucket = -1;
+
return ret;
}