From 45e359be1ce88fb22e61fa3aa23b2e450a6cae03 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 5 Jul 2025 00:01:38 +0800 Subject: net: xsk: introduce XDP_MAX_TX_SKB_BUDGET setsockopt This patch provides a setsockopt method to let applications leverage to adjust how many descs to be handled at most in one send syscall. It mitigates the situation where the default value (32) that is too small leads to higher frequency of triggering send syscall. Considering the prosperity/complexity the applications have, there is no absolutely ideal suggestion fitting all cases. So keep 32 as its default value like before. The patch does the following things: - Add XDP_MAX_TX_SKB_BUDGET socket option. - Set max_tx_budget to 32 by default in the initialization phase as a per-socket granular control. - Set the range of max_tx_budget as [32, xs->tx->nentries]. The idea behind this comes out of real workloads in production. We use a user-level stack with xsk support to accelerate sending packets and minimize triggering syscalls. When the packets are aggregated, it's not hard to hit the upper bound (namely, 32). The moment user-space stack fetches the -EAGAIN error number passed from sendto(), it will loop to try again until all the expected descs from tx ring are sent out to the driver. Enlarging the XDP_MAX_TX_SKB_BUDGET value contributes to less frequency of sendto() and higher throughput/PPS. Here is what I did in production, along with some numbers as follows: For one application I saw lately, I suggested using 128 as max_tx_budget because I saw two limitations without changing any default configuration: 1) XDP_MAX_TX_SKB_BUDGET, 2) socket sndbuf which is 212992 decided by net.core.wmem_default. As to XDP_MAX_TX_SKB_BUDGET, the scenario behind this was I counted how many descs are transmitted to the driver at one time of sendto() based on [1] patch and then I calculated the possibility of hitting the upper bound. Finally I chose 128 as a suitable value because 1) it covers most of the cases, 2) a higher number would not bring evident results. After twisting the parameters, a stable improvement of around 4% for both PPS and throughput and less resources consumption were found to be observed by strace -c -p xxx: 1) %time was decreased by 7.8% 2) error counter was decreased from 18367 to 572 [1]: https://lore.kernel.org/all/20250619093641.70700-1-kerneljasonxing@gmail.com/ Signed-off-by: Jason Xing Acked-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250704160138.48677-1-kerneljasonxing@gmail.com Signed-off-by: Paolo Abeni --- tools/include/uapi/linux/if_xdp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index 44f2bb93e7e6..23a062781468 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -79,6 +79,7 @@ struct xdp_mmap_offsets { #define XDP_UMEM_COMPLETION_RING 6 #define XDP_STATISTICS 7 #define XDP_OPTIONS 8 +#define XDP_MAX_TX_SKB_BUDGET 9 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ -- cgit From 2677010e7793451c20d895c477c4dc76f6e6a10e Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Thu, 10 Jul 2025 21:12:03 +0000 Subject: Add support to set NAPI threaded for individual NAPI A net device has a threaded sysctl that can be used to enable threaded NAPI polling on all of the NAPI contexts under that device. Allow enabling threaded NAPI polling at individual NAPI level using netlink. Extend the netlink operation `napi-set` and allow setting the threaded attribute of a NAPI. This will enable the threaded polling on a NAPI context. Add a test in `nl_netdev.py` that verifies various cases of threaded NAPI being set at NAPI and at device level. Tested ./tools/testing/selftests/net/nl_netdev.py TAP version 13 1..7 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check ok 3 nl_netdev.page_pool_check ok 4 nl_netdev.napi_list_check ok 5 nl_netdev.dev_set_threaded ok 6 nl_netdev.napi_set_threaded ok 7 nl_netdev.nsim_rxq_reset_down # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Samiullah Khawaja Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250710211203.3979655-1-skhawaja@google.com Signed-off-by: Jakub Kicinski --- tools/include/uapi/linux/netdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 7eb9571786b8..1f3719a9a0eb 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -134,6 +134,7 @@ enum { NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + NETDEV_A_NAPI_THREADED, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit From 8e7583a4f65f3dbf3e8deb4e60f3679c276bef62 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Wed, 23 Jul 2025 01:30:31 +0000 Subject: net: define an enum for the napi threaded state Instead of using '0' and '1' for napi threaded state use an enum with 'disabled' and 'enabled' states. Tested: ./tools/testing/selftests/net/nl_netdev.py TAP version 13 1..7 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check ok 3 nl_netdev.page_pool_check ok 4 nl_netdev.napi_list_check ok 5 nl_netdev.dev_set_threaded ok 6 nl_netdev.napi_set_threaded ok 7 nl_netdev.nsim_rxq_reset_down # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Samiullah Khawaja Link: https://patch.msgid.link/20250723013031.2911384-4-skhawaja@google.com Signed-off-by: Jakub Kicinski --- tools/include/uapi/linux/netdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'tools/include/uapi/linux') diff --git a/tools/include/uapi/linux/netdev.h b/tools/include/uapi/linux/netdev.h index 1f3719a9a0eb..48eb49aa03d4 100644 --- a/tools/include/uapi/linux/netdev.h +++ b/tools/include/uapi/linux/netdev.h @@ -77,6 +77,11 @@ enum netdev_qstats_scope { NETDEV_QSTATS_SCOPE_QUEUE = 1, }; +enum netdev_napi_threaded { + NETDEV_NAPI_THREADED_DISABLED, + NETDEV_NAPI_THREADED_ENABLED, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, -- cgit