From 31557b3487b349464daf42bc4366153743c1e727 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Jun 2025 07:39:33 -0700 Subject: uapi: in6: restore visibility of most IPv6 socket options MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A decade ago commit 6d08acd2d32e ("in6: fix conflict with glibc") hid the definitions of IPV6 options, because GCC was complaining about duplicates. The commit did not list the warnings seen, but trying to recreate them now I think they are (building iproute2): In file included from ./include/uapi/rdma/rdma_user_cm.h:39, from rdma.h:16, from res.h:9, from res-ctx.c:7: ../include/uapi/linux/in6.h:171:9: warning: ‘IPV6_ADD_MEMBERSHIP’ redefined 171 | #define IPV6_ADD_MEMBERSHIP 20 | ^~~~~~~~~~~~~~~~~~~ In file included from /usr/include/netinet/in.h:37, from rdma.h:13: /usr/include/bits/in.h:233:10: note: this is the location of the previous definition 233 | # define IPV6_ADD_MEMBERSHIP IPV6_JOIN_GROUP | ^~~~~~~~~~~~~~~~~~~ ../include/uapi/linux/in6.h:172:9: warning: ‘IPV6_DROP_MEMBERSHIP’ redefined 172 | #define IPV6_DROP_MEMBERSHIP 21 | ^~~~~~~~~~~~~~~~~~~~ /usr/include/bits/in.h:234:10: note: this is the location of the previous definition 234 | # define IPV6_DROP_MEMBERSHIP IPV6_LEAVE_GROUP | ^~~~~~~~~~~~~~~~~~~~ Compilers don't complain about redefinition if the defines are identical, but here we have the kernel using the literal value, and glibc using an indirection (defining to a name of another define, with the same numerical value). Problem is, the commit in question hid all the IPV6 socket options, and glibc has a pretty sparse list. For instance it lacks Flow Label related options. Willem called this out in commit 3fb321fde22d ("selftests/net: ipv6 flowlabel"): /* uapi/glibc weirdness may leave this undefined */ #ifndef IPV6_FLOWINFO #define IPV6_FLOWINFO 11 #endif More interestingly some applications (socat) use a #ifdef IPV6_FLOWINFO to gate compilation of thier rudimentary flow label support. (For added confusion socat misspells it as IPV4_FLOWINFO in some places.) Hide only the two defines we know glibc has a problem with. If we discover more warnings we can hide more but we should avoid covering the entire block of defines for "IPV6 socket options". Link: https://patch.msgid.link/20250609143933.1654417-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/in6.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index ff8d21f9e95b..5a47339ef7d7 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -152,7 +152,6 @@ struct in6_flowlabel_req { /* * IPV6 socket options */ -#if __UAPI_DEF_IPV6_OPTIONS #define IPV6_ADDRFORM 1 #define IPV6_2292PKTINFO 2 #define IPV6_2292HOPOPTS 3 @@ -169,8 +168,10 @@ struct in6_flowlabel_req { #define IPV6_MULTICAST_IF 17 #define IPV6_MULTICAST_HOPS 18 #define IPV6_MULTICAST_LOOP 19 +#if __UAPI_DEF_IPV6_OPTIONS #define IPV6_ADD_MEMBERSHIP 20 #define IPV6_DROP_MEMBERSHIP 21 +#endif #define IPV6_ROUTER_ALERT 22 #define IPV6_MTU_DISCOVER 23 #define IPV6_MTU 24 @@ -203,7 +204,6 @@ struct in6_flowlabel_req { #define IPV6_IPSEC_POLICY 34 #define IPV6_XFRM_POLICY 35 #define IPV6_HDRINCL 36 -#endif /* * Multicast: -- cgit From c035e736038045b411cb368e63f07bc2f5dbc0e1 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Thu, 12 Jun 2025 17:28:33 +0200 Subject: dpll: add phase-offset-monitor feature to netlink spec Add enum dpll_feature_state for control over features. Add dpll device level attribute: DPLL_A_PHASE_OFFSET_MONITOR - to allow control over a phase offset monitor feature. Attribute is present and shall return current state of a feature (enum dpll_feature_state), if the device driver provides such capability, otherwie attribute shall not be present. Reviewed-by: Aleksandr Loktionov Reviewed-by: Milena Olech Reviewed-by: Jiri Pirko Signed-off-by: Arkadiusz Kubalewski Acked-by: Vadim Fedorenko Link: https://patch.msgid.link/20250612152835.1703397-2-arkadiusz.kubalewski@intel.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/dpll.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index bf97d4b6d51f..349e1b3ca1ae 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -192,6 +192,17 @@ enum dpll_pin_capabilities { #define DPLL_PHASE_OFFSET_DIVIDER 1000 +/** + * enum dpll_feature_state - Allow control (enable/disable) and status checking + * over features. + * @DPLL_FEATURE_STATE_DISABLE: feature shall be disabled + * @DPLL_FEATURE_STATE_ENABLE: feature shall be enabled + */ +enum dpll_feature_state { + DPLL_FEATURE_STATE_DISABLE, + DPLL_FEATURE_STATE_ENABLE, +}; + enum dpll_a { DPLL_A_ID = 1, DPLL_A_MODULE_NAME, @@ -204,6 +215,7 @@ enum dpll_a { DPLL_A_TYPE, DPLL_A_LOCK_STATUS_ERROR, DPLL_A_CLOCK_QUALITY_LEVEL, + DPLL_A_PHASE_OFFSET_MONITOR, __DPLL_A_MAX, DPLL_A_MAX = (__DPLL_A_MAX - 1) -- cgit From f8337efa4ff5a27e6c1d4e384166413eecd21a65 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:19 +0200 Subject: vxlan: Support MC routing in the underlay Locally-generated MC packets have so far not been subject to MC routing. Instead an MC-enabled installation would maintain the MC routing tables, and separately from that the list of interfaces to send packets to as part of the VXLAN FDB and MDB. In a previous patch, a ip_mr_output() and ip6_mr_output() routines were added for IPv4 and IPv6. All locally generated MC traffic is now passed through these functions. For reasons of backward compatibility, an SKB (IPCB / IP6CB) flag guards the actual MC routing. This patch adds logic to set the flag, and the UAPI to enable the behavior. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/d899655bb7e9b2521ee8c793e67056b9fd02ba12.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 3ad2d5d98034..873c285996fe 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1398,6 +1398,7 @@ enum { IFLA_VXLAN_LOCALBYPASS, IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ IFLA_VXLAN_RESERVED_BITS, + IFLA_VXLAN_MC_ROUTE, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) -- cgit From fc0e6db30941a66e284b8516b82356f97f31061d Mon Sep 17 00:00:00 2001 From: "Kory Maincent (Dent Project)" Date: Tue, 17 Jun 2025 14:12:01 +0200 Subject: net: pse-pd: Add support for reporting events Add support for devm_pse_irq_helper() to register PSE interrupts and report events such as over-current or over-temperature conditions. This follows a similar approach to the regulator API but also sends notifications using a dedicated PSE ethtool netlink socket. Signed-off-by: Kory Maincent (Dent Project) Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-2-78a1a645e2ee@bootlin.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink_generated.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 9a02f579de22..3864aa0de8c7 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -49,6 +49,16 @@ enum hwtstamp_source { HWTSTAMP_SOURCE_PHYLIB, }; +/** + * enum ethtool_pse_event - PSE event list for the PSE controller + * @ETHTOOL_PSE_EVENT_OVER_CURRENT: PSE output current is too high + * @ETHTOOL_PSE_EVENT_OVER_TEMP: PSE in over temperature state + */ +enum ethtool_pse_event { + ETHTOOL_PSE_EVENT_OVER_CURRENT = 1, + ETHTOOL_PSE_EVENT_OVER_TEMP = 2, +}; + enum { ETHTOOL_A_HEADER_UNSPEC, ETHTOOL_A_HEADER_DEV_INDEX, @@ -718,6 +728,14 @@ enum { ETHTOOL_A_TSCONFIG_MAX = (__ETHTOOL_A_TSCONFIG_CNT - 1) }; +enum { + ETHTOOL_A_PSE_NTF_HEADER = 1, + ETHTOOL_A_PSE_NTF_EVENTS, + + __ETHTOOL_A_PSE_NTF_CNT, + ETHTOOL_A_PSE_NTF_MAX = (__ETHTOOL_A_PSE_NTF_CNT - 1) +}; + enum { ETHTOOL_MSG_USER_NONE = 0, ETHTOOL_MSG_STRSET_GET = 1, @@ -822,6 +840,7 @@ enum { ETHTOOL_MSG_PHY_NTF, ETHTOOL_MSG_TSCONFIG_GET_REPLY, ETHTOOL_MSG_TSCONFIG_SET_REPLY, + ETHTOOL_MSG_PSE_NTF, __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) -- cgit From 1176978ed851952652ddea3685e2f71a0e5d61ff Mon Sep 17 00:00:00 2001 From: "Kory Maincent (Dent Project)" Date: Tue, 17 Jun 2025 14:12:04 +0200 Subject: net: ethtool: Add support for new power domains index description Report the index of the newly introduced PSE power domain to the user, enabling improved management of the power budget for PSE devices. Signed-off-by: Kory Maincent (Dent Project) Reviewed-by: Oleksij Rempel Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-5-78a1a645e2ee@bootlin.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink_generated.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 3864aa0de8c7..ed344c8533eb 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -652,6 +652,7 @@ enum { ETHTOOL_A_C33_PSE_EXT_SUBSTATE, ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, + ETHTOOL_A_PSE_PW_D_ID, __ETHTOOL_A_PSE_CNT, ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) -- cgit From ffef61d6d27374542f1bce4452200d9bdd2e1edd Mon Sep 17 00:00:00 2001 From: "Kory Maincent (Dent Project)" Date: Tue, 17 Jun 2025 14:12:06 +0200 Subject: net: pse-pd: Add support for budget evaluation strategies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces the ability to configure the PSE PI budget evaluation strategies. Budget evaluation strategies is utilized by PSE controllers to determine which ports to turn off first in scenarios such as power budget exceedance. The pis_prio_max value is used to define the maximum priority level supported by the controller. Both the current priority and the maximum priority are exposed to the user through the pse_ethtool_get_status call. This patch add support for two mode of budget evaluation strategies. 1. Static Method: This method involves distributing power based on PD classification. It’s straightforward and stable, the PSE core keeping track of the budget and subtracting the power requested by each PD’s class. Advantages: Every PD gets its promised power at any time, which guarantees reliability. Disadvantages: PD classification steps are large, meaning devices request much more power than they actually need. As a result, the power supply may only operate at, say, 50% capacity, which is inefficient and wastes money. Priority max value is matching the number of PSE PIs within the PSE. 2. Dynamic Method: To address the inefficiencies of the static method, vendors like Microchip have introduced dynamic power budgeting, as seen in the PD692x0 firmware. This method monitors the current consumption per port and subtracts it from the available power budget. When the budget is exceeded, lower-priority ports are shut down. Advantages: This method optimizes resource utilization, saving costs. Disadvantages: Low-priority devices may experience instability. Priority max value is set by the PSE controller driver. For now, budget evaluation methods are not configurable and cannot be mixed. They are hardcoded in the PSE driver itself, as no current PSE controller supports both methods. Signed-off-by: Kory Maincent (Dent Project) Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-7-78a1a645e2ee@bootlin.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink_generated.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index ed344c8533eb..c6a95224be25 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -53,10 +53,28 @@ enum hwtstamp_source { * enum ethtool_pse_event - PSE event list for the PSE controller * @ETHTOOL_PSE_EVENT_OVER_CURRENT: PSE output current is too high * @ETHTOOL_PSE_EVENT_OVER_TEMP: PSE in over temperature state + * @ETHTOOL_C33_PSE_EVENT_DETECTION: detection process occur on the PSE. IEEE + * 802.3-2022 33.2.5 and 145.2.6 PSE detection of PDs. IEEE 802.3-202 + * 30.9.1.1.5 aPSEPowerDetectionStatus + * @ETHTOOL_C33_PSE_EVENT_CLASSIFICATION: classification process occur on the + * PSE. IEEE 802.3-2022 33.2.6 and 145.2.8 classification of PDs mutual + * identification. IEEE 802.3-2022 30.9.1.1.8 aPSEPowerClassification. + * @ETHTOOL_C33_PSE_EVENT_DISCONNECTION: PD has been disconnected on the PSE. + * IEEE 802.3-2022 33.3.8 and 145.3.9 PD Maintain Power Signature. IEEE + * 802.3-2022 33.5.1.2.9 MPS Absent. IEEE 802.3-2022 30.9.1.1.20 + * aPSEMPSAbsentCounter. + * @ETHTOOL_PSE_EVENT_OVER_BUDGET: PSE turned off due to over budget situation + * @ETHTOOL_PSE_EVENT_SW_PW_CONTROL_ERROR: PSE faced an error managing the + * power control from software */ enum ethtool_pse_event { ETHTOOL_PSE_EVENT_OVER_CURRENT = 1, ETHTOOL_PSE_EVENT_OVER_TEMP = 2, + ETHTOOL_C33_PSE_EVENT_DETECTION = 4, + ETHTOOL_C33_PSE_EVENT_CLASSIFICATION = 8, + ETHTOOL_C33_PSE_EVENT_DISCONNECTION = 16, + ETHTOOL_PSE_EVENT_OVER_BUDGET = 32, + ETHTOOL_PSE_EVENT_SW_PW_CONTROL_ERROR = 64, }; enum { -- cgit From eeb0c8f72f49a21984981188404cfd3700edbaff Mon Sep 17 00:00:00 2001 From: "Kory Maincent (Dent Project)" Date: Tue, 17 Jun 2025 14:12:07 +0200 Subject: net: ethtool: Add PSE port priority support feature This patch expands the status information provided by ethtool for PSE c33 with current port priority and max port priority. It also adds a call to pse_ethtool_set_prio() to configure the PSE port priority. Signed-off-by: Kory Maincent (Dent Project) Reviewed-by: Oleksij Rempel Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-8-78a1a645e2ee@bootlin.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink_generated.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index c6a95224be25..8e5d067e7ddf 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -671,6 +671,8 @@ enum { ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT, ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES, ETHTOOL_A_PSE_PW_D_ID, + ETHTOOL_A_PSE_PRIO_MAX, + ETHTOOL_A_PSE_PRIO, __ETHTOOL_A_PSE_CNT, ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) -- cgit From 5ae1fc4069578f50798f3372f36a3c13ee565b66 Mon Sep 17 00:00:00 2001 From: Kavita Kavita Date: Wed, 4 Jun 2025 16:27:56 +0530 Subject: wifi: cfg80211: Improve the documentation for NL80211_CMD_ASSOC_MLO_RECONF The existing documentation for the NL80211_CMD_ASSOC_MLO_RECONF does not clearly explain handling of link reconfiguration request results from the driver. Add documentation to explain that the command is used as an event to notify userspace about added links information, and that the existing NL80211_CMD_LINKS_REMOVED command is used to notify userspace about removed links information. Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20250604105757.2542-2-quic_kkavita@quicinc.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e9ccf43fe3c6..e53840d009d1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1330,7 +1330,11 @@ * TID to Link mapping for downlink/uplink traffic. * * @NL80211_CMD_ASSOC_MLO_RECONF: For a non-AP MLD station, request to - * add/remove links to/from the association. + * add/remove links to/from the association. To indicate link + * reconfiguration request results from the driver, this command is also + * used as an event to notify userspace about the added links information. + * For notifying the removed links information, the existing + * %NL80211_CMD_LINKS_REMOVED command is used. * * @NL80211_CMD_EPCS_CFG: EPCS configuration for a station. Used by userland to * control EPCS configuration. Used to notify userland on the current state -- cgit From 7c598c653ad465138ecc2fe64492633c541effef Mon Sep 17 00:00:00 2001 From: Kavita Kavita Date: Wed, 4 Jun 2025 16:27:57 +0530 Subject: wifi: cfg80211: Add support for link reconfiguration negotiation offload to driver In the case of SME-in-driver, the driver can internally choose to update the links based on the AP MLD recommendation and do link reconfiguration negotiation with AP MLD. (e.g., After the driver processing the BSS Transition Management request frame received from the AP MLD with Neighbor Report containing Multi-Link element with recommended links information chooses to do link reconfiguration negotiation with AP MLD). To support this, extend cfg80211_mlo_reconf_add_done() and NL80211_CMD_ASSOC_MLO_RECONF to indicate added links information for driver-initiated link reconfiguration requests. For removed links, the driver indicates links information using the NL80211_CMD_LINKS_REMOVED event for driver-initiated cases, the same as supplicant initiated cases. For the driver-initiated case, cfg80211 will receive link reconfiguration result asynchronously from driver so holding BSSes of the accepted add links is needed in the event path. Also, no need of unhold call for the rejected add link BSSes since there was no hold call happened previously. Once the supplicant receives the NL80211_CMD_ASSOC_MLO_RECONF event, it needs to process the information about newly added links and install per-link group keys (e.g., GTK/IGTK/BIGTK etc.). In case of the SME-in-driver, using a vendor interface etc. to notify the supplicant to initiate a link reconfiguration request and then supplicant sending command to the cfg80211 can lead to race conditions. The correct design to avoid this is that the driver indicates the cfg80211 directly with the results of the link reconfiguration negotiation. Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20250604105757.2542-3-quic_kkavita@quicinc.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e53840d009d1..a289014abe37 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1334,7 +1334,11 @@ * reconfiguration request results from the driver, this command is also * used as an event to notify userspace about the added links information. * For notifying the removed links information, the existing - * %NL80211_CMD_LINKS_REMOVED command is used. + * %NL80211_CMD_LINKS_REMOVED command is used. This command is also used to + * notify userspace about newly added links for the current connection in + * case of AP-initiated link recommendation requests, received via + * a BTM (BSS Transition Management) request or a link reconfig notify + * frame, where the driver handles the link recommendation offload. * * @NL80211_CMD_EPCS_CFG: EPCS configuration for a station. Used by userland to * control EPCS configuration. Used to notify userland on the current state -- cgit From b74947b4f6ff7c122a1bb6eb38bb7ecfbb1d3820 Mon Sep 17 00:00:00 2001 From: Roopni Devanathan Date: Sun, 15 Jun 2025 13:53:09 +0530 Subject: wifi: cfg80211/mac80211: Add support to get radio index Currently, per-radio attributes are set on per-phy basis, i.e., all the radios present in a wiphy will take attributes values sent from user. But each radio in a wiphy can get different values from userspace based on its requirement. To extend support to set per-radio attributes, add support to get radio index from userspace. Add an NL attribute - NL80211_ATTR_WIPHY_RADIO_INDEX, to get user specified radio index for which attributes should be changed. Pass this to individual drivers, so that the drivers can use this radio index to change per-radio attributes when necessary. Currently, per-radio attributes identified are: NL80211_ATTR_WIPHY_TX_POWER_LEVEL NL80211_ATTR_WIPHY_ANTENNA_TX NL80211_ATTR_WIPHY_ANTENNA_RX NL80211_ATTR_WIPHY_RETRY_SHORT NL80211_ATTR_WIPHY_RETRY_LONG NL80211_ATTR_WIPHY_FRAG_THRESHOLD NL80211_ATTR_WIPHY_RTS_THRESHOLD NL80211_ATTR_WIPHY_COVERAGE_CLASS NL80211_ATTR_TXQ_LIMIT NL80211_ATTR_TXQ_MEMORY_LIMIT NL80211_ATTR_TXQ_QUANTUM By default, the radio index is set to -1. This means the attribute should be treated as a global configuration. If the user has not specified any index, then the radio index passed to individual drivers would be -1. This would indicate that the attribute applies to all radios in that wiphy. Signed-off-by: Roopni Devanathan Link: https://patch.msgid.link/20250615082312.619639-2-quic_rdevanat@quicinc.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a289014abe37..2a71149c3065 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2907,6 +2907,14 @@ enum nl80211_commands { * APs Support". Drivers may set additional flags that they support * in the kernel or device. * + * @NL80211_ATTR_WIPHY_RADIO_INDEX: (int) Integer attribute denoting the index + * of the radio in interest. Internally a value of -1 is used to + * indicate that the radio id is not given in user-space. This means + * that all the attributes are applicable to all the radios. If there is + * a radio index provided in user-space, the attributes will be + * applicable to that specific radio only. If the radio id is greater + * thank the number of radios, error denoting invalid value is returned. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3464,6 +3472,8 @@ enum nl80211_attrs { NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS, + NL80211_ATTR_WIPHY_RADIO_INDEX, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, -- cgit From 89595190058c6e9ca4a8ca7d49be3fc8d2395e79 Mon Sep 17 00:00:00 2001 From: Roopni Devanathan Date: Sun, 15 Jun 2025 13:53:11 +0530 Subject: wifi: cfg80211: Report per-radio RTS threshold to userspace In case of multi-radio wiphys, with per-radio RTS threshold brought into use, RTS threshold for each radio in a wiphy can be recorded in wiphy parameter - wiphy_radio_cfg, as an array. Add a new attribute - NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD in nested parameter - NL80211_ATTR_WIPHY_RADIOS. When a request for getting RTS threshold for a particular radio is received, parse the radio id and get the required data. Add this data to the newly added nested attribute NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD. Add support to report this data to userspace. Signed-off-by: Roopni Devanathan Link: https://patch.msgid.link/20250615082312.619639-4-quic_rdevanat@quicinc.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 2a71149c3065..39460334dafb 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -8106,6 +8106,7 @@ enum nl80211_ap_settings_flags { * and contains attributes defined in &enum nl80211_if_combination_attrs. * @NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK: bitmask (u32) of antennas * connected to this radio. + * @NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD: RTS threshold (u32) of this radio. * * @__NL80211_WIPHY_RADIO_ATTR_LAST: Internal * @NL80211_WIPHY_RADIO_ATTR_MAX: Highest attribute @@ -8117,6 +8118,7 @@ enum nl80211_wiphy_radio_attrs { NL80211_WIPHY_RADIO_ATTR_FREQ_RANGE, NL80211_WIPHY_RADIO_ATTR_INTERFACE_COMBINATION, NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK, + NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD, /* keep last */ __NL80211_WIPHY_RADIO_ATTR_LAST, -- cgit From 826334359eacc1b70e9752ebc4954ed775dd40ca Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 23 Jun 2025 16:17:13 -0700 Subject: netlink: specs: add the multicast group name to spec Add the multicast group's name to the YAML spec. Without it YNL doesn't know how to subscribe to notifications. Reviewed-by: Donald Hunter Link: https://patch.msgid.link/20250623231720.3124717-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink.h | 2 -- include/uapi/linux/ethtool_netlink_generated.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index 09a75bdb6560..fa5d645140a4 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -208,6 +208,4 @@ enum { ETHTOOL_A_STATS_PHY_MAX = (__ETHTOOL_A_STATS_PHY_CNT - 1) }; -#define ETHTOOL_MCGRP_MONITOR_NAME "monitor" - #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_H_ */ diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 4944badf9fba..859e28c8a91a 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -867,4 +867,6 @@ enum { ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) }; +#define ETHTOOL_MCGRP_MONITOR_NAME "monitor" + #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H */ -- cgit From 46837be5afc6ea70bc827ca4439410e069e2ee37 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 23 Jun 2025 16:17:18 -0700 Subject: net: ethtool: rss: add notifications In preparation for RSS_SET handling in ethnl introduce Netlink notifications for RSS. Only cover modifications, not creation and not removal of a context, because the latter may deserve a different notification type. We should cross that bridge when we add the support for context add / remove via Netlink. Link: https://patch.msgid.link/20250623231720.3124717-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink_generated.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 859e28c8a91a..8f30ffa1cd14 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -862,6 +862,7 @@ enum { ETHTOOL_MSG_TSCONFIG_GET_REPLY, ETHTOOL_MSG_TSCONFIG_SET_REPLY, ETHTOOL_MSG_PSE_NTF, + ETHTOOL_MSG_RSS_NTF, __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) -- cgit From 2855e43c6bb154a9b8e27abda8df364aed574b22 Mon Sep 17 00:00:00 2001 From: RubenKelevra Date: Tue, 24 Jun 2025 18:57:11 +0200 Subject: uapi: net_dropmon: drop unused is_drop_point_hw macro Commit 4ea7e38696c7 ("dropmon: add ability to detect when hardware drops rx packets") introduced is_drop_point_hw, but the symbol was never referenced anywhere in the kernel tree and is currently not used by dropwatch. I could not find, to the best of my abilities, a current out-of-tree user of this macro. The definition also contains a syntax error in its for-loop, so any project that tried to compile against it would fail. Removing the macro therefore eliminates dead code without breaking existing users. Signed-off-by: RubenKelevra Link: https://patch.msgid.link/20250624165711.1188691-1-rubenkelevra@gmail.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/net_dropmon.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h index 9dd41c2f58a6..87cbef48d4c7 100644 --- a/include/uapi/linux/net_dropmon.h +++ b/include/uapi/linux/net_dropmon.h @@ -10,13 +10,6 @@ struct net_dm_drop_point { __u32 count; }; -#define is_drop_point_hw(x) do {\ - int ____i, ____j;\ - for (____i = 0; ____i < 8; i ____i++)\ - ____j |= x[____i];\ - ____j;\ -} while (0) - #define NET_DM_CFG_VERSION 0 #define NET_DM_CFG_ALERT_COUNT 1 #define NET_DM_CFG_ALERT_DELAY 2 -- cgit From 7f15ee35972dd3dee37704bfd0f136290f6d63d9 Mon Sep 17 00:00:00 2001 From: Arkadiusz Kubalewski Date: Thu, 26 Jun 2025 15:52:17 +0200 Subject: dpll: add reference-sync netlink attribute Add new netlink attribute to allow user space configuration of reference sync pin pairs, where both pins are used to provide one clock signal consisting of both: base frequency and sync signal. Reviewed-by: Przemek Kitszel Reviewed-by: Milena Olech Reviewed-by: Jiri Pirko Signed-off-by: Arkadiusz Kubalewski Link: https://patch.msgid.link/20250626135219.1769350-2-arkadiusz.kubalewski@intel.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/dpll.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h index 349e1b3ca1ae..37b438ce8efc 100644 --- a/include/uapi/linux/dpll.h +++ b/include/uapi/linux/dpll.h @@ -249,6 +249,7 @@ enum dpll_a_pin { DPLL_A_PIN_ESYNC_FREQUENCY, DPLL_A_PIN_ESYNC_FREQUENCY_SUPPORTED, DPLL_A_PIN_ESYNC_PULSE, + DPLL_A_PIN_REFERENCE_SYNC, __DPLL_A_PIN_MAX, DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1) -- cgit From 03dc03fa0432a9160c4fcbdb86f274e6b4587972 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 26 Jun 2025 10:31:10 +0300 Subject: neighbor: Add NTF_EXT_VALIDATED flag for externally validated entries tl;dr ===== Add a new neighbor flag ("extern_valid") that can be used to indicate to the kernel that a neighbor entry was learned and determined to be valid externally. The kernel will not try to remove or invalidate such an entry, leaving these decisions to the user space control plane. This is needed for EVPN multi-homing where a neighbor entry for a multi-homed host needs to be synced across all the VTEPs among which the host is multi-homed. Background ========== In a typical EVPN multi-homing setup each host is multi-homed using a set of links called ES (Ethernet Segment, i.e., LAG) to multiple leaf switches (VTEPs). VTEPs that are connected to the same ES are called ES peers. When a neighbor entry is learned on a VTEP, it is distributed to both ES peers and remote VTEPs using EVPN MAC/IP advertisement routes. ES peers use the neighbor entry when routing traffic towards the multi-homed host and remote VTEPs use it for ARP/NS suppression. Motivation ========== If the ES link between a host and the VTEP on which the neighbor entry was locally learned goes down, the EVPN MAC/IP advertisement route will be withdrawn and the neighbor entries will be removed from both ES peers and remote VTEPs. Routing towards the multi-homed host and ARP/NS suppression can fail until another ES peer locally learns the neighbor entry and distributes it via an EVPN MAC/IP advertisement route. "draft-rbickhart-evpn-ip-mac-proxy-adv-03" [1] suggests avoiding these intermittent failures by having the ES peers install the neighbor entries as before, but also injecting EVPN MAC/IP advertisement routes with a proxy indication. When the previously mentioned ES link goes down and the original EVPN MAC/IP advertisement route is withdrawn, the ES peers will not withdraw their neighbor entries, but instead start aging timers for the proxy indication. If an ES peer locally learns the neighbor entry (i.e., it becomes "reachable"), it will restart its aging timer for the entry and emit an EVPN MAC/IP advertisement route without a proxy indication. An ES peer will stop its aging timer for the proxy indication if it observes the removal of the proxy indication from at least one of the ES peers advertising the entry. In the event that the aging timer for the proxy indication expired, an ES peer will withdraw its EVPN MAC/IP advertisement route. If the timer expired on all ES peers and they all withdrew their proxy advertisements, the neighbor entry will be completely removed from the EVPN fabric. Implementation ============== In the above scheme, when the control plane (e.g., FRR) advertises a neighbor entry with a proxy indication, it expects the corresponding entry in the data plane (i.e., the kernel) to remain valid and not be removed due to garbage collection or loss of carrier. The control plane also expects the kernel to notify it if the entry was learned locally (i.e., became "reachable") so that it will remove the proxy indication from the EVPN MAC/IP advertisement route. That is why these entries cannot be programmed with dummy states such as "permanent" or "noarp". Instead, add a new neighbor flag ("extern_valid") which indicates that the entry was learned and determined to be valid externally and should not be removed or invalidated by the kernel. The kernel can probe the entry and notify user space when it becomes "reachable" (it is initially installed as "stale"). However, if the kernel does not receive a confirmation, have it return the entry to the "stale" state instead of the "failed" state. In other words, an entry marked with the "extern_valid" flag behaves like any other dynamically learned entry other than the fact that the kernel cannot remove or invalidate it. One can argue that the "extern_valid" flag should not prevent garbage collection and that instead a neighbor entry should be programmed with both the "extern_valid" and "extern_learn" flags. There are two reasons for not doing that: 1. Unclear why a control plane would like to program an entry that the kernel cannot invalidate but can completely remove. 2. The "extern_learn" flag is used by FRR for neighbor entries learned on remote VTEPs (for ARP/NS suppression) whereas here we are concerned with local entries. This distinction is currently irrelevant for the kernel, but might be relevant in the future. Given that the flag only makes sense when the neighbor has a valid state, reject attempts to add a neighbor with an invalid state and with this flag set. For example: # ip neigh add 192.0.2.1 nud none dev br0.10 extern_valid Error: Cannot create externally validated neighbor with an invalid state. # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid # ip neigh replace 192.0.2.1 nud failed dev br0.10 extern_valid Error: Cannot mark neighbor as externally validated with an invalid state. The above means that a neighbor cannot be created with the "extern_valid" flag and flags such as "use" or "managed" as they result in a neighbor being created with an invalid state ("none") and immediately getting probed: # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use Error: Cannot create externally validated neighbor with an invalid state. However, these flags can be used together with "extern_valid" after the neighbor was created with a valid state: # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use One consequence of preventing the kernel from invalidating a neighbor entry is that by default it will only try to determine reachability using unicast probes. This can be changed using the "mcast_resolicit" sysctl: # sysctl net.ipv4.neigh.br0/10.mcast_resolicit 0 # tcpdump -nn -e -i br0.10 -Q out arp & # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 # sysctl -wq net.ipv4.neigh.br0/10.mcast_resolicit=3 # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 iproute2 patches can be found here [2]. [1] https://datatracker.ietf.org/doc/html/draft-rbickhart-evpn-ip-mac-proxy-adv-03 [2] https://github.com/idosch/iproute2/tree/submit/extern_valid_v1 Signed-off-by: Ido Schimmel Acked-by: Daniel Borkmann Link: https://patch.msgid.link/20250626073111.244534-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/neighbour.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index b851c36ad25d..c34a81245f87 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -54,6 +54,7 @@ enum { /* Extended flags under NDA_FLAGS_EXT: */ #define NTF_EXT_MANAGED (1 << 0) #define NTF_EXT_LOCKED (1 << 1) +#define NTF_EXT_EXT_VALIDATED (1 << 2) /* * Neighbor Cache Entry States. @@ -92,6 +93,10 @@ enum { * bridge in response to a host trying to communicate via a locked bridge port * with MAB enabled. Their purpose is to notify user space that a host requires * authentication. + * + * NTF_EXT_EXT_VALIDATED flagged neighbor entries were externally validated by + * a user space control plane. The kernel will not remove or invalidate them, + * but it can probe them and notify user space when they become reachable. */ struct nda_cacheinfo { -- cgit From 566e8f108fc7847f2a8676ec6a101d37b7dd0fb4 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Sun, 29 Jun 2025 17:21:32 +0300 Subject: devlink: Extend devlink rate API with traffic classes bandwidth management Introduce support for specifying relative bandwidth shares between traffic classes (TC) in the devlink-rate API. This new option allows users to allocate bandwidth across multiple traffic classes in a single command. This feature provides a more granular control over traffic management, especially for scenarios requiring Enhanced Transmission Selection. Users can now define a relative bandwidth share for each traffic class. For example, assigning share values of 20 to TC0 (TCP/UDP) and 80 to TC5 (RoCE) will result in TC0 receiving 20% and TC5 receiving 80% of the total bandwidth. The actual percentage each class receives depends on the ratio of its share value to the sum of all shares. Example: DEV=pci/0000:08:00.0 $ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \ tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0 $ devlink port function rate set $DEV/vfs_group \ tc-bw 0:20 1:0 2:0 3:0 4:0 5:20 6:60 7:0 Example usage with ynl: ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \ --do rate-set --json '{ "bus-name": "pci", "dev-name": "0000:08:00.0", "port-index": 1, "rate-tc-bws": [ {"rate-tc-index": 0, "rate-tc-bw": 50}, {"rate-tc-index": 1, "rate-tc-bw": 50}, {"rate-tc-index": 2, "rate-tc-bw": 0}, {"rate-tc-index": 3, "rate-tc-bw": 0}, {"rate-tc-index": 4, "rate-tc-bw": 0}, {"rate-tc-index": 5, "rate-tc-bw": 0}, {"rate-tc-index": 6, "rate-tc-bw": 0}, {"rate-tc-index": 7, "rate-tc-bw": 0} ] }' ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \ --do rate-get --json '{ "bus-name": "pci", "dev-name": "0000:08:00.0", "port-index": 1 }' output for rate-get: {'bus-name': 'pci', 'dev-name': '0000:08:00.0', 'port-index': 1, 'rate-tc-bws': [{'rate-tc-bw': 50, 'rate-tc-index': 0}, {'rate-tc-bw': 50, 'rate-tc-index': 1}, {'rate-tc-bw': 0, 'rate-tc-index': 2}, {'rate-tc-bw': 0, 'rate-tc-index': 3}, {'rate-tc-bw': 0, 'rate-tc-index': 4}, {'rate-tc-bw': 0, 'rate-tc-index': 5}, {'rate-tc-bw': 0, 'rate-tc-index': 6}, {'rate-tc-bw': 0, 'rate-tc-index': 7}], 'rate-tx-max': 0, 'rate-tx-priority': 0, 'rate-tx-share': 0, 'rate-tx-weight': 0, 'rate-type': 'leaf'} Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Reviewed-by: Jiri Pirko Signed-off-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250629142138.361537-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/devlink.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index a5ee0f13740a..e72bcc239afd 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -221,6 +221,11 @@ enum devlink_port_flavour { */ }; +/* IEEE 802.1Qaz standard supported values. */ + +#define DEVLINK_RATE_TCS_MAX 8 +#define DEVLINK_RATE_TC_INDEX_MAX (DEVLINK_RATE_TCS_MAX - 1) + enum devlink_rate_type { DEVLINK_RATE_TYPE_LEAF, DEVLINK_RATE_TYPE_NODE, @@ -629,6 +634,10 @@ enum devlink_attr { DEVLINK_ATTR_REGION_DIRECT, /* flag */ + DEVLINK_ATTR_RATE_TC_BWS, /* nested */ + DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */ + DEVLINK_ATTR_RATE_TC_BW, /* u32 */ + /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate * net/devlink/netlink_gen.c. -- cgit From 59f44c9ccc3bb68aa3b062b8e57ce0e1ee2fca75 Mon Sep 17 00:00:00 2001 From: Ilya Maximets Date: Wed, 2 Jul 2025 17:50:34 +0200 Subject: net: openvswitch: allow providing upcall pid for the 'execute' command When a packet enters OVS datapath and there is no flow to handle it, packet goes to userspace through a MISS upcall. With per-CPU upcall dispatch mechanism, we're using the current CPU id to select the Netlink PID on which to send this packet. This allows us to send packets from the same traffic flow through the same handler. The handler will process the packet, install required flow into the kernel and re-inject the original packet via OVS_PACKET_CMD_EXECUTE. While handling OVS_PACKET_CMD_EXECUTE, however, we may hit a recirculation action that will pass the (likely modified) packet through the flow lookup again. And if the flow is not found, the packet will be sent to userspace again through another MISS upcall. However, the handler thread in userspace is likely running on a different CPU core, and the OVS_PACKET_CMD_EXECUTE request is handled in the syscall context of that thread. So, when the time comes to send the packet through another upcall, the per-CPU dispatch will choose a different Netlink PID, and this packet will end up processed by a different handler thread on a different CPU. The process continues as long as there are new recirculations, each time the packet goes to a different handler thread before it is sent out of the OVS datapath to the destination port. In real setups the number of recirculations can go up to 4 or 5, sometimes more. There is always a chance to re-order packets while processing upcalls, because userspace will first install the flow and then re-inject the original packet. So, there is a race window when the flow is already installed and the second packet can match it and be forwarded to the destination before the first packet is re-injected. But the fact that packets are going through multiple upcalls handled by different userspace threads makes the reordering noticeably more likely, because we not only have a race between the kernel and a userspace handler (which is hard to avoid), but also between multiple userspace handlers. For example, let's assume that 10 packets got enqueued through a MISS upcall for handler-1, it will start processing them, will install the flow into the kernel and start re-injecting packets back, from where they will go through another MISS to handler-2. Handler-2 will install the flow into the kernel and start re-injecting the packets, while handler-1 continues to re-inject the last of the 10 packets, they will hit the flow installed by handler-2 and be forwarded without going to the handler-2, while handler-2 still re-injects the first of these 10 packets. Given multiple recirculations and misses, these 10 packets may end up completely mixed up on the output from the datapath. Let's allow userspace to specify on which Netlink PID the packets should be upcalled while processing OVS_PACKET_CMD_EXECUTE. This makes it possible to ensure that all the packets are processed by the same handler thread in the userspace even with them being upcalled multiple times in the process. Packets will remain in order since they will be enqueued to the same socket and re-injected in the same order. This doesn't eliminate re-ordering as stated above, since we still have a race between kernel and the userspace thread, but it allows to eliminate races between multiple userspace threads. Userspace knows the PID of the socket on which the original upcall is received, so there is no need to send it up from the kernel. Solution requires storing the value somewhere for the duration of the packet processing. There are two potential places for this: our skb extension or the per-CPU storage. It's not clear which is better, so just following currently used scheme of storing this kind of things along the skb. We still have a decent amount of space in the cb. Signed-off-by: Ilya Maximets Acked-by: Flavio Leitner Acked-by: Eelco Chaudron Acked-by: Aaron Conole Link: https://patch.msgid.link/20250702155043.2331772-1-i.maximets@ovn.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/openvswitch.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 3a701bd1f31b..3092c2c6f1d2 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -186,6 +186,11 @@ enum ovs_packet_cmd { * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment * size. * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb). + * @OVS_PACKET_ATTR_UPCALL_PID: Netlink PID to use for upcalls while + * processing %OVS_PACKET_CMD_EXECUTE. Takes precedence over all other ways + * to determine the Netlink PID including %OVS_USERSPACE_ATTR_PID, + * %OVS_DP_ATTR_UPCALL_PID, %OVS_DP_ATTR_PER_CPU_PIDS and the + * %OVS_VPORT_ATTR_UPCALL_PID. * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_PACKET_* commands. @@ -205,6 +210,7 @@ enum ovs_packet_attr { OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */ OVS_PACKET_ATTR_LEN, /* Packet size before truncation. */ OVS_PACKET_ATTR_HASH, /* Packet hash. */ + OVS_PACKET_ATTR_UPCALL_PID, /* u32 Netlink PID. */ __OVS_PACKET_ATTR_MAX }; -- cgit From 3d98ee52659c3f1d3913ae5b97f7743c5247752c Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 27 Jun 2025 21:49:29 +0800 Subject: net: bonding: add broadcast_neighbor netlink option User can config or display the bonding broadcast_neighbor option via iproute2/netlink. Cc: Jay Vosburgh Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Jonathan Corbet Cc: Andrew Lunn Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nikolay Aleksandrov Signed-off-by: Tonghao Zhang Signed-off-by: Zengbing Tu Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/76b90700ba5b98027dfb51a2f3c5cfea0440a21b.1751031306.git.tonghao@bamaicloud.com Signed-off-by: Paolo Abeni --- include/uapi/linux/if_link.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 873c285996fe..784ace3a519c 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1535,6 +1535,7 @@ enum { IFLA_BOND_MISSED_MAX, IFLA_BOND_NS_IP6_TARGET, IFLA_BOND_COUPLED_CONTROL, + IFLA_BOND_BROADCAST_NEIGH, __IFLA_BOND_MAX, }; -- cgit From ad39c12fcee34b8980a80ad5c803bca9906fbd4e Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 2 Jul 2025 14:20:13 +0800 Subject: net: mctp: add gateway routing support This change allows for gateway routing, where a route table entry may reference a routable endpoint (by network and EID), instead of routing directly to a netdevice. We add support for a RTM_GATEWAY attribute for netlink route updates, with an attribute format of: struct mctp_fq_addr { unsigned int net; mctp_eid_t eid; } - we need the net here to uniquely identify the target EID, as we no longer have the device reference directly (which would provide the net id in the case of direct routes). This makes route lookups recursive, as a route lookup that returns a gateway route must be resolved into a direct route (ie, to a device) eventually. We provide a limit to the route lookups, to prevent infinite loop routing. The route lookup populates a new 'nexthop' field in the dst structure, which now specifies the key for the neighbour table lookup on device output, rather than using the packet destination address directly. Signed-off-by: Jeremy Kerr Link: https://patch.msgid.link/20250702-dev-forwarding-v5-13-1468191da8a4@codeconstruct.com.au Signed-off-by: Paolo Abeni --- include/uapi/linux/mctp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h index e1db65df9359..19ad12a0cd4b 100644 --- a/include/uapi/linux/mctp.h +++ b/include/uapi/linux/mctp.h @@ -37,6 +37,14 @@ struct sockaddr_mctp_ext { __u8 smctp_haddr[MAX_ADDR_LEN]; }; +/* A "fully qualified" MCTP address, which includes the system-local network ID, + * required to uniquely resolve a routable EID. + */ +struct mctp_fq_addr { + unsigned int net; + mctp_eid_t eid; +}; + #define MCTP_NET_ANY 0x0 #define MCTP_ADDR_NULL 0x00 -- cgit From e22da4685013922ade8e7b5e727ac6804fe5b51e Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Tue, 1 Jul 2025 16:46:57 +0200 Subject: net/handshake: Add new parameter 'HANDSHAKE_A_ACCEPT_KEYRING' Add a new netlink parameter 'HANDSHAKE_A_ACCEPT_KEYRING' to provide the serial number of the keyring to use. Signed-off-by: Hannes Reinecke Reviewed-by: Chuck Lever Acked-by: Jakub Kicinski Link: https://patch.msgid.link/20250701144657.104401-1-hare@kernel.org Signed-off-by: Paolo Abeni --- include/uapi/linux/handshake.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h index 3d7ea58778c9..662e7de46c54 100644 --- a/include/uapi/linux/handshake.h +++ b/include/uapi/linux/handshake.h @@ -45,6 +45,7 @@ enum { HANDSHAKE_A_ACCEPT_PEER_IDENTITY, HANDSHAKE_A_ACCEPT_CERTIFICATE, HANDSHAKE_A_ACCEPT_PEERNAME, + HANDSHAKE_A_ACCEPT_KEYRING, __HANDSHAKE_A_ACCEPT_MAX, HANDSHAKE_A_ACCEPT_MAX = (__HANDSHAKE_A_ACCEPT_MAX - 1) -- cgit From 333c515d189657c934470c9b0b8a8fedb016ce6f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 8 Jul 2025 17:54:54 +0200 Subject: vhost-net: allow configuring extended features Use the extended feature type for 'acked_features' and implement two new ioctls operation allowing the user-space to set/query an unbounded amount of features. The actual number of processed features is limited by VIRTIO_FEATURES_MAX and attempts to set features above such limit fail with EOPNOTSUPP. Note that: the legacy ioctls implicitly truncate the negotiated features to the lower 64 bits range and the 'acked_backend_features' field don't need conversion, as the only negotiated feature there is in the low 64 bit range. Acked-by: Jason Wang Signed-off-by: Paolo Abeni --- include/uapi/linux/vhost.h | 7 +++++++ include/uapi/linux/vhost_types.h | 5 +++++ 2 files changed, 12 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index d4b3e2ae1314..d6ad01fbb8d2 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -235,4 +235,11 @@ */ #define VHOST_VDPA_GET_VRING_SIZE _IOWR(VHOST_VIRTIO, 0x82, \ struct vhost_vring_state) + +/* Extended features manipulation */ +#define VHOST_GET_FEATURES_ARRAY _IOR(VHOST_VIRTIO, 0x83, \ + struct vhost_features_array) +#define VHOST_SET_FEATURES_ARRAY _IOW(VHOST_VIRTIO, 0x83, \ + struct vhost_features_array) + #endif diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index d7656908f730..1c39cc5f5a31 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -110,6 +110,11 @@ struct vhost_msg_v2 { }; }; +struct vhost_features_array { + __u64 count; /* number of entries present in features array */ + __u64 features[] __counted_by(count); +}; + struct vhost_memory_region { __u64 guest_phys_addr; __u64 memory_size; /* bytes */ -- cgit From a2fb4bc4e2a6a031683910d85b278c1d25ae5420 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 8 Jul 2025 17:54:59 +0200 Subject: net: implement virtio helpers to handle UDP GSO tunneling. The virtio specification are introducing support for GSO over UDP tunnel. This patch brings in the needed defines and the additional virtio hdr parsing/building helpers. The UDP tunnel support uses additional fields in the virtio hdr, and such fields location can change depending on other negotiated features - specifically VIRTIO_NET_F_HASH_REPORT. Try to be as conservative as possible with the new field validation. Existing implementation for plain GSO offloads allow for invalid/ self-contradictory values of such fields. With GSO over UDP tunnel we can be more strict, with no need to deal with legacy implementation. Since the checksum-related field validation is asymmetric in the driver and in the device, introduce a separate helper to implement the new checks (to be used only on the driver side). Note that while the feature space exceeds the 64-bit boundaries, the guest offload space is fixed by the specification of the VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET command to a 64-bit size. Prior to the UDP tunnel GSO support, each guest offload bit corresponded to the feature bit with the same value and vice versa. Due to the limited 'guest offload' space, relevant features in the high 64 bits are 'mapped' to free bits in the lower range. That is simpler than defining a new command (and associated features) to exchange an extended guest offloads set. As a consequence, the uAPIs also specify the mapped guest offload value corresponding to the UDP tunnel GSO features. Acked-by: Jason Wang Signed-off-by: Paolo Abeni -- v4 -> v5: - avoid lines above 80 chars v3 -> v4: - fixed offset for UDP GSO tunnel, update accordingly the helpers - tried to clarified vlan_hlen semantic - virtio_net_chk_data_valid() -> virtio_net_handle_csum_offload() v2 -> v3: - add definitions for possible vnet hdr layouts with tunnel support v1 -> v2: - 'relay' -> 'rely' typo - less unclear comment WRT enforced inner GSO checks - inner header fields are allowed only with 'modern' virtio, thus are always le - clarified in the commit message the need for 'mapped features' defines - assume little_endian is true when UDP GSO is enabled. - fix inner proto type value --- include/uapi/linux/virtio_net.h | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h index 963540deae66..8bf27ab8bcb4 100644 --- a/include/uapi/linux/virtio_net.h +++ b/include/uapi/linux/virtio_net.h @@ -70,6 +70,28 @@ * with the same MAC. */ #define VIRTIO_NET_F_SPEED_DUPLEX 63 /* Device set linkspeed and duplex */ +#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO 65 /* Driver can receive + * GSO-over-UDP-tunnel packets + */ +#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM 66 /* Driver handles + * GSO-over-UDP-tunnel + * packets with partial csum + * for the outer header + */ +#define VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO 67 /* Device can receive + * GSO-over-UDP-tunnel packets + */ +#define VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM 68 /* Device handles + * GSO-over-UDP-tunnel + * packets with partial csum + * for the outer header + */ + +/* Offloads bits corresponding to VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO{,_CSUM} + * features + */ +#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED 46 +#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED 47 #ifndef VIRTIO_NET_NO_LEGACY #define VIRTIO_NET_F_GSO 6 /* Host handles pkts w/ any GSO type */ @@ -131,12 +153,17 @@ struct virtio_net_hdr_v1 { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start, csum_offset */ #define VIRTIO_NET_HDR_F_DATA_VALID 2 /* Csum is valid */ #define VIRTIO_NET_HDR_F_RSC_INFO 4 /* rsc info in csum_ fields */ +#define VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM 8 /* UDP tunnel csum offload */ __u8 flags; #define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ #define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ #define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ #define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ #define VIRTIO_NET_HDR_GSO_UDP_L4 5 /* GSO frame, IPv4& IPv6 UDP (USO) */ +#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 0x20 /* UDPv4 tunnel present */ +#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6 0x40 /* UDPv6 tunnel present */ +#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 | \ + VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6) #define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ __u8 gso_type; __virtio16 hdr_len; /* Ethernet + IP + tcp/udp hdrs */ @@ -181,6 +208,12 @@ struct virtio_net_hdr_v1_hash { __le16 padding; }; +struct virtio_net_hdr_v1_hash_tunnel { + struct virtio_net_hdr_v1_hash hash_hdr; + __le16 outer_th_offset; + __le16 inner_nh_offset; +}; + #ifndef VIRTIO_NET_NO_LEGACY /* This header comes first in the scatter-gather list. * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must -- cgit From 288f30435132d2f9e7a29ec9b9745a4f9dc7fd37 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 8 Jul 2025 17:55:30 +0200 Subject: tun: enable gso over UDP tunnel support. Add new tun features to represent the newly introduced virtio GSO over UDP tunnel offload. Allows detection and selection of such features via the existing TUNSETOFFLOAD ioctl and compute the expected virtio header size and tunnel header offset using the current netdev features, so that we can plug almost seamless the newly introduced virtio helpers to serialize the extended virtio header. Acked-by: Jason Wang Signed-off-by: Paolo Abeni --- v6 -> v7: - rebased v4 -> v5: - encapsulate the guest feature guessing in a tun helper - dropped irrelevant check on xdp buff headroom - do not remove unrelated black line - avoid line len > 80 char v3 -> v4: - virtio tnl-related fields are at fixed offset, cleanup the code accordingly. - use netdev features instead of flags bit to check for the configured offload - drop packet in case of enabled features/configured hdr size mismatch v2 -> v3: - cleaned-up uAPI comments - use explicit struct layout instead of raw buf. --- include/uapi/linux/if_tun.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 287cdc81c939..79d53c7a1ebd 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -93,6 +93,15 @@ #define TUN_F_USO4 0x20 /* I can handle USO for IPv4 packets */ #define TUN_F_USO6 0x40 /* I can handle USO for IPv6 packets */ +/* I can handle TSO/USO for UDP tunneled packets */ +#define TUN_F_UDP_TUNNEL_GSO 0x080 + +/* + * I can handle TSO/USO for UDP tunneled packets requiring csum offload for + * the outer header + */ +#define TUN_F_UDP_TUNNEL_GSO_CSUM 0x100 + /* Protocol info prepended to the packets (when IFF_NO_PI is not set) */ #define TUN_PKT_STRIP 0x0001 struct tun_pi { -- cgit From 45e359be1ce88fb22e61fa3aa23b2e450a6cae03 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 5 Jul 2025 00:01:38 +0800 Subject: net: xsk: introduce XDP_MAX_TX_SKB_BUDGET setsockopt This patch provides a setsockopt method to let applications leverage to adjust how many descs to be handled at most in one send syscall. It mitigates the situation where the default value (32) that is too small leads to higher frequency of triggering send syscall. Considering the prosperity/complexity the applications have, there is no absolutely ideal suggestion fitting all cases. So keep 32 as its default value like before. The patch does the following things: - Add XDP_MAX_TX_SKB_BUDGET socket option. - Set max_tx_budget to 32 by default in the initialization phase as a per-socket granular control. - Set the range of max_tx_budget as [32, xs->tx->nentries]. The idea behind this comes out of real workloads in production. We use a user-level stack with xsk support to accelerate sending packets and minimize triggering syscalls. When the packets are aggregated, it's not hard to hit the upper bound (namely, 32). The moment user-space stack fetches the -EAGAIN error number passed from sendto(), it will loop to try again until all the expected descs from tx ring are sent out to the driver. Enlarging the XDP_MAX_TX_SKB_BUDGET value contributes to less frequency of sendto() and higher throughput/PPS. Here is what I did in production, along with some numbers as follows: For one application I saw lately, I suggested using 128 as max_tx_budget because I saw two limitations without changing any default configuration: 1) XDP_MAX_TX_SKB_BUDGET, 2) socket sndbuf which is 212992 decided by net.core.wmem_default. As to XDP_MAX_TX_SKB_BUDGET, the scenario behind this was I counted how many descs are transmitted to the driver at one time of sendto() based on [1] patch and then I calculated the possibility of hitting the upper bound. Finally I chose 128 as a suitable value because 1) it covers most of the cases, 2) a higher number would not bring evident results. After twisting the parameters, a stable improvement of around 4% for both PPS and throughput and less resources consumption were found to be observed by strace -c -p xxx: 1) %time was decreased by 7.8% 2) error counter was decreased from 18367 to 572 [1]: https://lore.kernel.org/all/20250619093641.70700-1-kerneljasonxing@gmail.com/ Signed-off-by: Jason Xing Acked-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250704160138.48677-1-kerneljasonxing@gmail.com Signed-off-by: Paolo Abeni --- include/uapi/linux/if_xdp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 44f2bb93e7e6..23a062781468 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -79,6 +79,7 @@ struct xdp_mmap_offsets { #define XDP_UMEM_COMPLETION_RING 6 #define XDP_STATISTICS 7 #define XDP_OPTIONS 8 +#define XDP_MAX_TX_SKB_BUDGET 9 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ -- cgit From d7974697de4d6fa1a1ed9ca43616a8500046f25a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Jul 2025 15:06:38 -0700 Subject: ethtool: mark ETHER_FLOW as usable for Rx hash Looks like some drivers (ena, enetc, fbnic.. there's probably more) consider ETHER_FLOW to be legitimate target for flow hashing. I'm not sure how intentional that is from the uAPI perspective vs just an effect of ethtool IOCTL doing minimal input validation. But Netlink will do strict validation, so we need to decide whether we allow this use case or not. I don't see a strong reason against it, and rejecting it would potentially regress a number of drivers. So update the comments and flow_type_hashable(). Link: https://patch.msgid.link/20250708220640.2738464-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 707c1844010c..9e9afdd1238a 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -2314,7 +2314,7 @@ enum { IPV6_USER_FLOW = 0x0e, /* spec only (usr_ip6_spec; nfc only) */ IPV4_FLOW = 0x10, /* hash only */ IPV6_FLOW = 0x11, /* hash only */ - ETHER_FLOW = 0x12, /* spec only (ether_spec) */ + ETHER_FLOW = 0x12, /* hash or spec (ether_spec) */ /* Used for GTP-U IPv4 and IPv6. * The format of GTP packets only includes @@ -2371,7 +2371,7 @@ enum { /* Flag to enable RSS spreading of traffic matching rule (nfc only) */ #define FLOW_RSS 0x20000000 -/* L3-L4 network traffic flow hash options */ +/* L2-L4 network traffic flow hash options */ #define RXH_L2DA (1 << 1) #define RXH_VLAN (1 << 2) #define RXH_L3_PROTO (1 << 3) -- cgit From 178331743ca860561f60d04a7797a2fce13f0784 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 8 Jul 2025 15:06:39 -0700 Subject: ethtool: rss: report which fields are configured for hashing Implement ETHTOOL_GRXFH over Netlink. The number of flow types is reasonable (around 20) so report all of them at once for simplicity. Do not maintain the flow ID mapping with ioctl at the uAPI level. This gives us a chance to clean up the confusion that come from RxNFC vs RxFH (flow direction vs hashing) in the ioctl. Try to align with the names used in ethtool CLI, they seem to have stood the test of time just fine. One annoyance is that we still call L4 ports the weird names, but I guess they also apply to IPSec (where they cover the SPI) so it is what it is. $ ynl --family ethtool --dump rss-get { "header": { "dev-index": 1, "dev-name": "enp1s0" }, "hfunc": 1, "hkey": b"...", "indir": [0, 1, ...], "flow-hash": { "ether": {"l2da"}, "ah-esp4": {"ip-src", "ip-dst"}, "ah-esp6": {"ip-src", "ip-dst"}, "ah4": {"ip-src", "ip-dst"}, "ah6": {"ip-src", "ip-dst"}, "esp4": {"ip-src", "ip-dst"}, "esp6": {"ip-src", "ip-dst"}, "ip4": {"ip-src", "ip-dst"}, "ip6": {"ip-src", "ip-dst"}, "sctp4": {"ip-src", "ip-dst"}, "sctp6": {"ip-src", "ip-dst"}, "udp4": {"ip-src", "ip-dst"}, "udp6": {"ip-src", "ip-dst"} "tcp4": {"l4-b-0-1", "l4-b-2-3", "ip-src", "ip-dst"}, "tcp6": {"l4-b-0-1", "l4-b-2-3", "ip-src", "ip-dst"}, }, } Link: https://patch.msgid.link/20250708220640.2738464-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink_generated.h | 34 ++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 8f30ffa1cd14..96027e26ffba 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -678,6 +678,39 @@ enum { ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1) }; +enum { + ETHTOOL_A_FLOW_ETHER = 1, + ETHTOOL_A_FLOW_IP4, + ETHTOOL_A_FLOW_IP6, + ETHTOOL_A_FLOW_TCP4, + ETHTOOL_A_FLOW_TCP6, + ETHTOOL_A_FLOW_UDP4, + ETHTOOL_A_FLOW_UDP6, + ETHTOOL_A_FLOW_SCTP4, + ETHTOOL_A_FLOW_SCTP6, + ETHTOOL_A_FLOW_AH4, + ETHTOOL_A_FLOW_AH6, + ETHTOOL_A_FLOW_ESP4, + ETHTOOL_A_FLOW_ESP6, + ETHTOOL_A_FLOW_AH_ESP4, + ETHTOOL_A_FLOW_AH_ESP6, + ETHTOOL_A_FLOW_GTPU4, + ETHTOOL_A_FLOW_GTPU6, + ETHTOOL_A_FLOW_GTPC4, + ETHTOOL_A_FLOW_GTPC6, + ETHTOOL_A_FLOW_GTPC_TEID4, + ETHTOOL_A_FLOW_GTPC_TEID6, + ETHTOOL_A_FLOW_GTPU_EH4, + ETHTOOL_A_FLOW_GTPU_EH6, + ETHTOOL_A_FLOW_GTPU_UL4, + ETHTOOL_A_FLOW_GTPU_UL6, + ETHTOOL_A_FLOW_GTPU_DL4, + ETHTOOL_A_FLOW_GTPU_DL6, + + __ETHTOOL_A_FLOW_CNT, + ETHTOOL_A_FLOW_MAX = (__ETHTOOL_A_FLOW_CNT - 1) +}; + enum { ETHTOOL_A_RSS_UNSPEC, ETHTOOL_A_RSS_HEADER, @@ -687,6 +720,7 @@ enum { ETHTOOL_A_RSS_HKEY, ETHTOOL_A_RSS_INPUT_XFRM, ETHTOOL_A_RSS_START_CONTEXT, + ETHTOOL_A_RSS_FLOW_HASH, __ETHTOOL_A_RSS_CNT, ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1) -- cgit From 2677010e7793451c20d895c477c4dc76f6e6a10e Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Thu, 10 Jul 2025 21:12:03 +0000 Subject: Add support to set NAPI threaded for individual NAPI A net device has a threaded sysctl that can be used to enable threaded NAPI polling on all of the NAPI contexts under that device. Allow enabling threaded NAPI polling at individual NAPI level using netlink. Extend the netlink operation `napi-set` and allow setting the threaded attribute of a NAPI. This will enable the threaded polling on a NAPI context. Add a test in `nl_netdev.py` that verifies various cases of threaded NAPI being set at NAPI and at device level. Tested ./tools/testing/selftests/net/nl_netdev.py TAP version 13 1..7 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check ok 3 nl_netdev.page_pool_check ok 4 nl_netdev.napi_list_check ok 5 nl_netdev.dev_set_threaded ok 6 nl_netdev.napi_set_threaded ok 7 nl_netdev.nsim_rxq_reset_down # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Samiullah Khawaja Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250710211203.3979655-1-skhawaja@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/netdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 7eb9571786b8..1f3719a9a0eb 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -134,6 +134,7 @@ enum { NETDEV_A_NAPI_DEFER_HARD_IRQS, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, + NETDEV_A_NAPI_THREADED, __NETDEV_A_NAPI_MAX, NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) -- cgit From 6c758062c64dfbd61862801fbde4e0702f4f3a23 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Jul 2025 11:40:00 +0000 Subject: tcp: add LINUX_MIB_BEYOND_WINDOW Add a new SNMP MIB : LINUX_MIB_BEYOND_WINDOW Incremented when an incoming packet is received beyond the receiver window. nstat -az | grep TcpExtBeyondWindow Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250711114006.480026-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/snmp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 1d234d7e1892..49f5640092a0 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -186,6 +186,7 @@ enum LINUX_MIB_TIMEWAITKILLED, /* TimeWaitKilled */ LINUX_MIB_PAWSACTIVEREJECTED, /* PAWSActiveRejected */ LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */ + LINUX_MIB_BEYOND_WINDOW, /* BeyondWindow */ LINUX_MIB_TSECRREJECTED, /* TSEcrRejected */ LINUX_MIB_PAWS_OLD_ACK, /* PAWSOldAck */ LINUX_MIB_PAWS_TW_REJECTED, /* PAWSTimewait */ -- cgit From c0ae03588bbb95378758fe80e7436a9b4cfc71f6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 15 Jul 2025 17:03:21 -0700 Subject: ethtool: rss: initial RSS_SET (indirection table handling) Add initial support for RSS_SET, for now only operations on the indirection table are supported. Unlike the ioctl don't check if at least one parameter is being changed. This is how other ethtool-nl ops behave, so pick the ethtool-nl consistency vs copying ioctl behavior. There are two special cases here: 1) resetting the table to defaults; 2) support for tables of different size. For (1) I use an empty Netlink attribute (array of size 0). (2) may require some background. AFAICT a lot of modern devices allow allocating RSS tables of different sizes. mlx5 can upsize its tables, bnxt has some "table size calculation", and Intel folks asked about RSS table sizing in context of resource allocation in the past. The ethtool IOCTL API has a concept of table size, but right now the user is expected to provide a table exactly the size the device requests. Some drivers may change the table size at runtime (in response to queue count changes) but the user is not in control of this. What's not great is that all RSS contexts share the same table size. For example a device with 128 queues enabled, 16 RSS contexts 8 queues in each will likely have 256 entry tables for each of the 16 contexts, while 32 would be more than enough given each context only has 8 queues. To address this the Netlink API should avoid enforcing table size at the uAPI level, and should allow the user to express the min table size they expect. To fully solve (2) we will need more driver plumbing but at the uAPI level this patch allows the user to specify a table size smaller than what the device advertises. The device table size must be a multiple of the user requested table size. We then replicate the user-provided table to fill the full device size table. This addresses the "allow the user to express the min table size" objective, while not enforcing any fixed size. From Netlink perspective .get_rxfh_indir_size() is now de facto the "max" table size supported by the device. We may choose to support table replication in ethtool, too, when we actually plumb this thru the device APIs. Initially I was considering moving full pattern generation to the kernel (which queues to use, at which frequency and what min sequence length). I don't think this complexity would buy us much and most if not all devices have pow-2 table sizes, which simplifies the replication a lot. Reviewed-by: Gal Pressman Link: https://patch.msgid.link/20250716000331.1378807-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink_generated.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 96027e26ffba..130bdf5c3516 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -840,6 +840,7 @@ enum { ETHTOOL_MSG_PHY_GET, ETHTOOL_MSG_TSCONFIG_GET, ETHTOOL_MSG_TSCONFIG_SET, + ETHTOOL_MSG_RSS_SET, __ETHTOOL_MSG_USER_CNT, ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1) -- cgit From 6624a0af82a6e3a4d3609264ef591a8fa3467139 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 17 Jul 2025 17:42:02 +1000 Subject: wifi: cfg80211: support configuring an S1G short beaconing BSS S1G short beacons are an optional frame type used in an S1G BSS that contain a limited set of elements. While they are optional, they are a fundamental part of S1G that enables significant power saving. Expose 2 additional netlink attributes, NL80211_ATTR_S1G_LONG_BEACON_PERIOD which denotes the number of beacon intervals between each long beacon and NL80211_ATTR_S1G_SHORT_BEACON which is a nested attribute containing the short beacon tail and head. We split them as the long beacon period cannot be updated, and is only used when initialisng the interface, whereas the short beacon data can be used to both initialise and update the templates. This follows how things such as the beacon interval and DTIM period currently operate. During the initialisation path, we ensure we have the long beacon period if the short beacon data is being passed down, whereas the update path will simply update the template if its sent down. The short beacon data is validated using the same routines for regular beacons as they support correctly parsing the short beacon format while ensuring the frame is well-formed. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250717074205.312577-2-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 39460334dafb..d1a14f2892d9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2915,6 +2915,19 @@ enum nl80211_commands { * applicable to that specific radio only. If the radio id is greater * thank the number of radios, error denoting invalid value is returned. * + * @NL80211_ATTR_S1G_LONG_BEACON_PERIOD: (u8) Integer attribute that represents + * the number of beacon intervals between each long beacon transmission + * for an S1G BSS with short beaconing enabled. This is a required + * attribute for initialising an S1G short beaconing BSS. When updating + * the short beacon data, this is not required. It has a minimum value of + * 2 (i.e 2 beacon intervals). + * + * @NL80211_ATTR_S1G_SHORT_BEACON: Nested attribute containing the short beacon + * head and tail used to set or update the short beacon templates. When + * bringing up a new interface, %NL80211_ATTR_S1G_LONG_BEACON_PERIOD is + * required alongside this attribute. Refer to + * @enum nl80211_s1g_short_beacon_attrs for the attribute definitions. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3474,6 +3487,9 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_RADIO_INDEX, + NL80211_ATTR_S1G_LONG_BEACON_PERIOD, + NL80211_ATTR_S1G_SHORT_BEACON, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -8148,4 +8164,27 @@ enum nl80211_wiphy_radio_freq_range { NL80211_WIPHY_RADIO_FREQ_ATTR_MAX = __NL80211_WIPHY_RADIO_FREQ_ATTR_LAST - 1, }; +/** + * enum nl80211_s1g_short_beacon_attrs - S1G short beacon data + * + * @__NL80211_S1G_SHORT_BEACON_ATTR_INVALID: Invalid + * + * @NL80211_S1G_SHORT_BEACON_ATTR_HEAD: Short beacon head (binary). + * @NL80211_S1G_SHORT_BEACON_ATTR_TAIL: Short beacon tail (binary). + * + * @__NL80211_S1G_SHORT_BEACON_ATTR_LAST: Internal + * @NL80211_S1G_SHORT_BEACON_ATTR_MAX: Highest attribute + */ +enum nl80211_s1g_short_beacon_attrs { + __NL80211_S1G_SHORT_BEACON_ATTR_INVALID, + + NL80211_S1G_SHORT_BEACON_ATTR_HEAD, + NL80211_S1G_SHORT_BEACON_ATTR_TAIL, + + /* keep last */ + __NL80211_S1G_SHORT_BEACON_ATTR_LAST, + NL80211_S1G_SHORT_BEACON_ATTR_MAX = + __NL80211_S1G_SHORT_BEACON_ATTR_LAST - 1 +}; + #endif /* __LINUX_NL80211_H */ -- cgit From a166ab7816c534973745b0fe7bce3c8cefc5426f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Jul 2025 16:43:41 -0700 Subject: ethtool: rss: support creating contexts via Netlink Support creating contexts via Netlink. Setting flow hashing fields on the new context is not supported at this stage, it can be added later. An empty indirection table is not supported. This is a carry over from the IOCTL interface where empty indirection table meant delete. We can repurpose empty indirection table in Netlink but for now to avoid confusion reject it using the policy. Support letting user choose the ID for the new context. This was not possible in IOCTL since the context ID field for the create action had to be set to the ETH_RXFH_CONTEXT_ALLOC magic value. Link: https://patch.msgid.link/20250717234343.2328602-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink_generated.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index 130bdf5c3516..dea77abd295f 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -841,6 +841,7 @@ enum { ETHTOOL_MSG_TSCONFIG_GET, ETHTOOL_MSG_TSCONFIG_SET, ETHTOOL_MSG_RSS_SET, + ETHTOOL_MSG_RSS_CREATE_ACT, __ETHTOOL_MSG_USER_CNT, ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1) @@ -898,6 +899,8 @@ enum { ETHTOOL_MSG_TSCONFIG_SET_REPLY, ETHTOOL_MSG_PSE_NTF, ETHTOOL_MSG_RSS_NTF, + ETHTOOL_MSG_RSS_CREATE_ACT_REPLY, + ETHTOOL_MSG_RSS_CREATE_NTF, __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) -- cgit From fbe09277fa6324b50cc4eedb4d99498cf7dad897 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Jul 2025 16:43:42 -0700 Subject: ethtool: rss: support removing contexts via Netlink Implement removing additional RSS contexts via Netlink. Technically it'd be possible to shoehorn the delete operation into ethnl_request_ops-compatible handler. The code ends up longer than open coded version, and I think we'll need a custom way of sending notifications at some stage (if we allow tying the context lifetime to the netlink socket, in the future). Link: https://patch.msgid.link/20250717234343.2328602-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/uapi/linux/ethtool_netlink_generated.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h index dea77abd295f..e3b8813465d7 100644 --- a/include/uapi/linux/ethtool_netlink_generated.h +++ b/include/uapi/linux/ethtool_netlink_generated.h @@ -842,6 +842,7 @@ enum { ETHTOOL_MSG_TSCONFIG_SET, ETHTOOL_MSG_RSS_SET, ETHTOOL_MSG_RSS_CREATE_ACT, + ETHTOOL_MSG_RSS_DELETE_ACT, __ETHTOOL_MSG_USER_CNT, ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1) @@ -901,6 +902,7 @@ enum { ETHTOOL_MSG_RSS_NTF, ETHTOOL_MSG_RSS_CREATE_ACT_REPLY, ETHTOOL_MSG_RSS_CREATE_NTF, + ETHTOOL_MSG_RSS_DELETE_NTF, __ETHTOOL_MSG_KERNEL_CNT, ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1) -- cgit From 1bbdb81a98363fd5cd0c2ac16ad5346bdf814dff Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Tue, 22 Jul 2025 12:13:29 +0300 Subject: devlink: Fix excessive stack usage in rate TC bandwidth parsing The devlink_nl_rate_tc_bw_parse function uses a large stack array for devlink attributes, which triggers a warning about excessive stack usage: net/devlink/rate.c: In function 'devlink_nl_rate_tc_bw_parse': net/devlink/rate.c:382:1: error: the frame size of 1648 bytes is larger than 1536 bytes [-Werror=frame-larger-than=] Introduce a separate attribute set specifically for rate TC bandwidth parsing that only contains the two attributes actually used: index and bandwidth. This reduces the stack array from DEVLINK_ATTR_MAX entries to just 2 entries, solving the stack usage issue. Update devlink selftest to use the new 'index' and 'bw' attribute names consistent with the YAML spec. Example usage with ynl with the new spec: ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \ --do rate-set --json '{ "bus-name": "pci", "dev-name": "0000:08:00.0", "port-index": 1, "rate-tc-bws": [ {"index": 0, "bw": 50}, {"index": 1, "bw": 50}, {"index": 2, "bw": 0}, {"index": 3, "bw": 0}, {"index": 4, "bw": 0}, {"index": 5, "bw": 0}, {"index": 6, "bw": 0}, {"index": 7, "bw": 0} ] }' ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \ --do rate-get --json '{ "bus-name": "pci", "dev-name": "0000:08:00.0", "port-index": 1 }' output for rate-get: {'bus-name': 'pci', 'dev-name': '0000:08:00.0', 'port-index': 1, 'rate-tc-bws': [{'bw': 50, 'index': 0}, {'bw': 50, 'index': 1}, {'bw': 0, 'index': 2}, {'bw': 0, 'index': 3}, {'bw': 0, 'index': 4}, {'bw': 0, 'index': 5}, {'bw': 0, 'index': 6}, {'bw': 0, 'index': 7}], 'rate-tx-max': 0, 'rate-tx-priority': 0, 'rate-tx-share': 0, 'rate-tx-weight': 0, 'rate-type': 'leaf'} Fixes: 566e8f108fc7 ("devlink: Extend devlink rate API with traffic classes bandwidth management") Reported-by: Arnd Bergmann Closes: https://lore.kernel.org/netdev/20250708160652.1810573-1-arnd@kernel.org/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202507171943.W7DJcs6Y-lkp@intel.com/ Suggested-by: Jakub Kicinski Signed-off-by: Carolina Jubran Tested-by: Carolina Jubran Signed-off-by: Tariq Toukan Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/1753175609-330621-1-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/devlink.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index e72bcc239afd..9fcb25a0f447 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -635,8 +635,6 @@ enum devlink_attr { DEVLINK_ATTR_REGION_DIRECT, /* flag */ DEVLINK_ATTR_RATE_TC_BWS, /* nested */ - DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */ - DEVLINK_ATTR_RATE_TC_BW, /* u32 */ /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate @@ -647,6 +645,15 @@ enum devlink_attr { DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1 }; +enum devlink_rate_tc_attr { + DEVLINK_RATE_TC_ATTR_UNSPEC, + DEVLINK_RATE_TC_ATTR_INDEX, /* u8 */ + DEVLINK_RATE_TC_ATTR_BW, /* u32 */ + + __DEVLINK_RATE_TC_ATTR_MAX, + DEVLINK_RATE_TC_ATTR_MAX = __DEVLINK_RATE_TC_ATTR_MAX - 1 +}; + /* Mapping between internal resource described by the field and system * structure */ -- cgit From 320d031ad6e4d67e8e1ab08ac71efda02bc85683 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 22 Jul 2025 11:59:10 +0200 Subject: sched: Struct definition and parsing of dualpi2 qdisc DualPI2 is the reference implementation of IETF RFC9332 DualQ Coupled AQM (https://datatracker.ietf.org/doc/html/rfc9332) providing two queues called low latency (L-queue) and classic (C-queue). By default, it enqueues non-ECN and ECT(0) packets into the C-queue and ECT(1) and CE packets into the low latency queue (L-queue), as per IETF RFC9332 spec. This patch defines the dualpi2 Qdisc structure and parsing, and the following two patches include dumping and enqueue/dequeue for the DualPI2. Signed-off-by: Chia-Yu Chang Link: https://patch.msgid.link/20250722095915.24485-2-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 53 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 3e41349f3fa2..75d685ea8368 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1211,4 +1211,57 @@ enum { #define TCA_ETS_MAX (__TCA_ETS_MAX - 1) +/* DUALPI2 */ +enum tc_dualpi2_drop_overload { + TC_DUALPI2_DROP_OVERLOAD_OVERFLOW = 0, + TC_DUALPI2_DROP_OVERLOAD_DROP = 1, + __TCA_DUALPI2_DROP_OVERLOAD_MAX, +}; +#define TCA_DUALPI2_DROP_OVERLOAD_MAX (__TCA_DUALPI2_DROP_OVERLOAD_MAX - 1) + +enum tc_dualpi2_drop_early { + TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE = 0, + TC_DUALPI2_DROP_EARLY_DROP_ENQUEUE = 1, + __TCA_DUALPI2_DROP_EARLY_MAX, +}; +#define TCA_DUALPI2_DROP_EARLY_MAX (__TCA_DUALPI2_DROP_EARLY_MAX - 1) + +enum tc_dualpi2_ecn_mask { + TC_DUALPI2_ECN_MASK_L4S_ECT = 1, + TC_DUALPI2_ECN_MASK_CLA_ECT = 2, + TC_DUALPI2_ECN_MASK_ANY_ECT = 3, + __TCA_DUALPI2_ECN_MASK_MAX, +}; +#define TCA_DUALPI2_ECN_MASK_MAX (__TCA_DUALPI2_ECN_MASK_MAX - 1) + +enum tc_dualpi2_split_gso { + TC_DUALPI2_SPLIT_GSO_NO_SPLIT_GSO = 0, + TC_DUALPI2_SPLIT_GSO_SPLIT_GSO = 1, + __TCA_DUALPI2_SPLIT_GSO_MAX, +}; +#define TCA_DUALPI2_SPLIT_GSO_MAX (__TCA_DUALPI2_SPLIT_GSO_MAX - 1) + +enum { + TCA_DUALPI2_UNSPEC, + TCA_DUALPI2_LIMIT, /* Packets */ + TCA_DUALPI2_MEMORY_LIMIT, /* Bytes */ + TCA_DUALPI2_TARGET, /* us */ + TCA_DUALPI2_TUPDATE, /* us */ + TCA_DUALPI2_ALPHA, /* Hz scaled up by 256 */ + TCA_DUALPI2_BETA, /* Hz scaled up by 256 */ + TCA_DUALPI2_STEP_THRESH_PKTS, /* Step threshold in packets */ + TCA_DUALPI2_STEP_THRESH_US, /* Step threshold in microseconds */ + TCA_DUALPI2_MIN_QLEN_STEP, /* Minimum qlen to apply STEP_THRESH */ + TCA_DUALPI2_COUPLING, /* Coupling factor between queues */ + TCA_DUALPI2_DROP_OVERLOAD, /* Whether to drop on overload */ + TCA_DUALPI2_DROP_EARLY, /* Whether to drop on enqueue */ + TCA_DUALPI2_C_PROTECTION, /* Percentage */ + TCA_DUALPI2_ECN_MASK, /* L4S queue classification mask */ + TCA_DUALPI2_SPLIT_GSO, /* Split GSO packets at enqueue */ + TCA_DUALPI2_PAD, + __TCA_DUALPI2_MAX +}; + +#define TCA_DUALPI2_MAX (__TCA_DUALPI2_MAX - 1) + #endif -- cgit From d4de8bffbef4a7e4ad14b9fd2ff8e2d0e06b3fa5 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 22 Jul 2025 11:59:11 +0200 Subject: sched: Dump configuration and statistics of dualpi2 qdisc The configuration and statistics dump of the DualPI2 Qdisc provides information related to both queues, such as packet numbers and queuing delays in the L-queue and C-queue, as well as general information such as probability value, WRR credits, memory usage, packet marking counters, max queue size, etc. The following patch includes enqueue/dequeue for DualPI2. Signed-off-by: Chia-Yu Chang Link: https://patch.msgid.link/20250722095915.24485-3-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/pkt_sched.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 75d685ea8368..c2da76e78bad 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1264,4 +1264,19 @@ enum { #define TCA_DUALPI2_MAX (__TCA_DUALPI2_MAX - 1) +struct tc_dualpi2_xstats { + __u32 prob; /* current probability */ + __u32 delay_c; /* current delay in C queue */ + __u32 delay_l; /* current delay in L queue */ + __u32 packets_in_c; /* number of packets enqueued in C queue */ + __u32 packets_in_l; /* number of packets enqueued in L queue */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ + __u32 step_marks; /* ECN marks due to the step AQM */ + __s32 credit; /* current c_protection credit */ + __u32 memory_used; /* Memory used by both queues */ + __u32 max_memory_used; /* Maximum used memory */ + __u32 memory_limit; /* Memory limit of both queues */ +}; + #endif -- cgit From 8e7583a4f65f3dbf3e8deb4e60f3679c276bef62 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Wed, 23 Jul 2025 01:30:31 +0000 Subject: net: define an enum for the napi threaded state Instead of using '0' and '1' for napi threaded state use an enum with 'disabled' and 'enabled' states. Tested: ./tools/testing/selftests/net/nl_netdev.py TAP version 13 1..7 ok 1 nl_netdev.empty_check ok 2 nl_netdev.lo_check ok 3 nl_netdev.page_pool_check ok 4 nl_netdev.napi_list_check ok 5 nl_netdev.dev_set_threaded ok 6 nl_netdev.napi_set_threaded ok 7 nl_netdev.nsim_rxq_reset_down # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0 Signed-off-by: Samiullah Khawaja Link: https://patch.msgid.link/20250723013031.2911384-4-skhawaja@google.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/netdev.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h index 1f3719a9a0eb..48eb49aa03d4 100644 --- a/include/uapi/linux/netdev.h +++ b/include/uapi/linux/netdev.h @@ -77,6 +77,11 @@ enum netdev_qstats_scope { NETDEV_QSTATS_SCOPE_QUEUE = 1, }; +enum netdev_napi_threaded { + NETDEV_NAPI_THREADED_DISABLED, + NETDEV_NAPI_THREADED_ENABLED, +}; + enum { NETDEV_A_DEV_IFINDEX = 1, NETDEV_A_DEV_PAD, -- cgit From bc8c43adfdc57c8253884fc1853cb6679cd5953d Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 8 Jul 2025 15:04:02 +0200 Subject: netfilter: nfnetlink_hook: Dump flowtable info Introduce NFNL_HOOK_TYPE_NFT_FLOWTABLE to distinguish flowtable hooks from base chain ones. Nested attributes are shared with the old NFTABLES hook info type since they fit apart from their misleading name. Old nftables in user space will ignore this new hook type and thus continue to print flowtable hooks just like before, e.g.: | family netdev { | hook ingress device test0 { | 0000000000 nf_flow_offload_ip_hook [nf_flow_table] | } | } With this patch in place and support for the new hook info type, output becomes more useful: | family netdev { | hook ingress device test0 { | 0000000000 flowtable ip mytable myft [nf_flow_table] | } | } Suggested-by: Florian Westphal Signed-off-by: Phil Sutter Reviewed-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nfnetlink_hook.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nfnetlink_hook.h b/include/uapi/linux/netfilter/nfnetlink_hook.h index 84a561a74b98..1a2c4d6424b5 100644 --- a/include/uapi/linux/netfilter/nfnetlink_hook.h +++ b/include/uapi/linux/netfilter/nfnetlink_hook.h @@ -61,10 +61,12 @@ enum nfnl_hook_chain_desc_attributes { * * @NFNL_HOOK_TYPE_NFTABLES: nf_tables base chain * @NFNL_HOOK_TYPE_BPF: bpf program + * @NFNL_HOOK_TYPE_NFT_FLOWTABLE: nf_tables flowtable */ enum nfnl_hook_chaintype { NFNL_HOOK_TYPE_NFTABLES = 0x1, NFNL_HOOK_TYPE_BPF, + NFNL_HOOK_TYPE_NFT_FLOWTABLE, }; /** -- cgit From f24987ef6959a7efaf79bffd265522c3df18d431 Mon Sep 17 00:00:00 2001 From: Gabriel Goller Date: Tue, 22 Jul 2025 10:18:45 +0200 Subject: ipv6: add `force_forwarding` sysctl to enable per-interface forwarding It is currently impossible to enable ipv6 forwarding on a per-interface basis like in ipv4. To enable forwarding on an ipv6 interface we need to enable it on all interfaces and disable it on the other interfaces using a netfilter rule. This is especially cumbersome if you have lots of interfaces and only want to enable forwarding on a few. According to the sysctl docs [0] the `net.ipv6.conf.all.forwarding` enables forwarding for all interfaces, while the interface-specific `net.ipv6.conf..forwarding` configures the interface Host/Router configuration. Introduce a new sysctl flag `force_forwarding`, which can be set on every interface. The ip6_forwarding function will then check if the global forwarding flag OR the force_forwarding flag is active and forward the packet. To preserve backwards-compatibility reset the flag (on all interfaces) to 0 if the net.ipv6.conf.all.forwarding flag is set to 0. Add a short selftest that checks if a packet gets forwarded with and without `force_forwarding`. [0]: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt Acked-by: Nicolas Dichtel Signed-off-by: Gabriel Goller Link: https://patch.msgid.link/20250722081847.132632-1-g.goller@proxmox.com Signed-off-by: Jakub Kicinski --- include/uapi/linux/ipv6.h | 1 + include/uapi/linux/netconf.h | 1 + include/uapi/linux/sysctl.h | 1 + 3 files changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index cf592d7b630f..d4d3ae774b26 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -199,6 +199,7 @@ enum { DEVCONF_NDISC_EVICT_NOCARRIER, DEVCONF_ACCEPT_UNTRACKED_NA, DEVCONF_ACCEPT_RA_MIN_LFT, + DEVCONF_FORCE_FORWARDING, DEVCONF_MAX }; diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h index fac4edd55379..1c8c84d65ae3 100644 --- a/include/uapi/linux/netconf.h +++ b/include/uapi/linux/netconf.h @@ -19,6 +19,7 @@ enum { NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, NETCONFA_INPUT, NETCONFA_BC_FORWARDING, + NETCONFA_FORCE_FORWARDING, __NETCONFA_MAX }; #define NETCONFA_MAX (__NETCONFA_MAX - 1) diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 8981f00204db..63d1464cb71c 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -573,6 +573,7 @@ enum { NET_IPV6_ACCEPT_RA_FROM_LOCAL=26, NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27, NET_IPV6_RA_DEFRTR_METRIC=28, + NET_IPV6_FORCE_FORWARDING=29, __NET_IPV6_MAX }; -- cgit