From 31557b3487b349464daf42bc4366153743c1e727 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 9 Jun 2025 07:39:33 -0700
Subject: uapi: in6: restore visibility of most IPv6 socket options
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A decade ago commit 6d08acd2d32e ("in6: fix conflict with glibc")
hid the definitions of IPV6 options, because GCC was complaining
about duplicates. The commit did not list the warnings seen, but
trying to recreate them now I think they are (building iproute2):

In file included from ./include/uapi/rdma/rdma_user_cm.h:39,
                 from rdma.h:16,
                 from res.h:9,
                 from res-ctx.c:7:
../include/uapi/linux/in6.h:171:9: warning: ‘IPV6_ADD_MEMBERSHIP’ redefined
  171 | #define IPV6_ADD_MEMBERSHIP     20
      |         ^~~~~~~~~~~~~~~~~~~
In file included from /usr/include/netinet/in.h:37,
                 from rdma.h:13:
/usr/include/bits/in.h:233:10: note: this is the location of the previous definition
  233 | # define IPV6_ADD_MEMBERSHIP    IPV6_JOIN_GROUP
      |          ^~~~~~~~~~~~~~~~~~~
../include/uapi/linux/in6.h:172:9: warning: ‘IPV6_DROP_MEMBERSHIP’ redefined
  172 | #define IPV6_DROP_MEMBERSHIP    21
      |         ^~~~~~~~~~~~~~~~~~~~
/usr/include/bits/in.h:234:10: note: this is the location of the previous definition
  234 | # define IPV6_DROP_MEMBERSHIP   IPV6_LEAVE_GROUP
      |          ^~~~~~~~~~~~~~~~~~~~

Compilers don't complain about redefinition if the defines
are identical, but here we have the kernel using the literal
value, and glibc using an indirection (defining to a name
of another define, with the same numerical value).

Problem is, the commit in question hid all the IPV6 socket
options, and glibc has a pretty sparse list. For instance
it lacks Flow Label related options. Willem called this out
in commit 3fb321fde22d ("selftests/net: ipv6 flowlabel"):

  /* uapi/glibc weirdness may leave this undefined */
  #ifndef IPV6_FLOWINFO
  #define IPV6_FLOWINFO 11
  #endif

More interestingly some applications (socat) use
a #ifdef IPV6_FLOWINFO to gate compilation of thier
rudimentary flow label support. (For added confusion
socat misspells it as IPV4_FLOWINFO in some places.)

Hide only the two defines we know glibc has a problem
with. If we discover more warnings we can hide more
but we should avoid covering the entire block of
defines for "IPV6 socket options".

Link: https://patch.msgid.link/20250609143933.1654417-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/in6.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
index ff8d21f9e95b..5a47339ef7d7 100644
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -152,7 +152,6 @@ struct in6_flowlabel_req {
 /*
  *	IPV6 socket options
  */
-#if __UAPI_DEF_IPV6_OPTIONS
 #define IPV6_ADDRFORM		1
 #define IPV6_2292PKTINFO	2
 #define IPV6_2292HOPOPTS	3
@@ -169,8 +168,10 @@ struct in6_flowlabel_req {
 #define IPV6_MULTICAST_IF	17
 #define IPV6_MULTICAST_HOPS	18
 #define IPV6_MULTICAST_LOOP	19
+#if __UAPI_DEF_IPV6_OPTIONS
 #define IPV6_ADD_MEMBERSHIP	20
 #define IPV6_DROP_MEMBERSHIP	21
+#endif
 #define IPV6_ROUTER_ALERT	22
 #define IPV6_MTU_DISCOVER	23
 #define IPV6_MTU		24
@@ -203,7 +204,6 @@ struct in6_flowlabel_req {
 #define IPV6_IPSEC_POLICY	34
 #define IPV6_XFRM_POLICY	35
 #define IPV6_HDRINCL		36
-#endif
 
 /*
  * Multicast:
-- 
cgit 


From c035e736038045b411cb368e63f07bc2f5dbc0e1 Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Thu, 12 Jun 2025 17:28:33 +0200
Subject: dpll: add phase-offset-monitor feature to netlink spec

Add enum dpll_feature_state for control over features.

Add dpll device level attribute:
DPLL_A_PHASE_OFFSET_MONITOR - to allow control over a phase offset monitor
feature. Attribute is present and shall return current state of a feature
(enum dpll_feature_state), if the device driver provides such capability,
otherwie attribute shall not be present.

Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
Reviewed-by: Milena Olech <milena.olech@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Acked-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>
Link: https://patch.msgid.link/20250612152835.1703397-2-arkadiusz.kubalewski@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/dpll.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index bf97d4b6d51f..349e1b3ca1ae 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -192,6 +192,17 @@ enum dpll_pin_capabilities {
 
 #define DPLL_PHASE_OFFSET_DIVIDER	1000
 
+/**
+ * enum dpll_feature_state - Allow control (enable/disable) and status checking
+ *   over features.
+ * @DPLL_FEATURE_STATE_DISABLE: feature shall be disabled
+ * @DPLL_FEATURE_STATE_ENABLE: feature shall be enabled
+ */
+enum dpll_feature_state {
+	DPLL_FEATURE_STATE_DISABLE,
+	DPLL_FEATURE_STATE_ENABLE,
+};
+
 enum dpll_a {
 	DPLL_A_ID = 1,
 	DPLL_A_MODULE_NAME,
@@ -204,6 +215,7 @@ enum dpll_a {
 	DPLL_A_TYPE,
 	DPLL_A_LOCK_STATUS_ERROR,
 	DPLL_A_CLOCK_QUALITY_LEVEL,
+	DPLL_A_PHASE_OFFSET_MONITOR,
 
 	__DPLL_A_MAX,
 	DPLL_A_MAX = (__DPLL_A_MAX - 1)
-- 
cgit 


From f8337efa4ff5a27e6c1d4e384166413eecd21a65 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@nvidia.com>
Date: Tue, 17 Jun 2025 00:44:19 +0200
Subject: vxlan: Support MC routing in the underlay

Locally-generated MC packets have so far not been subject to MC routing.
Instead an MC-enabled installation would maintain the MC routing tables,
and separately from that the list of interfaces to send packets to as part
of the VXLAN FDB and MDB.

In a previous patch, a ip_mr_output() and ip6_mr_output() routines were
added for IPv4 and IPv6. All locally generated MC traffic is now passed
through these functions. For reasons of backward compatibility, an SKB
(IPCB / IP6CB) flag guards the actual MC routing.

This patch adds logic to set the flag, and the UAPI to enable the behavior.

Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/d899655bb7e9b2521ee8c793e67056b9fd02ba12.1750113335.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 3ad2d5d98034..873c285996fe 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1398,6 +1398,7 @@ enum {
 	IFLA_VXLAN_LOCALBYPASS,
 	IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */
 	IFLA_VXLAN_RESERVED_BITS,
+	IFLA_VXLAN_MC_ROUTE,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit 


From fc0e6db30941a66e284b8516b82356f97f31061d Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Tue, 17 Jun 2025 14:12:01 +0200
Subject: net: pse-pd: Add support for reporting events

Add support for devm_pse_irq_helper() to register PSE interrupts and report
events such as over-current or over-temperature conditions. This follows a
similar approach to the regulator API but also sends notifications using a
dedicated PSE ethtool netlink socket.

Signed-off-by: Kory Maincent (Dent Project) <kory.maincent@bootlin.com>
Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-2-78a1a645e2ee@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool_netlink_generated.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 9a02f579de22..3864aa0de8c7 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -49,6 +49,16 @@ enum hwtstamp_source {
 	HWTSTAMP_SOURCE_PHYLIB,
 };
 
+/**
+ * enum ethtool_pse_event - PSE event list for the PSE controller
+ * @ETHTOOL_PSE_EVENT_OVER_CURRENT: PSE output current is too high
+ * @ETHTOOL_PSE_EVENT_OVER_TEMP: PSE in over temperature state
+ */
+enum ethtool_pse_event {
+	ETHTOOL_PSE_EVENT_OVER_CURRENT = 1,
+	ETHTOOL_PSE_EVENT_OVER_TEMP = 2,
+};
+
 enum {
 	ETHTOOL_A_HEADER_UNSPEC,
 	ETHTOOL_A_HEADER_DEV_INDEX,
@@ -718,6 +728,14 @@ enum {
 	ETHTOOL_A_TSCONFIG_MAX = (__ETHTOOL_A_TSCONFIG_CNT - 1)
 };
 
+enum {
+	ETHTOOL_A_PSE_NTF_HEADER = 1,
+	ETHTOOL_A_PSE_NTF_EVENTS,
+
+	__ETHTOOL_A_PSE_NTF_CNT,
+	ETHTOOL_A_PSE_NTF_MAX = (__ETHTOOL_A_PSE_NTF_CNT - 1)
+};
+
 enum {
 	ETHTOOL_MSG_USER_NONE = 0,
 	ETHTOOL_MSG_STRSET_GET = 1,
@@ -822,6 +840,7 @@ enum {
 	ETHTOOL_MSG_PHY_NTF,
 	ETHTOOL_MSG_TSCONFIG_GET_REPLY,
 	ETHTOOL_MSG_TSCONFIG_SET_REPLY,
+	ETHTOOL_MSG_PSE_NTF,
 
 	__ETHTOOL_MSG_KERNEL_CNT,
 	ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1)
-- 
cgit 


From 1176978ed851952652ddea3685e2f71a0e5d61ff Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Tue, 17 Jun 2025 14:12:04 +0200
Subject: net: ethtool: Add support for new power domains index description

Report the index of the newly introduced PSE power domain to the user,
enabling improved management of the power budget for PSE devices.

Signed-off-by: Kory Maincent (Dent Project) <kory.maincent@bootlin.com>
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-5-78a1a645e2ee@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool_netlink_generated.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 3864aa0de8c7..ed344c8533eb 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -652,6 +652,7 @@ enum {
 	ETHTOOL_A_C33_PSE_EXT_SUBSTATE,
 	ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT,
 	ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES,
+	ETHTOOL_A_PSE_PW_D_ID,
 
 	__ETHTOOL_A_PSE_CNT,
 	ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1)
-- 
cgit 


From ffef61d6d27374542f1bce4452200d9bdd2e1edd Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Tue, 17 Jun 2025 14:12:06 +0200
Subject: net: pse-pd: Add support for budget evaluation strategies
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces the ability to configure the PSE PI budget evaluation
strategies. Budget evaluation strategies is utilized by PSE controllers to
determine which ports to turn off first in scenarios such as power budget
exceedance.

The pis_prio_max value is used to define the maximum priority level
supported by the controller. Both the current priority and the maximum
priority are exposed to the user through the pse_ethtool_get_status call.

This patch add support for two mode of budget evaluation strategies.
1. Static Method:

   This method involves distributing power based on PD classification.
   It’s straightforward and stable, the PSE core keeping track of the
   budget and subtracting the power requested by each PD’s class.

   Advantages: Every PD gets its promised power at any time, which
   guarantees reliability.

   Disadvantages: PD classification steps are large, meaning devices
   request much more power than they actually need. As a result, the power
   supply may only operate at, say, 50% capacity, which is inefficient and
   wastes money.

   Priority max value is matching the number of PSE PIs within the PSE.

2. Dynamic Method:

   To address the inefficiencies of the static method, vendors like
   Microchip have introduced dynamic power budgeting, as seen in the
   PD692x0 firmware. This method monitors the current consumption per port
   and subtracts it from the available power budget. When the budget is
   exceeded, lower-priority ports are shut down.

   Advantages: This method optimizes resource utilization, saving costs.

   Disadvantages: Low-priority devices may experience instability.

   Priority max value is set by the PSE controller driver.

For now, budget evaluation methods are not configurable and cannot be
mixed. They are hardcoded in the PSE driver itself, as no current PSE
controller supports both methods.

Signed-off-by: Kory Maincent (Dent Project) <kory.maincent@bootlin.com>
Acked-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-7-78a1a645e2ee@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool_netlink_generated.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index ed344c8533eb..c6a95224be25 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -53,10 +53,28 @@ enum hwtstamp_source {
  * enum ethtool_pse_event - PSE event list for the PSE controller
  * @ETHTOOL_PSE_EVENT_OVER_CURRENT: PSE output current is too high
  * @ETHTOOL_PSE_EVENT_OVER_TEMP: PSE in over temperature state
+ * @ETHTOOL_C33_PSE_EVENT_DETECTION: detection process occur on the PSE. IEEE
+ *   802.3-2022 33.2.5 and 145.2.6 PSE detection of PDs. IEEE 802.3-202
+ *   30.9.1.1.5 aPSEPowerDetectionStatus
+ * @ETHTOOL_C33_PSE_EVENT_CLASSIFICATION: classification process occur on the
+ *   PSE. IEEE 802.3-2022 33.2.6 and 145.2.8 classification of PDs mutual
+ *   identification. IEEE 802.3-2022 30.9.1.1.8 aPSEPowerClassification.
+ * @ETHTOOL_C33_PSE_EVENT_DISCONNECTION: PD has been disconnected on the PSE.
+ *   IEEE 802.3-2022 33.3.8 and 145.3.9 PD Maintain Power Signature. IEEE
+ *   802.3-2022 33.5.1.2.9 MPS Absent. IEEE 802.3-2022 30.9.1.1.20
+ *   aPSEMPSAbsentCounter.
+ * @ETHTOOL_PSE_EVENT_OVER_BUDGET: PSE turned off due to over budget situation
+ * @ETHTOOL_PSE_EVENT_SW_PW_CONTROL_ERROR: PSE faced an error managing the
+ *   power control from software
  */
 enum ethtool_pse_event {
 	ETHTOOL_PSE_EVENT_OVER_CURRENT = 1,
 	ETHTOOL_PSE_EVENT_OVER_TEMP = 2,
+	ETHTOOL_C33_PSE_EVENT_DETECTION = 4,
+	ETHTOOL_C33_PSE_EVENT_CLASSIFICATION = 8,
+	ETHTOOL_C33_PSE_EVENT_DISCONNECTION = 16,
+	ETHTOOL_PSE_EVENT_OVER_BUDGET = 32,
+	ETHTOOL_PSE_EVENT_SW_PW_CONTROL_ERROR = 64,
 };
 
 enum {
-- 
cgit 


From eeb0c8f72f49a21984981188404cfd3700edbaff Mon Sep 17 00:00:00 2001
From: "Kory Maincent (Dent Project)" <kory.maincent@bootlin.com>
Date: Tue, 17 Jun 2025 14:12:07 +0200
Subject: net: ethtool: Add PSE port priority support feature

This patch expands the status information provided by ethtool for PSE c33
with current port priority and max port priority. It also adds a call to
pse_ethtool_set_prio() to configure the PSE port priority.

Signed-off-by: Kory Maincent (Dent Project) <kory.maincent@bootlin.com>
Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de>
Link: https://patch.msgid.link/20250617-feature_poe_port_prio-v14-8-78a1a645e2ee@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool_netlink_generated.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index c6a95224be25..8e5d067e7ddf 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -671,6 +671,8 @@ enum {
 	ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT,
 	ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES,
 	ETHTOOL_A_PSE_PW_D_ID,
+	ETHTOOL_A_PSE_PRIO_MAX,
+	ETHTOOL_A_PSE_PRIO,
 
 	__ETHTOOL_A_PSE_CNT,
 	ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1)
-- 
cgit 


From 5ae1fc4069578f50798f3372f36a3c13ee565b66 Mon Sep 17 00:00:00 2001
From: Kavita Kavita <quic_kkavita@quicinc.com>
Date: Wed, 4 Jun 2025 16:27:56 +0530
Subject: wifi: cfg80211: Improve the documentation for
 NL80211_CMD_ASSOC_MLO_RECONF

The existing documentation for the NL80211_CMD_ASSOC_MLO_RECONF
does not clearly explain handling of link reconfiguration request
results from the driver.

Add documentation to explain that the command is used as an event to
notify userspace about added links information, and that the existing
NL80211_CMD_LINKS_REMOVED command is used to notify userspace about
removed links information.

Signed-off-by: Kavita Kavita <quic_kkavita@quicinc.com>
Link: https://patch.msgid.link/20250604105757.2542-2-quic_kkavita@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index e9ccf43fe3c6..e53840d009d1 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1330,7 +1330,11 @@
  *      TID to Link mapping for downlink/uplink traffic.
  *
  * @NL80211_CMD_ASSOC_MLO_RECONF: For a non-AP MLD station, request to
- *      add/remove links to/from the association.
+ *      add/remove links to/from the association. To indicate link
+ *      reconfiguration request results from the driver, this command is also
+ *      used as an event to notify userspace about the added links information.
+ *      For notifying the removed links information, the existing
+ *      %NL80211_CMD_LINKS_REMOVED command is used.
  *
  * @NL80211_CMD_EPCS_CFG: EPCS configuration for a station. Used by userland to
  *	control EPCS configuration. Used to notify userland on the current state
-- 
cgit 


From 7c598c653ad465138ecc2fe64492633c541effef Mon Sep 17 00:00:00 2001
From: Kavita Kavita <quic_kkavita@quicinc.com>
Date: Wed, 4 Jun 2025 16:27:57 +0530
Subject: wifi: cfg80211: Add support for link reconfiguration negotiation
 offload to driver

In the case of SME-in-driver, the driver can internally choose to
update the links based on the AP MLD recommendation and do link
reconfiguration negotiation with AP MLD.
(e.g., After the driver processing the BSS Transition Management request
frame received from the AP MLD with Neighbor Report containing
Multi-Link element with recommended links information chooses to do link
reconfiguration negotiation with AP MLD).

To support this, extend cfg80211_mlo_reconf_add_done() and
NL80211_CMD_ASSOC_MLO_RECONF to indicate added links information for
driver-initiated link reconfiguration requests. For removed links,
the driver indicates links information using the
NL80211_CMD_LINKS_REMOVED event for driver-initiated cases, the same as
supplicant initiated cases.

For the driver-initiated case, cfg80211 will receive link
reconfiguration result asynchronously from driver so holding BSSes of
the accepted add links is needed in the event path. Also, no need of
unhold call for the rejected add link BSSes since there was no hold call
happened previously.

Once the supplicant receives the NL80211_CMD_ASSOC_MLO_RECONF event,
it needs to process the information about newly added links and install
per-link group keys (e.g., GTK/IGTK/BIGTK etc.).

In case of the SME-in-driver, using a vendor interface etc. to notify
the supplicant to initiate a link reconfiguration request and then
supplicant sending command to the cfg80211 can lead to race conditions.
The correct design to avoid this is that the driver indicates the
cfg80211 directly with the results of the link reconfiguration
negotiation.

Signed-off-by: Kavita Kavita <quic_kkavita@quicinc.com>
Link: https://patch.msgid.link/20250604105757.2542-3-quic_kkavita@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index e53840d009d1..a289014abe37 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1334,7 +1334,11 @@
  *      reconfiguration request results from the driver, this command is also
  *      used as an event to notify userspace about the added links information.
  *      For notifying the removed links information, the existing
- *      %NL80211_CMD_LINKS_REMOVED command is used.
+ *      %NL80211_CMD_LINKS_REMOVED command is used. This command is also used to
+ *      notify userspace about newly added links for the current connection in
+ *      case of AP-initiated link recommendation requests, received via
+ *      a BTM (BSS Transition Management) request or a link reconfig notify
+ *      frame, where the driver handles the link recommendation offload.
  *
  * @NL80211_CMD_EPCS_CFG: EPCS configuration for a station. Used by userland to
  *	control EPCS configuration. Used to notify userland on the current state
-- 
cgit 


From b74947b4f6ff7c122a1bb6eb38bb7ecfbb1d3820 Mon Sep 17 00:00:00 2001
From: Roopni Devanathan <quic_rdevanat@quicinc.com>
Date: Sun, 15 Jun 2025 13:53:09 +0530
Subject: wifi: cfg80211/mac80211: Add support to get radio index

Currently, per-radio attributes are set on per-phy basis, i.e., all the
radios present in a wiphy will take attributes values sent from user. But
each radio in a wiphy can get different values from userspace based on
its requirement.

To extend support to set per-radio attributes, add support to get radio
index from userspace. Add an NL attribute - NL80211_ATTR_WIPHY_RADIO_INDEX,
to get user specified radio index for which attributes should be changed.
Pass this to individual drivers, so that the drivers can use this radio
index to change per-radio attributes when necessary. Currently, per-radio
attributes identified are:
NL80211_ATTR_WIPHY_TX_POWER_LEVEL
NL80211_ATTR_WIPHY_ANTENNA_TX
NL80211_ATTR_WIPHY_ANTENNA_RX
NL80211_ATTR_WIPHY_RETRY_SHORT
NL80211_ATTR_WIPHY_RETRY_LONG
NL80211_ATTR_WIPHY_FRAG_THRESHOLD
NL80211_ATTR_WIPHY_RTS_THRESHOLD
NL80211_ATTR_WIPHY_COVERAGE_CLASS
NL80211_ATTR_TXQ_LIMIT
NL80211_ATTR_TXQ_MEMORY_LIMIT
NL80211_ATTR_TXQ_QUANTUM

By default, the radio index is set to -1. This means the attribute should
be treated as a global configuration. If the user has not specified any
index, then the radio index passed to individual drivers would be -1. This
would indicate that the attribute applies to all radios in that wiphy.

Signed-off-by: Roopni Devanathan <quic_rdevanat@quicinc.com>
Link: https://patch.msgid.link/20250615082312.619639-2-quic_rdevanat@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index a289014abe37..2a71149c3065 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2907,6 +2907,14 @@ enum nl80211_commands {
  *	APs Support". Drivers may set additional flags that they support
  *	in the kernel or device.
  *
+ * @NL80211_ATTR_WIPHY_RADIO_INDEX: (int) Integer attribute denoting the index
+ *	of the radio in interest. Internally a value of -1 is used to
+ *	indicate that the radio id is not given in user-space. This means
+ *	that all the attributes are applicable to all the radios. If there is
+ *	a radio index provided in user-space, the attributes will be
+ *	applicable to that specific radio only. If the radio id is greater
+ *	thank the number of radios, error denoting invalid value is returned.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3464,6 +3472,8 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS,
 
+	NL80211_ATTR_WIPHY_RADIO_INDEX,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
-- 
cgit 


From 89595190058c6e9ca4a8ca7d49be3fc8d2395e79 Mon Sep 17 00:00:00 2001
From: Roopni Devanathan <quic_rdevanat@quicinc.com>
Date: Sun, 15 Jun 2025 13:53:11 +0530
Subject: wifi: cfg80211: Report per-radio RTS threshold to userspace

In case of multi-radio wiphys, with per-radio RTS threshold brought
into use, RTS threshold for each radio in a wiphy can be recorded in
wiphy parameter - wiphy_radio_cfg, as an array. Add a new attribute -
NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD in nested parameter -
NL80211_ATTR_WIPHY_RADIOS. When a request for getting RTS threshold
for a particular radio is received, parse the radio id and get the
required data. Add this data to the newly added nested attribute
NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD. Add support to report this
data to userspace.

Signed-off-by: Roopni Devanathan <quic_rdevanat@quicinc.com>
Link: https://patch.msgid.link/20250615082312.619639-4-quic_rdevanat@quicinc.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 2a71149c3065..39460334dafb 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -8106,6 +8106,7 @@ enum nl80211_ap_settings_flags {
  *	and contains attributes defined in &enum nl80211_if_combination_attrs.
  * @NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK: bitmask (u32) of antennas
  *	connected to this radio.
+ * @NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD: RTS threshold (u32) of this radio.
  *
  * @__NL80211_WIPHY_RADIO_ATTR_LAST: Internal
  * @NL80211_WIPHY_RADIO_ATTR_MAX: Highest attribute
@@ -8117,6 +8118,7 @@ enum nl80211_wiphy_radio_attrs {
 	NL80211_WIPHY_RADIO_ATTR_FREQ_RANGE,
 	NL80211_WIPHY_RADIO_ATTR_INTERFACE_COMBINATION,
 	NL80211_WIPHY_RADIO_ATTR_ANTENNA_MASK,
+	NL80211_WIPHY_RADIO_ATTR_RTS_THRESHOLD,
 
 	/* keep last */
 	__NL80211_WIPHY_RADIO_ATTR_LAST,
-- 
cgit 


From 826334359eacc1b70e9752ebc4954ed775dd40ca Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 23 Jun 2025 16:17:13 -0700
Subject: netlink: specs: add the multicast group name to spec

Add the multicast group's name to the YAML spec.
Without it YNL doesn't know how to subscribe to notifications.

Reviewed-by: Donald Hunter <donald.hunter@gmail.com>
Link: https://patch.msgid.link/20250623231720.3124717-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool_netlink.h           | 2 --
 include/uapi/linux/ethtool_netlink_generated.h | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h
index 09a75bdb6560..fa5d645140a4 100644
--- a/include/uapi/linux/ethtool_netlink.h
+++ b/include/uapi/linux/ethtool_netlink.h
@@ -208,6 +208,4 @@ enum {
 	ETHTOOL_A_STATS_PHY_MAX = (__ETHTOOL_A_STATS_PHY_CNT - 1)
 };
 
-#define ETHTOOL_MCGRP_MONITOR_NAME "monitor"
-
 #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_H_ */
diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 4944badf9fba..859e28c8a91a 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -867,4 +867,6 @@ enum {
 	ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1)
 };
 
+#define ETHTOOL_MCGRP_MONITOR_NAME	"monitor"
+
 #endif /* _UAPI_LINUX_ETHTOOL_NETLINK_GENERATED_H */
-- 
cgit 


From 46837be5afc6ea70bc827ca4439410e069e2ee37 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Mon, 23 Jun 2025 16:17:18 -0700
Subject: net: ethtool: rss: add notifications

In preparation for RSS_SET handling in ethnl introduce Netlink
notifications for RSS. Only cover modifications, not creation
and not removal of a context, because the latter may deserve
a different notification type. We should cross that bridge
when we add the support for context add / remove via Netlink.

Link: https://patch.msgid.link/20250623231720.3124717-7-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool_netlink_generated.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 859e28c8a91a..8f30ffa1cd14 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -862,6 +862,7 @@ enum {
 	ETHTOOL_MSG_TSCONFIG_GET_REPLY,
 	ETHTOOL_MSG_TSCONFIG_SET_REPLY,
 	ETHTOOL_MSG_PSE_NTF,
+	ETHTOOL_MSG_RSS_NTF,
 
 	__ETHTOOL_MSG_KERNEL_CNT,
 	ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1)
-- 
cgit 


From 2855e43c6bb154a9b8e27abda8df364aed574b22 Mon Sep 17 00:00:00 2001
From: RubenKelevra <rubenkelevra@gmail.com>
Date: Tue, 24 Jun 2025 18:57:11 +0200
Subject: uapi: net_dropmon: drop unused is_drop_point_hw macro

Commit 4ea7e38696c7 ("dropmon: add ability to detect when hardware
drops rx packets") introduced is_drop_point_hw, but the symbol was
never referenced anywhere in the kernel tree and is currently not used
by dropwatch. I could not find, to the best of my abilities, a current
out-of-tree user of this macro.

The definition also contains a syntax error in its for-loop, so any
project that tried to compile against it would fail. Removing the
macro therefore eliminates dead code without breaking existing
users.

Signed-off-by: RubenKelevra <rubenkelevra@gmail.com>
Link: https://patch.msgid.link/20250624165711.1188691-1-rubenkelevra@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/net_dropmon.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 9dd41c2f58a6..87cbef48d4c7 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -10,13 +10,6 @@ struct net_dm_drop_point {
 	__u32 count;
 };
 
-#define is_drop_point_hw(x) do {\
-	int ____i, ____j;\
-	for (____i = 0; ____i < 8; i ____i++)\
-		____j |= x[____i];\
-	____j;\
-} while (0)
-
 #define NET_DM_CFG_VERSION  0
 #define NET_DM_CFG_ALERT_COUNT  1
 #define NET_DM_CFG_ALERT_DELAY 2
-- 
cgit 


From 7f15ee35972dd3dee37704bfd0f136290f6d63d9 Mon Sep 17 00:00:00 2001
From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Date: Thu, 26 Jun 2025 15:52:17 +0200
Subject: dpll: add reference-sync netlink attribute

Add new netlink attribute to allow user space configuration of reference
sync pin pairs, where both pins are used to provide one clock signal
consisting of both: base frequency and sync signal.

Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Reviewed-by: Milena Olech <milena.olech@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
Link: https://patch.msgid.link/20250626135219.1769350-2-arkadiusz.kubalewski@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/dpll.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/dpll.h b/include/uapi/linux/dpll.h
index 349e1b3ca1ae..37b438ce8efc 100644
--- a/include/uapi/linux/dpll.h
+++ b/include/uapi/linux/dpll.h
@@ -249,6 +249,7 @@ enum dpll_a_pin {
 	DPLL_A_PIN_ESYNC_FREQUENCY,
 	DPLL_A_PIN_ESYNC_FREQUENCY_SUPPORTED,
 	DPLL_A_PIN_ESYNC_PULSE,
+	DPLL_A_PIN_REFERENCE_SYNC,
 
 	__DPLL_A_PIN_MAX,
 	DPLL_A_PIN_MAX = (__DPLL_A_PIN_MAX - 1)
-- 
cgit 


From 03dc03fa0432a9160c4fcbdb86f274e6b4587972 Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Thu, 26 Jun 2025 10:31:10 +0300
Subject: neighbor: Add NTF_EXT_VALIDATED flag for externally validated entries

tl;dr
=====

Add a new neighbor flag ("extern_valid") that can be used to indicate to
the kernel that a neighbor entry was learned and determined to be valid
externally. The kernel will not try to remove or invalidate such an
entry, leaving these decisions to the user space control plane. This is
needed for EVPN multi-homing where a neighbor entry for a multi-homed
host needs to be synced across all the VTEPs among which the host is
multi-homed.

Background
==========

In a typical EVPN multi-homing setup each host is multi-homed using a
set of links called ES (Ethernet Segment, i.e., LAG) to multiple leaf
switches (VTEPs). VTEPs that are connected to the same ES are called ES
peers.

When a neighbor entry is learned on a VTEP, it is distributed to both ES
peers and remote VTEPs using EVPN MAC/IP advertisement routes. ES peers
use the neighbor entry when routing traffic towards the multi-homed host
and remote VTEPs use it for ARP/NS suppression.

Motivation
==========

If the ES link between a host and the VTEP on which the neighbor entry
was locally learned goes down, the EVPN MAC/IP advertisement route will
be withdrawn and the neighbor entries will be removed from both ES peers
and remote VTEPs. Routing towards the multi-homed host and ARP/NS
suppression can fail until another ES peer locally learns the neighbor
entry and distributes it via an EVPN MAC/IP advertisement route.

"draft-rbickhart-evpn-ip-mac-proxy-adv-03" [1] suggests avoiding these
intermittent failures by having the ES peers install the neighbor
entries as before, but also injecting EVPN MAC/IP advertisement routes
with a proxy indication. When the previously mentioned ES link goes down
and the original EVPN MAC/IP advertisement route is withdrawn, the ES
peers will not withdraw their neighbor entries, but instead start aging
timers for the proxy indication.

If an ES peer locally learns the neighbor entry (i.e., it becomes
"reachable"), it will restart its aging timer for the entry and emit an
EVPN MAC/IP advertisement route without a proxy indication. An ES peer
will stop its aging timer for the proxy indication if it observes the
removal of the proxy indication from at least one of the ES peers
advertising the entry.

In the event that the aging timer for the proxy indication expired, an
ES peer will withdraw its EVPN MAC/IP advertisement route. If the timer
expired on all ES peers and they all withdrew their proxy
advertisements, the neighbor entry will be completely removed from the
EVPN fabric.

Implementation
==============

In the above scheme, when the control plane (e.g., FRR) advertises a
neighbor entry with a proxy indication, it expects the corresponding
entry in the data plane (i.e., the kernel) to remain valid and not be
removed due to garbage collection or loss of carrier. The control plane
also expects the kernel to notify it if the entry was learned locally
(i.e., became "reachable") so that it will remove the proxy indication
from the EVPN MAC/IP advertisement route. That is why these entries
cannot be programmed with dummy states such as "permanent" or "noarp".

Instead, add a new neighbor flag ("extern_valid") which indicates that
the entry was learned and determined to be valid externally and should
not be removed or invalidated by the kernel. The kernel can probe the
entry and notify user space when it becomes "reachable" (it is initially
installed as "stale"). However, if the kernel does not receive a
confirmation, have it return the entry to the "stale" state instead of
the "failed" state.

In other words, an entry marked with the "extern_valid" flag behaves
like any other dynamically learned entry other than the fact that the
kernel cannot remove or invalidate it.

One can argue that the "extern_valid" flag should not prevent garbage
collection and that instead a neighbor entry should be programmed with
both the "extern_valid" and "extern_learn" flags. There are two reasons
for not doing that:

1. Unclear why a control plane would like to program an entry that the
   kernel cannot invalidate but can completely remove.

2. The "extern_learn" flag is used by FRR for neighbor entries learned
   on remote VTEPs (for ARP/NS suppression) whereas here we are
   concerned with local entries. This distinction is currently irrelevant
   for the kernel, but might be relevant in the future.

Given that the flag only makes sense when the neighbor has a valid
state, reject attempts to add a neighbor with an invalid state and with
this flag set. For example:

 # ip neigh add 192.0.2.1 nud none dev br0.10 extern_valid
 Error: Cannot create externally validated neighbor with an invalid state.
 # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid
 # ip neigh replace 192.0.2.1 nud failed dev br0.10 extern_valid
 Error: Cannot mark neighbor as externally validated with an invalid state.

The above means that a neighbor cannot be created with the
"extern_valid" flag and flags such as "use" or "managed" as they result
in a neighbor being created with an invalid state ("none") and
immediately getting probed:

 # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use
 Error: Cannot create externally validated neighbor with an invalid state.

However, these flags can be used together with "extern_valid" after the
neighbor was created with a valid state:

 # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid
 # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use

One consequence of preventing the kernel from invalidating a neighbor
entry is that by default it will only try to determine reachability
using unicast probes. This can be changed using the "mcast_resolicit"
sysctl:

 # sysctl net.ipv4.neigh.br0/10.mcast_resolicit
 0
 # tcpdump -nn -e -i br0.10 -Q out arp &
 # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use
 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 # sysctl -wq net.ipv4.neigh.br0/10.mcast_resolicit=3
 # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use
 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28
 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28

iproute2 patches can be found here [2].

[1] https://datatracker.ietf.org/doc/html/draft-rbickhart-evpn-ip-mac-proxy-adv-03
[2] https://github.com/idosch/iproute2/tree/submit/extern_valid_v1

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://patch.msgid.link/20250626073111.244534-2-idosch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/neighbour.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h
index b851c36ad25d..c34a81245f87 100644
--- a/include/uapi/linux/neighbour.h
+++ b/include/uapi/linux/neighbour.h
@@ -54,6 +54,7 @@ enum {
 /* Extended flags under NDA_FLAGS_EXT: */
 #define NTF_EXT_MANAGED		(1 << 0)
 #define NTF_EXT_LOCKED		(1 << 1)
+#define NTF_EXT_EXT_VALIDATED	(1 << 2)
 
 /*
  *	Neighbor Cache Entry States.
@@ -92,6 +93,10 @@ enum {
  * bridge in response to a host trying to communicate via a locked bridge port
  * with MAB enabled. Their purpose is to notify user space that a host requires
  * authentication.
+ *
+ * NTF_EXT_EXT_VALIDATED flagged neighbor entries were externally validated by
+ * a user space control plane. The kernel will not remove or invalidate them,
+ * but it can probe them and notify user space when they become reachable.
  */
 
 struct nda_cacheinfo {
-- 
cgit 


From 566e8f108fc7847f2a8676ec6a101d37b7dd0fb4 Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Sun, 29 Jun 2025 17:21:32 +0300
Subject: devlink: Extend devlink rate API with traffic classes bandwidth
 management

Introduce support for specifying relative bandwidth shares between
traffic classes (TC) in the devlink-rate API. This new option allows
users to allocate bandwidth across multiple traffic classes in a
single command.

This feature provides a more granular control over traffic management,
especially for scenarios requiring Enhanced Transmission Selection.

Users can now define a relative bandwidth share for each traffic class.
For example, assigning share values of 20 to TC0 (TCP/UDP) and 80 to TC5
(RoCE) will result in TC0 receiving 20% and TC5 receiving 80% of the
total bandwidth. The actual percentage each class receives depends on
the ratio of its share value to the sum of all shares.

Example:
DEV=pci/0000:08:00.0

$ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \
  tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0

$ devlink port function rate set $DEV/vfs_group \
  tc-bw 0:20 1:0 2:0 3:0 4:0 5:20 6:60 7:0

Example usage with ynl:

./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
  --do rate-set --json '{
  "bus-name": "pci",
  "dev-name": "0000:08:00.0",
  "port-index": 1,
  "rate-tc-bws": [
    {"rate-tc-index": 0, "rate-tc-bw": 50},
    {"rate-tc-index": 1, "rate-tc-bw": 50},
    {"rate-tc-index": 2, "rate-tc-bw": 0},
    {"rate-tc-index": 3, "rate-tc-bw": 0},
    {"rate-tc-index": 4, "rate-tc-bw": 0},
    {"rate-tc-index": 5, "rate-tc-bw": 0},
    {"rate-tc-index": 6, "rate-tc-bw": 0},
    {"rate-tc-index": 7, "rate-tc-bw": 0}
  ]
}'

./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
  --do rate-get --json '{
  "bus-name": "pci",
  "dev-name": "0000:08:00.0",
  "port-index": 1
}'

output for rate-get:
{'bus-name': 'pci',
 'dev-name': '0000:08:00.0',
 'port-index': 1,
 'rate-tc-bws': [{'rate-tc-bw': 50, 'rate-tc-index': 0},
                 {'rate-tc-bw': 50, 'rate-tc-index': 1},
                 {'rate-tc-bw': 0, 'rate-tc-index': 2},
                 {'rate-tc-bw': 0, 'rate-tc-index': 3},
                 {'rate-tc-bw': 0, 'rate-tc-index': 4},
                 {'rate-tc-bw': 0, 'rate-tc-index': 5},
                 {'rate-tc-bw': 0, 'rate-tc-index': 6},
                 {'rate-tc-bw': 0, 'rate-tc-index': 7}],
 'rate-tx-max': 0,
 'rate-tx-priority': 0,
 'rate-tx-share': 0,
 'rate-tx-weight': 0,
 'rate-type': 'leaf'}

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250629142138.361537-3-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/devlink.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index a5ee0f13740a..e72bcc239afd 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -221,6 +221,11 @@ enum devlink_port_flavour {
 				      */
 };
 
+/* IEEE 802.1Qaz standard supported values. */
+
+#define DEVLINK_RATE_TCS_MAX 8
+#define DEVLINK_RATE_TC_INDEX_MAX (DEVLINK_RATE_TCS_MAX - 1)
+
 enum devlink_rate_type {
 	DEVLINK_RATE_TYPE_LEAF,
 	DEVLINK_RATE_TYPE_NODE,
@@ -629,6 +634,10 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_REGION_DIRECT,		/* flag */
 
+	DEVLINK_ATTR_RATE_TC_BWS,		/* nested */
+	DEVLINK_ATTR_RATE_TC_INDEX,		/* u8 */
+	DEVLINK_ATTR_RATE_TC_BW,		/* u32 */
+
 	/* Add new attributes above here, update the spec in
 	 * Documentation/netlink/specs/devlink.yaml and re-generate
 	 * net/devlink/netlink_gen.c.
-- 
cgit 


From 59f44c9ccc3bb68aa3b062b8e57ce0e1ee2fca75 Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@ovn.org>
Date: Wed, 2 Jul 2025 17:50:34 +0200
Subject: net: openvswitch: allow providing upcall pid for the 'execute'
 command

When a packet enters OVS datapath and there is no flow to handle it,
packet goes to userspace through a MISS upcall.  With per-CPU upcall
dispatch mechanism, we're using the current CPU id to select the
Netlink PID on which to send this packet.  This allows us to send
packets from the same traffic flow through the same handler.

The handler will process the packet, install required flow into the
kernel and re-inject the original packet via OVS_PACKET_CMD_EXECUTE.

While handling OVS_PACKET_CMD_EXECUTE, however, we may hit a
recirculation action that will pass the (likely modified) packet
through the flow lookup again.  And if the flow is not found, the
packet will be sent to userspace again through another MISS upcall.

However, the handler thread in userspace is likely running on a
different CPU core, and the OVS_PACKET_CMD_EXECUTE request is handled
in the syscall context of that thread.  So, when the time comes to
send the packet through another upcall, the per-CPU dispatch will
choose a different Netlink PID, and this packet will end up processed
by a different handler thread on a different CPU.

The process continues as long as there are new recirculations, each
time the packet goes to a different handler thread before it is sent
out of the OVS datapath to the destination port.  In real setups the
number of recirculations can go up to 4 or 5, sometimes more.

There is always a chance to re-order packets while processing upcalls,
because userspace will first install the flow and then re-inject the
original packet.  So, there is a race window when the flow is already
installed and the second packet can match it and be forwarded to the
destination before the first packet is re-injected.  But the fact that
packets are going through multiple upcalls handled by different
userspace threads makes the reordering noticeably more likely, because
we not only have a race between the kernel and a userspace handler
(which is hard to avoid), but also between multiple userspace handlers.

For example, let's assume that 10 packets got enqueued through a MISS
upcall for handler-1, it will start processing them, will install the
flow into the kernel and start re-injecting packets back, from where
they will go through another MISS to handler-2.  Handler-2 will install
the flow into the kernel and start re-injecting the packets, while
handler-1 continues to re-inject the last of the 10 packets, they will
hit the flow installed by handler-2 and be forwarded without going to
the handler-2, while handler-2 still re-injects the first of these 10
packets.  Given multiple recirculations and misses, these 10 packets
may end up completely mixed up on the output from the datapath.

Let's allow userspace to specify on which Netlink PID the packets
should be upcalled while processing OVS_PACKET_CMD_EXECUTE.
This makes it possible to ensure that all the packets are processed
by the same handler thread in the userspace even with them being
upcalled multiple times in the process.  Packets will remain in order
since they will be enqueued to the same socket and re-injected in the
same order.  This doesn't eliminate re-ordering as stated above, since
we still have a race between kernel and the userspace thread, but it
allows to eliminate races between multiple userspace threads.

Userspace knows the PID of the socket on which the original upcall is
received, so there is no need to send it up from the kernel.

Solution requires storing the value somewhere for the duration of the
packet processing.  There are two potential places for this: our skb
extension or the per-CPU storage.  It's not clear which is better,
so just following currently used scheme of storing this kind of things
along the skb.  We still have a decent amount of space in the cb.

Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Acked-by: Aaron Conole <aconole@redhat.com>
Link: https://patch.msgid.link/20250702155043.2331772-1-i.maximets@ovn.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/openvswitch.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index 3a701bd1f31b..3092c2c6f1d2 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -186,6 +186,11 @@ enum ovs_packet_cmd {
  * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment
  * size.
  * @OVS_PACKET_ATTR_HASH: Packet hash info (e.g. hash, sw_hash and l4_hash in skb).
+ * @OVS_PACKET_ATTR_UPCALL_PID: Netlink PID to use for upcalls while
+ * processing %OVS_PACKET_CMD_EXECUTE.  Takes precedence over all other ways
+ * to determine the Netlink PID including %OVS_USERSPACE_ATTR_PID,
+ * %OVS_DP_ATTR_UPCALL_PID, %OVS_DP_ATTR_PER_CPU_PIDS and the
+ * %OVS_VPORT_ATTR_UPCALL_PID.
  *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_PACKET_* commands.
@@ -205,6 +210,7 @@ enum ovs_packet_attr {
 	OVS_PACKET_ATTR_MRU,	    /* Maximum received IP fragment size. */
 	OVS_PACKET_ATTR_LEN,	    /* Packet size before truncation. */
 	OVS_PACKET_ATTR_HASH,	    /* Packet hash. */
+	OVS_PACKET_ATTR_UPCALL_PID, /* u32 Netlink PID. */
 	__OVS_PACKET_ATTR_MAX
 };
 
-- 
cgit 


From 3d98ee52659c3f1d3913ae5b97f7743c5247752c Mon Sep 17 00:00:00 2001
From: Tonghao Zhang <tonghao@bamaicloud.com>
Date: Fri, 27 Jun 2025 21:49:29 +0800
Subject: net: bonding: add broadcast_neighbor netlink option

User can config or display the bonding broadcast_neighbor option via
iproute2/netlink.

Cc: Jay Vosburgh <jv@jvosburgh.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Simon Horman <horms@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Andrew Lunn <andrew+netdev@lunn.ch>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Tonghao Zhang <tonghao@bamaicloud.com>
Signed-off-by: Zengbing Tu <tuzengbing@didiglobal.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://patch.msgid.link/76b90700ba5b98027dfb51a2f3c5cfea0440a21b.1751031306.git.tonghao@bamaicloud.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 873c285996fe..784ace3a519c 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -1535,6 +1535,7 @@ enum {
 	IFLA_BOND_MISSED_MAX,
 	IFLA_BOND_NS_IP6_TARGET,
 	IFLA_BOND_COUPLED_CONTROL,
+	IFLA_BOND_BROADCAST_NEIGH,
 	__IFLA_BOND_MAX,
 };
 
-- 
cgit 


From ad39c12fcee34b8980a80ad5c803bca9906fbd4e Mon Sep 17 00:00:00 2001
From: Jeremy Kerr <jk@codeconstruct.com.au>
Date: Wed, 2 Jul 2025 14:20:13 +0800
Subject: net: mctp: add gateway routing support

This change allows for gateway routing, where a route table entry
may reference a routable endpoint (by network and EID), instead of
routing directly to a netdevice.

We add support for a RTM_GATEWAY attribute for netlink route updates,
with an attribute format of:

    struct mctp_fq_addr {
        unsigned int net;
        mctp_eid_t eid;
    }

- we need the net here to uniquely identify the target EID, as we no
longer have the device reference directly (which would provide the net
id in the case of direct routes).

This makes route lookups recursive, as a route lookup that returns a
gateway route must be resolved into a direct route (ie, to a device)
eventually. We provide a limit to the route lookups, to prevent infinite
loop routing.

The route lookup populates a new 'nexthop' field in the dst structure,
which now specifies the key for the neighbour table lookup on device
output, rather than using the packet destination address directly.

Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
Link: https://patch.msgid.link/20250702-dev-forwarding-v5-13-1468191da8a4@codeconstruct.com.au
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/mctp.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h
index e1db65df9359..19ad12a0cd4b 100644
--- a/include/uapi/linux/mctp.h
+++ b/include/uapi/linux/mctp.h
@@ -37,6 +37,14 @@ struct sockaddr_mctp_ext {
 	__u8			smctp_haddr[MAX_ADDR_LEN];
 };
 
+/* A "fully qualified" MCTP address, which includes the system-local network ID,
+ * required to uniquely resolve a routable EID.
+ */
+struct mctp_fq_addr {
+	unsigned int	net;
+	mctp_eid_t	eid;
+};
+
 #define MCTP_NET_ANY		0x0
 
 #define MCTP_ADDR_NULL		0x00
-- 
cgit 


From e22da4685013922ade8e7b5e727ac6804fe5b51e Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@kernel.org>
Date: Tue, 1 Jul 2025 16:46:57 +0200
Subject: net/handshake: Add new parameter 'HANDSHAKE_A_ACCEPT_KEYRING'

Add a new netlink parameter 'HANDSHAKE_A_ACCEPT_KEYRING' to provide
the serial number of the keyring to use.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
Reviewed-by: Chuck Lever <chuck.lever@oracle.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20250701144657.104401-1-hare@kernel.org
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/handshake.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/handshake.h b/include/uapi/linux/handshake.h
index 3d7ea58778c9..662e7de46c54 100644
--- a/include/uapi/linux/handshake.h
+++ b/include/uapi/linux/handshake.h
@@ -45,6 +45,7 @@ enum {
 	HANDSHAKE_A_ACCEPT_PEER_IDENTITY,
 	HANDSHAKE_A_ACCEPT_CERTIFICATE,
 	HANDSHAKE_A_ACCEPT_PEERNAME,
+	HANDSHAKE_A_ACCEPT_KEYRING,
 
 	__HANDSHAKE_A_ACCEPT_MAX,
 	HANDSHAKE_A_ACCEPT_MAX = (__HANDSHAKE_A_ACCEPT_MAX - 1)
-- 
cgit 


From 333c515d189657c934470c9b0b8a8fedb016ce6f Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 8 Jul 2025 17:54:54 +0200
Subject: vhost-net: allow configuring extended features

Use the extended feature type for 'acked_features' and implement
two new ioctls operation allowing the user-space to set/query an
unbounded amount of features.

The actual number of processed features is limited by VIRTIO_FEATURES_MAX
and attempts to set features above such limit fail with
EOPNOTSUPP.

Note that: the legacy ioctls implicitly truncate the negotiated
features to the lower 64 bits range and the 'acked_backend_features'
field don't need conversion, as the only negotiated feature there
is in the low 64 bit range.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/vhost.h       | 7 +++++++
 include/uapi/linux/vhost_types.h | 5 +++++
 2 files changed, 12 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index d4b3e2ae1314..d6ad01fbb8d2 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -235,4 +235,11 @@
  */
 #define VHOST_VDPA_GET_VRING_SIZE	_IOWR(VHOST_VIRTIO, 0x82,	\
 					      struct vhost_vring_state)
+
+/* Extended features manipulation */
+#define VHOST_GET_FEATURES_ARRAY _IOR(VHOST_VIRTIO, 0x83, \
+				       struct vhost_features_array)
+#define VHOST_SET_FEATURES_ARRAY _IOW(VHOST_VIRTIO, 0x83, \
+				       struct vhost_features_array)
+
 #endif
diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h
index d7656908f730..1c39cc5f5a31 100644
--- a/include/uapi/linux/vhost_types.h
+++ b/include/uapi/linux/vhost_types.h
@@ -110,6 +110,11 @@ struct vhost_msg_v2 {
 	};
 };
 
+struct vhost_features_array {
+	__u64 count; /* number of entries present in features array */
+	__u64 features[] __counted_by(count);
+};
+
 struct vhost_memory_region {
 	__u64 guest_phys_addr;
 	__u64 memory_size; /* bytes */
-- 
cgit 


From a2fb4bc4e2a6a031683910d85b278c1d25ae5420 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 8 Jul 2025 17:54:59 +0200
Subject: net: implement virtio helpers to handle UDP GSO tunneling.

The virtio specification are introducing support for GSO over UDP
tunnel.

This patch brings in the needed defines and the additional virtio hdr
parsing/building helpers.

The UDP tunnel support uses additional fields in the virtio hdr, and such
fields location can change depending on other negotiated features -
specifically VIRTIO_NET_F_HASH_REPORT.

Try to be as conservative as possible with the new field validation.

Existing implementation for plain GSO offloads allow for invalid/
self-contradictory values of such fields. With GSO over UDP tunnel we can
be more strict, with no need to deal with legacy implementation.

Since the checksum-related field validation is asymmetric in the driver
and in the device, introduce a separate helper to implement the new checks
(to be used only on the driver side).

Note that while the feature space exceeds the 64-bit boundaries, the
guest offload space is fixed by the specification of the
VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET command to a 64-bit size.

Prior to the UDP tunnel GSO support, each guest offload bit corresponded
to the feature bit with the same value and vice versa.

Due to the limited 'guest offload' space, relevant features in the high
64 bits are 'mapped' to free bits in the lower range. That is simpler
than defining a new command (and associated features) to exchange an
extended guest offloads set.

As a consequence, the uAPIs also specify the mapped guest offload value
corresponding to the UDP tunnel GSO features.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
--
v4 -> v5:
  - avoid lines above 80 chars

v3 -> v4:
  - fixed offset for UDP GSO tunnel, update accordingly the helpers
  - tried to clarified vlan_hlen semantic
  - virtio_net_chk_data_valid() -> virtio_net_handle_csum_offload()

v2 -> v3:
  - add definitions for possible vnet hdr layouts with tunnel support

v1 -> v2:
  - 'relay' -> 'rely' typo
  - less unclear comment WRT enforced inner GSO checks
  - inner header fields are allowed only with 'modern' virtio,
    thus are always le
  - clarified in the commit message the need for 'mapped features'
    defines
  - assume little_endian is true when UDP GSO is enabled.
  - fix inner proto type value
---
 include/uapi/linux/virtio_net.h | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_net.h b/include/uapi/linux/virtio_net.h
index 963540deae66..8bf27ab8bcb4 100644
--- a/include/uapi/linux/virtio_net.h
+++ b/include/uapi/linux/virtio_net.h
@@ -70,6 +70,28 @@
 					 * with the same MAC.
 					 */
 #define VIRTIO_NET_F_SPEED_DUPLEX 63	/* Device set linkspeed and duplex */
+#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO 65 /* Driver can receive
+					      * GSO-over-UDP-tunnel packets
+					      */
+#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM 66 /* Driver handles
+						   * GSO-over-UDP-tunnel
+						   * packets with partial csum
+						   * for the outer header
+						   */
+#define VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO 67 /* Device can receive
+					     * GSO-over-UDP-tunnel packets
+					     */
+#define VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO_CSUM 68 /* Device handles
+						  * GSO-over-UDP-tunnel
+						  * packets with partial csum
+						  * for the outer header
+						  */
+
+/* Offloads bits corresponding to VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO{,_CSUM}
+ * features
+ */
+#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_MAPPED	46
+#define VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO_CSUM_MAPPED	47
 
 #ifndef VIRTIO_NET_NO_LEGACY
 #define VIRTIO_NET_F_GSO	6	/* Host handles pkts w/ any GSO type */
@@ -131,12 +153,17 @@ struct virtio_net_hdr_v1 {
 #define VIRTIO_NET_HDR_F_NEEDS_CSUM	1	/* Use csum_start, csum_offset */
 #define VIRTIO_NET_HDR_F_DATA_VALID	2	/* Csum is valid */
 #define VIRTIO_NET_HDR_F_RSC_INFO	4	/* rsc info in csum_ fields */
+#define VIRTIO_NET_HDR_F_UDP_TUNNEL_CSUM 8	/* UDP tunnel csum offload */
 	__u8 flags;
 #define VIRTIO_NET_HDR_GSO_NONE		0	/* Not a GSO frame */
 #define VIRTIO_NET_HDR_GSO_TCPV4	1	/* GSO frame, IPv4 TCP (TSO) */
 #define VIRTIO_NET_HDR_GSO_UDP		3	/* GSO frame, IPv4 UDP (UFO) */
 #define VIRTIO_NET_HDR_GSO_TCPV6	4	/* GSO frame, IPv6 TCP */
 #define VIRTIO_NET_HDR_GSO_UDP_L4	5	/* GSO frame, IPv4& IPv6 UDP (USO) */
+#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 0x20 /* UDPv4 tunnel present */
+#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6 0x40 /* UDPv6 tunnel present */
+#define VIRTIO_NET_HDR_GSO_UDP_TUNNEL (VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV4 | \
+				       VIRTIO_NET_HDR_GSO_UDP_TUNNEL_IPV6)
 #define VIRTIO_NET_HDR_GSO_ECN		0x80	/* TCP has ECN set */
 	__u8 gso_type;
 	__virtio16 hdr_len;	/* Ethernet + IP + tcp/udp hdrs */
@@ -181,6 +208,12 @@ struct virtio_net_hdr_v1_hash {
 	__le16 padding;
 };
 
+struct virtio_net_hdr_v1_hash_tunnel {
+	struct virtio_net_hdr_v1_hash hash_hdr;
+	__le16 outer_th_offset;
+	__le16 inner_nh_offset;
+};
+
 #ifndef VIRTIO_NET_NO_LEGACY
 /* This header comes first in the scatter-gather list.
  * For legacy virtio, if VIRTIO_F_ANY_LAYOUT is not negotiated, it must
-- 
cgit 


From 288f30435132d2f9e7a29ec9b9745a4f9dc7fd37 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Tue, 8 Jul 2025 17:55:30 +0200
Subject: tun: enable gso over UDP tunnel support.

Add new tun features to represent the newly introduced virtio
GSO over UDP tunnel offload. Allows detection and selection of
such features via the existing TUNSETOFFLOAD ioctl and compute
the expected virtio header size and tunnel header offset using
the current netdev features, so that we can plug almost seamless
the newly introduced virtio helpers to serialize the extended
virtio header.

Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
v6 -> v7:
  - rebased

v4 -> v5:
  - encapsulate the guest feature guessing in a tun helper
  - dropped irrelevant check on xdp buff headroom
  - do not remove unrelated black line
  - avoid line len > 80 char

v3 -> v4:
  - virtio tnl-related fields are at fixed offset, cleanup
    the code accordingly.
  - use netdev features instead of flags bit to check for
    the configured offload
  - drop packet in case of enabled features/configured hdr
    size mismatch

v2 -> v3:
  - cleaned-up uAPI comments
  - use explicit struct layout instead of raw buf.
---
 include/uapi/linux/if_tun.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 287cdc81c939..79d53c7a1ebd 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -93,6 +93,15 @@
 #define TUN_F_USO4	0x20	/* I can handle USO for IPv4 packets */
 #define TUN_F_USO6	0x40	/* I can handle USO for IPv6 packets */
 
+/* I can handle TSO/USO for UDP tunneled packets */
+#define TUN_F_UDP_TUNNEL_GSO		0x080
+
+/*
+ * I can handle TSO/USO for UDP tunneled packets requiring csum offload for
+ * the outer header
+ */
+#define TUN_F_UDP_TUNNEL_GSO_CSUM	0x100
+
 /* Protocol info prepended to the packets (when IFF_NO_PI is not set) */
 #define TUN_PKT_STRIP	0x0001
 struct tun_pi {
-- 
cgit 


From 45e359be1ce88fb22e61fa3aa23b2e450a6cae03 Mon Sep 17 00:00:00 2001
From: Jason Xing <kernelxing@tencent.com>
Date: Sat, 5 Jul 2025 00:01:38 +0800
Subject: net: xsk: introduce XDP_MAX_TX_SKB_BUDGET setsockopt

This patch provides a setsockopt method to let applications leverage to
adjust how many descs to be handled at most in one send syscall. It
mitigates the situation where the default value (32) that is too small
leads to higher frequency of triggering send syscall.

Considering the prosperity/complexity the applications have, there is no
absolutely ideal suggestion fitting all cases. So keep 32 as its default
value like before.

The patch does the following things:
- Add XDP_MAX_TX_SKB_BUDGET socket option.
- Set max_tx_budget to 32 by default in the initialization phase as a
  per-socket granular control.
- Set the range of max_tx_budget as [32, xs->tx->nentries].

The idea behind this comes out of real workloads in production. We use a
user-level stack with xsk support to accelerate sending packets and
minimize triggering syscalls. When the packets are aggregated, it's not
hard to hit the upper bound (namely, 32). The moment user-space stack
fetches the -EAGAIN error number passed from sendto(), it will loop to try
again until all the expected descs from tx ring are sent out to the driver.
Enlarging the XDP_MAX_TX_SKB_BUDGET value contributes to less frequency of
sendto() and higher throughput/PPS.

Here is what I did in production, along with some numbers as follows:
For one application I saw lately, I suggested using 128 as max_tx_budget
because I saw two limitations without changing any default configuration:
1) XDP_MAX_TX_SKB_BUDGET, 2) socket sndbuf which is 212992 decided by
net.core.wmem_default. As to XDP_MAX_TX_SKB_BUDGET, the scenario behind
this was I counted how many descs are transmitted to the driver at one
time of sendto() based on [1] patch and then I calculated the
possibility of hitting the upper bound. Finally I chose 128 as a
suitable value because 1) it covers most of the cases, 2) a higher
number would not bring evident results. After twisting the parameters,
a stable improvement of around 4% for both PPS and throughput and less
resources consumption were found to be observed by strace -c -p xxx:
1) %time was decreased by 7.8%
2) error counter was decreased from 18367 to 572

[1]: https://lore.kernel.org/all/20250619093641.70700-1-kerneljasonxing@gmail.com/

Signed-off-by: Jason Xing <kernelxing@tencent.com>
Acked-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
Link: https://patch.msgid.link/20250704160138.48677-1-kerneljasonxing@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/if_xdp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h
index 44f2bb93e7e6..23a062781468 100644
--- a/include/uapi/linux/if_xdp.h
+++ b/include/uapi/linux/if_xdp.h
@@ -79,6 +79,7 @@ struct xdp_mmap_offsets {
 #define XDP_UMEM_COMPLETION_RING	6
 #define XDP_STATISTICS			7
 #define XDP_OPTIONS			8
+#define XDP_MAX_TX_SKB_BUDGET		9
 
 struct xdp_umem_reg {
 	__u64 addr; /* Start of packet data area */
-- 
cgit 


From d7974697de4d6fa1a1ed9ca43616a8500046f25a Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 8 Jul 2025 15:06:38 -0700
Subject: ethtool: mark ETHER_FLOW as usable for Rx hash

Looks like some drivers (ena, enetc, fbnic.. there's probably more)
consider ETHER_FLOW to be legitimate target for flow hashing.
I'm not sure how intentional that is from the uAPI perspective
vs just an effect of ethtool IOCTL doing minimal input validation.
But Netlink will do strict validation, so we need to decide whether
we allow this use case or not. I don't see a strong reason against
it, and rejecting it would potentially regress a number of drivers.
So update the comments and flow_type_hashable().

Link: https://patch.msgid.link/20250708220640.2738464-4-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 707c1844010c..9e9afdd1238a 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -2314,7 +2314,7 @@ enum {
 	IPV6_USER_FLOW	= 0x0e, /* spec only (usr_ip6_spec; nfc only) */
 	IPV4_FLOW	= 0x10, /* hash only */
 	IPV6_FLOW	= 0x11, /* hash only */
-	ETHER_FLOW	= 0x12, /* spec only (ether_spec) */
+	ETHER_FLOW	= 0x12, /* hash or spec (ether_spec) */
 
 	/* Used for GTP-U IPv4 and IPv6.
 	 * The format of GTP packets only includes
@@ -2371,7 +2371,7 @@ enum {
 /* Flag to enable RSS spreading of traffic matching rule (nfc only) */
 #define	FLOW_RSS	0x20000000
 
-/* L3-L4 network traffic flow hash options */
+/* L2-L4 network traffic flow hash options */
 #define	RXH_L2DA	(1 << 1)
 #define	RXH_VLAN	(1 << 2)
 #define	RXH_L3_PROTO	(1 << 3)
-- 
cgit 


From 178331743ca860561f60d04a7797a2fce13f0784 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 8 Jul 2025 15:06:39 -0700
Subject: ethtool: rss: report which fields are configured for hashing

Implement ETHTOOL_GRXFH over Netlink. The number of flow types is
reasonable (around 20) so report all of them at once for simplicity.

Do not maintain the flow ID mapping with ioctl at the uAPI level.
This gives us a chance to clean up the confusion that come from
RxNFC vs RxFH (flow direction vs hashing) in the ioctl.
Try to align with the names used in ethtool CLI, they seem to have
stood the test of time just fine. One annoyance is that we still
call L4 ports the weird names, but I guess they also apply to IPSec
(where they cover the SPI) so it is what it is.

 $ ynl --family ethtool --dump rss-get
 {
    "header": {
	"dev-index": 1,
	"dev-name": "enp1s0"
    },
    "hfunc": 1,
    "hkey": b"...",
    "indir": [0, 1, ...],
    "flow-hash": {
        "ether": {"l2da"},
	"ah-esp4": {"ip-src", "ip-dst"},
        "ah-esp6": {"ip-src", "ip-dst"},
        "ah4": {"ip-src", "ip-dst"},
        "ah6": {"ip-src", "ip-dst"},
        "esp4": {"ip-src", "ip-dst"},
        "esp6": {"ip-src", "ip-dst"},
        "ip4": {"ip-src", "ip-dst"},
        "ip6": {"ip-src", "ip-dst"},
        "sctp4": {"ip-src", "ip-dst"},
        "sctp6": {"ip-src", "ip-dst"},
        "udp4": {"ip-src", "ip-dst"},
        "udp6": {"ip-src", "ip-dst"}
        "tcp4": {"l4-b-0-1", "l4-b-2-3", "ip-src", "ip-dst"},
        "tcp6": {"l4-b-0-1", "l4-b-2-3", "ip-src", "ip-dst"},
    },
 }

Link: https://patch.msgid.link/20250708220640.2738464-5-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool_netlink_generated.h | 34 ++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 8f30ffa1cd14..96027e26ffba 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -678,6 +678,39 @@ enum {
 	ETHTOOL_A_PSE_MAX = (__ETHTOOL_A_PSE_CNT - 1)
 };
 
+enum {
+	ETHTOOL_A_FLOW_ETHER = 1,
+	ETHTOOL_A_FLOW_IP4,
+	ETHTOOL_A_FLOW_IP6,
+	ETHTOOL_A_FLOW_TCP4,
+	ETHTOOL_A_FLOW_TCP6,
+	ETHTOOL_A_FLOW_UDP4,
+	ETHTOOL_A_FLOW_UDP6,
+	ETHTOOL_A_FLOW_SCTP4,
+	ETHTOOL_A_FLOW_SCTP6,
+	ETHTOOL_A_FLOW_AH4,
+	ETHTOOL_A_FLOW_AH6,
+	ETHTOOL_A_FLOW_ESP4,
+	ETHTOOL_A_FLOW_ESP6,
+	ETHTOOL_A_FLOW_AH_ESP4,
+	ETHTOOL_A_FLOW_AH_ESP6,
+	ETHTOOL_A_FLOW_GTPU4,
+	ETHTOOL_A_FLOW_GTPU6,
+	ETHTOOL_A_FLOW_GTPC4,
+	ETHTOOL_A_FLOW_GTPC6,
+	ETHTOOL_A_FLOW_GTPC_TEID4,
+	ETHTOOL_A_FLOW_GTPC_TEID6,
+	ETHTOOL_A_FLOW_GTPU_EH4,
+	ETHTOOL_A_FLOW_GTPU_EH6,
+	ETHTOOL_A_FLOW_GTPU_UL4,
+	ETHTOOL_A_FLOW_GTPU_UL6,
+	ETHTOOL_A_FLOW_GTPU_DL4,
+	ETHTOOL_A_FLOW_GTPU_DL6,
+
+	__ETHTOOL_A_FLOW_CNT,
+	ETHTOOL_A_FLOW_MAX = (__ETHTOOL_A_FLOW_CNT - 1)
+};
+
 enum {
 	ETHTOOL_A_RSS_UNSPEC,
 	ETHTOOL_A_RSS_HEADER,
@@ -687,6 +720,7 @@ enum {
 	ETHTOOL_A_RSS_HKEY,
 	ETHTOOL_A_RSS_INPUT_XFRM,
 	ETHTOOL_A_RSS_START_CONTEXT,
+	ETHTOOL_A_RSS_FLOW_HASH,
 
 	__ETHTOOL_A_RSS_CNT,
 	ETHTOOL_A_RSS_MAX = (__ETHTOOL_A_RSS_CNT - 1)
-- 
cgit 


From 2677010e7793451c20d895c477c4dc76f6e6a10e Mon Sep 17 00:00:00 2001
From: Samiullah Khawaja <skhawaja@google.com>
Date: Thu, 10 Jul 2025 21:12:03 +0000
Subject: Add support to set NAPI threaded for individual NAPI

A net device has a threaded sysctl that can be used to enable threaded
NAPI polling on all of the NAPI contexts under that device. Allow
enabling threaded NAPI polling at individual NAPI level using netlink.

Extend the netlink operation `napi-set` and allow setting the threaded
attribute of a NAPI. This will enable the threaded polling on a NAPI
context.

Add a test in `nl_netdev.py` that verifies various cases of threaded
NAPI being set at NAPI and at device level.

Tested
 ./tools/testing/selftests/net/nl_netdev.py
 TAP version 13
 1..7
 ok 1 nl_netdev.empty_check
 ok 2 nl_netdev.lo_check
 ok 3 nl_netdev.page_pool_check
 ok 4 nl_netdev.napi_list_check
 ok 5 nl_netdev.dev_set_threaded
 ok 6 nl_netdev.napi_set_threaded
 ok 7 nl_netdev.nsim_rxq_reset_down
 # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20250710211203.3979655-1-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/netdev.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 7eb9571786b8..1f3719a9a0eb 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -134,6 +134,7 @@ enum {
 	NETDEV_A_NAPI_DEFER_HARD_IRQS,
 	NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
 	NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
+	NETDEV_A_NAPI_THREADED,
 
 	__NETDEV_A_NAPI_MAX,
 	NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1)
-- 
cgit 


From 6c758062c64dfbd61862801fbde4e0702f4f3a23 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Fri, 11 Jul 2025 11:40:00 +0000
Subject: tcp: add LINUX_MIB_BEYOND_WINDOW

Add a new SNMP MIB : LINUX_MIB_BEYOND_WINDOW

Incremented when an incoming packet is received beyond the
receiver window.

nstat -az | grep TcpExtBeyondWindow

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20250711114006.480026-3-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/snmp.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 1d234d7e1892..49f5640092a0 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -186,6 +186,7 @@ enum
 	LINUX_MIB_TIMEWAITKILLED,		/* TimeWaitKilled */
 	LINUX_MIB_PAWSACTIVEREJECTED,		/* PAWSActiveRejected */
 	LINUX_MIB_PAWSESTABREJECTED,		/* PAWSEstabRejected */
+	LINUX_MIB_BEYOND_WINDOW,		/* BeyondWindow */
 	LINUX_MIB_TSECRREJECTED,		/* TSEcrRejected */
 	LINUX_MIB_PAWS_OLD_ACK,			/* PAWSOldAck */
 	LINUX_MIB_PAWS_TW_REJECTED,		/* PAWSTimewait */
-- 
cgit 


From c0ae03588bbb95378758fe80e7436a9b4cfc71f6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 15 Jul 2025 17:03:21 -0700
Subject: ethtool: rss: initial RSS_SET (indirection table handling)

Add initial support for RSS_SET, for now only operations on
the indirection table are supported.

Unlike the ioctl don't check if at least one parameter is
being changed. This is how other ethtool-nl ops behave,
so pick the ethtool-nl consistency vs copying ioctl behavior.

There are two special cases here:
 1) resetting the table to defaults;
 2) support for tables of different size.

For (1) I use an empty Netlink attribute (array of size 0).

(2) may require some background. AFAICT a lot of modern devices
allow allocating RSS tables of different sizes. mlx5 can upsize
its tables, bnxt has some "table size calculation", and Intel
folks asked about RSS table sizing in context of resource allocation
in the past. The ethtool IOCTL API has a concept of table size,
but right now the user is expected to provide a table exactly
the size the device requests. Some drivers may change the table
size at runtime (in response to queue count changes) but the
user is not in control of this. What's not great is that all
RSS contexts share the same table size. For example a device
with 128 queues enabled, 16 RSS contexts 8 queues in each will
likely have 256 entry tables for each of the 16 contexts,
while 32 would be more than enough given each context only has
8 queues. To address this the Netlink API should avoid enforcing
table size at the uAPI level, and should allow the user to express
the min table size they expect.

To fully solve (2) we will need more driver plumbing but
at the uAPI level this patch allows the user to specify
a table size smaller than what the device advertises. The device
table size must be a multiple of the user requested table size.
We then replicate the user-provided table to fill the full device
size table. This addresses the "allow the user to express the min
table size" objective, while not enforcing any fixed size.
From Netlink perspective .get_rxfh_indir_size() is now de facto
the "max" table size supported by the device.

We may choose to support table replication in ethtool, too,
when we actually plumb this thru the device APIs.

Initially I was considering moving full pattern generation
to the kernel (which queues to use, at which frequency and
what min sequence length). I don't think this complexity
would buy us much and most if not all devices have pow-2
table sizes, which simplifies the replication a lot.

Reviewed-by: Gal Pressman <gal@nvidia.com>
Link: https://patch.msgid.link/20250716000331.1378807-2-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool_netlink_generated.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 96027e26ffba..130bdf5c3516 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -840,6 +840,7 @@ enum {
 	ETHTOOL_MSG_PHY_GET,
 	ETHTOOL_MSG_TSCONFIG_GET,
 	ETHTOOL_MSG_TSCONFIG_SET,
+	ETHTOOL_MSG_RSS_SET,
 
 	__ETHTOOL_MSG_USER_CNT,
 	ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1)
-- 
cgit 


From 6624a0af82a6e3a4d3609264ef591a8fa3467139 Mon Sep 17 00:00:00 2001
From: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Date: Thu, 17 Jul 2025 17:42:02 +1000
Subject: wifi: cfg80211: support configuring an S1G short beaconing BSS

S1G short beacons are an optional frame type used in an S1G BSS
that contain a limited set of elements. While they are optional,
they are a fundamental part of S1G that enables significant
power saving.

Expose 2 additional netlink attributes,
NL80211_ATTR_S1G_LONG_BEACON_PERIOD which denotes the number of beacon
intervals between each long beacon and NL80211_ATTR_S1G_SHORT_BEACON
which is a nested attribute containing the short beacon tail and
head. We split them as the long beacon period cannot be updated,
and is only used when initialisng the interface, whereas the short
beacon data can be used to both initialise and update the templates.
This follows how things such as the beacon interval and DTIM period
currently operate.

During the initialisation path, we ensure we have the long beacon
period if the short beacon data is being passed down, whereas
the update path will simply update the template if its sent down.

The short beacon data is validated using the same routines for regular
beacons as they support correctly parsing the short beacon format
while ensuring the frame is well-formed.

Signed-off-by: Lachlan Hodges <lachlan.hodges@morsemicro.com>
Link: https://patch.msgid.link/20250717074205.312577-2-lachlan.hodges@morsemicro.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 39460334dafb..d1a14f2892d9 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2915,6 +2915,19 @@ enum nl80211_commands {
  *	applicable to that specific radio only. If the radio id is greater
  *	thank the number of radios, error denoting invalid value is returned.
  *
+ * @NL80211_ATTR_S1G_LONG_BEACON_PERIOD: (u8) Integer attribute that represents
+ *	the number of beacon intervals between each long beacon transmission
+ *	for an S1G BSS with short beaconing enabled. This is a required
+ *	attribute for initialising an S1G short beaconing BSS. When updating
+ *	the short beacon data, this is not required. It has a minimum value of
+ *	2 (i.e 2 beacon intervals).
+ *
+ * @NL80211_ATTR_S1G_SHORT_BEACON: Nested attribute containing the short beacon
+ *	head and tail used to set or update the short beacon templates. When
+ *	bringing up a new interface, %NL80211_ATTR_S1G_LONG_BEACON_PERIOD is
+ *	required alongside this attribute. Refer to
+ *	@enum nl80211_s1g_short_beacon_attrs for the attribute definitions.
+ *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3474,6 +3487,9 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_WIPHY_RADIO_INDEX,
 
+	NL80211_ATTR_S1G_LONG_BEACON_PERIOD,
+	NL80211_ATTR_S1G_SHORT_BEACON,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
@@ -8148,4 +8164,27 @@ enum nl80211_wiphy_radio_freq_range {
 	NL80211_WIPHY_RADIO_FREQ_ATTR_MAX = __NL80211_WIPHY_RADIO_FREQ_ATTR_LAST - 1,
 };
 
+/**
+ * enum nl80211_s1g_short_beacon_attrs - S1G short beacon data
+ *
+ * @__NL80211_S1G_SHORT_BEACON_ATTR_INVALID: Invalid
+ *
+ * @NL80211_S1G_SHORT_BEACON_ATTR_HEAD: Short beacon head (binary).
+ * @NL80211_S1G_SHORT_BEACON_ATTR_TAIL: Short beacon tail (binary).
+ *
+ * @__NL80211_S1G_SHORT_BEACON_ATTR_LAST: Internal
+ * @NL80211_S1G_SHORT_BEACON_ATTR_MAX: Highest attribute
+ */
+enum nl80211_s1g_short_beacon_attrs {
+	__NL80211_S1G_SHORT_BEACON_ATTR_INVALID,
+
+	NL80211_S1G_SHORT_BEACON_ATTR_HEAD,
+	NL80211_S1G_SHORT_BEACON_ATTR_TAIL,
+
+	/* keep last */
+	__NL80211_S1G_SHORT_BEACON_ATTR_LAST,
+	NL80211_S1G_SHORT_BEACON_ATTR_MAX =
+		__NL80211_S1G_SHORT_BEACON_ATTR_LAST - 1
+};
+
 #endif /* __LINUX_NL80211_H */
-- 
cgit 


From a166ab7816c534973745b0fe7bce3c8cefc5426f Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 17 Jul 2025 16:43:41 -0700
Subject: ethtool: rss: support creating contexts via Netlink

Support creating contexts via Netlink. Setting flow hashing
fields on the new context is not supported at this stage,
it can be added later.

An empty indirection table is not supported. This is a carry
over from the IOCTL interface where empty indirection table
meant delete. We can repurpose empty indirection table in
Netlink but for now to avoid confusion reject it using the
policy.

Support letting user choose the ID for the new context. This was
not possible in IOCTL since the context ID field for the create
action had to be set to the ETH_RXFH_CONTEXT_ALLOC magic value.

Link: https://patch.msgid.link/20250717234343.2328602-7-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool_netlink_generated.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index 130bdf5c3516..dea77abd295f 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -841,6 +841,7 @@ enum {
 	ETHTOOL_MSG_TSCONFIG_GET,
 	ETHTOOL_MSG_TSCONFIG_SET,
 	ETHTOOL_MSG_RSS_SET,
+	ETHTOOL_MSG_RSS_CREATE_ACT,
 
 	__ETHTOOL_MSG_USER_CNT,
 	ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1)
@@ -898,6 +899,8 @@ enum {
 	ETHTOOL_MSG_TSCONFIG_SET_REPLY,
 	ETHTOOL_MSG_PSE_NTF,
 	ETHTOOL_MSG_RSS_NTF,
+	ETHTOOL_MSG_RSS_CREATE_ACT_REPLY,
+	ETHTOOL_MSG_RSS_CREATE_NTF,
 
 	__ETHTOOL_MSG_KERNEL_CNT,
 	ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1)
-- 
cgit 


From fbe09277fa6324b50cc4eedb4d99498cf7dad897 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 17 Jul 2025 16:43:42 -0700
Subject: ethtool: rss: support removing contexts via Netlink

Implement removing additional RSS contexts via Netlink.
Technically it'd be possible to shoehorn the delete operation
into ethnl_request_ops-compatible handler. The code ends
up longer than open coded version, and I think we'll need
a custom way of sending notifications at some stage (if we
allow tying the context lifetime to the netlink socket, in
the future).

Link: https://patch.msgid.link/20250717234343.2328602-8-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ethtool_netlink_generated.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool_netlink_generated.h b/include/uapi/linux/ethtool_netlink_generated.h
index dea77abd295f..e3b8813465d7 100644
--- a/include/uapi/linux/ethtool_netlink_generated.h
+++ b/include/uapi/linux/ethtool_netlink_generated.h
@@ -842,6 +842,7 @@ enum {
 	ETHTOOL_MSG_TSCONFIG_SET,
 	ETHTOOL_MSG_RSS_SET,
 	ETHTOOL_MSG_RSS_CREATE_ACT,
+	ETHTOOL_MSG_RSS_DELETE_ACT,
 
 	__ETHTOOL_MSG_USER_CNT,
 	ETHTOOL_MSG_USER_MAX = (__ETHTOOL_MSG_USER_CNT - 1)
@@ -901,6 +902,7 @@ enum {
 	ETHTOOL_MSG_RSS_NTF,
 	ETHTOOL_MSG_RSS_CREATE_ACT_REPLY,
 	ETHTOOL_MSG_RSS_CREATE_NTF,
+	ETHTOOL_MSG_RSS_DELETE_NTF,
 
 	__ETHTOOL_MSG_KERNEL_CNT,
 	ETHTOOL_MSG_KERNEL_MAX = (__ETHTOOL_MSG_KERNEL_CNT - 1)
-- 
cgit 


From 1bbdb81a98363fd5cd0c2ac16ad5346bdf814dff Mon Sep 17 00:00:00 2001
From: Carolina Jubran <cjubran@nvidia.com>
Date: Tue, 22 Jul 2025 12:13:29 +0300
Subject: devlink: Fix excessive stack usage in rate TC bandwidth parsing

The devlink_nl_rate_tc_bw_parse function uses a large stack array for
devlink attributes, which triggers a warning about excessive stack
usage:

net/devlink/rate.c: In function 'devlink_nl_rate_tc_bw_parse':
net/devlink/rate.c:382:1: error: the frame size of 1648 bytes is larger than 1536 bytes [-Werror=frame-larger-than=]

Introduce a separate attribute set specifically for rate TC bandwidth
parsing that only contains the two attributes actually used: index
and bandwidth. This reduces the stack array from DEVLINK_ATTR_MAX
entries to just 2 entries, solving the stack usage issue.

Update devlink selftest to use the new 'index' and 'bw' attribute names
consistent with the YAML spec.

Example usage with ynl with the new spec:

    ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
      --do rate-set --json '{
      "bus-name": "pci",
      "dev-name": "0000:08:00.0",
      "port-index": 1,
      "rate-tc-bws": [
        {"index": 0, "bw": 50},
        {"index": 1, "bw": 50},
        {"index": 2, "bw": 0},
        {"index": 3, "bw": 0},
        {"index": 4, "bw": 0},
        {"index": 5, "bw": 0},
        {"index": 6, "bw": 0},
        {"index": 7, "bw": 0}
      ]
    }'

    ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
      --do rate-get --json '{
      "bus-name": "pci",
      "dev-name": "0000:08:00.0",
      "port-index": 1
    }'

    output for rate-get:
    {'bus-name': 'pci',
     'dev-name': '0000:08:00.0',
     'port-index': 1,
     'rate-tc-bws': [{'bw': 50, 'index': 0},
                     {'bw': 50, 'index': 1},
                     {'bw': 0, 'index': 2},
                     {'bw': 0, 'index': 3},
                     {'bw': 0, 'index': 4},
                     {'bw': 0, 'index': 5},
                     {'bw': 0, 'index': 6},
                     {'bw': 0, 'index': 7}],
     'rate-tx-max': 0,
     'rate-tx-priority': 0,
     'rate-tx-share': 0,
     'rate-tx-weight': 0,
     'rate-type': 'leaf'}

Fixes: 566e8f108fc7 ("devlink: Extend devlink rate API with traffic classes bandwidth management")
Reported-by: Arnd Bergmann <arnd@arndb.de>
Closes: https://lore.kernel.org/netdev/20250708160652.1810573-1-arnd@kernel.org/
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202507171943.W7DJcs6Y-lkp@intel.com/
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Tested-by: Carolina Jubran <cjubran@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Link: https://patch.msgid.link/1753175609-330621-1-git-send-email-tariqt@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/devlink.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index e72bcc239afd..9fcb25a0f447 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -635,8 +635,6 @@ enum devlink_attr {
 	DEVLINK_ATTR_REGION_DIRECT,		/* flag */
 
 	DEVLINK_ATTR_RATE_TC_BWS,		/* nested */
-	DEVLINK_ATTR_RATE_TC_INDEX,		/* u8 */
-	DEVLINK_ATTR_RATE_TC_BW,		/* u32 */
 
 	/* Add new attributes above here, update the spec in
 	 * Documentation/netlink/specs/devlink.yaml and re-generate
@@ -647,6 +645,15 @@ enum devlink_attr {
 	DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1
 };
 
+enum devlink_rate_tc_attr {
+	DEVLINK_RATE_TC_ATTR_UNSPEC,
+	DEVLINK_RATE_TC_ATTR_INDEX,		/* u8 */
+	DEVLINK_RATE_TC_ATTR_BW,		/* u32 */
+
+	__DEVLINK_RATE_TC_ATTR_MAX,
+	DEVLINK_RATE_TC_ATTR_MAX = __DEVLINK_RATE_TC_ATTR_MAX - 1
+};
+
 /* Mapping between internal resource described by the field and system
  * structure
  */
-- 
cgit 


From 320d031ad6e4d67e8e1ab08ac71efda02bc85683 Mon Sep 17 00:00:00 2001
From: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Date: Tue, 22 Jul 2025 11:59:10 +0200
Subject: sched: Struct definition and parsing of dualpi2 qdisc

DualPI2 is the reference implementation of IETF RFC9332 DualQ Coupled
AQM (https://datatracker.ietf.org/doc/html/rfc9332) providing two
queues called low latency (L-queue) and classic (C-queue). By default,
it enqueues non-ECN and ECT(0) packets into the C-queue and ECT(1) and
CE packets into the low latency queue (L-queue), as per IETF RFC9332 spec.

This patch defines the dualpi2 Qdisc structure and parsing, and the
following two patches include dumping and enqueue/dequeue for the DualPI2.

Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Link: https://patch.msgid.link/20250722095915.24485-2-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/pkt_sched.h | 53 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 3e41349f3fa2..75d685ea8368 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1211,4 +1211,57 @@ enum {
 
 #define TCA_ETS_MAX (__TCA_ETS_MAX - 1)
 
+/* DUALPI2 */
+enum tc_dualpi2_drop_overload {
+	TC_DUALPI2_DROP_OVERLOAD_OVERFLOW = 0,
+	TC_DUALPI2_DROP_OVERLOAD_DROP = 1,
+	__TCA_DUALPI2_DROP_OVERLOAD_MAX,
+};
+#define TCA_DUALPI2_DROP_OVERLOAD_MAX (__TCA_DUALPI2_DROP_OVERLOAD_MAX - 1)
+
+enum tc_dualpi2_drop_early {
+	TC_DUALPI2_DROP_EARLY_DROP_DEQUEUE = 0,
+	TC_DUALPI2_DROP_EARLY_DROP_ENQUEUE = 1,
+	__TCA_DUALPI2_DROP_EARLY_MAX,
+};
+#define TCA_DUALPI2_DROP_EARLY_MAX (__TCA_DUALPI2_DROP_EARLY_MAX - 1)
+
+enum tc_dualpi2_ecn_mask {
+	TC_DUALPI2_ECN_MASK_L4S_ECT = 1,
+	TC_DUALPI2_ECN_MASK_CLA_ECT = 2,
+	TC_DUALPI2_ECN_MASK_ANY_ECT = 3,
+	__TCA_DUALPI2_ECN_MASK_MAX,
+};
+#define TCA_DUALPI2_ECN_MASK_MAX (__TCA_DUALPI2_ECN_MASK_MAX - 1)
+
+enum tc_dualpi2_split_gso {
+	TC_DUALPI2_SPLIT_GSO_NO_SPLIT_GSO = 0,
+	TC_DUALPI2_SPLIT_GSO_SPLIT_GSO = 1,
+	__TCA_DUALPI2_SPLIT_GSO_MAX,
+};
+#define TCA_DUALPI2_SPLIT_GSO_MAX (__TCA_DUALPI2_SPLIT_GSO_MAX - 1)
+
+enum {
+	TCA_DUALPI2_UNSPEC,
+	TCA_DUALPI2_LIMIT,		/* Packets */
+	TCA_DUALPI2_MEMORY_LIMIT,	/* Bytes */
+	TCA_DUALPI2_TARGET,		/* us */
+	TCA_DUALPI2_TUPDATE,		/* us */
+	TCA_DUALPI2_ALPHA,		/* Hz scaled up by 256 */
+	TCA_DUALPI2_BETA,		/* Hz scaled up by 256 */
+	TCA_DUALPI2_STEP_THRESH_PKTS,	/* Step threshold in packets */
+	TCA_DUALPI2_STEP_THRESH_US,	/* Step threshold in microseconds */
+	TCA_DUALPI2_MIN_QLEN_STEP,	/* Minimum qlen to apply STEP_THRESH */
+	TCA_DUALPI2_COUPLING,		/* Coupling factor between queues */
+	TCA_DUALPI2_DROP_OVERLOAD,	/* Whether to drop on overload */
+	TCA_DUALPI2_DROP_EARLY,		/* Whether to drop on enqueue */
+	TCA_DUALPI2_C_PROTECTION,	/* Percentage */
+	TCA_DUALPI2_ECN_MASK,		/* L4S queue classification mask */
+	TCA_DUALPI2_SPLIT_GSO,		/* Split GSO packets at enqueue */
+	TCA_DUALPI2_PAD,
+	__TCA_DUALPI2_MAX
+};
+
+#define TCA_DUALPI2_MAX   (__TCA_DUALPI2_MAX - 1)
+
 #endif
-- 
cgit 


From d4de8bffbef4a7e4ad14b9fd2ff8e2d0e06b3fa5 Mon Sep 17 00:00:00 2001
From: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Date: Tue, 22 Jul 2025 11:59:11 +0200
Subject: sched: Dump configuration and statistics of dualpi2 qdisc

The configuration and statistics dump of the DualPI2 Qdisc provides
information related to both queues, such as packet numbers and queuing
delays in the L-queue and C-queue, as well as general information such as
probability value, WRR credits, memory usage, packet marking counters, max
queue size, etc.

The following patch includes enqueue/dequeue for DualPI2.

Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Link: https://patch.msgid.link/20250722095915.24485-3-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/pkt_sched.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 75d685ea8368..c2da76e78bad 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -1264,4 +1264,19 @@ enum {
 
 #define TCA_DUALPI2_MAX   (__TCA_DUALPI2_MAX - 1)
 
+struct tc_dualpi2_xstats {
+	__u32 prob;		/* current probability */
+	__u32 delay_c;		/* current delay in C queue */
+	__u32 delay_l;		/* current delay in L queue */
+	__u32 packets_in_c;	/* number of packets enqueued in C queue */
+	__u32 packets_in_l;	/* number of packets enqueued in L queue */
+	__u32 maxq;		/* maximum queue size */
+	__u32 ecn_mark;		/* packets marked with ecn*/
+	__u32 step_marks;	/* ECN marks due to the step AQM */
+	__s32 credit;		/* current c_protection credit */
+	__u32 memory_used;	/* Memory used by both queues */
+	__u32 max_memory_used;	/* Maximum used memory */
+	__u32 memory_limit;	/* Memory limit of both queues */
+};
+
 #endif
-- 
cgit 


From 8e7583a4f65f3dbf3e8deb4e60f3679c276bef62 Mon Sep 17 00:00:00 2001
From: Samiullah Khawaja <skhawaja@google.com>
Date: Wed, 23 Jul 2025 01:30:31 +0000
Subject: net: define an enum for the napi threaded state

Instead of using '0' and '1' for napi threaded state use an enum with
'disabled' and 'enabled' states.

Tested:
 ./tools/testing/selftests/net/nl_netdev.py
 TAP version 13
 1..7
 ok 1 nl_netdev.empty_check
 ok 2 nl_netdev.lo_check
 ok 3 nl_netdev.page_pool_check
 ok 4 nl_netdev.napi_list_check
 ok 5 nl_netdev.dev_set_threaded
 ok 6 nl_netdev.napi_set_threaded
 ok 7 nl_netdev.nsim_rxq_reset_down
 # Totals: pass:7 fail:0 xfail:0 xpass:0 skip:0 error:0

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Link: https://patch.msgid.link/20250723013031.2911384-4-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/netdev.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netdev.h b/include/uapi/linux/netdev.h
index 1f3719a9a0eb..48eb49aa03d4 100644
--- a/include/uapi/linux/netdev.h
+++ b/include/uapi/linux/netdev.h
@@ -77,6 +77,11 @@ enum netdev_qstats_scope {
 	NETDEV_QSTATS_SCOPE_QUEUE = 1,
 };
 
+enum netdev_napi_threaded {
+	NETDEV_NAPI_THREADED_DISABLED,
+	NETDEV_NAPI_THREADED_ENABLED,
+};
+
 enum {
 	NETDEV_A_DEV_IFINDEX = 1,
 	NETDEV_A_DEV_PAD,
-- 
cgit 


From bc8c43adfdc57c8253884fc1853cb6679cd5953d Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 8 Jul 2025 15:04:02 +0200
Subject: netfilter: nfnetlink_hook: Dump flowtable info

Introduce NFNL_HOOK_TYPE_NFT_FLOWTABLE to distinguish flowtable hooks
from base chain ones. Nested attributes are shared with the old NFTABLES
hook info type since they fit apart from their misleading name.

Old nftables in user space will ignore this new hook type and thus
continue to print flowtable hooks just like before, e.g.:

| family netdev {
| 	hook ingress device test0 {
| 		 0000000000 nf_flow_offload_ip_hook [nf_flow_table]
| 	}
| }

With this patch in place and support for the new hook info type, output
becomes more useful:

| family netdev {
| 	hook ingress device test0 {
| 		 0000000000 flowtable ip mytable myft [nf_flow_table]
| 	}
| }

Suggested-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Phil Sutter <phil@nwl.cc>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_hook.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nfnetlink_hook.h b/include/uapi/linux/netfilter/nfnetlink_hook.h
index 84a561a74b98..1a2c4d6424b5 100644
--- a/include/uapi/linux/netfilter/nfnetlink_hook.h
+++ b/include/uapi/linux/netfilter/nfnetlink_hook.h
@@ -61,10 +61,12 @@ enum nfnl_hook_chain_desc_attributes {
  *
  * @NFNL_HOOK_TYPE_NFTABLES: nf_tables base chain
  * @NFNL_HOOK_TYPE_BPF: bpf program
+ * @NFNL_HOOK_TYPE_NFT_FLOWTABLE: nf_tables flowtable
  */
 enum nfnl_hook_chaintype {
 	NFNL_HOOK_TYPE_NFTABLES = 0x1,
 	NFNL_HOOK_TYPE_BPF,
+	NFNL_HOOK_TYPE_NFT_FLOWTABLE,
 };
 
 /**
-- 
cgit 


From f24987ef6959a7efaf79bffd265522c3df18d431 Mon Sep 17 00:00:00 2001
From: Gabriel Goller <g.goller@proxmox.com>
Date: Tue, 22 Jul 2025 10:18:45 +0200
Subject: ipv6: add `force_forwarding` sysctl to enable per-interface
 forwarding

It is currently impossible to enable ipv6 forwarding on a per-interface
basis like in ipv4. To enable forwarding on an ipv6 interface we need to
enable it on all interfaces and disable it on the other interfaces using
a netfilter rule. This is especially cumbersome if you have lots of
interfaces and only want to enable forwarding on a few. According to the
sysctl docs [0] the `net.ipv6.conf.all.forwarding` enables forwarding
for all interfaces, while the interface-specific
`net.ipv6.conf.<interface>.forwarding` configures the interface
Host/Router configuration.

Introduce a new sysctl flag `force_forwarding`, which can be set on every
interface. The ip6_forwarding function will then check if the global
forwarding flag OR the force_forwarding flag is active and forward the
packet.

To preserve backwards-compatibility reset the flag (on all interfaces)
to 0 if the net.ipv6.conf.all.forwarding flag is set to 0.

Add a short selftest that checks if a packet gets forwarded with and
without `force_forwarding`.

[0]: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt

Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Gabriel Goller <g.goller@proxmox.com>
Link: https://patch.msgid.link/20250722081847.132632-1-g.goller@proxmox.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/uapi/linux/ipv6.h    | 1 +
 include/uapi/linux/netconf.h | 1 +
 include/uapi/linux/sysctl.h  | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index cf592d7b630f..d4d3ae774b26 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -199,6 +199,7 @@ enum {
 	DEVCONF_NDISC_EVICT_NOCARRIER,
 	DEVCONF_ACCEPT_UNTRACKED_NA,
 	DEVCONF_ACCEPT_RA_MIN_LFT,
+	DEVCONF_FORCE_FORWARDING,
 	DEVCONF_MAX
 };
 
diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index fac4edd55379..1c8c84d65ae3 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -19,6 +19,7 @@ enum {
 	NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 	NETCONFA_INPUT,
 	NETCONFA_BC_FORWARDING,
+	NETCONFA_FORCE_FORWARDING,
 	__NETCONFA_MAX
 };
 #define NETCONFA_MAX	(__NETCONFA_MAX - 1)
diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h
index 8981f00204db..63d1464cb71c 100644
--- a/include/uapi/linux/sysctl.h
+++ b/include/uapi/linux/sysctl.h
@@ -573,6 +573,7 @@ enum {
 	NET_IPV6_ACCEPT_RA_FROM_LOCAL=26,
 	NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27,
 	NET_IPV6_RA_DEFRTR_METRIC=28,
+	NET_IPV6_FORCE_FORWARDING=29,
 	__NET_IPV6_MAX
 };
 
-- 
cgit