From 7b6f08771bf6045c32a2a1e18201c1aeeb78d72a Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 7 Sep 2022 10:34:30 +0300 Subject: wifi: ieee80211: Support validating ML station profile length Add a function to validate EHT Multi-Link per station profile length. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 79690938d9a2..bdf668f9dace 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4685,6 +4685,46 @@ struct ieee80211_mle_per_sta_profile { u8 variable[]; } __packed; +/** + * ieee80211_mle_sta_prof_size_ok - validate multi-link element sta profile size + * @data: pointer to the sub element data + * @len: length of the containing sub element + */ +static inline bool ieee80211_mle_sta_prof_size_ok(const u8 *data, size_t len) +{ + const struct ieee80211_mle_per_sta_profile *prof = (const void *)data; + u16 control; + u8 fixed = sizeof(*prof); + u8 info_len = 1; + + if (len < fixed) + return false; + + control = le16_to_cpu(prof->control); + + if (control & IEEE80211_MLE_STA_CONTROL_STA_MAC_ADDR_PRESENT) + info_len += 6; + if (control & IEEE80211_MLE_STA_CONTROL_BEACON_INT_PRESENT) + info_len += 2; + if (control & IEEE80211_MLE_STA_CONTROL_TSF_OFFS_PRESENT) + info_len += 8; + if (control & IEEE80211_MLE_STA_CONTROL_DTIM_INFO_PRESENT) + info_len += 2; + if (control & IEEE80211_MLE_STA_CONTROL_BSS_PARAM_CHANGE_CNT_PRESENT) + info_len += 1; + + if (control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE && + control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) { + if (control & IEEE80211_MLE_STA_CONTROL_NSTR_BITMAP_SIZE) + info_len += 2; + else + info_len += 1; + } + + return prof->sta_info_len >= info_len && + fixed + prof->sta_info_len <= len; +} + #define for_each_mle_subelement(_elem, _data, _len) \ if (ieee80211_mle_size_ok(_data, _len)) \ for_each_element(_elem, \ -- cgit From 1403b109c9a5244dc6ab79154f04eecc209ef3d2 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 7 Sep 2022 17:23:09 +0300 Subject: wifi: cfg80211/mac80211: Fix ML element common size calculation The common size is part of the length in the data so don't add it again. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index bdf668f9dace..442b13333da8 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4573,18 +4573,17 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) switch (u16_get_bits(control, IEEE80211_ML_CONTROL_TYPE)) { case IEEE80211_ML_CONTROL_TYPE_BASIC: - common += sizeof(struct ieee80211_mle_basic_common_info); - break; case IEEE80211_ML_CONTROL_TYPE_PREQ: - common += sizeof(struct ieee80211_mle_preq_common_info); + case IEEE80211_ML_CONTROL_TYPE_TDLS: + /* + * The length is the first octet pointed by mle->variable so no + * need to add anything + */ break; case IEEE80211_ML_CONTROL_TYPE_RECONF: if (control & IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR) common += ETH_ALEN; return common; - case IEEE80211_ML_CONTROL_TYPE_TDLS: - common += sizeof(struct ieee80211_mle_tdls_common_info); - break; case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: if (control & IEEE80211_MLC_PRIO_ACCESS_PRES_AP_MLD_MAC_ADDR) common += ETH_ALEN; -- cgit From fb99c7d4d6d0fb4fe5a953e0c5f6c37a5b796b98 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Sun, 11 Sep 2022 16:55:12 +0300 Subject: wifi: cfg80211/mac80211: Fix ML element common size validation The Multi-Link element can be fragmented, thus its size can exceed 254. Thus, modify ieee80211_mle_size_ok() to use 'size_t len' instead of 'u8 len'. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 442b13333da8..b935a85b2f44 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4601,7 +4601,7 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) * @data: pointer to the element data * @len: length of the containing element */ -static inline bool ieee80211_mle_size_ok(const u8 *data, u8 len) +static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) { const struct ieee80211_multi_link_elem *mle = (const void *)data; u8 fixed = sizeof(*mle); -- cgit From 45ebac4f059b92906e7e86dd1a780739f883857c Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Tue, 6 Sep 2022 11:48:56 +0300 Subject: wifi: mac80211: Parse station profile from association response When processing an association response frame for a Multi-Link connection, extract the per station profile for each additional link, and use it for parsing the link elements. As the Multi-Link element might be fragmented, add support for reassembling a fragmented element. To simplify memory management logic, extend 'struct ieee802_11_elems' to hold a scratch buffer, which is used for the defragmentation. Once an element is reconstructed in the scratch area, point the corresponding element pointer to it. Currently only defragmentation of Multi-Link element and the contained per-STA profile subelement is supported. Signed-off-by: Ilan Peer Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index b935a85b2f44..f51e939f28f2 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4666,6 +4666,7 @@ static inline bool ieee80211_mle_size_ok(const u8 *data, size_t len) enum ieee80211_mle_subelems { IEEE80211_MLE_SUBELEM_PER_STA_PROFILE = 0, + IEEE80211_MLE_SUBELEM_FRAGMENT = 254, }; #define IEEE80211_MLE_STA_CONTROL_LINK_ID 0x000f -- cgit From 1177aaa7fe9373c762cd5bf5f5de8517bac989d5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 17 Sep 2022 03:14:53 +0200 Subject: wifi: fix multi-link element subelement iteration The subelements obviously start after the common data, including the common multi-link element structure definition itself. This bug was possibly just hidden by the higher bits of the control being set to 0, so the iteration just found one bogus element and most of the code could continue anyway. Fixes: 0f48b8b88aa9 ("wifi: ieee80211: add definitions for multi-link element") Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index f51e939f28f2..6252f02f38b7 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -4593,7 +4593,7 @@ static inline u8 ieee80211_mle_common_size(const u8 *data) return 0; } - return common + mle->variable[0]; + return sizeof(*mle) + common + mle->variable[0]; } /** -- cgit From a6a6163b9a934df5965792d60af1f53aa51fdd39 Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Fri, 7 Oct 2022 10:53:03 +0200 Subject: mac802154: Introduce filtering levels The 802154 specification details several filtering levels in which the PHY and the MAC could be. The amount of filtering will vary if they are in promiscuous mode or in scanning mode. Otherwise they are expected to do some very basic checks, such as enforcing the frame validity. Either the PHY is able to do so, and the MAC has nothing to do, or the PHY has a lower filtering level than expected and the MAC should take over. For now we just define these levels in an enumeration. In a second time, we will add a per-PHY parameter showing the expected filtering level as well as a per device current filtering level, and will initialize all these fields. In a third time, we will use them to apply more filtering by software when the PHY is limited. Indeed, if the drivers know they cannot reach the requested level of filtering, they will overwrite the "current filtering" parameter so that it reflects what they do. Then, in the core, the expected filtering level will be used to decide whether some additional software processing is needed or not. Signed-off-by: Miquel Raynal Acked-by: Alexander Aring Link: https://lore.kernel.org/r/20221007085310.503366-2-miquel.raynal@bootlin.com Signed-off-by: Stefan Schmidt --- include/linux/ieee802154.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ieee802154.h b/include/linux/ieee802154.h index f1f9412b6ac6..0303eb84d596 100644 --- a/include/linux/ieee802154.h +++ b/include/linux/ieee802154.h @@ -276,6 +276,30 @@ enum { IEEE802154_SYSTEM_ERROR = 0xff, }; +/** + * enum ieee802154_filtering_level - Filtering levels applicable to a PHY + * + * @IEEE802154_FILTERING_NONE: No filtering at all, what is received is + * forwarded to the softMAC + * @IEEE802154_FILTERING_1_FCS: First filtering level, frames with an invalid + * FCS should be dropped + * @IEEE802154_FILTERING_2_PROMISCUOUS: Second filtering level, promiscuous + * mode as described in the spec, identical in terms of filtering to the + * level one on PHY side, but at the MAC level the frame should be + * forwarded to the upper layer directly + * @IEEE802154_FILTERING_3_SCAN: Third filtering level, scan related, where + * only beacons must be processed, all remaining traffic gets dropped + * @IEEE802154_FILTERING_4_FRAME_FIELDS: Fourth filtering level actually + * enforcing the validity of the content of the frame with various checks + */ +enum ieee802154_filtering_level { + IEEE802154_FILTERING_NONE, + IEEE802154_FILTERING_1_FCS, + IEEE802154_FILTERING_2_PROMISCUOUS, + IEEE802154_FILTERING_3_SCAN, + IEEE802154_FILTERING_4_FRAME_FIELDS, +}; + /* frame control handling */ #define IEEE802154_FCTL_FTYPE 0x0003 #define IEEE802154_FCTL_ACKREQ 0x0020 -- cgit From e6c86c513f440bec5f1046539c7e3c6c653842da Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 14 Oct 2022 19:39:43 +0800 Subject: rcu-tasks: Provide rcu_trace_implies_rcu_gp() As an accident of implementation, an RCU Tasks Trace grace period also acts as an RCU grace period. However, this could change at any time. This commit therefore creates an rcu_trace_implies_rcu_gp() that currently returns true to codify this accident. Code relying on this accident must call this function to verify that this accident is still happening. Reported-by: Hou Tao Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Martin KaFai Lau Link: https://lore.kernel.org/r/20221014113946.965131-2-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/rcupdate.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 08605ce7379d..8822f06e4b40 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -240,6 +240,18 @@ static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ +/** + * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period? + * + * As an accident of implementation, an RCU Tasks Trace grace period also + * acts as an RCU grace period. However, this could change at any time. + * Code relying on this accident must call this function to verify that + * this accident is still happening. + * + * You have been warned! + */ +static inline bool rcu_trace_implies_rcu_gp(void) { return true; } + /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * -- cgit From a2fd08448f2b7591fc7ec8dec2e5e025a43f0cee Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 17 Oct 2022 14:18:26 +0200 Subject: net: remove smc911x driver This driver was used on Arm and SH machines until 2009, when the last platforms moved to the smsc911x driver for the same hardware. Time to retire this version. Link: https://lore.kernel.org/netdev/1232010482-3744-1-git-send-email-steve.glendinning@smsc.com/ Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20221017121900.3520108-1-arnd@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/smc911x.h | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 include/linux/smc911x.h (limited to 'include/linux') diff --git a/include/linux/smc911x.h b/include/linux/smc911x.h deleted file mode 100644 index 8cace8189e74..000000000000 --- a/include/linux/smc911x.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __SMC911X_H__ -#define __SMC911X_H__ - -#define SMC911X_USE_16BIT (1 << 0) -#define SMC911X_USE_32BIT (1 << 1) - -struct smc911x_platdata { - unsigned long flags; - unsigned long irq_flags; /* IRQF_... */ - int irq_polarity; -}; - -#endif /* __SMC911X_H__ */ -- cgit From f392a1846489720fc2e063d1210633b6cf4ec5a4 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Mon, 17 Oct 2022 16:22:35 -0400 Subject: net: phylink: provide phylink_validate_mask_caps() helper Provide a helper that restricts the link modes according to the phylink capabilities. Signed-off-by: Russell King (Oracle) [rebased on net-next/master and added documentation] Signed-off-by: Sean Anderson Signed-off-by: David S. Miller --- include/linux/phylink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 664dd409feb9..c29c3f174972 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -556,6 +556,9 @@ void phylink_caps_to_linkmodes(unsigned long *linkmodes, unsigned long caps); unsigned long phylink_get_capabilities(phy_interface_t interface, unsigned long mac_capabilities, int rate_matching); +void phylink_validate_mask_caps(unsigned long *supported, + struct phylink_link_state *state, + unsigned long caps); void phylink_generic_validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); -- cgit From 51c352bdbcd23d7ce46b06c1e64c82754dc44044 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Tue, 18 Oct 2022 15:37:27 +0100 Subject: netlink: add support for formatted extack messages Include an 80-byte buffer in struct netlink_ext_ack that can be used for scnprintf()ed messages. This does mean that the resulting string can't be enumerated, translated etc. in the way NL_SET_ERR_MSG() was designed to allow. Signed-off-by: Edward Cree Reviewed-by: Jakub Kicinski Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index d51e041d2242..d81bde5a5844 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -64,6 +64,7 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) /* this can be increased when necessary - don't expose to userland */ #define NETLINK_MAX_COOKIE_LEN 20 +#define NETLINK_MAX_FMTMSG_LEN 80 /** * struct netlink_ext_ack - netlink extended ACK report struct @@ -75,6 +76,8 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) * @miss_nest: nest missing an attribute (%NULL if missing top level attr) * @cookie: cookie data to return to userspace (for success) * @cookie_len: actual cookie data length + * @_msg_buf: output buffer for formatted message strings - don't access + * directly, use %NL_SET_ERR_MSG_FMT */ struct netlink_ext_ack { const char *_msg; @@ -84,13 +87,13 @@ struct netlink_ext_ack { u16 miss_type; u8 cookie[NETLINK_MAX_COOKIE_LEN]; u8 cookie_len; + char _msg_buf[NETLINK_MAX_FMTMSG_LEN]; }; /* Always use this macro, this allows later putting the * message into a separate section or such for things * like translation or listing all possible messages. - * Currently string formatting is not supported (due - * to the lack of an output buffer.) + * If string formatting is needed use NL_SET_ERR_MSG_FMT. */ #define NL_SET_ERR_MSG(extack, msg) do { \ static const char __msg[] = msg; \ @@ -102,9 +105,31 @@ struct netlink_ext_ack { __extack->_msg = __msg; \ } while (0) +/* We splice fmt with %s at each end even in the snprintf so that both calls + * can use the same string constant, avoiding its duplication in .ro + */ +#define NL_SET_ERR_MSG_FMT(extack, fmt, args...) do { \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (!__extack) \ + break; \ + if (snprintf(__extack->_msg_buf, NETLINK_MAX_FMTMSG_LEN, \ + "%s" fmt "%s", "", ##args, "") >= \ + NETLINK_MAX_FMTMSG_LEN) \ + net_warn_ratelimited("%s" fmt "%s", "truncated extack: ", \ + ##args, "\n"); \ + \ + do_trace_netlink_extack(__extack->_msg_buf); \ + \ + __extack->_msg = __extack->_msg_buf; \ +} while (0) + #define NL_SET_ERR_MSG_MOD(extack, msg) \ NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) +#define NL_SET_ERR_MSG_FMT_MOD(extack, fmt, args...) \ + NL_SET_ERR_MSG_FMT((extack), KBUILD_MODNAME ": " fmt, ##args) + #define NL_SET_BAD_ATTR_POLICY(extack, attr, pol) do { \ if ((extack)) { \ (extack)->bad_attr = (attr); \ -- cgit From a5ef058dc4d9a3e60d1808a0700e18e0e37e408e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Oct 2022 19:48:51 +0200 Subject: net: introduce and use custom sockopt socket flag We will soon introduce custom setsockopt for UDP sockets, too. Instead of doing even more complex arbitrary checks inside sock_use_custom_sol_socket(), add a new socket flag and set it for the relevant socket types (currently only MPTCP). Reviewed-by: Matthieu Baerts Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/net.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/net.h b/include/linux/net.h index 711c3593c3b8..59350fd85823 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -41,6 +41,7 @@ struct net; #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 +#define SOCK_CUSTOM_SOCKOPT 5 #ifndef ARCH_HAS_SOCKET_TYPES /** -- cgit From 8a3854c7b8e4532063b14bed34115079b7d0cb36 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 20 Oct 2022 19:48:52 +0200 Subject: udp: track the forward memory release threshold in an hot cacheline When the receiver process and the BH runs on different cores, udp_rmem_release() experience a cache miss while accessing sk_rcvbuf, as the latter shares the same cacheline with sk_forward_alloc, written by the BH. With this patch, UDP tracks the rcvbuf value and its update via custom SOL_SOCKET socket options, and copies the forward memory threshold value used by udp_rmem_release() in a different cacheline, already accessed by the above function and uncontended. Since the UDP socket init operation grown a bit, factor out the common code between v4 and v6 in a shared helper. Overall the above give a 10% peek throughput increase under UDP flood. Signed-off-by: Paolo Abeni Reviewed-by: Eric Dumazet Acked-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/udp.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index e96da4157d04..5cdba00a904a 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -87,6 +87,9 @@ struct udp_sock { /* This field is dirtied by udp_recvmsg() */ int forward_deficit; + + /* This fields follows rcvbuf value, and is touched by udp_recvmsg */ + int forward_threshold; }; #define UDP_MAX_SEGMENTS (1 << 6UL) -- cgit From 4727bab4e9bbeafeff6acdfcb077a7a548cbde30 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Fri, 21 Oct 2022 10:58:22 +0800 Subject: net: skb: move skb_pp_recycle() to skbuff.c skb_pp_recycle() is only used by skb_free_head() in skbuff.c, so move it to skbuff.c. Signed-off-by: Yunsheng Lin Acked-by: Ilias Apalodimas Signed-off-by: David S. Miller --- include/linux/skbuff.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 7be5bb4c94b6..59c9fd55699d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -5050,12 +5050,5 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb) } #endif -static inline bool skb_pp_recycle(struct sk_buff *skb, void *data) -{ - if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) - return false; - return page_pool_return_skb_page(virt_to_page(data)); -} - #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ -- cgit From 398900498485fa9465386454681763d81efaad25 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 21 Oct 2022 16:09:59 +0100 Subject: net: sfp: provide a definition for the power level select bit Provide a named definition for the power level select bit in the extended status register, rather than using BIT(0) in the code. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/sfp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index d1f343853b6c..01ae9f1dd2ad 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -489,6 +489,8 @@ enum { SFP_WARN1_RXPWR_LOW = BIT(6), SFP_EXT_STATUS = 0x76, + SFP_EXT_STATUS_PWRLVL_SELECT = BIT(0), + SFP_VSL = 0x78, SFP_PAGE = 0x7f, }; -- cgit From 73feb8d5fa3b755bb51077c0aabfb6aa556fd498 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 25 Oct 2022 15:41:41 +0200 Subject: kallsyms: Make module_kallsyms_on_each_symbol generally available Making module_kallsyms_on_each_symbol generally available, so it can be used outside CONFIG_LIVEPATCH option in following changes. Rather than adding another ifdef option let's make the function generally available (when CONFIG_KALLSYMS and CONFIG_MODULES options are defined). Cc: Christoph Hellwig Acked-by: Song Liu Signed-off-by: Jiri Olsa Link: https://lore.kernel.org/r/20221025134148.3300700-2-jolsa@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/module.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/module.h b/include/linux/module.h index ec61fb53979a..35876e89eb93 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -879,8 +879,17 @@ static inline bool module_sig_ok(struct module *module) } #endif /* CONFIG_MODULE_SIG */ +#if defined(CONFIG_MODULES) && defined(CONFIG_KALLSYMS) int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data); +#else +static inline int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, + struct module *, unsigned long), + void *data) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_MODULES && CONFIG_KALLSYMS */ #endif /* _LINUX_MODULE_H */ -- cgit From b5f0de6df6dce8d641ef58ef7012f3304dffb9a1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 18 Oct 2022 02:56:03 -0700 Subject: net: dev: Convert sa_data to flexible array in struct sockaddr One of the worst offenders of "fake flexible arrays" is struct sockaddr, as it is the classic example of why GCC and Clang have been traditionally forced to treat all trailing arrays as fake flexible arrays: in the distant misty past, sa_data became too small, and code started just treating it as a flexible array, even though it was fixed-size. The special case by the compiler is specifically that sizeof(sa->sa_data) and FORTIFY_SOURCE (which uses __builtin_object_size(sa->sa_data, 1)) do not agree (14 and -1 respectively), which makes FORTIFY_SOURCE treat it as a flexible array. However, the coming -fstrict-flex-arrays compiler flag will remove these special cases so that FORTIFY_SOURCE can gain coverage over all the trailing arrays in the kernel that are _not_ supposed to be treated as a flexible array. To deal with this change, convert sa_data to a true flexible array. To keep the structure size the same, move sa_data into a union with a newly introduced sa_data_min with the original size. The result is that FORTIFY_SOURCE can continue to have no idea how large sa_data may actually be, but anything using sizeof(sa->sa_data) must switch to sizeof(sa->sa_data_min). Cc: Jens Axboe Cc: Pavel Begunkov Cc: David Ahern Cc: Dylan Yudaken Cc: Yajun Deng Cc: Petr Machata Cc: Hangbin Liu Cc: Leon Romanovsky Cc: syzbot Cc: Willem de Bruijn Cc: Pablo Neira Ayuso Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221018095503.never.671-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index de3701a2a212..13c3a237b9c9 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -33,7 +33,10 @@ typedef __kernel_sa_family_t sa_family_t; struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ - char sa_data[14]; /* 14 bytes of protocol address */ + union { + char sa_data_min[14]; /* Minimum 14 bytes of protocol address */ + DECLARE_FLEX_ARRAY(char, sa_data); + }; }; struct linger { -- cgit From 271de525e1d7f564e88a9d212c50998b49a54476 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Oct 2022 11:45:16 -0700 Subject: bpf: Remove prog->active check for bpf_lsm and bpf_iter The commit 64696c40d03c ("bpf: Add __bpf_prog_{enter,exit}_struct_ops for struct_ops trampoline") removed prog->active check for struct_ops prog. The bpf_lsm and bpf_iter is also using trampoline. Like struct_ops, the bpf_lsm and bpf_iter have fixed hooks for the prog to attach. The kernel does not call the same hook in a recursive way. This patch also removes the prog->active check for bpf_lsm and bpf_iter. A later patch has a test to reproduce the recursion issue for a sleepable bpf_lsm program. This patch appends the '_recur' naming to the existing enter and exit functions that track the prog->active counter. New __bpf_prog_{enter,exit}[_sleepable] function are added to skip the prog->active tracking. The '_struct_ops' version is also removed. It also moves the decision on picking the enter and exit function to the new bpf_trampoline_{enter,exit}(). It returns the '_recur' ones for all tracing progs to use. For bpf_lsm, bpf_iter, struct_ops (no prog->active tracking after 64696c40d03c), and bpf_lsm_cgroup (no prog->active tracking after 69fd337a975c7), it will return the functions that don't track the prog->active. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221025184524.3526117-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 24 ++++++++++-------------- include/linux/bpf_verifier.h | 15 ++++++++++++++- 2 files changed, 24 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9e7d46d16032..1279e699dc98 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -854,22 +854,18 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *i const struct btf_func_model *m, u32 flags, struct bpf_tramp_links *tlinks, void *orig_call); -/* these two functions are called from generated trampoline */ -u64 notrace __bpf_prog_enter(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_run_ctx *run_ctx); -u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx); -void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx); -u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, - struct bpf_tramp_run_ctx *run_ctx); -void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx); -u64 notrace __bpf_prog_enter_struct_ops(struct bpf_prog *prog, - struct bpf_tramp_run_ctx *run_ctx); -void notrace __bpf_prog_exit_struct_ops(struct bpf_prog *prog, u64 start, - struct bpf_tramp_run_ctx *run_ctx); +u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); +typedef u64 (*bpf_trampoline_enter_t)(struct bpf_prog *prog, + struct bpf_tramp_run_ctx *run_ctx); +typedef void (*bpf_trampoline_exit_t)(struct bpf_prog *prog, u64 start, + struct bpf_tramp_run_ctx *run_ctx); +bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog); +bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog); struct bpf_ksym { unsigned long start; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 9e1e6965f407..1a32baa78ce2 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -642,10 +642,23 @@ static inline u32 type_flag(u32 type) } /* only use after check_attach_btf_id() */ -static inline enum bpf_prog_type resolve_prog_type(struct bpf_prog *prog) +static inline enum bpf_prog_type resolve_prog_type(const struct bpf_prog *prog) { return prog->type == BPF_PROG_TYPE_EXT ? prog->aux->dst_prog->type : prog->type; } +static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) +{ + switch (resolve_prog_type(prog)) { + case BPF_PROG_TYPE_TRACING: + return prog->expected_attach_type != BPF_TRACE_ITER; + case BPF_PROG_TYPE_STRUCT_OPS: + case BPF_PROG_TYPE_LSM: + return false; + default: + return true; + } +} + #endif /* _LINUX_BPF_VERIFIER_H */ -- cgit From 0593dd34e53489557569d5e6d27371b49aa9b41f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Oct 2022 11:45:17 -0700 Subject: bpf: Append _recur naming to the bpf_task_storage helper proto This patch adds the "_recur" naming to the bpf_task_storage_{get,delete} proto. In a latter patch, they will only be used by the tracing programs that requires a deadlock detection because a tracing prog may use bpf_task_storage_{get,delete} recursively and cause a deadlock. Another following patch will add a different helper proto for the non tracing programs because they do not need the deadlock prevention. This patch does this rename to prepare for this future proto additions. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221025184524.3526117-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1279e699dc98..b04fe3f4342e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2519,8 +2519,8 @@ extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; -extern const struct bpf_func_proto bpf_task_storage_get_proto; -extern const struct bpf_func_proto bpf_task_storage_delete_proto; +extern const struct bpf_func_proto bpf_task_storage_get_recur_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; -- cgit From 4279adb094a17132423f1271c3d11b593fc2327e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Oct 2022 11:45:20 -0700 Subject: bpf: Add new bpf_task_storage_get proto with no deadlock detection The bpf_lsm and bpf_iter do not recur that will cause a deadlock. The situation is similar to the bpf_pid_task_storage_lookup_elem() which is called from the syscall map_lookup_elem. It does not need deadlock detection. Otherwise, it will cause unnecessary failure when calling the bpf_task_storage_get() helper. This patch adds bpf_task_storage_get proto that does not do deadlock detection. It will be used by bpf_lsm and bpf_iter programs. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221025184524.3526117-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b04fe3f4342e..ef3f98afa45d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2520,6 +2520,7 @@ extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_recur_proto; +extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; -- cgit From 8a7dac37f27a3dfbd814bf29a73d6417db2c81d9 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 25 Oct 2022 11:45:22 -0700 Subject: bpf: Add new bpf_task_storage_delete proto with no deadlock detection The bpf_lsm and bpf_iter do not recur that will cause a deadlock. The situation is similar to the bpf_pid_task_storage_delete_elem() which is called from the syscall map_delete_elem. It does not need deadlock detection. Otherwise, it will cause unnecessary failure when calling the bpf_task_storage_delete() helper. This patch adds bpf_task_storage_delete proto that does not do deadlock detection. It will be used by bpf_lsm and bpf_iter program. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20221025184524.3526117-8-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index ef3f98afa45d..a5dbac8f5aba 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2522,6 +2522,7 @@ extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; extern const struct bpf_func_proto bpf_task_storage_get_recur_proto; extern const struct bpf_func_proto bpf_task_storage_get_proto; extern const struct bpf_func_proto bpf_task_storage_delete_recur_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_proto; extern const struct bpf_func_proto bpf_for_each_map_elem_proto; extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto; extern const struct bpf_func_proto bpf_sk_setsockopt_proto; -- cgit From 5e67b8ef125bb6e83bf0f0442ad7ffc09e7956f9 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Oct 2022 21:28:40 -0700 Subject: bpf: Make struct cgroup btf id global Make struct cgroup btf id global so later patch can reuse the same btf id. Acked-by: David Vernet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221026042840.672602-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 2aea877d644f..c9744efd202f 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -265,5 +265,6 @@ MAX_BTF_TRACING_TYPE, }; extern u32 btf_tracing_ids[]; +extern u32 bpf_cgroup_btf_id[]; #endif -- cgit From c83597fa5dc6b322e9bdf929e5f4136a3f4aa4db Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Oct 2022 21:28:45 -0700 Subject: bpf: Refactor some inode/task/sk storage functions for reuse Refactor codes so that inode/task/sk storage implementation can maximally share the same code. I also added some comments in new function bpf_local_storage_unlink_nolock() to make codes easy to understand. There is no functionality change. Acked-by: David Vernet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221026042845.672944-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 7ea18d4da84b..6d37a40cd90e 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -116,21 +116,22 @@ static struct bpf_local_storage_cache name = { \ .idx_lock = __SPIN_LOCK_UNLOCKED(name.idx_lock), \ } -u16 bpf_local_storage_cache_idx_get(struct bpf_local_storage_cache *cache); -void bpf_local_storage_cache_idx_free(struct bpf_local_storage_cache *cache, - u16 idx); - /* Helper functions for bpf_local_storage */ int bpf_local_storage_map_alloc_check(union bpf_attr *attr); -struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr); +struct bpf_map * +bpf_local_storage_map_alloc(union bpf_attr *attr, + struct bpf_local_storage_cache *cache); struct bpf_local_storage_data * bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit); -void bpf_local_storage_map_free(struct bpf_local_storage_map *smap, +bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage); + +void bpf_local_storage_map_free(struct bpf_map *map, + struct bpf_local_storage_cache *cache, int __percpu *busy_counter); int bpf_local_storage_map_check_btf(const struct bpf_map *map, @@ -141,10 +142,6 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, struct bpf_local_storage_elem *selem); -bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage, - struct bpf_local_storage_elem *selem, - bool uncharge_omem, bool use_trace_rcu); - void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); void bpf_selem_link_map(struct bpf_local_storage_map *smap, -- cgit From c4bcfb38a95edb1021a53f2d0356a78120ecfbe4 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 25 Oct 2022 21:28:50 -0700 Subject: bpf: Implement cgroup storage available to non-cgroup-attached bpf progs Similar to sk/inode/task storage, implement similar cgroup local storage. There already exists a local storage implementation for cgroup-attached bpf programs. See map type BPF_MAP_TYPE_CGROUP_STORAGE and helper bpf_get_local_storage(). But there are use cases such that non-cgroup attached bpf progs wants to access cgroup local storage data. For example, tc egress prog has access to sk and cgroup. It is possible to use sk local storage to emulate cgroup local storage by storing data in socket. But this is a waste as it could be lots of sockets belonging to a particular cgroup. Alternatively, a separate map can be created with cgroup id as the key. But this will introduce additional overhead to manipulate the new map. A cgroup local storage, similar to existing sk/inode/task storage, should help for this use case. The life-cycle of storage is managed with the life-cycle of the cgroup struct. i.e. the storage is destroyed along with the owning cgroup with a call to bpf_cgrp_storage_free() when cgroup itself is deleted. The userspace map operations can be done by using a cgroup fd as a key passed to the lookup, update and delete operations. Typically, the following code is used to get the current cgroup: struct task_struct *task = bpf_get_current_task_btf(); ... task->cgroups->dfl_cgrp ... and in structure task_struct definition: struct task_struct { .... struct css_set __rcu *cgroups; .... } With sleepable program, accessing task->cgroups is not protected by rcu_read_lock. So the current implementation only supports non-sleepable program and supporting sleepable program will be the next step together with adding rcu_read_lock protection for rcu tagged structures. Since map name BPF_MAP_TYPE_CGROUP_STORAGE has been used for old cgroup local storage support, the new map name BPF_MAP_TYPE_CGRP_STORAGE is used for cgroup storage available to non-cgroup-attached bpf programs. The old cgroup storage supports bpf_get_local_storage() helper to get the cgroup data. The new cgroup storage helper bpf_cgrp_storage_get() can provide similar functionality. While old cgroup storage pre-allocates storage memory, the new mechanism can also pre-allocate with a user space bpf_map_update_elem() call to avoid potential run-time memory allocation failure. Therefore, the new cgroup storage can provide all functionality w.r.t. the old one. So in uapi bpf.h, the old BPF_MAP_TYPE_CGROUP_STORAGE is alias to BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED to indicate the old cgroup storage can be deprecated since the new one can provide the same functionality. Acked-by: David Vernet Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221026042850.673791-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ include/linux/bpf_types.h | 1 + include/linux/cgroup-defs.h | 4 ++++ 3 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a5dbac8f5aba..9fd68b0b3e9c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2041,6 +2041,7 @@ struct bpf_link *bpf_link_by_id(u32 id); const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); void bpf_task_storage_free(struct task_struct *task); +void bpf_cgrp_storage_free(struct cgroup *cgroup); bool bpf_prog_has_kfunc_call(const struct bpf_prog *prog); const struct btf_func_model * bpf_jit_find_kfunc_model(const struct bpf_prog *prog, @@ -2295,6 +2296,10 @@ static inline bool has_current_bpf_ctx(void) static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) { } + +static inline void bpf_cgrp_storage_free(struct cgroup *cgroup) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, @@ -2535,6 +2540,8 @@ extern const struct bpf_func_proto bpf_copy_from_user_task_proto; extern const struct bpf_func_proto bpf_set_retval_proto; extern const struct bpf_func_proto bpf_get_retval_proto; extern const struct bpf_func_proto bpf_user_ringbuf_drain_proto; +extern const struct bpf_func_proto bpf_cgrp_storage_get_proto; +extern const struct bpf_func_proto bpf_cgrp_storage_delete_proto; const struct bpf_func_proto *tracing_prog_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2c6a4f2562a7..d4ee3ccd3753 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -86,6 +86,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_PROG_ARRAY, prog_array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops) #ifdef CONFIG_CGROUPS BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_CGRP_STORAGE, cgrp_storage_map_ops) #endif #ifdef CONFIG_CGROUP_BPF BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8f481d1b159a..c466fdc3a32a 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -504,6 +504,10 @@ struct cgroup { /* Used to store internal freezer state */ struct cgroup_freezer_state freezer; +#ifdef CONFIG_BPF_SYSCALL + struct bpf_local_storage __rcu *bpf_cgrp_storage; +#endif + /* All ancestors including self */ struct cgroup *ancestors[]; }; -- cgit From e753df8fbca592d36f539ed950fcdf412166c549 Mon Sep 17 00:00:00 2001 From: Michal Jaron Date: Tue, 25 Oct 2022 09:12:52 -0700 Subject: ice: Add support Flex RXD Add new VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC flag, opcode VIRTCHNL_OP_GET_SUPPORTED_RXDIDS and add member rxdid in struct virtchnl_rxq_info to support AVF Flex RXD extension. Add support to allow VF to query flexible descriptor RXDIDs supported by DDP package and configure Rx queues with selected RXDID for IAVF. Add code to allow VIRTCHNL_OP_GET_SUPPORTED_RXDIDS message to be processed. Add necessary macros for registers. Signed-off-by: Leyi Rong Signed-off-by: Xu Ting Signed-off-by: Michal Jaron Signed-off-by: Mateusz Palczewski Tested-by: Maxime Coquelin Tested-by: Konrad Jankowski Signed-off-by: Jacob Keller Link: https://lore.kernel.org/r/20221025161252.1952939-1-jacob.e.keller@intel.com Signed-off-by: Paolo Abeni --- include/linux/avf/virtchnl.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/avf/virtchnl.h b/include/linux/avf/virtchnl.h index 2ce27e8e4f19..d91af50ac58d 100644 --- a/include/linux/avf/virtchnl.h +++ b/include/linux/avf/virtchnl.h @@ -136,7 +136,8 @@ enum virtchnl_ops { VIRTCHNL_OP_DISABLE_CHANNELS = 31, VIRTCHNL_OP_ADD_CLOUD_FILTER = 32, VIRTCHNL_OP_DEL_CLOUD_FILTER = 33, - /* opcode 34 - 44 are reserved */ + /* opcode 34 - 43 are reserved */ + VIRTCHNL_OP_GET_SUPPORTED_RXDIDS = 44, VIRTCHNL_OP_ADD_RSS_CFG = 45, VIRTCHNL_OP_DEL_RSS_CFG = 46, VIRTCHNL_OP_ADD_FDIR_FILTER = 47, @@ -263,6 +264,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource); #define VIRTCHNL_VF_OFFLOAD_RX_ENCAP_CSUM BIT(22) #define VIRTCHNL_VF_OFFLOAD_ADQ BIT(23) #define VIRTCHNL_VF_OFFLOAD_USO BIT(25) +#define VIRTCHNL_VF_OFFLOAD_RX_FLEX_DESC BIT(26) #define VIRTCHNL_VF_OFFLOAD_ADV_RSS_PF BIT(27) #define VIRTCHNL_VF_OFFLOAD_FDIR_PF BIT(28) @@ -318,7 +320,9 @@ struct virtchnl_rxq_info { u16 splithdr_enabled; /* deprecated with AVF 1.0 */ u32 databuffer_size; u32 max_pkt_size; - u32 pad1; + u8 pad0; + u8 rxdid; + u8 pad1[2]; u64 dma_ring_addr; enum virtchnl_rx_hsplit rx_split_pos; /* deprecated with AVF 1.0 */ u32 pad2; @@ -970,6 +974,10 @@ struct virtchnl_filter { VIRTCHNL_CHECK_STRUCT_LEN(272, virtchnl_filter); +struct virtchnl_supported_rxdids { + u64 supported_rxdids; +}; + /* VIRTCHNL_OP_EVENT * PF sends this message to inform the VF driver of events that may affect it. * No direct response is expected from the VF, though it may generate other @@ -1499,6 +1507,8 @@ virtchnl_vc_validate_vf_msg(struct virtchnl_version_info *ver, u32 v_opcode, case VIRTCHNL_OP_DEL_CLOUD_FILTER: valid_len = sizeof(struct virtchnl_filter); break; + case VIRTCHNL_OP_GET_SUPPORTED_RXDIDS: + break; case VIRTCHNL_OP_ADD_RSS_CFG: case VIRTCHNL_OP_DEL_RSS_CFG: valid_len = sizeof(struct virtchnl_rss_cfg); -- cgit From 29c1c44646aec5d5134f2365259a84becc1ee7d3 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:14 +0000 Subject: tcp: add u32 counter in tcp_sock and an SNMP counter for PLB A u32 counter is added to tcp_sock for counting the number of PLB triggered rehashes for a TCP connection. An SNMP counter is also added to count overall PLB triggered rehash events for a host. These counters are hooked up to PLB implementation for DCTCP. TCP_NLA_REHASH is added to SCM_TIMESTAMPING_OPT_STATS that reports the rehash attempts triggered due to PLB or timeouts. This gives a historical view of sustained congestion or timeouts experienced by the TCP connection. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 41b1da621a45..ca7f05a130d2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -423,6 +423,7 @@ struct tcp_sock { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; + u32 plb_rehash; /* PLB-triggered rehash attempts */ u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ -- cgit From 9c5a170677c3c8facc83e931a57f4c99c0511ae0 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 27 Oct 2022 14:10:37 +0100 Subject: net: phylink: add phylink_get_link_timer_ns() helper Add a helper to convert the PHY interface mode to the required link timer setting as stated by the appropriate standard. Inappropriate interface modes return an error. Signed-off-by: Russell King (Oracle) Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 63800bf4a7ac..1df3e5e316e8 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -616,6 +616,30 @@ int phylink_speed_up(struct phylink *pl); void phylink_set_port_modes(unsigned long *bits); +/** + * phylink_get_link_timer_ns - return the PCS link timer value + * @interface: link &typedef phy_interface_t mode + * + * Return the PCS link timer setting in nanoseconds for the PHY @interface + * mode, or -EINVAL if not appropriate. + */ +static inline int phylink_get_link_timer_ns(phy_interface_t interface) +{ + switch (interface) { + case PHY_INTERFACE_MODE_SGMII: + case PHY_INTERFACE_MODE_QSGMII: + case PHY_INTERFACE_MODE_USXGMII: + return 1600000; + + case PHY_INTERFACE_MODE_1000BASEX: + case PHY_INTERFACE_MODE_2500BASEX: + return 10000000; + + default: + return -EINVAL; + } +} + void phylink_mii_c22_pcs_decode_state(struct phylink_link_state *state, u16 bmsr, u16 lpa); void phylink_mii_c22_pcs_get_state(struct mdio_device *pcs, -- cgit From 17dd361119e5bc4cfb85aed660674a846b613817 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 27 Oct 2022 14:21:16 +0100 Subject: net: sfp: convert register indexes from hex to decimal The register indexes in the standards are in decimal rather than hex, so lets specify them in decimal in the header file so we can easily cross-reference without converting between hex and decimal. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/sfp.h | 180 ++++++++++++++++++++++++++-------------------------- 1 file changed, 90 insertions(+), 90 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 01ae9f1dd2ad..4e2db155664d 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -332,37 +332,37 @@ enum { /* SFP EEPROM registers */ enum { - SFP_PHYS_ID = 0x00, - SFP_PHYS_EXT_ID = 0x01, - SFP_CONNECTOR = 0x02, - SFP_COMPLIANCE = 0x03, - SFP_ENCODING = 0x0b, - SFP_BR_NOMINAL = 0x0c, - SFP_RATE_ID = 0x0d, - SFP_LINK_LEN_SM_KM = 0x0e, - SFP_LINK_LEN_SM_100M = 0x0f, - SFP_LINK_LEN_50UM_OM2_10M = 0x10, - SFP_LINK_LEN_62_5UM_OM1_10M = 0x11, - SFP_LINK_LEN_COPPER_1M = 0x12, - SFP_LINK_LEN_50UM_OM4_10M = 0x12, - SFP_LINK_LEN_50UM_OM3_10M = 0x13, - SFP_VENDOR_NAME = 0x14, - SFP_VENDOR_OUI = 0x25, - SFP_VENDOR_PN = 0x28, - SFP_VENDOR_REV = 0x38, - SFP_OPTICAL_WAVELENGTH_MSB = 0x3c, - SFP_OPTICAL_WAVELENGTH_LSB = 0x3d, - SFP_CABLE_SPEC = 0x3c, - SFP_CC_BASE = 0x3f, - SFP_OPTIONS = 0x40, /* 2 bytes, MSB, LSB */ - SFP_BR_MAX = 0x42, - SFP_BR_MIN = 0x43, - SFP_VENDOR_SN = 0x44, - SFP_DATECODE = 0x54, - SFP_DIAGMON = 0x5c, - SFP_ENHOPTS = 0x5d, - SFP_SFF8472_COMPLIANCE = 0x5e, - SFP_CC_EXT = 0x5f, + SFP_PHYS_ID = 0, + SFP_PHYS_EXT_ID = 1, + SFP_CONNECTOR = 2, + SFP_COMPLIANCE = 3, + SFP_ENCODING = 11, + SFP_BR_NOMINAL = 12, + SFP_RATE_ID = 13, + SFP_LINK_LEN_SM_KM = 14, + SFP_LINK_LEN_SM_100M = 15, + SFP_LINK_LEN_50UM_OM2_10M = 16, + SFP_LINK_LEN_62_5UM_OM1_10M = 17, + SFP_LINK_LEN_COPPER_1M = 18, + SFP_LINK_LEN_50UM_OM4_10M = 18, + SFP_LINK_LEN_50UM_OM3_10M = 19, + SFP_VENDOR_NAME = 20, + SFP_VENDOR_OUI = 37, + SFP_VENDOR_PN = 40, + SFP_VENDOR_REV = 56, + SFP_OPTICAL_WAVELENGTH_MSB = 60, + SFP_OPTICAL_WAVELENGTH_LSB = 61, + SFP_CABLE_SPEC = 60, + SFP_CC_BASE = 63, + SFP_OPTIONS = 64, /* 2 bytes, MSB, LSB */ + SFP_BR_MAX = 66, + SFP_BR_MIN = 67, + SFP_VENDOR_SN = 68, + SFP_DATECODE = 84, + SFP_DIAGMON = 92, + SFP_ENHOPTS = 93, + SFP_SFF8472_COMPLIANCE = 94, + SFP_CC_EXT = 95, SFP_PHYS_EXT_ID_SFP = 0x04, SFP_OPTIONS_HIGH_POWER_LEVEL = BIT(13), @@ -404,63 +404,63 @@ enum { /* SFP Diagnostics */ enum { /* Alarm and warnings stored MSB at lower address then LSB */ - SFP_TEMP_HIGH_ALARM = 0x00, - SFP_TEMP_LOW_ALARM = 0x02, - SFP_TEMP_HIGH_WARN = 0x04, - SFP_TEMP_LOW_WARN = 0x06, - SFP_VOLT_HIGH_ALARM = 0x08, - SFP_VOLT_LOW_ALARM = 0x0a, - SFP_VOLT_HIGH_WARN = 0x0c, - SFP_VOLT_LOW_WARN = 0x0e, - SFP_BIAS_HIGH_ALARM = 0x10, - SFP_BIAS_LOW_ALARM = 0x12, - SFP_BIAS_HIGH_WARN = 0x14, - SFP_BIAS_LOW_WARN = 0x16, - SFP_TXPWR_HIGH_ALARM = 0x18, - SFP_TXPWR_LOW_ALARM = 0x1a, - SFP_TXPWR_HIGH_WARN = 0x1c, - SFP_TXPWR_LOW_WARN = 0x1e, - SFP_RXPWR_HIGH_ALARM = 0x20, - SFP_RXPWR_LOW_ALARM = 0x22, - SFP_RXPWR_HIGH_WARN = 0x24, - SFP_RXPWR_LOW_WARN = 0x26, - SFP_LASER_TEMP_HIGH_ALARM = 0x28, - SFP_LASER_TEMP_LOW_ALARM = 0x2a, - SFP_LASER_TEMP_HIGH_WARN = 0x2c, - SFP_LASER_TEMP_LOW_WARN = 0x2e, - SFP_TEC_CUR_HIGH_ALARM = 0x30, - SFP_TEC_CUR_LOW_ALARM = 0x32, - SFP_TEC_CUR_HIGH_WARN = 0x34, - SFP_TEC_CUR_LOW_WARN = 0x36, - SFP_CAL_RXPWR4 = 0x38, - SFP_CAL_RXPWR3 = 0x3c, - SFP_CAL_RXPWR2 = 0x40, - SFP_CAL_RXPWR1 = 0x44, - SFP_CAL_RXPWR0 = 0x48, - SFP_CAL_TXI_SLOPE = 0x4c, - SFP_CAL_TXI_OFFSET = 0x4e, - SFP_CAL_TXPWR_SLOPE = 0x50, - SFP_CAL_TXPWR_OFFSET = 0x52, - SFP_CAL_T_SLOPE = 0x54, - SFP_CAL_T_OFFSET = 0x56, - SFP_CAL_V_SLOPE = 0x58, - SFP_CAL_V_OFFSET = 0x5a, - SFP_CHKSUM = 0x5f, - - SFP_TEMP = 0x60, - SFP_VCC = 0x62, - SFP_TX_BIAS = 0x64, - SFP_TX_POWER = 0x66, - SFP_RX_POWER = 0x68, - SFP_LASER_TEMP = 0x6a, - SFP_TEC_CUR = 0x6c, - - SFP_STATUS = 0x6e, + SFP_TEMP_HIGH_ALARM = 0, + SFP_TEMP_LOW_ALARM = 2, + SFP_TEMP_HIGH_WARN = 4, + SFP_TEMP_LOW_WARN = 6, + SFP_VOLT_HIGH_ALARM = 8, + SFP_VOLT_LOW_ALARM = 10, + SFP_VOLT_HIGH_WARN = 12, + SFP_VOLT_LOW_WARN = 14, + SFP_BIAS_HIGH_ALARM = 16, + SFP_BIAS_LOW_ALARM = 18, + SFP_BIAS_HIGH_WARN = 20, + SFP_BIAS_LOW_WARN = 22, + SFP_TXPWR_HIGH_ALARM = 24, + SFP_TXPWR_LOW_ALARM = 26, + SFP_TXPWR_HIGH_WARN = 28, + SFP_TXPWR_LOW_WARN = 30, + SFP_RXPWR_HIGH_ALARM = 32, + SFP_RXPWR_LOW_ALARM = 34, + SFP_RXPWR_HIGH_WARN = 36, + SFP_RXPWR_LOW_WARN = 38, + SFP_LASER_TEMP_HIGH_ALARM = 40, + SFP_LASER_TEMP_LOW_ALARM = 42, + SFP_LASER_TEMP_HIGH_WARN = 44, + SFP_LASER_TEMP_LOW_WARN = 46, + SFP_TEC_CUR_HIGH_ALARM = 48, + SFP_TEC_CUR_LOW_ALARM = 50, + SFP_TEC_CUR_HIGH_WARN = 52, + SFP_TEC_CUR_LOW_WARN = 54, + SFP_CAL_RXPWR4 = 56, + SFP_CAL_RXPWR3 = 60, + SFP_CAL_RXPWR2 = 64, + SFP_CAL_RXPWR1 = 68, + SFP_CAL_RXPWR0 = 72, + SFP_CAL_TXI_SLOPE = 76, + SFP_CAL_TXI_OFFSET = 78, + SFP_CAL_TXPWR_SLOPE = 80, + SFP_CAL_TXPWR_OFFSET = 82, + SFP_CAL_T_SLOPE = 84, + SFP_CAL_T_OFFSET = 86, + SFP_CAL_V_SLOPE = 88, + SFP_CAL_V_OFFSET = 90, + SFP_CHKSUM = 95, + + SFP_TEMP = 96, + SFP_VCC = 98, + SFP_TX_BIAS = 100, + SFP_TX_POWER = 102, + SFP_RX_POWER = 104, + SFP_LASER_TEMP = 106, + SFP_TEC_CUR = 108, + + SFP_STATUS = 110, SFP_STATUS_TX_DISABLE = BIT(7), SFP_STATUS_TX_DISABLE_FORCE = BIT(6), SFP_STATUS_TX_FAULT = BIT(2), SFP_STATUS_RX_LOS = BIT(1), - SFP_ALARM0 = 0x70, + SFP_ALARM0 = 112, SFP_ALARM0_TEMP_HIGH = BIT(7), SFP_ALARM0_TEMP_LOW = BIT(6), SFP_ALARM0_VCC_HIGH = BIT(5), @@ -470,11 +470,11 @@ enum { SFP_ALARM0_TXPWR_HIGH = BIT(1), SFP_ALARM0_TXPWR_LOW = BIT(0), - SFP_ALARM1 = 0x71, + SFP_ALARM1 = 113, SFP_ALARM1_RXPWR_HIGH = BIT(7), SFP_ALARM1_RXPWR_LOW = BIT(6), - SFP_WARN0 = 0x74, + SFP_WARN0 = 116, SFP_WARN0_TEMP_HIGH = BIT(7), SFP_WARN0_TEMP_LOW = BIT(6), SFP_WARN0_VCC_HIGH = BIT(5), @@ -484,15 +484,15 @@ enum { SFP_WARN0_TXPWR_HIGH = BIT(1), SFP_WARN0_TXPWR_LOW = BIT(0), - SFP_WARN1 = 0x75, + SFP_WARN1 = 117, SFP_WARN1_RXPWR_HIGH = BIT(7), SFP_WARN1_RXPWR_LOW = BIT(6), - SFP_EXT_STATUS = 0x76, + SFP_EXT_STATUS = 118, SFP_EXT_STATUS_PWRLVL_SELECT = BIT(0), - SFP_VSL = 0x78, - SFP_PAGE = 0x7f, + SFP_VSL = 120, + SFP_PAGE = 127, }; struct fwnode_handle; -- cgit From d83845d224a059165902ecd0e1203015c5713a78 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Thu, 27 Oct 2022 14:21:21 +0100 Subject: net: sfp: move field definitions along side register index Just as we do for the A2h enum, arrange the A0h enum to have the field definitions next to their corresponding register index. Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/sfp.h | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sfp.h b/include/linux/sfp.h index 4e2db155664d..52b98f9666a2 100644 --- a/include/linux/sfp.h +++ b/include/linux/sfp.h @@ -333,7 +333,10 @@ enum { /* SFP EEPROM registers */ enum { SFP_PHYS_ID = 0, + SFP_PHYS_EXT_ID = 1, + SFP_PHYS_EXT_ID_SFP = 0x04, + SFP_CONNECTOR = 2, SFP_COMPLIANCE = 3, SFP_ENCODING = 11, @@ -354,17 +357,8 @@ enum { SFP_OPTICAL_WAVELENGTH_LSB = 61, SFP_CABLE_SPEC = 60, SFP_CC_BASE = 63, - SFP_OPTIONS = 64, /* 2 bytes, MSB, LSB */ - SFP_BR_MAX = 66, - SFP_BR_MIN = 67, - SFP_VENDOR_SN = 68, - SFP_DATECODE = 84, - SFP_DIAGMON = 92, - SFP_ENHOPTS = 93, - SFP_SFF8472_COMPLIANCE = 94, - SFP_CC_EXT = 95, - SFP_PHYS_EXT_ID_SFP = 0x04, + SFP_OPTIONS = 64, /* 2 bytes, MSB, LSB */ SFP_OPTIONS_HIGH_POWER_LEVEL = BIT(13), SFP_OPTIONS_PAGING_A2 = BIT(12), SFP_OPTIONS_RETIMER = BIT(11), @@ -378,11 +372,20 @@ enum { SFP_OPTIONS_TX_FAULT = BIT(3), SFP_OPTIONS_LOS_INVERTED = BIT(2), SFP_OPTIONS_LOS_NORMAL = BIT(1), + + SFP_BR_MAX = 66, + SFP_BR_MIN = 67, + SFP_VENDOR_SN = 68, + SFP_DATECODE = 84, + + SFP_DIAGMON = 92, SFP_DIAGMON_DDM = BIT(6), SFP_DIAGMON_INT_CAL = BIT(5), SFP_DIAGMON_EXT_CAL = BIT(4), SFP_DIAGMON_RXPWR_AVG = BIT(3), SFP_DIAGMON_ADDRMODE = BIT(2), + + SFP_ENHOPTS = 93, SFP_ENHOPTS_ALARMWARN = BIT(7), SFP_ENHOPTS_SOFT_TX_DISABLE = BIT(6), SFP_ENHOPTS_SOFT_TX_FAULT = BIT(5), @@ -390,6 +393,8 @@ enum { SFP_ENHOPTS_SOFT_RATE_SELECT = BIT(3), SFP_ENHOPTS_APP_SELECT_SFF8079 = BIT(2), SFP_ENHOPTS_SOFT_RATE_SFF8431 = BIT(1), + + SFP_SFF8472_COMPLIANCE = 94, SFP_SFF8472_COMPLIANCE_NONE = 0x00, SFP_SFF8472_COMPLIANCE_REV9_3 = 0x01, SFP_SFF8472_COMPLIANCE_REV9_5 = 0x02, @@ -399,6 +404,8 @@ enum { SFP_SFF8472_COMPLIANCE_REV11_3 = 0x06, SFP_SFF8472_COMPLIANCE_REV11_4 = 0x07, SFP_SFF8472_COMPLIANCE_REV12_0 = 0x08, + + SFP_CC_EXT = 95, }; /* SFP Diagnostics */ -- cgit From f3fb589aeb88c5bf7a7a804d36a8fa219b72e290 Mon Sep 17 00:00:00 2001 From: Juhee Kang Date: Fri, 28 Oct 2022 01:04:24 +0900 Subject: net: remove unused netdev_unregistering() Currently, use dev->reg_state == NETREG_UNREGISTERING to check the status which is NETREG_UNREGISTERING, rather than using netdev_unregistering. Also, A helper function which is netdev_unregistering on nedevice.h is no longer used. Thus, netdev_unregistering removes from netdevice.h. Signed-off-by: Juhee Kang Signed-off-by: David S. Miller --- include/linux/netdevice.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index eddf8ee270e7..99e58b773266 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -5101,11 +5101,6 @@ static inline const char *netdev_name(const struct net_device *dev) return dev->name; } -static inline bool netdev_unregistering(const struct net_device *dev) -{ - return dev->reg_state == NETREG_UNREGISTERING; -} - static inline const char *netdev_reg_state(const struct net_device *dev) { switch (dev->reg_state) { -- cgit From b9a61b97798ca6d0c856a217e61e2177bd8f6eae Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 28 Oct 2022 04:04:12 -0700 Subject: ptp: add missing documentation for parameters The ptp_find_pin_unlocked function and the ptp_system_timestamp structure didn't document their parameters and fields. Fix this. Signed-off-by: Jacob Keller Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 92b44161408e..ad4aaadc2f7a 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -45,6 +45,8 @@ struct system_device_crosststamp; /** * struct ptp_system_timestamp - system time corresponding to a PHC timestamp + * @pre_ts: system timestamp before capturing PHC + * @post_ts: system timestamp after capturing PHC */ struct ptp_system_timestamp { struct timespec64 pre_ts; @@ -316,6 +318,11 @@ int ptp_find_pin(struct ptp_clock *ptp, * should most likely call ptp_find_pin() directly from their * ptp_clock_info::enable() method. * +* @ptp: The clock obtained from ptp_clock_register(). +* @func: One of the ptp_pin_function enumerated values. +* @chan: The particular functional channel to find. +* Return: Pin index in the range of zero to ptp_clock_caps.n_pins - 1, +* or -1 if the auxiliary function cannot be found. */ int ptp_find_pin_unlocked(struct ptp_clock *ptp, -- cgit From 1060707e380994e7c9b754b6eb74f25459b4a5b3 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Fri, 28 Oct 2022 04:04:13 -0700 Subject: ptp: introduce helpers to adjust by scaled parts per million Many drivers implement the .adjfreq or .adjfine PTP op function with the same basic logic: 1. Determine a base frequency value 2. Multiply this by the abs() of the requested adjustment, then divide by the appropriate divisor (1 billion, or 65,536 billion). 3. Add or subtract this difference from the base frequency to calculate a new adjustment. A few drivers need the difference and direction rather than the combined new increment value. I recently converted the Intel drivers to .adjfine and the scaled parts per million (65.536 parts per billion) logic. To avoid overflow with minimal loss of precision, mul_u64_u64_div_u64 was used. The basic logic used by all of these drivers is very similar, and leads to a lot of duplicate code to perform the same task. Rather than keep this duplicate code, introduce diff_by_scaled_ppm and adjust_by_scaled_ppm. These helper functions calculate the difference or adjustment necessary based on the scaled parts per million input. The diff_by_scaled_ppm function returns true if the difference should be subtracted, and false otherwise. Update the Intel drivers to use the new helper functions. Other vendor drivers will be converted to .adjfine and this helper function in the following changes. Signed-off-by: Jacob Keller Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index ad4aaadc2f7a..f4781c5766d6 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -248,6 +248,52 @@ static inline long scaled_ppm_to_ppb(long ppm) return (long)ppb; } +/** + * diff_by_scaled_ppm - Calculate difference using scaled ppm + * @base: the base increment value to adjust + * @scaled_ppm: scaled parts per million to adjust by + * @diff: on return, the absolute value of calculated diff + * + * Calculate the difference to adjust the base increment using scaled parts + * per million. + * + * Use mul_u64_u64_div_u64 to perform the difference calculation in avoid + * possible overflow. + * + * Returns: true if scaled_ppm is negative, false otherwise + */ +static inline bool diff_by_scaled_ppm(u64 base, long scaled_ppm, u64 *diff) +{ + bool negative = false; + + if (scaled_ppm < 0) { + negative = true; + scaled_ppm = -scaled_ppm; + } + + *diff = mul_u64_u64_div_u64(base, (u64)scaled_ppm, 1000000ULL << 16); + + return negative; +} + +/** + * adjust_by_scaled_ppm - Adjust a base increment by scaled parts per million + * @base: the base increment value to adjust + * @scaled_ppm: scaled parts per million frequency adjustment + * + * Helper function which calculates a new increment value based on the + * requested scaled parts per million adjustment. + */ +static inline u64 adjust_by_scaled_ppm(u64 base, long scaled_ppm) +{ + u64 diff; + + if (diff_by_scaled_ppm(base, scaled_ppm, &diff)) + return base - diff; + + return base + diff; +} + #if IS_ENABLED(CONFIG_PTP_1588_CLOCK) /** -- cgit From 1d997f1013079c05b642c739901e3584a3ae558d Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 28 Oct 2022 04:42:21 -0400 Subject: rtnetlink: pass netlink message header and portid to rtnl_configure_link() This patch pass netlink message header and portid to rtnl_configure_link() All the functions in this call chain need to add the parameters so we can use them in the last call rtnl_notify(), and notify the userspace about the new link info if NLM_F_ECHO flag is set. - rtnl_configure_link() - __dev_notify_flags() - rtmsg_ifinfo() - rtmsg_ifinfo_event() - rtmsg_ifinfo_build_skb() - rtmsg_ifinfo_send() - rtnl_notify() Also move __dev_notify_flags() declaration to net/core/dev.h, as Jakub suggested. Signed-off-by: Hangbin Liu Reviewed-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 -- include/linux/rtnetlink.h | 9 +++++---- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 99e58b773266..4b5052db978f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3855,8 +3855,6 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); int dev_change_flags(struct net_device *dev, unsigned int flags, struct netlink_ext_ack *extack); -void __dev_notify_flags(struct net_device *, unsigned int old_flags, - unsigned int gchanges); int dev_set_alias(struct net_device *, const char *, size_t); int dev_get_alias(const struct net_device *, char *, size_t); int __dev_change_net_namespace(struct net_device *dev, struct net *net, diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index ae2c6a3cec5d..92ad75549e9c 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -12,21 +12,22 @@ extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, - u32 group, struct nlmsghdr *nlh, gfp_t flags); + u32 group, const struct nlmsghdr *nlh, gfp_t flags); extern void rtnl_set_sk_err(struct net *net, u32 group, int error); extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error); -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags, + u32 portid, const struct nlmsghdr *nlh); void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, gfp_t flags, int *new_nsid, - int new_ifindex); + int new_ifindex, u32 portid, u32 seq); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, - gfp_t flags); + gfp_t flags, u32 portid, const struct nlmsghdr *nlh); /* RTNL is used as a global lock for all changes to network configuration */ -- cgit From d08b0f8f46e45a274fc8c9a5bc92cb9da70d9887 Mon Sep 17 00:00:00 2001 From: Shane Parslow Date: Sat, 29 Oct 2022 02:03:56 -0700 Subject: net: wwan: iosm: add rpc interface for xmm modems Add a new iosm wwan port that connects to the modem rpc interface. This interface provides a configuration channel, and in the case of the 7360, is the only way to configure the modem (as it does not support mbim). The new interface is compatible with existing software, such as open_xdatachannel.py from the xmm7360-pci project [1]. [1] https://github.com/xmm7360/xmm7360-pci Signed-off-by: Shane Parslow Reviewed-by: Loic Poulain Signed-off-by: David S. Miller --- include/linux/wwan.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/wwan.h b/include/linux/wwan.h index 5ce2acf444fb..24d76500b1cc 100644 --- a/include/linux/wwan.h +++ b/include/linux/wwan.h @@ -15,6 +15,7 @@ * @WWAN_PORT_QMI: Qcom modem/MSM interface for modem control * @WWAN_PORT_QCDM: Qcom Modem diagnostic interface * @WWAN_PORT_FIREHOSE: XML based command protocol + * @WWAN_PORT_XMMRPC: Control protocol for Intel XMM modems * * @WWAN_PORT_MAX: Highest supported port types * @WWAN_PORT_UNKNOWN: Special value to indicate an unknown port type @@ -26,6 +27,7 @@ enum wwan_port_type { WWAN_PORT_QMI, WWAN_PORT_QCDM, WWAN_PORT_FIREHOSE, + WWAN_PORT_XMMRPC, /* Add new port types above this line */ -- cgit From 23da464dd6b8935b66f4ee306ad8947fd32ccd75 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:51 +0530 Subject: bpf: Allow specifying volatile type modifier for kptrs This is useful in particular to mark the pointer as volatile, so that compiler treats each load and store to the field as a volatile access. The alternative is having to define and use READ_ONCE and WRITE_ONCE in the BPF program. Signed-off-by: Kumar Kartikeya Dwivedi Acked-by: David Vernet Link: https://lore.kernel.org/r/20221103191013.1236066-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index f9aababc5d78..86aad9b2ce02 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -288,6 +288,11 @@ static inline bool btf_type_is_typedef(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF; } +static inline bool btf_type_is_volatile(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_VOLATILE; +} + static inline bool btf_type_is_func(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; -- cgit From a35ec8e38cdd1766f29924ca391a01de20163931 Mon Sep 17 00:00:00 2001 From: "Hans J. Schultz" Date: Tue, 1 Nov 2022 21:39:21 +0200 Subject: bridge: Add MAC Authentication Bypass (MAB) support Hosts that support 802.1X authentication are able to authenticate themselves by exchanging EAPOL frames with an authenticator (Ethernet bridge, in this case) and an authentication server. Access to the network is only granted by the authenticator to successfully authenticated hosts. The above is implemented in the bridge using the "locked" bridge port option. When enabled, link-local frames (e.g., EAPOL) can be locally received by the bridge, but all other frames are dropped unless the host is authenticated. That is, unless the user space control plane installed an FDB entry according to which the source address of the frame is located behind the locked ingress port. The entry can be dynamic, in which case learning needs to be enabled so that the entry will be refreshed by incoming traffic. There are deployments in which not all the devices connected to the authenticator (the bridge) support 802.1X. Such devices can include printers and cameras. One option to support such deployments is to unlock the bridge ports connecting these devices, but a slightly more secure option is to use MAB. When MAB is enabled, the MAC address of the connected device is used as the user name and password for the authentication. For MAB to work, the user space control plane needs to be notified about MAC addresses that are trying to gain access so that they will be compared against an allow list. This can be implemented via the regular learning process with the sole difference that learned FDB entries are installed with a new "locked" flag indicating that the entry cannot be used to authenticate the device. The flag cannot be set by user space, but user space can clear the flag by replacing the entry, thereby authenticating the device. Locked FDB entries implement the following semantics with regards to roaming, aging and forwarding: 1. Roaming: Locked FDB entries can roam to unlocked (authorized) ports, in which case the "locked" flag is cleared. FDB entries cannot roam to locked ports regardless of MAB being enabled or not. Therefore, locked FDB entries are only created if an FDB entry with the given {MAC, VID} does not already exist. This behavior prevents unauthenticated devices from disrupting traffic destined to already authenticated devices. 2. Aging: Locked FDB entries age and refresh by incoming traffic like regular entries. 3. Forwarding: Locked FDB entries forward traffic like regular entries. If user space detects an unauthorized MAC behind a locked port and wishes to prevent traffic with this MAC DA from reaching the host, it can do so using tc or a different mechanism. Enable the above behavior using a new bridge port option called "mab". It can only be enabled on a bridge port that is both locked and has learning enabled. Locked FDB entries are flushed from the port once MAB is disabled. A new option is added because there are pure 802.1X deployments that are not interested in notifications about locked FDB entries. Signed-off-by: Hans J. Schultz Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- include/linux/if_bridge.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index d62ef428e3aa..1668ac4d7adc 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -59,6 +59,7 @@ struct br_ip_list { #define BR_MRP_LOST_IN_CONT BIT(19) #define BR_TX_FWD_OFFLOAD BIT(20) #define BR_PORT_LOCKED BIT(21) +#define BR_PORT_MAB BIT(22) #define BR_DEFAULT_AGEING_TIME (300 * HZ) -- cgit From 02a68a47eadedf95748facfca6ced31fb0181d52 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:03 +0100 Subject: net: devlink: track netdev with devlink_port assigned Currently, ethernet drivers are using devlink_port_type_eth_set() and devlink_port_type_clear() to set devlink port type and link to related netdev. Instead of calling them directly, let the driver use SET_NETDEV_DEVLINK_PORT macro to assign devlink_port pointer and let devlink to track it. Note the devlink port pointer is static during the time netdevice is registered. In devlink code, use per-namespace netdev notifier to track the netdevices with devlink_port assigned and change the internal devlink_port type and related type pointer accordingly. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4b5052db978f..f048a30ea10b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1999,6 +1999,11 @@ enum netdev_ml_priv_type { * registered * @offload_xstats_l3: L3 HW stats for this netdevice. * + * @devlink_port: Pointer to related devlink port structure. + * Assigned by a driver before netdev registration using + * SET_NETDEV_DEVLINK_PORT macro. This pointer is static + * during the time netdevice is registered. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -2349,9 +2354,22 @@ struct net_device { netdevice_tracker watchdog_dev_tracker; netdevice_tracker dev_registered_tracker; struct rtnl_hw_stats64 *offload_xstats_l3; + + struct devlink_port *devlink_port; }; #define to_net_dev(d) container_of(d, struct net_device, dev) +/* + * Driver should use this to assign devlink port instance to a netdevice + * before it registers the netdevice. Therefore devlink_port is static + * during the netdev lifetime after it is registered. + */ +#define SET_NETDEV_DEVLINK_PORT(dev, port) \ +({ \ + WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED); \ + ((dev)->devlink_port = (port)); \ +}) + static inline bool netif_elide_gro(const struct net_device *dev) { if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog) @@ -2785,6 +2803,7 @@ enum netdev_cmd { NETDEV_PRE_TYPE_CHANGE, NETDEV_POST_TYPE_CHANGE, NETDEV_POST_INIT, + NETDEV_PRE_UNINIT, NETDEV_RELEASE, NETDEV_NOTIFY_PEERS, NETDEV_JOIN, -- cgit From 77df1db80da384c565106321f5934967690da7dd Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Nov 2022 17:02:10 +0100 Subject: net: remove unused ndo_get_devlink_port Remove ndo_get_devlink_port which is no longer used alongside with the implementations in drivers. Signed-off-by: Jiri Pirko Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index f048a30ea10b..d45713a06568 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1366,10 +1366,6 @@ struct netdev_net_notifier { * queue id bound to an AF_XDP socket. The flags field specifies if * only RX, only Tx, or both should be woken up using the flags * XDP_WAKEUP_RX and XDP_WAKEUP_TX. - * struct devlink_port *(*ndo_get_devlink_port)(struct net_device *dev); - * Get devlink port instance associated with a given netdev. - * Called with a reference on the netdevice and devlink locks only, - * rtnl_lock is not held. * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, * int cmd); * Add, change, delete or get information on an IPv4 tunnel. @@ -1600,7 +1596,6 @@ struct net_device_ops { struct xdp_buff *xdp); int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags); - struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, int cmd); struct net_device * (*ndo_get_peer_dev)(struct net_device *dev); -- cgit From aa3496accc412b3d975e4ee5d06076d73394d8b5 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:55 +0530 Subject: bpf: Refactor kptr_off_tab into btf_record To prepare the BPF verifier to handle special fields in both map values and program allocated types coming from program BTF, we need to refactor the kptr_off_tab handling code into something more generic and reusable across both cases to avoid code duplication. Later patches also require passing this data to helpers at runtime, so that they can work on user defined types, initialize them, destruct them, etc. The main observation is that both map values and such allocated types point to a type in program BTF, hence they can be handled similarly. We can prepare a field metadata table for both cases and store them in struct bpf_map or struct btf depending on the use case. Hence, refactor the code into generic btf_record and btf_field member structs. The btf_record represents the fields of a specific btf_type in user BTF. The cnt indicates the number of special fields we successfully recognized, and field_mask is a bitmask of fields that were found, to enable quick determination of availability of a certain field. Subsequently, refactor the rest of the code to work with these generic types, remove assumptions about kptr and kptr_off_tab, rename variables to more meaningful names, etc. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 125 ++++++++++++++++++++++++++++++++++------------------ include/linux/btf.h | 3 +- 2 files changed, 82 insertions(+), 46 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8d948bfcb984..5f2a42033a37 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -165,35 +165,41 @@ struct bpf_map_ops { }; enum { - /* Support at most 8 pointers in a BPF map value */ - BPF_MAP_VALUE_OFF_MAX = 8, - BPF_MAP_OFF_ARR_MAX = BPF_MAP_VALUE_OFF_MAX + + /* Support at most 8 pointers in a BTF type */ + BTF_FIELDS_MAX = 8, + BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX + 1 + /* for bpf_spin_lock */ 1, /* for bpf_timer */ }; -enum bpf_kptr_type { - BPF_KPTR_UNREF, - BPF_KPTR_REF, +enum btf_field_type { + BPF_KPTR_UNREF = (1 << 2), + BPF_KPTR_REF = (1 << 3), + BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, }; -struct bpf_map_value_off_desc { +struct btf_field_kptr { + struct btf *btf; + struct module *module; + btf_dtor_kfunc_t dtor; + u32 btf_id; +}; + +struct btf_field { u32 offset; - enum bpf_kptr_type type; - struct { - struct btf *btf; - struct module *module; - btf_dtor_kfunc_t dtor; - u32 btf_id; - } kptr; + enum btf_field_type type; + union { + struct btf_field_kptr kptr; + }; }; -struct bpf_map_value_off { - u32 nr_off; - struct bpf_map_value_off_desc off[]; +struct btf_record { + u32 cnt; + u32 field_mask; + struct btf_field fields[]; }; -struct bpf_map_off_arr { +struct btf_field_offs { u32 cnt; u32 field_off[BPF_MAP_OFF_ARR_MAX]; u8 field_sz[BPF_MAP_OFF_ARR_MAX]; @@ -215,7 +221,7 @@ struct bpf_map { u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; int spin_lock_off; /* >=0 valid offset, <0 error */ - struct bpf_map_value_off *kptr_off_tab; + struct btf_record *record; int timer_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; @@ -227,7 +233,7 @@ struct bpf_map { struct obj_cgroup *objcg; #endif char name[BPF_OBJ_NAME_LEN]; - struct bpf_map_off_arr *off_arr; + struct btf_field_offs *field_offs; /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. */ @@ -251,6 +257,37 @@ struct bpf_map { bool frozen; /* write-once; write-protected by freeze_mutex */ }; +static inline u32 btf_field_type_size(enum btf_field_type type) +{ + switch (type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return sizeof(u64); + default: + WARN_ON_ONCE(1); + return 0; + } +} + +static inline u32 btf_field_type_align(enum btf_field_type type) +{ + switch (type) { + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return __alignof__(u64); + default: + WARN_ON_ONCE(1); + return 0; + } +} + +static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_field_type type) +{ + if (IS_ERR_OR_NULL(rec)) + return false; + return rec->field_mask & type; +} + static inline bool map_value_has_spin_lock(const struct bpf_map *map) { return map->spin_lock_off >= 0; @@ -261,23 +298,19 @@ static inline bool map_value_has_timer(const struct bpf_map *map) return map->timer_off >= 0; } -static inline bool map_value_has_kptrs(const struct bpf_map *map) -{ - return !IS_ERR_OR_NULL(map->kptr_off_tab); -} - static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { if (unlikely(map_value_has_spin_lock(map))) memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); if (unlikely(map_value_has_timer(map))) memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); - if (unlikely(map_value_has_kptrs(map))) { - struct bpf_map_value_off *tab = map->kptr_off_tab; + if (!IS_ERR_OR_NULL(map->record)) { + struct btf_field *fields = map->record->fields; + u32 cnt = map->record->cnt; int i; - for (i = 0; i < tab->nr_off; i++) - *(u64 *)(dst + tab->off[i].offset) = 0; + for (i = 0; i < cnt; i++) + memset(dst + fields[i].offset, 0, btf_field_type_size(fields[i].type)); } } @@ -303,7 +336,7 @@ static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, b u32 curr_off = 0; int i; - if (likely(!map->off_arr)) { + if (likely(!map->field_offs)) { if (long_memcpy) bpf_long_memcpy(dst, src, round_up(map->value_size, 8)); else @@ -311,11 +344,12 @@ static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, b return; } - for (i = 0; i < map->off_arr->cnt; i++) { - u32 next_off = map->off_arr->field_off[i]; + for (i = 0; i < map->field_offs->cnt; i++) { + u32 next_off = map->field_offs->field_off[i]; + u32 sz = next_off - curr_off; - memcpy(dst + curr_off, src + curr_off, next_off - curr_off); - curr_off += map->off_arr->field_sz[i]; + memcpy(dst + curr_off, src + curr_off, sz); + curr_off += map->field_offs->field_sz[i]; } memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); } @@ -335,16 +369,17 @@ static inline void zero_map_value(struct bpf_map *map, void *dst) u32 curr_off = 0; int i; - if (likely(!map->off_arr)) { + if (likely(!map->field_offs)) { memset(dst, 0, map->value_size); return; } - for (i = 0; i < map->off_arr->cnt; i++) { - u32 next_off = map->off_arr->field_off[i]; + for (i = 0; i < map->field_offs->cnt; i++) { + u32 next_off = map->field_offs->field_off[i]; + u32 sz = next_off - curr_off; - memset(dst + curr_off, 0, next_off - curr_off); - curr_off += map->off_arr->field_sz[i]; + memset(dst + curr_off, 0, sz); + curr_off += map->field_offs->field_sz[i]; } memset(dst + curr_off, 0, map->value_size - curr_off); } @@ -1699,11 +1734,13 @@ void bpf_prog_put(struct bpf_prog *prog); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); -struct bpf_map_value_off_desc *bpf_map_kptr_off_contains(struct bpf_map *map, u32 offset); -void bpf_map_free_kptr_off_tab(struct bpf_map *map); -struct bpf_map_value_off *bpf_map_copy_kptr_off_tab(const struct bpf_map *map); -bool bpf_map_equal_kptr_off_tab(const struct bpf_map *map_a, const struct bpf_map *map_b); -void bpf_map_free_kptrs(struct bpf_map *map, void *map_value); +struct btf_field *btf_record_find(const struct btf_record *rec, + u32 offset, enum btf_field_type type); +void btf_record_free(struct btf_record *rec); +void bpf_map_free_record(struct bpf_map *map); +struct btf_record *btf_record_dup(const struct btf_record *rec); +bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); +void bpf_obj_free_fields(const struct btf_record *rec, void *obj); struct bpf_map *bpf_map_get(u32 ufd); struct bpf_map *bpf_map_get_with_uref(u32 ufd); diff --git a/include/linux/btf.h b/include/linux/btf.h index 86aad9b2ce02..9e62717cdc7a 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -163,8 +163,7 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); -struct bpf_map_value_off *btf_parse_kptrs(const struct btf *btf, - const struct btf_type *t); +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit From db559117828d2448fe81ada051c60bcf39f822e9 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:56 +0530 Subject: bpf: Consolidate spin_lock, timer management into btf_record Now that kptr_off_tab has been refactored into btf_record, and can hold more than one specific field type, accomodate bpf_spin_lock and bpf_timer as well. While they don't require any more metadata than offset, having all special fields in one place allows us to share the same code for allocated user defined types and handle both map values and these allocated objects in a similar fashion. As an optimization, we still keep spin_lock_off and timer_off offsets in the btf_record structure, just to avoid having to find the btf_field struct each time their offset is needed. This is mostly needed to manipulate such objects in a map value at runtime. It's ok to hardcode just one offset as more than one field is disallowed. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 53 ++++++++++++++++++++++++++++++++--------------------- include/linux/btf.h | 3 ++- 2 files changed, 34 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5f2a42033a37..aae85019abde 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -166,13 +166,13 @@ struct bpf_map_ops { enum { /* Support at most 8 pointers in a BTF type */ - BTF_FIELDS_MAX = 8, - BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX + - 1 + /* for bpf_spin_lock */ - 1, /* for bpf_timer */ + BTF_FIELDS_MAX = 10, + BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX, }; enum btf_field_type { + BPF_SPIN_LOCK = (1 << 0), + BPF_TIMER = (1 << 1), BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, @@ -196,6 +196,8 @@ struct btf_field { struct btf_record { u32 cnt; u32 field_mask; + int spin_lock_off; + int timer_off; struct btf_field fields[]; }; @@ -220,10 +222,8 @@ struct bpf_map { u32 max_entries; u64 map_extra; /* any per-map-type extra fields */ u32 map_flags; - int spin_lock_off; /* >=0 valid offset, <0 error */ - struct btf_record *record; - int timer_off; /* >=0 valid offset, <0 error */ u32 id; + struct btf_record *record; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; @@ -257,9 +257,29 @@ struct bpf_map { bool frozen; /* write-once; write-protected by freeze_mutex */ }; +static inline const char *btf_field_type_name(enum btf_field_type type) +{ + switch (type) { + case BPF_SPIN_LOCK: + return "bpf_spin_lock"; + case BPF_TIMER: + return "bpf_timer"; + case BPF_KPTR_UNREF: + case BPF_KPTR_REF: + return "kptr"; + default: + WARN_ON_ONCE(1); + return "unknown"; + } +} + static inline u32 btf_field_type_size(enum btf_field_type type) { switch (type) { + case BPF_SPIN_LOCK: + return sizeof(struct bpf_spin_lock); + case BPF_TIMER: + return sizeof(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: return sizeof(u64); @@ -272,6 +292,10 @@ static inline u32 btf_field_type_size(enum btf_field_type type) static inline u32 btf_field_type_align(enum btf_field_type type) { switch (type) { + case BPF_SPIN_LOCK: + return __alignof__(struct bpf_spin_lock); + case BPF_TIMER: + return __alignof__(struct bpf_timer); case BPF_KPTR_UNREF: case BPF_KPTR_REF: return __alignof__(u64); @@ -288,22 +312,8 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } -static inline bool map_value_has_spin_lock(const struct bpf_map *map) -{ - return map->spin_lock_off >= 0; -} - -static inline bool map_value_has_timer(const struct bpf_map *map) -{ - return map->timer_off >= 0; -} - static inline void check_and_init_map_value(struct bpf_map *map, void *dst) { - if (unlikely(map_value_has_spin_lock(map))) - memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock)); - if (unlikely(map_value_has_timer(map))) - memset(dst + map->timer_off, 0, sizeof(struct bpf_timer)); if (!IS_ERR_OR_NULL(map->record)) { struct btf_field *fields = map->record->fields; u32 cnt = map->record->cnt; @@ -1740,6 +1750,7 @@ void btf_record_free(struct btf_record *rec); void bpf_map_free_record(struct bpf_map *map); struct btf_record *btf_record_dup(const struct btf_record *rec); bool btf_record_equal(const struct btf_record *rec_a, const struct btf_record *rec_b); +void bpf_obj_free_timer(const struct btf_record *rec, void *obj); void bpf_obj_free_fields(const struct btf_record *rec, void *obj); struct bpf_map *bpf_map_get(u32 ufd); diff --git a/include/linux/btf.h b/include/linux/btf.h index 9e62717cdc7a..282006abd062 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -163,7 +163,8 @@ bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, u32 expected_offset, u32 expected_size); int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); -struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t); +struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, + u32 field_mask, u32 value_size); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit From f71b2f64177a199d5b1d2047e155d45fd98f564a Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 4 Nov 2022 00:39:57 +0530 Subject: bpf: Refactor map->off_arr handling Refactor map->off_arr handling into generic functions that can work on their own without hardcoding map specific code. The btf_fields_offs structure is now returned from btf_parse_field_offs, which can be reused later for types in program BTF. All functions like copy_map_value, zero_map_value call generic underlying functions so that they can also be reused later for copying to values allocated in programs which encode specific fields. Later, some helper functions will also require access to this btf_field_offs structure to be able to skip over special fields at runtime. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221103191013.1236066-9-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 41 ++++++++++++++++++++++++----------------- include/linux/btf.h | 1 + 2 files changed, 25 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index aae85019abde..798aec816970 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -341,57 +341,64 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) } /* copy everything but bpf_spin_lock, bpf_timer, and kptrs. There could be one of each. */ -static inline void __copy_map_value(struct bpf_map *map, void *dst, void *src, bool long_memcpy) +static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, + void *dst, void *src, u32 size, + bool long_memcpy) { u32 curr_off = 0; int i; - if (likely(!map->field_offs)) { + if (likely(!foffs)) { if (long_memcpy) - bpf_long_memcpy(dst, src, round_up(map->value_size, 8)); + bpf_long_memcpy(dst, src, round_up(size, 8)); else - memcpy(dst, src, map->value_size); + memcpy(dst, src, size); return; } - for (i = 0; i < map->field_offs->cnt; i++) { - u32 next_off = map->field_offs->field_off[i]; + for (i = 0; i < foffs->cnt; i++) { + u32 next_off = foffs->field_off[i]; u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); - curr_off += map->field_offs->field_sz[i]; + curr_off += foffs->field_sz[i]; } - memcpy(dst + curr_off, src + curr_off, map->value_size - curr_off); + memcpy(dst + curr_off, src + curr_off, size - curr_off); } static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) { - __copy_map_value(map, dst, src, false); + bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, false); } static inline void copy_map_value_long(struct bpf_map *map, void *dst, void *src) { - __copy_map_value(map, dst, src, true); + bpf_obj_memcpy(map->field_offs, dst, src, map->value_size, true); } -static inline void zero_map_value(struct bpf_map *map, void *dst) +static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 size) { u32 curr_off = 0; int i; - if (likely(!map->field_offs)) { - memset(dst, 0, map->value_size); + if (likely(!foffs)) { + memset(dst, 0, size); return; } - for (i = 0; i < map->field_offs->cnt; i++) { - u32 next_off = map->field_offs->field_off[i]; + for (i = 0; i < foffs->cnt; i++) { + u32 next_off = foffs->field_off[i]; u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); - curr_off += map->field_offs->field_sz[i]; + curr_off += foffs->field_sz[i]; } - memset(dst + curr_off, 0, map->value_size - curr_off); + memset(dst + curr_off, 0, size - curr_off); +} + +static inline void zero_map_value(struct bpf_map *map, void *dst) +{ + bpf_obj_memzero(map->field_offs, dst, map->value_size); } void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, diff --git a/include/linux/btf.h b/include/linux/btf.h index 282006abd062..d80345fa566b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -165,6 +165,7 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size); +struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); const struct btf_type *btf_type_skip_modifiers(const struct btf *btf, -- cgit From 2b6c0e152868c9c5939a5c5094d5b2be61cf48e6 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 28 Oct 2022 11:23:32 +0200 Subject: bcma: Use the proper gpio include The is including the legacy header to obtain struct gpio_chip. Instead, include where this struct is defined. It turns out that the brcm80211 brcmsmac depends on this to bring in the symbol gpio_is_valid(). The driver looks up the BCMA parent GPIO driver and checks that this succeeds, but then it goes on to use the deprecated GPIO call gpio_is_valid() to check the consistency of the .base member of the BCMA GPIO struct. The whole check can be dropped because the bcma_gpio is initialized in the declarations: struct gpio_chip *bcma_gpio = &cc_drv->gpio; And this can never be NULL. Cc: Jonas Gorski Acked-by: Arend van Spriel Signed-off-by: Linus Walleij Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20221028092332.238728-1-linus.walleij@linaro.org --- include/linux/bcma/bcma_driver_chipcommon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h index 2d94c30ed439..0cb6638b55e5 100644 --- a/include/linux/bcma/bcma_driver_chipcommon.h +++ b/include/linux/bcma/bcma_driver_chipcommon.h @@ -4,7 +4,7 @@ #include #include -#include +#include /** ChipCommon core registers. **/ #define BCMA_CC_ID 0x0000 -- cgit From d08cb25556773c65efb21761b75f92c804ad0975 Mon Sep 17 00:00:00 2001 From: David Yang Date: Fri, 28 Oct 2022 10:01:01 +0800 Subject: net: mv643xx_eth: support MII/GMII/RGMII modes for Kirkwood Support mode switch properly, which is not available before. If SoC has two Ethernet controllers, by setting both of them into MII mode, the first controller enters GMII mode, while the second controller is effectively disabled. This requires configuring (and maybe enabling) the second controller in the device tree, even though it cannot be used. Signed-off-by: David Yang Signed-off-by: David S. Miller --- include/linux/mv643xx_eth.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h index 3682ae75c7aa..145169be2ed8 100644 --- a/include/linux/mv643xx_eth.h +++ b/include/linux/mv643xx_eth.h @@ -8,6 +8,7 @@ #include #include +#include #define MV643XX_ETH_SHARED_NAME "mv643xx_eth" #define MV643XX_ETH_NAME "mv643xx_eth_port" @@ -59,6 +60,7 @@ struct mv643xx_eth_platform_data { */ int speed; int duplex; + phy_interface_t interface; /* * How many RX/TX queues to use. -- cgit From e1f4ecab19338ec7079830e8700e4869f991fd45 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 4 Nov 2022 17:13:01 +0000 Subject: net: remove explicit phylink_generic_validate() references Virtually all conventional network drivers are now converted to use phylink_generic_validate() - only DSA drivers and fman_memac remain, so lets remove the necessity for network drivers to explicitly set this member, and default to phylink_generic_validate() when unset. This is possible as .validate must currently be set. Any remaining instances that have not been addressed by this patch can be fixed up later. Signed-off-by: Russell King (Oracle) Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/E1or0FZ-001tRa-DI@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 1df3e5e316e8..c492c26202b5 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -207,6 +207,11 @@ struct phylink_mac_ops { * * If the @state->interface mode is not supported, then the @supported * mask must be cleared. + * + * This member is optional; if not set, the generic validator will be + * used making use of @config->mac_capabilities and + * @config->supported_interfaces to determine which link modes are + * supported. */ void validate(struct phylink_config *config, unsigned long *supported, struct phylink_link_state *state); -- cgit From 9a0f830f80265bd1ef816e1541ac24bee80e9a3c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Nov 2022 12:01:25 -0700 Subject: ethtool: linkstate: add a statistic for PHY down events The previous attempt to augment carrier_down (see Link) was not met with much enthusiasm so let's do the simple thing of exposing what some devices already maintain. Add a common ethtool statistic for link going down. Currently users have to maintain per-driver mapping to extract the right stat from the vendor-specific ethtool -S stats. carrier_down does not fit the bill because it counts a lot of software related false positives. Add the statistic to the extended link state API to steer vendors towards implementing all of it. Implement for bnxt and all Linux-controlled PHYs. mlx5 and (possibly) enic also have a counter for this but I leave the implementation to their maintainers. Link: https://lore.kernel.org/r/20220520004500.2250674-1-kuba@kernel.org Reviewed-by: Florian Fainelli Reviewed-by: Michael Chan Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski Link: https://lore.kernel.org/r/20221104190125.684910-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- include/linux/ethtool.h | 17 +++++++++++++++++ include/linux/phy.h | 3 +++ 2 files changed, 20 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 99dc7bfbcd3c..5c51c7fda32a 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -125,6 +125,20 @@ struct ethtool_link_ext_state_info { }; }; +struct ethtool_link_ext_stats { + /* Custom Linux statistic for PHY level link down events. + * In a simpler world it should be equal to netdev->carrier_down_count + * unfortunately netdev also counts local reconfigurations which don't + * actually take the physical link down, not to mention NC-SI which, + * if present, keeps the link up regardless of host state. + * This statistic counts when PHY _actually_ went down, or lost link. + * + * Note that we need u64 for ethtool_stats_init() and comparisons + * to ETHTOOL_STAT_NOT_SET, but only u32 is exposed to the user. + */ + u64 link_down_events; +}; + /** * ethtool_rxfh_indir_default - get default value for RX flow hash indirection * @index: Index in RX flow hash indirection table @@ -481,6 +495,7 @@ struct ethtool_module_power_mode_params { * do not attach ext_substate attribute to netlink message). If link_ext_state * and link_ext_substate are unknown, return -ENODATA. If not implemented, * link_ext_state and link_ext_substate will not be sent to userspace. + * @get_link_ext_stats: Read extra link-related counters. * @get_eeprom_len: Read range of EEPROM addresses for validation of * @get_eeprom and @set_eeprom requests. * Returns 0 if device does not support EEPROM access. @@ -652,6 +667,8 @@ struct ethtool_ops { u32 (*get_link)(struct net_device *); int (*get_link_ext_state)(struct net_device *, struct ethtool_link_ext_state_info *); + void (*get_link_ext_stats)(struct net_device *dev, + struct ethtool_link_ext_stats *stats); int (*get_eeprom_len)(struct net_device *); int (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); diff --git a/include/linux/phy.h b/include/linux/phy.h index ddf66198f751..9a3752c0c444 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -600,6 +600,7 @@ struct macsec_ops; * @psec: Pointer to Power Sourcing Equipment control struct * @lock: Mutex for serialization access to PHY * @state_queue: Work queue for state machine + * @link_down_events: Number of times link was lost * @shared: Pointer to private data shared by phys in one package * @priv: Pointer to driver private data * @@ -723,6 +724,8 @@ struct phy_device { int pma_extable; + unsigned int link_down_events; + void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); -- cgit From c3d96f690a790074b508fe183a41e36a00cd7ddd Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 3 Oct 2022 07:34:21 +0100 Subject: net, proc: Provide PROC_FS=n fallback for proc_create_net_single_write() Provide a CONFIG_PROC_FS=n fallback for proc_create_net_single_write(). Also provide a fallback for proc_create_net_data_write(). Fixes: 564def71765c ("proc: Add a way to make network proc files writable") Reported-by: kernel test robot Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org --- include/linux/proc_fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 81d6e4ec2294..0260f5ea98fe 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -208,8 +208,10 @@ static inline void proc_remove(struct proc_dir_entry *de) {} static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { return 0; } #define proc_create_net_data(name, mode, parent, ops, state_size, data) ({NULL;}) +#define proc_create_net_data_write(name, mode, parent, ops, write, state_size, data) ({NULL;}) #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;}) #define proc_create_net_single(name, mode, parent, show, data) ({NULL;}) +#define proc_create_net_single_write(name, mode, parent, show, write, data) ({NULL;}) static inline struct pid *tgid_pidfd_to_pid(const struct file *file) { -- cgit From 42fb06b391ace2aec5cdb1ebb8ff668f0a34332f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Oct 2022 08:49:29 +0100 Subject: net: Change the udp encap_err_rcv to allow use of {ip,ipv6}_icmp_error() Change the udp encap_err_rcv signature to match ip_icmp_error() and ipv6_icmp_error() so that those can be used from the called function and export them. Signed-off-by: David Howells cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: netdev@vger.kernel.org --- include/linux/udp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index 5cdba00a904a..dea57aa37df6 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -70,7 +70,8 @@ struct udp_sock { * For encapsulation sockets. */ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); - void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); + void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, int err, + __be16 port, u32 info, u8 *payload); int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb); void (*encap_destroy)(struct sock *sk); -- cgit From bd039b5ea2a91ea707ee8539df26456bd5be80af Mon Sep 17 00:00:00 2001 From: Andy Ren Date: Mon, 7 Nov 2022 09:42:42 -0800 Subject: net/core: Allow live renaming when an interface is up Allow a network interface to be renamed when the interface is up. As described in the netconsole documentation [1], when netconsole is used as a built-in, it will bring up the specified interface as soon as possible. As a result, user space will not be able to rename the interface since the kernel disallows renaming of interfaces that are administratively up unless the 'IFF_LIVE_RENAME_OK' private flag was set by the kernel. The original solution [2] to this problem was to add a new parameter to the netconsole configuration parameters that allows renaming of the interface used by netconsole while it is administratively up. However, during the discussion that followed, it became apparent that we have no reason to keep the current restriction and instead we should allow user space to rename interfaces regardless of their administrative state: 1. The restriction was put in place over 20 years ago when renaming was only possible via IOCTL and before rtnetlink started notifying user space about such changes like it does today. 2. The 'IFF_LIVE_RENAME_OK' flag was added over 3 years ago in version 5.2 and no regressions were reported. 3. In-kernel listeners to 'NETDEV_CHANGENAME' do not seem to care about the administrative state of interface. Therefore, allow user space to rename running interfaces by removing the restriction and the associated 'IFF_LIVE_RENAME_OK' flag. Help in possible triage by emitting a message to the kernel log that an interface was renamed while UP. [1] https://www.kernel.org/doc/Documentation/networking/netconsole.rst [2] https://lore.kernel.org/netdev/20221102002420.2613004-1-andy.ren@getcruise.com/ Signed-off-by: Andy Ren Reviewed-by: Ido Schimmel Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d45713a06568..4be87b89e481 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1650,7 +1650,6 @@ struct net_device_ops { * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device - * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN @@ -1686,7 +1685,7 @@ enum netdev_priv_flags { IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, - IFF_LIVE_RENAME_OK = 1<<30, + /* was IFF_LIVE_RENAME_OK */ IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; @@ -1721,7 +1720,6 @@ enum netdev_priv_flags { #define IFF_FAILOVER IFF_FAILOVER #define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE #define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER -#define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK #define IFF_TX_SKB_NO_LINEAR IFF_TX_SKB_NO_LINEAR /* Specifies the type of the struct net_device::ml_priv pointer */ -- cgit From 3e52fba03a20234abc65a656cef063a1045d9723 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 8 Nov 2022 14:22:06 +0100 Subject: net: introduce a helper to move notifier block to different namespace Currently, net_dev() netdev notifier variant follows the netdev with per-net notifier from namespace to namespace. This is implemented by move_netdevice_notifiers_dev_net() helper. For devlink it is needed to re-register per-net notifier during devlink reload. Introduce a new helper called move_netdevice_notifier_net() and share the unregister/register code with existing move_netdevice_notifiers_dev_net() helper. Signed-off-by: Jiri Pirko Reviewed-by: Ido Schimmel Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4be87b89e481..02a2318da7c7 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2826,6 +2826,8 @@ int unregister_netdevice_notifier(struct notifier_block *nb); int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb); int unregister_netdevice_notifier_net(struct net *net, struct notifier_block *nb); +void move_netdevice_notifier_net(struct net *src_net, struct net *dst_net, + struct notifier_block *nb); int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn); -- cgit From f6479ea4e5990e17b9690d66474198dcdba1b2c1 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 8 Nov 2022 14:25:56 +0000 Subject: net: mdio: add mdiodev_c45_(read|write) Signed-off-by: Russell King (Oracle) Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- include/linux/mdio.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mdio.h b/include/linux/mdio.h index 00177567cfef..f7fbbf3069e7 100644 --- a/include/linux/mdio.h +++ b/include/linux/mdio.h @@ -488,6 +488,19 @@ static inline int mdiobus_c45_write(struct mii_bus *bus, int prtad, int devad, return mdiobus_write(bus, prtad, mdiobus_c45_addr(devad, regnum), val); } +static inline int mdiodev_c45_read(struct mdio_device *mdiodev, int devad, + u16 regnum) +{ + return mdiobus_c45_read(mdiodev->bus, mdiodev->addr, devad, regnum); +} + +static inline int mdiodev_c45_write(struct mdio_device *mdiodev, u32 devad, + u16 regnum, u16 val) +{ + return mdiobus_c45_write(mdiodev->bus, mdiodev->addr, devad, regnum, + val); +} + int mdiobus_register_device(struct mdio_device *mdiodev); int mdiobus_unregister_device(struct mdio_device *mdiodev); bool mdiobus_is_registered_device(struct mii_bus *bus, int addr); -- cgit From cc514101a97e3fb48f8617c8f291db798d10d831 Mon Sep 17 00:00:00 2001 From: Sujuan Chen Date: Sat, 5 Nov 2022 23:36:18 +0100 Subject: net: ethernet: mtk_wed: introduce wed mcu support Introduce WED mcu support used to configure WED WO chip. This is a preliminary patch in order to add RX Wireless Ethernet Dispatch available on MT7986 SoC. Tested-by: Daniel Golle Co-developed-by: Lorenzo Bianconi Signed-off-by: Lorenzo Bianconi Signed-off-by: Sujuan Chen Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 4450c8b7a1cb..2cc2f1e43ba9 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -11,6 +11,35 @@ struct mtk_wed_hw; struct mtk_wdma_desc; +enum mtk_wed_wo_cmd { + MTK_WED_WO_CMD_WED_CFG, + MTK_WED_WO_CMD_WED_RX_STAT, + MTK_WED_WO_CMD_RRO_SER, + MTK_WED_WO_CMD_DBG_INFO, + MTK_WED_WO_CMD_DEV_INFO, + MTK_WED_WO_CMD_BSS_INFO, + MTK_WED_WO_CMD_STA_REC, + MTK_WED_WO_CMD_DEV_INFO_DUMP, + MTK_WED_WO_CMD_BSS_INFO_DUMP, + MTK_WED_WO_CMD_STA_REC_DUMP, + MTK_WED_WO_CMD_BA_INFO_DUMP, + MTK_WED_WO_CMD_FBCMD_Q_DUMP, + MTK_WED_WO_CMD_FW_LOG_CTRL, + MTK_WED_WO_CMD_LOG_FLUSH, + MTK_WED_WO_CMD_CHANGE_STATE, + MTK_WED_WO_CMD_CPU_STATS_ENABLE, + MTK_WED_WO_CMD_CPU_STATS_DUMP, + MTK_WED_WO_CMD_EXCEPTION_INIT, + MTK_WED_WO_CMD_PROF_CTRL, + MTK_WED_WO_CMD_STA_BA_DUMP, + MTK_WED_WO_CMD_BA_CTRL_DUMP, + MTK_WED_WO_CMD_RXCNT_CTRL, + MTK_WED_WO_CMD_RXCNT_INFO, + MTK_WED_WO_CMD_SET_CAP, + MTK_WED_WO_CMD_CCIF_RING_DUMP, + MTK_WED_WO_CMD_WED_END +}; + enum mtk_wed_bus_tye { MTK_WED_BUS_PCIE, MTK_WED_BUS_AXI, -- cgit From 084d60ce0c6cef96024d53a58b92d7ff2d8b9318 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 5 Nov 2022 23:36:20 +0100 Subject: net: ethernet: mtk_wed: rename tx_wdma array in rx_wdma Rename tx_wdma queue array in rx_wdma since this is rx side of wdma soc. Moreover rename mtk_wed_wdma_ring_setup routine in mtk_wed_wdma_rx_ring_setup() Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 2cc2f1e43ba9..956978320f8b 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -7,6 +7,7 @@ #include #define MTK_WED_TX_QUEUES 2 +#define MTK_WED_RX_QUEUES 2 struct mtk_wed_hw; struct mtk_wdma_desc; @@ -66,7 +67,7 @@ struct mtk_wed_device { struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES]; struct mtk_wed_ring txfree_ring; - struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES]; + struct mtk_wed_ring rx_wdma[MTK_WED_RX_QUEUES]; struct { int size; -- cgit From 4c5de09eb0d05fc9e73ff8f0e052622f1f3df6d8 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 5 Nov 2022 23:36:21 +0100 Subject: net: ethernet: mtk_wed: add configure wed wo support Enable RX Wireless Ethernet Dispatch available on MT7986 Soc. Tested-by: Daniel Golle Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 76 +++++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 956978320f8b..8294978f4bca 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -5,10 +5,13 @@ #include #include #include +#include #define MTK_WED_TX_QUEUES 2 #define MTK_WED_RX_QUEUES 2 +#define WED_WO_STA_REC 0x6 + struct mtk_wed_hw; struct mtk_wdma_desc; @@ -41,21 +44,37 @@ enum mtk_wed_wo_cmd { MTK_WED_WO_CMD_WED_END }; +struct mtk_rxbm_desc { + __le32 buf0; + __le32 token; +} __packed __aligned(4); + enum mtk_wed_bus_tye { MTK_WED_BUS_PCIE, MTK_WED_BUS_AXI, }; +#define MTK_WED_RING_CONFIGURED BIT(0) struct mtk_wed_ring { struct mtk_wdma_desc *desc; dma_addr_t desc_phys; u32 desc_size; int size; + u32 flags; u32 reg_base; void __iomem *wpdma; }; +struct mtk_wed_wo_rx_stats { + __le16 wlan_idx; + __le16 tid; + __le32 rx_pkt_cnt; + __le32 rx_byte_cnt; + __le32 rx_err_cnt; + __le32 rx_drop_cnt; +}; + struct mtk_wed_device { #ifdef CONFIG_NET_MEDIATEK_SOC_WED const struct mtk_wed_ops *ops; @@ -64,9 +83,12 @@ struct mtk_wed_device { bool init_done, running; int wdma_idx; int irq; + u8 version; struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES]; + struct mtk_wed_ring rx_ring[MTK_WED_RX_QUEUES]; struct mtk_wed_ring txfree_ring; + struct mtk_wed_ring tx_wdma[MTK_WED_TX_QUEUES]; struct mtk_wed_ring rx_wdma[MTK_WED_RX_QUEUES]; struct { @@ -74,7 +96,20 @@ struct mtk_wed_device { void **pages; struct mtk_wdma_desc *desc; dma_addr_t desc_phys; - } buf_ring; + } tx_buf_ring; + + struct { + int size; + struct page_frag_cache rx_page; + struct mtk_rxbm_desc *desc; + dma_addr_t desc_phys; + } rx_buf_ring; + + struct { + struct mtk_wed_ring ring; + dma_addr_t miod_phys; + dma_addr_t fdbk_phys; + } rro; /* filled by driver: */ struct { @@ -83,22 +118,36 @@ struct mtk_wed_device { struct pci_dev *pci_dev; }; enum mtk_wed_bus_tye bus_type; + void __iomem *base; + u32 phy_base; u32 wpdma_phys; u32 wpdma_int; u32 wpdma_mask; u32 wpdma_tx; u32 wpdma_txfree; + u32 wpdma_rx_glo; + u32 wpdma_rx; + + bool wcid_512; u16 token_start; unsigned int nbuf; + unsigned int rx_nbuf; + unsigned int rx_npkt; + unsigned int rx_size; u8 tx_tbit[MTK_WED_TX_QUEUES]; + u8 rx_tbit[MTK_WED_RX_QUEUES]; u8 txfree_tbit; u32 (*init_buf)(void *ptr, dma_addr_t phys, int token_id); int (*offload_enable)(struct mtk_wed_device *wed); void (*offload_disable)(struct mtk_wed_device *wed); + u32 (*init_rx_buf)(struct mtk_wed_device *wed, int size); + void (*release_rx_buf)(struct mtk_wed_device *wed); + void (*update_wo_rx_stats)(struct mtk_wed_device *wed, + struct mtk_wed_wo_rx_stats *stats); } wlan; #endif }; @@ -107,9 +156,15 @@ struct mtk_wed_ops { int (*attach)(struct mtk_wed_device *dev); int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs); + int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, + void __iomem *regs); int (*txfree_ring_setup)(struct mtk_wed_device *dev, void __iomem *regs); + int (*msg_update)(struct mtk_wed_device *dev, int cmd_id, + void *data, int len); void (*detach)(struct mtk_wed_device *dev); + void (*ppe_check)(struct mtk_wed_device *dev, struct sk_buff *skb, + u32 reason, u32 hash); void (*stop)(struct mtk_wed_device *dev); void (*start)(struct mtk_wed_device *dev, u32 irq_mask); @@ -144,6 +199,16 @@ mtk_wed_device_attach(struct mtk_wed_device *dev) return ret; } +static inline bool +mtk_wed_get_rx_capa(struct mtk_wed_device *dev) +{ +#ifdef CONFIG_NET_MEDIATEK_SOC_WED + return dev->version != 1; +#else + return false; +#endif +} + #ifdef CONFIG_NET_MEDIATEK_SOC_WED #define mtk_wed_device_active(_dev) !!(_dev)->ops #define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev) @@ -160,6 +225,12 @@ mtk_wed_device_attach(struct mtk_wed_device *dev) (_dev)->ops->irq_get(_dev, _mask) #define mtk_wed_device_irq_set_mask(_dev, _mask) \ (_dev)->ops->irq_set_mask(_dev, _mask) +#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) \ + (_dev)->ops->rx_ring_setup(_dev, _ring, _regs) +#define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) \ + (_dev)->ops->ppe_check(_dev, _skb, _reason, _hash) +#define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \ + (_dev)->ops->msg_update(_dev, _id, _msg, _len) #else static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) { @@ -173,6 +244,9 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0) #define mtk_wed_device_irq_get(_dev, _mask) 0 #define mtk_wed_device_irq_set_mask(_dev, _mask) do {} while (0) +#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) do {} while (0) +#define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV #endif #endif -- cgit From 75ab70ec5cefade915a999886e4863fe6a38f8b5 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Wed, 9 Nov 2022 15:09:45 -0800 Subject: ptp: remove the .adjfreq interface function Now that all drivers have been converted to .adjfine, we can remove the .adjfreq from the interface structure. Signed-off-by: Jacob Keller Cc: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index f4781c5766d6..fdffa6a98d79 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -77,12 +77,6 @@ struct ptp_system_timestamp { * nominal frequency in parts per million, but with a * 16 bit binary fractional field. * - * @adjfreq: Adjusts the frequency of the hardware clock. - * This method is deprecated. New drivers should implement - * the @adjfine method instead. - * parameter delta: Desired frequency offset from nominal frequency - * in parts per billion - * * @adjphase: Adjusts the phase offset of the hardware clock. * parameter delta: Desired change in nanoseconds. * @@ -174,7 +168,6 @@ struct ptp_clock_info { int pps; struct ptp_pin_desc *pin_config; int (*adjfine)(struct ptp_clock_info *ptp, long scaled_ppm); - int (*adjfreq)(struct ptp_clock_info *ptp, s32 delta); int (*adjphase)(struct ptp_clock_info *ptp, s32 phase); int (*adjtime)(struct ptp_clock_info *ptp, s64 delta); int (*gettime64)(struct ptp_clock_info *ptp, struct timespec64 *ts); -- cgit From 354259fa73e2aac92ae5e19522adb69a92c15b49 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Nov 2022 09:57:58 +0000 Subject: net: remove skb->vlan_present skb->vlan_present seems redundant. We can instead derive it from this boolean expression: vlan_present = skb->vlan_proto != 0 || skb->vlan_tci != 0 Add a new union, to access both fields in a single load/store when possible. union { u32 vlan_all; struct { __be16 vlan_proto; __u16 vlan_tci; }; }; This allows following patch to remove a conditional test in GRO stack. Note: We move remcsum_offload to keep TC_AT_INGRESS_MASK and SKB_MONO_DELIVERY_TIME_MASK unchanged. Signed-off-by: Eric Dumazet Acked-by: Yonghong Song Acked-by: Martin KaFai Lau Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 9 +++------ include/linux/skbuff.h | 18 ++++++++++-------- 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index e00c4ee81ff7..6864b89ef868 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -76,7 +76,7 @@ static inline bool is_vlan_dev(const struct net_device *dev) return dev->priv_flags & IFF_802_1Q_VLAN; } -#define skb_vlan_tag_present(__skb) ((__skb)->vlan_present) +#define skb_vlan_tag_present(__skb) (!!(__skb)->vlan_all) #define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) #define skb_vlan_tag_get_cfi(__skb) (!!((__skb)->vlan_tci & VLAN_CFI_MASK)) @@ -471,7 +471,7 @@ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, */ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) { - skb->vlan_present = 0; + skb->vlan_all = 0; } /** @@ -483,9 +483,7 @@ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) */ static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src) { - dst->vlan_present = src->vlan_present; - dst->vlan_proto = src->vlan_proto; - dst->vlan_tci = src->vlan_tci; + dst->vlan_all = src->vlan_all; } /* @@ -519,7 +517,6 @@ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, { skb->vlan_proto = vlan_proto; skb->vlan_tci = vlan_tci; - skb->vlan_present = 1; } /** diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 59c9fd55699d..4e464a27adaf 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -818,7 +818,7 @@ typedef unsigned char *sk_buff_data_t; * @mark: Generic packet mark * @reserved_tailroom: (aka @mark) number of bytes of free space available * at the tail of an sk_buff - * @vlan_present: VLAN tag is present + * @vlan_all: vlan fields (proto & tci) * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information * @inner_protocol: Protocol (encapsulation) @@ -951,7 +951,7 @@ struct sk_buff { /* private: */ __u8 __pkt_vlan_present_offset[0]; /* public: */ - __u8 vlan_present:1; /* See PKT_VLAN_PRESENT_BIT */ + __u8 remcsum_offload:1; __u8 csum_complete_sw:1; __u8 csum_level:2; __u8 dst_pending_confirm:1; @@ -966,7 +966,6 @@ struct sk_buff { __u8 ipvs_property:1; __u8 inner_protocol_type:1; - __u8 remcsum_offload:1; #ifdef CONFIG_NET_SWITCHDEV __u8 offload_fwd_mark:1; __u8 offload_l3_fwd_mark:1; @@ -999,8 +998,13 @@ struct sk_buff { __u32 priority; int skb_iif; __u32 hash; - __be16 vlan_proto; - __u16 vlan_tci; + union { + u32 vlan_all; + struct { + __be16 vlan_proto; + __u16 vlan_tci; + }; + }; #if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS) union { unsigned int napi_id; @@ -1059,15 +1063,13 @@ struct sk_buff { #endif #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) -/* if you move pkt_vlan_present, tc_at_ingress, or mono_delivery_time +/* if you move tc_at_ingress or mono_delivery_time * around, you also must adapt these constants. */ #ifdef __BIG_ENDIAN_BITFIELD -#define PKT_VLAN_PRESENT_BIT 7 #define TC_AT_INGRESS_MASK (1 << 0) #define SKB_MONO_DELIVERY_TIME_MASK (1 << 2) #else -#define PKT_VLAN_PRESENT_BIT 0 #define TC_AT_INGRESS_MASK (1 << 7) #define SKB_MONO_DELIVERY_TIME_MASK (1 << 5) #endif -- cgit From 2c925db0a7d69b404d6bfe4c037935c2d367913d Mon Sep 17 00:00:00 2001 From: Ofer Levi Date: Tue, 9 Feb 2021 17:48:11 +0200 Subject: net/mlx5e: Support enhanced CQE compression CQE compression feature improves performance by reducing PCI bandwidth bottleneck on CQEs write. Enhanced CQE compression introduced in ConnectX-6 and it aims to reduce CPU utilization of SW side packets decompression by eliminating the need to rewrite ownership bit, which is likely to cost a cache-miss, is replaced by validity byte handled solely by HW. Another advantage of the enhanced feature is that session packets are available to SW as soon as a single CQE slot is filled, instead of waiting for session to close, this improves packet latency from NIC to host. Performance: Following are tested scenarios and reults comparing basic and enahnced CQE compression. setup: IXIA 100GbE connected directly to port 0 and port 1 of ConnectX-6 Dx 100GbE dual port. Case #1 RX only, single flow goes to single queue: IRQ rate reduced by ~ 30%, CPU utilization improved by 2%. Case #2 IP forwarding from port 1 to port 0 single flow goes to single queue: Avg latency improved from 60us to 21us, frame loss improved from 0.5% to 0.0%. Case #3 IP forwarding from port 1 to port 0 Max Throughput IXIA sends 100%, 8192 UDP flows, goes to 24 queues: Enhanced is equal or slightly better than basic. Testing the basic compression feature with this patch shows there is no perfrormance degradation of the basic compression feature. Signed-off-by: Ofer Levi Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 1ff91cb79ded..eb3fac30488b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -882,6 +882,12 @@ static inline u8 get_cqe_opcode(struct mlx5_cqe64 *cqe) return cqe->op_own >> 4; } +static inline u8 get_cqe_enhanced_num_mini_cqes(struct mlx5_cqe64 *cqe) +{ + /* num_of_mini_cqes is zero based */ + return get_cqe_opcode(cqe) + 1; +} + static inline u8 get_cqe_lro_tcppsh(struct mlx5_cqe64 *cqe) { return (cqe->lro.tcppsh_abort_dupack >> 6) & 1; -- cgit From 92125c3a602454824d70edb1b2abb382811cab4f Mon Sep 17 00:00:00 2001 From: Nick Child Date: Thu, 10 Nov 2022 15:32:17 -0600 Subject: ibmvnic: Add hotpluggable CPU callbacks to reassign affinity hints When CPU's are added and removed, ibmvnic devices will reassign hint values. Introduce a new cpu hotplug state CPUHP_IBMVNIC_DEAD to signal to ibmvnic devices that the CPU has been removed and it is time to reset affinity hint assignments. On the other hand, when CPU's are being added, add a state instance to CPUHP_AP_ONLINE_DYN which will trigger a reassignment of affinity hints once the new CPU's are online. This implementation is based on the virtio_net driver. Signed-off-by: Thomas Falcon Signed-off-by: Dany Madden Signed-off-by: Nick Child Reviewed-by: Rick Lindsley Reviewed-by: Haren Myneni Signed-off-by: David S. Miller --- include/linux/cpuhotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index f61447913db9..c8bc85a87b1e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -69,6 +69,7 @@ enum cpuhp_state { CPUHP_X86_APB_DEAD, CPUHP_X86_MCE_DEAD, CPUHP_VIRT_NET_DEAD, + CPUHP_IBMVNIC_DEAD, CPUHP_SLUB_DEAD, CPUHP_DEBUG_OBJ_DEAD, CPUHP_MM_WRITEBACK_DEAD, -- cgit From 2d577252579b3efb9e934b68948a2edfa9920110 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:23 +0530 Subject: bpf: Remove BPF_MAP_OFF_ARR_MAX In f71b2f64177a ("bpf: Refactor map->off_arr handling"), map->off_arr was refactored to be btf_field_offs. The number of field offsets is equal to maximum possible fields limited by BTF_FIELDS_MAX. Hence, reuse BTF_FIELDS_MAX as spin_lock and timer no longer are to be handled specially for offset sorting, fix the comment, and remove incorrect WARN_ON as its rec->cnt can never exceed this value. The reason to keep separate constant was the it was always more 2 more than total kptrs. This is no longer the case. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 798aec816970..1a66a1df1af1 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -165,9 +165,8 @@ struct bpf_map_ops { }; enum { - /* Support at most 8 pointers in a BTF type */ - BTF_FIELDS_MAX = 10, - BPF_MAP_OFF_ARR_MAX = BTF_FIELDS_MAX, + /* Support at most 10 fields in a BTF type */ + BTF_FIELDS_MAX = 10, }; enum btf_field_type { @@ -203,8 +202,8 @@ struct btf_record { struct btf_field_offs { u32 cnt; - u32 field_off[BPF_MAP_OFF_ARR_MAX]; - u8 field_sz[BPF_MAP_OFF_ARR_MAX]; + u32 field_off[BTF_FIELDS_MAX]; + u8 field_sz[BTF_FIELDS_MAX]; }; struct bpf_map { -- cgit From e5feed0f64f73e167ef70755d3dc2db959d8fd5c Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:24 +0530 Subject: bpf: Fix copy_map_value, zero_map_value The current offset needs to also skip over the already copied region in addition to the size of the next field. This case manifests where there are gaps between adjacent special fields. It was observed that for a map value with size 48, having fields at: off: 0, 16, 32 size: 4, 16, 16 The current code does: memcpy(dst + 0, src + 0, 0) memcpy(dst + 4, src + 4, 12) memcpy(dst + 20, src + 20, 12) memcpy(dst + 36, src + 36, 12) With the fix, it is done correctly as: memcpy(dst + 0, src + 0, 0) memcpy(dst + 4, src + 4, 12) memcpy(dst + 32, src + 32, 0) memcpy(dst + 48, src + 48, 0) Fixes: 4d7d7f69f4b1 ("bpf: Adapt copy_map_value for multiple offset case") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1a66a1df1af1..f08eb2d27de0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -360,7 +360,7 @@ static inline void bpf_obj_memcpy(struct btf_field_offs *foffs, u32 sz = next_off - curr_off; memcpy(dst + curr_off, src + curr_off, sz); - curr_off += foffs->field_sz[i]; + curr_off += foffs->field_sz[i] + sz; } memcpy(dst + curr_off, src + curr_off, size - curr_off); } @@ -390,7 +390,7 @@ static inline void bpf_obj_memzero(struct btf_field_offs *foffs, void *dst, u32 u32 sz = next_off - curr_off; memset(dst + curr_off, 0, sz); - curr_off += foffs->field_sz[i]; + curr_off += foffs->field_sz[i] + sz; } memset(dst + curr_off, 0, size - curr_off); } -- cgit From f0c5941ff5b255413d31425bb327c2aec3625673 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:25 +0530 Subject: bpf: Support bpf_list_head in map values Add the support on the map side to parse, recognize, verify, and build metadata table for a new special field of the type struct bpf_list_head. To parameterize the bpf_list_head for a certain value type and the list_node member it will accept in that value type, we use BTF declaration tags. The definition of bpf_list_head in a map value will be done as follows: struct foo { struct bpf_list_node node; int data; }; struct map_value { struct bpf_list_head head __contains(foo, node); }; Then, the bpf_list_head only allows adding to the list 'head' using the bpf_list_node 'node' for the type struct foo. The 'contains' annotation is a BTF declaration tag composed of four parts, "contains:name:node" where the name is then used to look up the type in the map BTF, with its kind hardcoded to BTF_KIND_STRUCT during the lookup. The node defines name of the member in this type that has the type struct bpf_list_node, which is actually used for linking into the linked list. For now, 'kind' part is hardcoded as struct. This allows building intrusive linked lists in BPF, using container_of to obtain pointer to entry, while being completely type safe from the perspective of the verifier. The verifier knows exactly the type of the nodes, and knows that list helpers return that type at some fixed offset where the bpf_list_node member used for this list exists. The verifier also uses this information to disallow adding types that are not accepted by a certain list. For now, no elements can be added to such lists. Support for that is coming in future patches, hence draining and freeing items is done with a TODO that will be resolved in a future patch. Note that the bpf_list_head_free function moves the list out to a local variable under the lock and releases it, doing the actual draining of the list items outside the lock. While this helps with not holding the lock for too long pessimizing other concurrent list operations, it is also necessary for deadlock prevention: unless every function called in the critical section would be notrace, a fentry/fexit program could attach and call bpf_map_update_elem again on the map, leading to the same lock being acquired if the key matches and lead to a deadlock. While this requires some special effort on part of the BPF programmer to trigger and is highly unlikely to occur in practice, it is always better if we can avoid such a condition. While notrace would prevent this, doing the draining outside the lock has advantages of its own, hence it is used to also fix the deadlock related problem. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-5-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f08eb2d27de0..05f98e9e5c48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -175,6 +175,7 @@ enum btf_field_type { BPF_KPTR_UNREF = (1 << 2), BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, + BPF_LIST_HEAD = (1 << 4), }; struct btf_field_kptr { @@ -184,11 +185,18 @@ struct btf_field_kptr { u32 btf_id; }; +struct btf_field_list_head { + struct btf *btf; + u32 value_btf_id; + u32 node_offset; +}; + struct btf_field { u32 offset; enum btf_field_type type; union { struct btf_field_kptr kptr; + struct btf_field_list_head list_head; }; }; @@ -266,6 +274,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return "kptr"; + case BPF_LIST_HEAD: + return "bpf_list_head"; default: WARN_ON_ONCE(1); return "unknown"; @@ -282,6 +292,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return sizeof(u64); + case BPF_LIST_HEAD: + return sizeof(struct bpf_list_head); default: WARN_ON_ONCE(1); return 0; @@ -298,6 +310,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) case BPF_KPTR_UNREF: case BPF_KPTR_REF: return __alignof__(u64); + case BPF_LIST_HEAD: + return __alignof__(struct bpf_list_head); default: WARN_ON_ONCE(1); return 0; @@ -403,6 +417,9 @@ static inline void zero_map_value(struct bpf_map *map, void *dst) void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, bool lock_src); void bpf_timer_cancel_and_free(void *timer); +void bpf_list_head_free(const struct btf_field *field, void *list_head, + struct bpf_spin_lock *spin_lock); + int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size); struct bpf_offload_dev; -- cgit From 2de2669b4e52b2ae2f118bfc310004f50b47f0f5 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:26 +0530 Subject: bpf: Rename RET_PTR_TO_ALLOC_MEM Currently, the verifier has two return types, RET_PTR_TO_ALLOC_MEM, and RET_PTR_TO_ALLOC_MEM_OR_NULL, however the former is confusingly named to imply that it carries MEM_ALLOC, while only the latter does. This causes confusion during code review leading to conclusions like that the return value of RET_PTR_TO_DYNPTR_MEM_OR_NULL (which is RET_PTR_TO_ALLOC_MEM | PTR_MAYBE_NULL) may be consumable by bpf_ringbuf_{submit,commit}. Rename it to make it clear MEM_ALLOC needs to be tacked on top of RET_PTR_TO_MEM. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 05f98e9e5c48..2fe3ec620d54 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -607,7 +607,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET, /* returns a pointer to a socket */ RET_PTR_TO_TCP_SOCK, /* returns a pointer to a tcp_sock */ RET_PTR_TO_SOCK_COMMON, /* returns a pointer to a sock_common */ - RET_PTR_TO_ALLOC_MEM, /* returns a pointer to dynamically allocated memory */ + RET_PTR_TO_MEM, /* returns a pointer to memory */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ __BPF_RET_TYPE_MAX, @@ -617,8 +617,8 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, - RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_ALLOC_MEM, - RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM, + RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_MEM, + RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is -- cgit From 894f2a8b1673a355a1a7507a4dfa6a3c836d07c1 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:27 +0530 Subject: bpf: Rename MEM_ALLOC to MEM_RINGBUF Currently, verifier uses MEM_ALLOC type tag to specially tag memory returned from bpf_ringbuf_reserve helper. However, this is currently only used for this purpose and there is an implicit assumption that it only refers to ringbuf memory (e.g. the check for ARG_PTR_TO_ALLOC_MEM in check_func_arg_reg_off). Hence, rename MEM_ALLOC to MEM_RINGBUF to indicate this special relationship and instead open the use of MEM_ALLOC for more generic allocations made for user types. Also, since ARG_PTR_TO_ALLOC_MEM_OR_NULL is unused, simply drop it. Finally, update selftests using 'alloc_' verifier string to 'ringbuf_'. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2fe3ec620d54..afc1c51b59ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -488,10 +488,8 @@ enum bpf_type_flag { */ MEM_RDONLY = BIT(1 + BPF_BASE_TYPE_BITS), - /* MEM was "allocated" from a different helper, and cannot be mixed - * with regular non-MEM_ALLOC'ed MEM types. - */ - MEM_ALLOC = BIT(2 + BPF_BASE_TYPE_BITS), + /* MEM points to BPF ring buffer reservation. */ + MEM_RINGBUF = BIT(2 + BPF_BASE_TYPE_BITS), /* MEM is in user address space. */ MEM_USER = BIT(3 + BPF_BASE_TYPE_BITS), @@ -565,7 +563,7 @@ enum bpf_arg_type { ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ - ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */ + ARG_PTR_TO_RINGBUF_MEM, /* pointer to dynamically reserved ringbuf memory */ ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */ ARG_PTR_TO_BTF_ID_SOCK_COMMON, /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */ ARG_PTR_TO_PERCPU_BTF_ID, /* pointer to in-kernel percpu type */ @@ -582,7 +580,6 @@ enum bpf_arg_type { ARG_PTR_TO_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_MEM, ARG_PTR_TO_CTX_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_CTX, ARG_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET, - ARG_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM, ARG_PTR_TO_STACK_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_STACK, ARG_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | ARG_PTR_TO_BTF_ID, /* pointer to memory does not need to be initialized, helper function must fill @@ -617,7 +614,7 @@ enum bpf_return_type { RET_PTR_TO_SOCKET_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET, RET_PTR_TO_TCP_SOCK_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK, RET_PTR_TO_SOCK_COMMON_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_SOCK_COMMON, - RET_PTR_TO_ALLOC_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_ALLOC | RET_PTR_TO_MEM, + RET_PTR_TO_RINGBUF_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM, RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, -- cgit From 6728aea7216c0c06c98e2e58d753a5e8b2ae1c6f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Tue, 15 Nov 2022 00:45:28 +0530 Subject: bpf: Refactor btf_struct_access Instead of having to pass multiple arguments that describe the register, pass the bpf_reg_state into the btf_struct_access callback. Currently, all call sites simply reuse the btf and btf_id of the reg they want to check the access of. The only exception to this pattern is the callsite in check_ptr_to_map_access, hence for that case create a dummy reg to simulate PTR_TO_BTF_ID access. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221114191547.1694267-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 17 ++++++++--------- include/linux/filter.h | 8 ++++---- 2 files changed, 12 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index afc1c51b59ff..49f9d2bec401 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -771,6 +771,7 @@ struct bpf_prog_ops { union bpf_attr __user *uattr); }; +struct bpf_reg_state; struct bpf_verifier_ops { /* return eBPF function prototype for verification */ const struct bpf_func_proto * @@ -792,9 +793,8 @@ struct bpf_verifier_ops { struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag); }; @@ -2080,9 +2080,9 @@ static inline bool bpf_tracing_btf_ctx_access(int off, int size, return btf_ctx_access(off, size, type, prog, info); } -int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, +int btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag); bool btf_struct_ids_match(struct bpf_verifier_log *log, const struct btf *btf, u32 id, int off, @@ -2333,9 +2333,8 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) } static inline int btf_struct_access(struct bpf_verifier_log *log, - const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, u32 *next_btf_id, enum bpf_type_flag *flag) { return -EACCES; diff --git a/include/linux/filter.h b/include/linux/filter.h index efc42a6e3aed..787d35dbf5b0 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -568,10 +568,10 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); extern struct mutex nf_conn_btf_access_lock; -extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, - const struct btf_type *t, int off, int size, - enum bpf_access_type atype, u32 *next_btf_id, - enum bpf_type_flag *flag); +extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size, enum bpf_access_type atype, + u32 *next_btf_id, enum bpf_type_flag *flag); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, -- cgit From 14d898f3c1b3bf9c4375ee3255ec9e9b89a35578 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 8 Nov 2022 15:05:59 +0100 Subject: dev: Move received_rps counter next to RPS members in softnet data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the received_rps counter value next to the other RPS-related members in softnet_data. This closes two four-byte holes in the structure, making room for another pointer in the first two cache lines without bumping the xmit struct to its own line. Acked-by: Song Liu Reviewed-by: Stanislav Fomichev Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221108140601.149971-2-toke@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/netdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 02a2318da7c7..ddc59ef98500 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3128,7 +3128,6 @@ struct softnet_data { /* stats */ unsigned int processed; unsigned int time_squeeze; - unsigned int received_rps; #ifdef CONFIG_RPS struct softnet_data *rps_ipi_list; #endif @@ -3161,6 +3160,7 @@ struct softnet_data { unsigned int cpu; unsigned int input_queue_tail; #endif + unsigned int received_rps; unsigned int dropped; struct sk_buff_head input_pkt_queue; struct napi_struct backlog; -- cgit From 32637e33003f36e75e9147788cc0e2f21706ef99 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 8 Nov 2022 15:06:00 +0100 Subject: bpf: Expand map key argument of bpf_redirect_map to u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For queueing packets in XDP we want to add a new redirect map type with support for 64-bit indexes. To prepare fore this, expand the width of the 'key' argument to the bpf_redirect_map() helper. Since BPF registers are always 64-bit, this should be safe to do after the fact. Acked-by: Song Liu Reviewed-by: Stanislav Fomichev Signed-off-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/20221108140601.149971-3-toke@redhat.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 +- include/linux/filter.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 49f9d2bec401..54462dd28824 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -135,7 +135,7 @@ struct bpf_map_ops { struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner); /* Misc helpers.*/ - int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags); + int (*map_redirect)(struct bpf_map *map, u64 key, u64 flags); /* map_meta_equal must be implemented for maps that can be * used as an inner map. It is a runtime check to ensure diff --git a/include/linux/filter.h b/include/linux/filter.h index 787d35dbf5b0..bf701976056e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -643,13 +643,13 @@ struct bpf_nh_params { }; struct bpf_redirect_info { - u32 flags; - u32 tgt_index; + u64 tgt_index; void *tgt_value; struct bpf_map *map; + u32 flags; + u32 kern_flags; u32 map_id; enum bpf_map_type map_type; - u32 kern_flags; struct bpf_nh_params nh; }; @@ -1504,7 +1504,7 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */ -static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifindex, +static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { @@ -1515,7 +1515,7 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind if (unlikely(flags & ~(action_mask | flag_mask))) return XDP_ABORTED; - ri->tgt_value = lookup_elem(map, ifindex); + ri->tgt_value = lookup_elem(map, index); if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program @@ -1527,7 +1527,7 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u32 ifind return flags & action_mask; } - ri->tgt_index = ifindex; + ri->tgt_index = index; ri->map_id = map->id; ri->map_type = map->map_type; -- cgit From 06463f6e98df34908c26aa8e7a31a279646b1f51 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 9 Nov 2022 14:42:49 -0800 Subject: wifi: wl1251: drop support for platform data Remove support for configuring the device via platform data because there are no users of wl1251_platform_data left in the mainline kernel. Signed-off-by: Dmitry Torokhov Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/20221109224250.2885119-2-dmitry.torokhov@gmail.com --- include/linux/wl12xx.h | 44 -------------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 include/linux/wl12xx.h (limited to 'include/linux') diff --git a/include/linux/wl12xx.h b/include/linux/wl12xx.h deleted file mode 100644 index 03d61f1d23ab..000000000000 --- a/include/linux/wl12xx.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * This file is part of wl12xx - * - * Copyright (C) 2009 Nokia Corporation - * - * Contact: Luciano Coelho - */ - -#ifndef _LINUX_WL12XX_H -#define _LINUX_WL12XX_H - -#include - -struct wl1251_platform_data { - int power_gpio; - /* SDIO only: IRQ number if WLAN_IRQ line is used, 0 for SDIO IRQs */ - int irq; - bool use_eeprom; -}; - -#ifdef CONFIG_WILINK_PLATFORM_DATA - -int wl1251_set_platform_data(const struct wl1251_platform_data *data); - -struct wl1251_platform_data *wl1251_get_platform_data(void); - -#else - -static inline -int wl1251_set_platform_data(const struct wl1251_platform_data *data) -{ - return -ENOSYS; -} - -static inline -struct wl1251_platform_data *wl1251_get_platform_data(void) -{ - return ERR_PTR(-ENODATA); -} - -#endif - -#endif -- cgit From 9804985bf27f8fbcf0d96c7435b5ad94a2a6ea20 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 14 Nov 2022 13:57:57 -0800 Subject: udp: Introduce optional per-netns hash table. The maximum hash table size is 64K due to the nature of the protocol. [0] It's smaller than TCP, and fewer sockets can cause a performance drop. On an EC2 c5.24xlarge instance (192 GiB memory), after running iperf3 in different netns, creating 32Mi sockets without data transfer in the root netns causes regression for the iperf3's connection. uhash_entries sockets length Gbps 64K 1 1 5.69 1Mi 16 5.27 2Mi 32 4.90 4Mi 64 4.09 8Mi 128 2.96 16Mi 256 2.06 32Mi 512 1.12 The per-netns hash table breaks the lengthy lists into shorter ones. It is useful on a multi-tenant system with thousands of netns. With smaller hash tables, we can look up sockets faster, isolate noisy neighbours, and reduce lock contention. The max size of the per-netns table is 64K as well. This is because the possible hash range by udp_hashfn() always fits in 64K within the same netns and we cannot make full use of the whole buckets larger than 64K. /* 0 < num < 64K -> X < hash < X + 64K */ (num + net_hash_mix(net)) & mask; Also, the min size is 128. We use a bitmap to search for an available port in udp_lib_get_port(). To keep the bitmap on the stack and not fire the CONFIG_FRAME_WARN error at build time, we round up the table size to 128. The sysctl usage is the same with TCP: $ dmesg | cut -d ' ' -f 6- | grep "UDP hash" UDP hash table entries: 65536 (order: 9, 2097152 bytes, vmalloc) # sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 65536 # can be changed by uhash_entries # sysctl net.ipv4.udp_child_hash_entries net.ipv4.udp_child_hash_entries = 0 # disabled by default # ip netns add test1 # ip netns exec test1 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = -65536 # share the global table # sysctl -w net.ipv4.udp_child_hash_entries=100 net.ipv4.udp_child_hash_entries = 100 # ip netns add test2 # ip netns exec test2 sysctl net.ipv4.udp_hash_entries net.ipv4.udp_hash_entries = 128 # own a per-netns table with 2^n buckets We could optimise the hash table lookup/iteration further by removing the netns comparison for the per-netns one in the future. Also, we could optimise the sparse udp_hslot layout by putting it in udp_table. [0]: https://lore.kernel.org/netdev/4ACC2815.7010101@gmail.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- include/linux/udp.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/udp.h b/include/linux/udp.h index dea57aa37df6..a2892e151644 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -23,7 +23,9 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb) return (struct udphdr *)skb_transport_header(skb); } +#define UDP_HTABLE_SIZE_MIN_PERNET 128 #define UDP_HTABLE_SIZE_MIN (CONFIG_BASE_SMALL ? 128 : 256) +#define UDP_HTABLE_SIZE_MAX 65536 static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) { -- cgit From 6c1c5097781f563b70a81683ea6fdac21637573b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Nov 2022 08:53:55 +0000 Subject: net: add atomic_long_t to net_device_stats fields Long standing KCSAN issues are caused by data-race around some dev->stats changes. Most performance critical paths already use per-cpu variables, or per-queue ones. It is reasonable (and more correct) to use atomic operations for the slow paths. This patch adds an union for each field of net_device_stats, so that we can convert paths that are not yet protected by a spinlock or a mutex. netdev_stats_to_stats64() no longer has an #if BITS_PER_LONG==64 Note that the memcpy() we were using on 64bit arches had no provision to avoid load-tearing, while atomic_long_read() is providing the needed protection at no cost. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 58 ++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 02a2318da7c7..23b3903b0678 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -171,31 +171,38 @@ static inline bool dev_xmit_complete(int rc) * (unsigned long) so they can be read and written atomically. */ +#define NET_DEV_STAT(FIELD) \ + union { \ + unsigned long FIELD; \ + atomic_long_t __##FIELD; \ + } + struct net_device_stats { - unsigned long rx_packets; - unsigned long tx_packets; - unsigned long rx_bytes; - unsigned long tx_bytes; - unsigned long rx_errors; - unsigned long tx_errors; - unsigned long rx_dropped; - unsigned long tx_dropped; - unsigned long multicast; - unsigned long collisions; - unsigned long rx_length_errors; - unsigned long rx_over_errors; - unsigned long rx_crc_errors; - unsigned long rx_frame_errors; - unsigned long rx_fifo_errors; - unsigned long rx_missed_errors; - unsigned long tx_aborted_errors; - unsigned long tx_carrier_errors; - unsigned long tx_fifo_errors; - unsigned long tx_heartbeat_errors; - unsigned long tx_window_errors; - unsigned long rx_compressed; - unsigned long tx_compressed; + NET_DEV_STAT(rx_packets); + NET_DEV_STAT(tx_packets); + NET_DEV_STAT(rx_bytes); + NET_DEV_STAT(tx_bytes); + NET_DEV_STAT(rx_errors); + NET_DEV_STAT(tx_errors); + NET_DEV_STAT(rx_dropped); + NET_DEV_STAT(tx_dropped); + NET_DEV_STAT(multicast); + NET_DEV_STAT(collisions); + NET_DEV_STAT(rx_length_errors); + NET_DEV_STAT(rx_over_errors); + NET_DEV_STAT(rx_crc_errors); + NET_DEV_STAT(rx_frame_errors); + NET_DEV_STAT(rx_fifo_errors); + NET_DEV_STAT(rx_missed_errors); + NET_DEV_STAT(tx_aborted_errors); + NET_DEV_STAT(tx_carrier_errors); + NET_DEV_STAT(tx_fifo_errors); + NET_DEV_STAT(tx_heartbeat_errors); + NET_DEV_STAT(tx_window_errors); + NET_DEV_STAT(rx_compressed); + NET_DEV_STAT(tx_compressed); }; +#undef NET_DEV_STAT /* per-cpu stats, allocated on demand. * Try to fit them in a single cache line, for dev_get_stats() sake. @@ -5171,4 +5178,9 @@ extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; extern struct net_device *blackhole_netdev; +/* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */ +#define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD) +#define DEV_STATS_ADD(DEV, FIELD, VAL) \ + atomic_long_add((VAL), &(DEV)->stats.__##FIELD) + #endif /* _LINUX_NETDEVICE_H */ -- cgit From 3af43ba4c6019b29c048921eb8147eb010165329 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Wed, 16 Nov 2022 15:50:58 +0800 Subject: bpf: Pass map file to .map_update_batch directly Currently bpf_map_do_batch() first invokes fdget(batch.map_fd) to get the target map file, then it invokes generic_map_update_batch() to do batch update. generic_map_update_batch() will get the target map file by using fdget(batch.map_fd) again and pass it to bpf_map_update_value(). The problem is map file returned by the second fdget() may be NULL or a totally different file compared by map file in bpf_map_do_batch(). The reason is that the first fdget() only guarantees the liveness of struct file instead of file descriptor and the file description may be released by concurrent close() through pick_file(). It doesn't incur any problem as for now, because maps with batch update support don't use map file in .map_fd_get_ptr() ops. But it is better to fix the potential access of an invalid map file. Using __bpf_map_get() again in generic_map_update_batch() can not fix the problem, because batch.map_fd may be closed and reopened, and the returned map file may be different with map file got in bpf_map_do_batch(), so just passing the map file directly to .map_update_batch() in bpf_map_do_batch(). Signed-off-by: Hou Tao Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221116075059.1551277-1-houtao@huaweicloud.com --- include/linux/bpf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 54462dd28824..e60a5c052473 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -85,7 +85,8 @@ struct bpf_map_ops { int (*map_lookup_and_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); - int (*map_update_batch)(struct bpf_map *map, const union bpf_attr *attr, + int (*map_update_batch)(struct bpf_map *map, struct file *map_file, + const union bpf_attr *attr, union bpf_attr __user *uattr); int (*map_delete_batch)(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); @@ -1789,7 +1790,7 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); int generic_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr, union bpf_attr __user *uattr); -int generic_map_update_batch(struct bpf_map *map, +int generic_map_update_batch(struct bpf_map *map, struct file *map_file, const union bpf_attr *attr, union bpf_attr __user *uattr); int generic_map_delete_batch(struct bpf_map *map, -- cgit From 282de143ead96a5d53331e946f31c977b4610a74 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:25:55 +0530 Subject: bpf: Introduce allocated objects support Introduce support for representing pointers to objects allocated by the BPF program, i.e. PTR_TO_BTF_ID that point to a type in program BTF. This is indicated by the presence of MEM_ALLOC type flag in reg->type to avoid having to check btf_is_kernel when trying to match argument types in helpers. Whenever walking such types, any pointers being walked will always yield a SCALAR instead of pointer. In the future we might permit kptr inside such allocated objects (either kernel or program allocated), and it will then form a PTR_TO_BTF_ID of the respective type. For now, such allocated objects will always be referenced in verifier context, hence ref_obj_id == 0 for them is a bug. It is allowed to write to such objects, as long fields that are special are not touched (support for which will be added in subsequent patches). Note that once such a pointer is marked PTR_UNTRUSTED, it is no longer allowed to write to it. No PROBE_MEM handling is therefore done for loads into this type unless PTR_UNTRUSTED is part of the register type, since they can never be in an undefined state, and their lifetime will always be valid. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-6-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e60a5c052473..7440c20c4192 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -525,6 +525,11 @@ enum bpf_type_flag { /* Size is known at compile time. */ MEM_FIXED_SIZE = BIT(10 + BPF_BASE_TYPE_BITS), + /* MEM is of an allocated object of type in program BTF. This is used to + * tag PTR_TO_BTF_ID allocated using bpf_obj_new. + */ + MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -2792,4 +2797,10 @@ struct bpf_key { bool has_ref; }; #endif /* CONFIG_KEYS */ + +static inline bool type_is_alloc(u32 type) +{ + return type & MEM_ALLOC; +} + #endif /* _LINUX_BPF_H */ -- cgit From 8ffa5cc142137a59d6a10eb5273fa2ba5dcd4947 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:25:56 +0530 Subject: bpf: Recognize lock and list fields in allocated objects Allow specifying bpf_spin_lock, bpf_list_head, bpf_list_node fields in a allocated object. Also update btf_struct_access to reject direct access to these special fields. A bpf_list_head allows implementing map-in-map style use cases, where an allocated object with bpf_list_head is linked into a list in a map value. This would require embedding a bpf_list_node, support for which is also included. The bpf_spin_lock is used to protect the bpf_list_head and other data. While we strictly don't require to hold a bpf_spin_lock while touching the bpf_list_head in such objects, as when have access to it, we have complete ownership of the object, the locking constraint is still kept and may be conditionally lifted in the future. Note that the specification of such types can be done just like map values, e.g.: struct bar { struct bpf_list_node node; }; struct foo { struct bpf_spin_lock lock; struct bpf_list_head head __contains(bar, node); struct bpf_list_node node; }; struct map_value { struct bpf_spin_lock lock; struct bpf_list_head head __contains(foo, node); }; To recognize such types in user BTF, we build a btf_struct_metas array of metadata items corresponding to each BTF ID. This is done once during the btf_parse stage to avoid having to do it each time during the verification process's requirement to inspect the metadata. Moreover, the computed metadata needs to be passed to some helpers in future patches which requires allocating them and storing them in the BTF that is pinned by the program itself, so that valid access can be assumed to such data during program runtime. A key thing to note is that once a btf_struct_meta is available for a type, both the btf_record and btf_field_offs should be available. It is critical that btf_field_offs is available in case special fields are present, as we extensively rely on special fields being zeroed out in map values and allocated objects in later patches. The code ensures that by bailing out in case of errors and ensuring both are available together. If the record is not available, the special fields won't be recognized, so not having both is also fine (in terms of being a verification error and not a runtime bug). Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-7-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 7 +++++++ include/linux/btf.h | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7440c20c4192..eb6ea53fa5a2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -177,6 +177,7 @@ enum btf_field_type { BPF_KPTR_REF = (1 << 3), BPF_KPTR = BPF_KPTR_UNREF | BPF_KPTR_REF, BPF_LIST_HEAD = (1 << 4), + BPF_LIST_NODE = (1 << 5), }; struct btf_field_kptr { @@ -277,6 +278,8 @@ static inline const char *btf_field_type_name(enum btf_field_type type) return "kptr"; case BPF_LIST_HEAD: return "bpf_list_head"; + case BPF_LIST_NODE: + return "bpf_list_node"; default: WARN_ON_ONCE(1); return "unknown"; @@ -295,6 +298,8 @@ static inline u32 btf_field_type_size(enum btf_field_type type) return sizeof(u64); case BPF_LIST_HEAD: return sizeof(struct bpf_list_head); + case BPF_LIST_NODE: + return sizeof(struct bpf_list_node); default: WARN_ON_ONCE(1); return 0; @@ -313,6 +318,8 @@ static inline u32 btf_field_type_align(enum btf_field_type type) return __alignof__(u64); case BPF_LIST_HEAD: return __alignof__(struct bpf_list_head); + case BPF_LIST_NODE: + return __alignof__(struct bpf_list_node); default: WARN_ON_ONCE(1); return 0; diff --git a/include/linux/btf.h b/include/linux/btf.h index d80345fa566b..a01a8da20021 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include @@ -78,6 +80,17 @@ struct btf_id_dtor_kfunc { u32 kfunc_btf_id; }; +struct btf_struct_meta { + u32 btf_id; + struct btf_record *record; + struct btf_field_offs *field_offs; +}; + +struct btf_struct_metas { + u32 cnt; + struct btf_struct_meta types[]; +}; + typedef void (*btf_dtor_kfunc_t)(void *); extern const struct file_operations btf_fops; @@ -408,6 +421,23 @@ static inline struct btf_param *btf_params(const struct btf_type *t) return (struct btf_param *)(t + 1); } +static inline int btf_id_cmp_func(const void *a, const void *b) +{ + const int *pa = a, *pb = b; + + return *pa - *pb; +} + +static inline bool btf_id_set_contains(const struct btf_id_set *set, u32 id) +{ + return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; +} + +static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) +{ + return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); +} + #ifdef CONFIG_BPF_SYSCALL struct bpf_prog; @@ -423,6 +453,7 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); +struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -454,6 +485,10 @@ static inline int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dt { return 0; } +static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id) +{ + return NULL; +} #endif static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) -- cgit From 865ce09a49d79d2b2c1d980f4c05ffc0b3517bdc Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:25:57 +0530 Subject: bpf: Verify ownership relationships for user BTF types Ensure that there can be no ownership cycles among different types by way of having owning objects that can hold some other type as their element. For instance, a map value can only hold allocated objects, but these are allowed to have another bpf_list_head. To prevent unbounded recursion while freeing resources, elements of bpf_list_head in local kptrs can never have a bpf_list_head which are part of list in a map value. Later patches will verify this by having dedicated BTF selftests. Also, to make runtime destruction easier, once btf_struct_metas is fully populated, we can stash the metadata of the value type directly in the metadata of the list_head fields, as that allows easier access to the value type's layout to destruct it at runtime from the btf_field entry of the list head itself. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-8-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/btf.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index eb6ea53fa5a2..323985a39ece 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -191,6 +191,7 @@ struct btf_field_list_head { struct btf *btf; u32 value_btf_id; u32 node_offset; + struct btf_record *value_rec; }; struct btf_field { diff --git a/include/linux/btf.h b/include/linux/btf.h index a01a8da20021..42d8f3730a8d 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -178,6 +178,7 @@ int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); int btf_find_timer(const struct btf *btf, const struct btf_type *t); struct btf_record *btf_parse_fields(const struct btf *btf, const struct btf_type *t, u32 field_mask, u32 value_size); +int btf_check_and_fixup_fields(const struct btf *btf, struct btf_record *rec); struct btf_field_offs *btf_parse_field_offs(struct btf_record *rec); bool btf_type_is_void(const struct btf_type *t); s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind); -- cgit From d0d78c1df9b1dbfb5e172de473561ce09d5e9d39 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:25:59 +0530 Subject: bpf: Allow locking bpf_spin_lock global variables Global variables reside in maps accessible using direct_value_addr callbacks, so giving each load instruction's rewrite a unique reg->id disallows us from holding locks which are global. The reason for preserving reg->id as a unique value for registers that may point to spin lock is that two separate lookups are treated as two separate memory regions, and any possible aliasing is ignored for the purposes of spin lock correctness. This is not great especially for the global variable case, which are served from maps that have max_entries == 1, i.e. they always lead to map values pointing into the same map value. So refactor the active_spin_lock into a 'active_lock' structure which represents the lock identity, and instead of the reg->id, remember two fields, a pointer and the reg->id. The pointer will store reg->map_ptr or reg->btf. It's only necessary to distinguish for the id == 0 case of global variables, but always setting the pointer to a non-NULL value and using the pointer to check whether the lock is held simplifies code in the verifier. This is generic enough to allow it for global variables, map lookups, and allocated objects at the same time. Note that while whether a lock is held can be answered by just comparing active_lock.ptr to NULL, to determine whether the register is pointing to the same held lock requires comparing _both_ ptr and id. Finally, as a result of this refactoring, pseudo load instructions are not given a unique reg->id, as they are doing lookup for the same map value (max_entries is never greater than 1). Essentially, we consider that the tuple of (ptr, id) will always be unique for any kind of argument to bpf_spin_{lock,unlock}. Note that this can be extended in the future to also remember offset used for locking, so that we can introduce multiple bpf_spin_lock fields in the same allocation. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-10-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1a32baa78ce2..1db2b4dc7009 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -323,7 +323,21 @@ struct bpf_verifier_state { u32 branches; u32 insn_idx; u32 curframe; - u32 active_spin_lock; + /* For every reg representing a map value or allocated object pointer, + * we consider the tuple of (ptr, id) for them to be unique in verifier + * context and conside them to not alias each other for the purposes of + * tracking lock state. + */ + struct { + /* This can either be reg->map_ptr or reg->btf. If ptr is NULL, + * there's no active lock held, and other fields have no + * meaning. If non-NULL, it indicates that a lock is held and + * id member has the reg->id of the register which can be >= 0. + */ + void *ptr; + /* This will be reg->id */ + u32 id; + } active_lock; bool speculative; /* first and last insn idx of this verifier state */ -- cgit From 00b85860feb809852af9a88cb4ca8766d7dff6a3 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:26:01 +0530 Subject: bpf: Rewrite kfunc argument handling As we continue to add more features, argument types, kfunc flags, and different extensions to kfuncs, the code to verify the correctness of the kfunc prototype wrt the passed in registers has become ad-hoc and ugly to read. To make life easier, and make a very clear split between different stages of argument processing, move all the code into verifier.c and refactor into easier to read helpers and functions. This also makes sharing code within the verifier easier with kfunc argument processing. This will be more and more useful in later patches as we are now moving to implement very core BPF helpers as kfuncs, to keep them experimental before baking into UAPI. Remove all kfunc related bits now from btf_check_func_arg_match, as users have been converted away to refactored kfunc argument handling. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-12-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 11 ----------- include/linux/bpf_verifier.h | 2 -- include/linux/btf.h | 31 ++++++++++++++++++++++++++++++- 3 files changed, 30 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 323985a39ece..0a74df731eb8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2109,22 +2109,11 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, const char *func_name, struct btf_func_model *m); -struct bpf_kfunc_arg_meta { - u64 r0_size; - bool r0_rdonly; - int ref_obj_id; - u32 flags; -}; - struct bpf_reg_state; int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); int btf_check_subprog_call(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *regs); -int btf_check_kfunc_arg_match(struct bpf_verifier_env *env, - const struct btf *btf, u32 func_id, - struct bpf_reg_state *regs, - struct bpf_kfunc_arg_meta *meta); int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, struct bpf_reg_state *reg); int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *prog, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1db2b4dc7009..fb146b0ce006 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -603,8 +603,6 @@ int check_ptr_off_reg(struct bpf_verifier_env *env, int check_func_arg_reg_off(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno, enum bpf_arg_type arg_type); -int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, - u32 regno); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, diff --git a/include/linux/btf.h b/include/linux/btf.h index 42d8f3730a8d..d5b26380a60f 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -338,6 +338,16 @@ static inline bool btf_type_is_struct(const struct btf_type *t) return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; } +static inline bool __btf_type_is_struct(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; +} + +static inline bool btf_type_is_array(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; +} + static inline u16 btf_type_vlen(const struct btf_type *t) { return BTF_INFO_VLEN(t->info); @@ -439,9 +449,10 @@ static inline void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id) return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func); } -#ifdef CONFIG_BPF_SYSCALL struct bpf_prog; +struct bpf_verifier_log; +#ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); @@ -455,6 +466,12 @@ s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf, u32 btf_id); +const struct btf_member * +btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg); +bool btf_types_are_same(const struct btf *btf1, u32 id1, + const struct btf *btf2, u32 id2); #else static inline const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) @@ -490,6 +507,18 @@ static inline struct btf_struct_meta *btf_find_struct_meta(const struct btf *btf { return NULL; } +static inline const struct btf_member * +btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, enum bpf_prog_type prog_type, + int arg) +{ + return NULL; +} +static inline bool btf_types_are_same(const struct btf *btf1, u32 id1, + const struct btf *btf2, u32 id2) +{ + return false; +} #endif static inline bool btf_type_is_struct_ptr(struct btf *btf, const struct btf_type *t) -- cgit From 958cf2e273f0929c66169e0788031310e8118722 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:26:03 +0530 Subject: bpf: Introduce bpf_obj_new Introduce type safe memory allocator bpf_obj_new for BPF programs. The kernel side kfunc is named bpf_obj_new_impl, as passing hidden arguments to kfuncs still requires having them in prototype, unlike BPF helpers which always take 5 arguments and have them checked using bpf_func_proto in verifier, ignoring unset argument types. Introduce __ign suffix to ignore a specific kfunc argument during type checks, then use this to introduce support for passing type metadata to the bpf_obj_new_impl kfunc. The user passes BTF ID of the type it wants to allocates in program BTF, the verifier then rewrites the first argument as the size of this type, after performing some sanity checks (to ensure it exists and it is a struct type). The second argument is also fixed up and passed by the verifier. This is the btf_struct_meta for the type being allocated. It would be needed mostly for the offset array which is required for zero initializing special fields while leaving the rest of storage in unitialized state. It would also be needed in the next patch to perform proper destruction of the object's special fields. Under the hood, bpf_obj_new will call bpf_mem_alloc and bpf_mem_free, using the any context BPF memory allocator introduced recently. To this end, a global instance of the BPF memory allocator is initialized on boot to be used for this purpose. This 'bpf_global_ma' serves all allocations for bpf_obj_new. In the future, bpf_obj_new variants will allow specifying a custom allocator. Note that now that bpf_obj_new can be used to allocate objects that can be linked to BPF linked list (when future linked list helpers are available), we need to also free the elements using bpf_mem_free. However, since the draining of elements is done outside the bpf_spin_lock, we need to do migrate_disable around the call since bpf_list_head_free can be called from map free path where migration is enabled. Otherwise, when called from BPF programs migration is already disabled. A convenience macro is included in the bpf_experimental.h header to hide over the ugly details of the implementation, leading to user code looking similar to a language level extension which allocates and constructs fields of a user type. struct bar { struct bpf_list_node node; }; struct foo { struct bpf_spin_lock lock; struct bpf_list_head head __contains(bar, node); }; void prog(void) { struct foo *f; f = bpf_obj_new(typeof(*f)); if (!f) return; ... } A key piece of this story is still missing, i.e. the free function, which will come in the next patch. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-14-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 21 +++++++++++++-------- include/linux/bpf_verifier.h | 2 ++ 2 files changed, 15 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0a74df731eb8..8b32376ce746 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -54,6 +54,8 @@ struct cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; extern struct kobject *btf_kobj; +extern struct bpf_mem_alloc bpf_global_ma; +extern bool bpf_global_ma_set; typedef u64 (*bpf_callback_t)(u64, u64, u64, u64, u64); typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, @@ -334,16 +336,19 @@ static inline bool btf_record_has_field(const struct btf_record *rec, enum btf_f return rec->field_mask & type; } -static inline void check_and_init_map_value(struct bpf_map *map, void *dst) +static inline void bpf_obj_init(const struct btf_field_offs *foffs, void *obj) { - if (!IS_ERR_OR_NULL(map->record)) { - struct btf_field *fields = map->record->fields; - u32 cnt = map->record->cnt; - int i; + int i; - for (i = 0; i < cnt; i++) - memset(dst + fields[i].offset, 0, btf_field_type_size(fields[i].type)); - } + if (!foffs) + return; + for (i = 0; i < foffs->cnt; i++) + memset(obj + foffs->field_off[i], 0, foffs->field_sz[i]); +} + +static inline void check_and_init_map_value(struct bpf_map *map, void *dst) +{ + bpf_obj_init(map->field_offs, dst); } /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index fb146b0ce006..3dc72d396dfc 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -433,6 +433,8 @@ struct bpf_insn_aux_data { */ struct bpf_loop_inline_state loop_inline_state; }; + u64 obj_new_size; /* remember the size of type passed to bpf_obj_new to rewrite R1 */ + struct btf_struct_meta *kptr_struct_meta; u64 map_key_state; /* constant (32 bit) key tracking for maps */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ -- cgit From 534e86bc6c66e1e0c798a1c0a6a680bb231c08db Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 18 Nov 2022 07:26:07 +0530 Subject: bpf: Add 'release on unlock' logic for bpf_list_push_{front,back} This commit implements the delayed release logic for bpf_list_push_front and bpf_list_push_back. Once a node has been added to the list, it's pointer changes to PTR_UNTRUSTED. However, it is only released once the lock protecting the list is unlocked. For such PTR_TO_BTF_ID | MEM_ALLOC with PTR_UNTRUSTED set but an active ref_obj_id, it is still permitted to read them as long as the lock is held. Writing to them is not allowed. This allows having read access to push items we no longer own until we release the lock guarding the list, allowing a little more flexibility when working with these APIs. Note that enabling write support has fairly tricky interactions with what happens inside the critical section. Just as an example, currently, bpf_obj_drop is not permitted, but if it were, being able to write to the PTR_UNTRUSTED pointer while the object gets released back to the memory allocator would violate safety properties we wish to guarantee (i.e. not crashing the kernel). The memory could be reused for a different type in the BPF program or even in the kernel as it gets eventually kfree'd. Not enabling bpf_obj_drop inside the critical section would appear to prevent all of the above, but that is more of an artifical limitation right now. Since the write support is tangled with how we handle potential aliasing of nodes inside the critical section that may or may not be part of the list anymore, it has been deferred to a future patch. Acked-by: Dave Marchevsky Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221118015614.2013203-18-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 3dc72d396dfc..23f30c685f28 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -223,6 +223,11 @@ struct bpf_reference_state { * exiting a callback function. */ int callback_ref; + /* Mark the reference state to release the registers sharing the same id + * on bpf_spin_unlock (for nodes that we will lose ownership to but are + * safe to access inside the critical section). + */ + bool release_on_unlock; }; /* state of the program: -- cgit From f20a0a0519f35a3a34236ed2d38b067c5cb22b9f Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 17 Nov 2022 02:18:28 +0900 Subject: ethtool: doc: clarify what drivers can implement in their get_drvinfo() Many of the drivers which implement ethtool_ops::get_drvinfo() will prints the .driver, .version or .bus_info of struct ethtool_drvinfo. To have a glance of current state, do: $ git grep -W "get_drvinfo(struct" Printing in those three fields is useless because: - since [1], the driver version should be the kernel version (at least for upstream drivers). Arguably, out of tree drivers might still want to set a custom version, but out of tree is not our focus. - since [2], the core is able to provide default values for .driver and .bus_info. In summary, drivers may provide .fw_version and .erom_version, the rest is expected to be done by the core. In struct ethtool_ops doc from linux/ethtool: rephrase field get_drvinfo() doc to discourage developers from implementing this callback. In struct ethtool_drvinfo doc from uapi/linux/ethtool.h: remove the paragraph mentioning what drivers should do. Rationale: no need to repeat what is already written in struct ethtool_ops doc. But add a note that .fw_version and .erom_version are driver defined. Also update the dummy driver and simply remove the callback in order not to confuse the newcomers: most of the drivers will not need this callback function any more. [1] commit 6a7e25c7fb48 ("net/core: Replace driver version to be kernel version") Link: https://git.kernel.org/torvalds/linux/c/6a7e25c7fb48 [2] commit edaf5df22cb8 ("ethtool: ethtool_get_drvinfo: populate drvinfo fields even if callback exits") Link: https://git.kernel.org/netdev/net-next/c/edaf5df22cb8 Reviewed-by: Leon Romanovsky Signed-off-by: Vincent Mailhol Link: https://lore.kernel.org/r/20221116171828.4093-1-mailhol.vincent@wanadoo.fr Signed-off-by: Jakub Kicinski --- include/linux/ethtool.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 5c51c7fda32a..9e0a76fc7de9 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -473,10 +473,10 @@ struct ethtool_module_power_mode_params { * parameter. * @supported_coalesce_params: supported types of interrupt coalescing. * @supported_ring_params: supported ring params. - * @get_drvinfo: Report driver/device information. Should only set the - * @driver, @version, @fw_version and @bus_info fields. If not - * implemented, the @driver and @bus_info fields will be filled in - * according to the netdev's parent device. + * @get_drvinfo: Report driver/device information. Modern drivers no + * longer have to implement this callback. Most fields are + * correctly filled in by the core using system information, or + * populated using other driver operations. * @get_regs_len: Get buffer length required for @get_regs * @get_regs: Get device registers * @get_wol: Report whether Wake-on-Lan is enabled -- cgit From 647541ea06a7bbbcf941e501c726b3e26328c102 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 15 Nov 2022 10:40:21 -0500 Subject: sctp: move SCTP_PAD4 and SCTP_TRUNC4 to linux/sctp.h Move these two macros from net/sctp/sctp.h to linux/sctp.h, so that it will be enough to include only linux/sctp.h in nft_exthdr.c and xt_sctp.c. It should not include "net/sctp/sctp.h" if a module does not have a dependence on SCTP module. Signed-off-by: Xin Long Reviewed-by: Saeed Mahameed Link: https://lore.kernel.org/r/ef6468a687f36da06f575c2131cd4612f6b7be88.1668526821.git.lucien.xin@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/sctp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sctp.h b/include/linux/sctp.h index a86e852507b3..358dc08e0831 100644 --- a/include/linux/sctp.h +++ b/include/linux/sctp.h @@ -820,4 +820,9 @@ struct sctp_new_encap_port_hdr { __be16 new_port; }; +/* Round an int up to the next multiple of 4. */ +#define SCTP_PAD4(s) (((s)+3)&~3) +/* Truncate to the previous multiple of 4. */ +#define SCTP_TRUNC4(s) ((s)&~3) + #endif /* __LINUX_SCTP_H__ */ -- cgit From ef66c5475d7fb864c2418d3bdd19dee46324624b Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sat, 19 Nov 2022 23:10:01 -0600 Subject: bpf: Allow multiple modifiers in reg_type_str() prefix reg_type_str() in the verifier currently only allows a single register type modifier to be present in the 'prefix' string which is eventually stored in the env type_str_buf. This currently works fine because there are no overlapping type modifiers, but once PTR_TRUSTED is added, that will no longer be the case. This patch updates reg_type_str() to support having multiple modifiers in the prefix string, and updates the size of type_str_buf to be 128 bytes. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20221120051004.3605026-2-void@manifault.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 23f30c685f28..608dde740fef 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -19,7 +19,7 @@ */ #define BPF_MAX_VAR_SIZ (1 << 29) /* size of type_str_buf in bpf_verifier. */ -#define TYPE_STR_BUF_LEN 64 +#define TYPE_STR_BUF_LEN 128 /* Liveness marks, used for registers and spilled-regs (in stack slots). * Read marks propagate upwards until they find a write mark; they record that -- cgit From 3f00c52393445ed49aadc1a567aa502c6333b1a1 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Sat, 19 Nov 2022 23:10:02 -0600 Subject: bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs Kfuncs currently support specifying the KF_TRUSTED_ARGS flag to signal to the verifier that it should enforce that a BPF program passes it a "safe", trusted pointer. Currently, "safe" means that the pointer is either PTR_TO_CTX, or is refcounted. There may be cases, however, where the kernel passes a BPF program a safe / trusted pointer to an object that the BPF program wishes to use as a kptr, but because the object does not yet have a ref_obj_id from the perspective of the verifier, the program would be unable to pass it to a KF_ACQUIRE | KF_TRUSTED_ARGS kfunc. The solution is to expand the set of pointers that are considered trusted according to KF_TRUSTED_ARGS, so that programs can invoke kfuncs with these pointers without getting rejected by the verifier. There is already a PTR_UNTRUSTED flag that is set in some scenarios, such as when a BPF program reads a kptr directly from a map without performing a bpf_kptr_xchg() call. These pointers of course can and should be rejected by the verifier. Unfortunately, however, PTR_UNTRUSTED does not cover all the cases for safety that need to be addressed to adequately protect kfuncs. Specifically, pointers obtained by a BPF program "walking" a struct are _not_ considered PTR_UNTRUSTED according to BPF. For example, say that we were to add a kfunc called bpf_task_acquire(), with KF_ACQUIRE | KF_TRUSTED_ARGS, to acquire a struct task_struct *. If we only used PTR_UNTRUSTED to signal that a task was unsafe to pass to a kfunc, the verifier would mistakenly allow the following unsafe BPF program to be loaded: SEC("tp_btf/task_newtask") int BPF_PROG(unsafe_acquire_task, struct task_struct *task, u64 clone_flags) { struct task_struct *acquired, *nested; nested = task->last_wakee; /* Would not be rejected by the verifier. */ acquired = bpf_task_acquire(nested); if (!acquired) return 0; bpf_task_release(acquired); return 0; } To address this, this patch defines a new type flag called PTR_TRUSTED which tracks whether a PTR_TO_BTF_ID pointer is safe to pass to a KF_TRUSTED_ARGS kfunc or a BPF helper function. PTR_TRUSTED pointers are passed directly from the kernel as a tracepoint or struct_ops callback argument. Any nested pointer that is obtained from walking a PTR_TRUSTED pointer is no longer PTR_TRUSTED. From the example above, the struct task_struct *task argument is PTR_TRUSTED, but the 'nested' pointer obtained from 'task->last_wakee' is not PTR_TRUSTED. A subsequent patch will add kfuncs for storing a task kfunc as a kptr, and then another patch will add selftests to validate. Signed-off-by: David Vernet Link: https://lore.kernel.org/r/20221120051004.3605026-3-void@manifault.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 30 ++++++++++++++++++++ include/linux/bpf_verifier.h | 7 +++++ include/linux/btf.h | 65 ++++++++++++++++++++++++++++---------------- 3 files changed, 78 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8b32376ce746..c9eafa67f2a2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -543,6 +543,35 @@ enum bpf_type_flag { */ MEM_ALLOC = BIT(11 + BPF_BASE_TYPE_BITS), + /* PTR was passed from the kernel in a trusted context, and may be + * passed to KF_TRUSTED_ARGS kfuncs or BPF helper functions. + * Confusingly, this is _not_ the opposite of PTR_UNTRUSTED above. + * PTR_UNTRUSTED refers to a kptr that was read directly from a map + * without invoking bpf_kptr_xchg(). What we really need to know is + * whether a pointer is safe to pass to a kfunc or BPF helper function. + * While PTR_UNTRUSTED pointers are unsafe to pass to kfuncs and BPF + * helpers, they do not cover all possible instances of unsafe + * pointers. For example, a pointer that was obtained from walking a + * struct will _not_ get the PTR_UNTRUSTED type modifier, despite the + * fact that it may be NULL, invalid, etc. This is due to backwards + * compatibility requirements, as this was the behavior that was first + * introduced when kptrs were added. The behavior is now considered + * deprecated, and PTR_UNTRUSTED will eventually be removed. + * + * PTR_TRUSTED, on the other hand, is a pointer that the kernel + * guarantees to be valid and safe to pass to kfuncs and BPF helpers. + * For example, pointers passed to tracepoint arguments are considered + * PTR_TRUSTED, as are pointers that are passed to struct_ops + * callbacks. As alluded to above, pointers that are obtained from + * walking PTR_TRUSTED pointers are _not_ trusted. For example, if a + * struct task_struct *task is PTR_TRUSTED, then accessing + * task->last_wakee will lose the PTR_TRUSTED modifier when it's stored + * in a BPF register. Similarly, pointers passed to certain programs + * types such as kretprobes are not guaranteed to be valid, as they may + * for example contain an object that was recently freed. + */ + PTR_TRUSTED = BIT(12 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; @@ -636,6 +665,7 @@ enum bpf_return_type { RET_PTR_TO_RINGBUF_MEM_OR_NULL = PTR_MAYBE_NULL | MEM_RINGBUF | RET_PTR_TO_MEM, RET_PTR_TO_DYNPTR_MEM_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_MEM, RET_PTR_TO_BTF_ID_OR_NULL = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID, + RET_PTR_TO_BTF_ID_TRUSTED = PTR_TRUSTED | RET_PTR_TO_BTF_ID, /* This must be the last entry. Its purpose is to ensure the enum is * wide enough to hold the higher bits reserved for bpf_type_flag. diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 608dde740fef..545152ac136c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -680,4 +680,11 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) + +static inline bool bpf_type_has_unsafe_modifiers(u32 type) +{ + return type_flag(type) & ~BPF_REG_TRUSTED_MODIFIERS; +} + #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/btf.h b/include/linux/btf.h index d5b26380a60f..d38aa4251c28 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -19,36 +19,53 @@ #define KF_RELEASE (1 << 1) /* kfunc is a release function */ #define KF_RET_NULL (1 << 2) /* kfunc returns a pointer that may be NULL */ #define KF_KPTR_GET (1 << 3) /* kfunc returns reference to a kptr */ -/* Trusted arguments are those which are meant to be referenced arguments with - * unchanged offset. It is used to enforce that pointers obtained from acquire - * kfuncs remain unmodified when being passed to helpers taking trusted args. +/* Trusted arguments are those which are guaranteed to be valid when passed to + * the kfunc. It is used to enforce that pointers obtained from either acquire + * kfuncs, or from the main kernel on a tracepoint or struct_ops callback + * invocation, remain unmodified when being passed to helpers taking trusted + * args. * - * Consider - * struct foo { - * int data; - * struct foo *next; - * }; + * Consider, for example, the following new task tracepoint: * - * struct bar { - * int data; - * struct foo f; - * }; + * SEC("tp_btf/task_newtask") + * int BPF_PROG(new_task_tp, struct task_struct *task, u64 clone_flags) + * { + * ... + * } * - * struct foo *f = alloc_foo(); // Acquire kfunc - * struct bar *b = alloc_bar(); // Acquire kfunc + * And the following kfunc: * - * If a kfunc set_foo_data() wants to operate only on the allocated object, it - * will set the KF_TRUSTED_ARGS flag, which will prevent unsafe usage like: + * BTF_ID_FLAGS(func, bpf_task_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) * - * set_foo_data(f, 42); // Allowed - * set_foo_data(f->next, 42); // Rejected, non-referenced pointer - * set_foo_data(&f->next, 42);// Rejected, referenced, but wrong type - * set_foo_data(&b->f, 42); // Rejected, referenced, but bad offset + * All invocations to the kfunc must pass the unmodified, unwalked task: * - * In the final case, usually for the purposes of type matching, it is deduced - * by looking at the type of the member at the offset, but due to the - * requirement of trusted argument, this deduction will be strict and not done - * for this case. + * bpf_task_acquire(task); // Allowed + * bpf_task_acquire(task->last_wakee); // Rejected, walked task + * + * Programs may also pass referenced tasks directly to the kfunc: + * + * struct task_struct *acquired; + * + * acquired = bpf_task_acquire(task); // Allowed, same as above + * bpf_task_acquire(acquired); // Allowed + * bpf_task_acquire(task); // Allowed + * bpf_task_acquire(acquired->last_wakee); // Rejected, walked task + * + * Programs may _not_, however, pass a task from an arbitrary fentry/fexit, or + * kprobe/kretprobe to the kfunc, as BPF cannot guarantee that all of these + * pointers are guaranteed to be safe. For example, the following BPF program + * would be rejected: + * + * SEC("kretprobe/free_task") + * int BPF_PROG(free_task_probe, struct task_struct *tsk) + * { + * struct task_struct *acquired; + * + * acquired = bpf_task_acquire(acquired); // Rejected, not a trusted pointer + * bpf_task_release(acquired); + * + * return 0; + * } */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ -- cgit From fd264ca020948a743e4c36731dfdecc4a812153c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sun, 20 Nov 2022 11:54:32 -0800 Subject: bpf: Add a kfunc to type cast from bpf uapi ctx to kernel ctx Implement bpf_cast_to_kern_ctx() kfunc which does a type cast of a uapi ctx object to the corresponding kernel ctx. Previously if users want to access some data available in kctx but not in uapi ctx, bpf_probe_read_kernel() helper is needed. The introduction of bpf_cast_to_kern_ctx() allows direct memory access which makes code simpler and easier to understand. Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221120195432.3113982-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/btf.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index d38aa4251c28..9ed00077db6e 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -487,6 +487,7 @@ const struct btf_member * btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, enum bpf_prog_type prog_type, int arg); +int get_kern_ctx_btf_id(struct bpf_verifier_log *log, enum bpf_prog_type prog_type); bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2); #else @@ -531,6 +532,10 @@ btf_get_prog_ctx_type(struct bpf_verifier_log *log, const struct btf *btf, { return NULL; } +static inline int get_kern_ctx_btf_id(struct bpf_verifier_log *log, + enum bpf_prog_type prog_type) { + return -EINVAL; +} static inline bool btf_types_are_same(const struct btf *btf1, u32 id1, const struct btf *btf2, u32 id2) { -- cgit From 19d05ea712ecbbb67d302664da5ec58b37b9aece Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 21 Nov 2022 15:55:54 +0200 Subject: net: dsa: move tag_8021q headers to their proper place tag_8021q definitions are all over the place. Some are exported to linux/dsa/8021q.h (visible by DSA core, taggers, switch drivers and everyone else), and some are in dsa_priv.h. Move the structures that don't need external visibility into tag_8021q.c, and the ones which don't need the world or switch drivers to see them into tag_8021q.h. We also have the tag_8021q.h inclusion from switch.c, which is basically the entire reason why tag_8021q.c was built into DSA in commit 8b6e638b4be2 ("net: dsa: build tag_8021q.c as part of DSA core"). I still don't know how to better deal with that, so leave it alone. Signed-off-by: Vladimir Oltean Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- include/linux/dsa/8021q.h | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h index 3ed117e299ec..f3664ee12170 100644 --- a/include/linux/dsa/8021q.h +++ b/include/linux/dsa/8021q.h @@ -5,28 +5,8 @@ #ifndef _NET_DSA_8021Q_H #define _NET_DSA_8021Q_H -#include -#include #include - -struct dsa_switch; -struct dsa_port; -struct sk_buff; -struct net_device; - -struct dsa_tag_8021q_vlan { - struct list_head list; - int port; - u16 vid; - refcount_t refcount; -}; - -struct dsa_8021q_context { - struct dsa_switch *ds; - struct list_head vlans; - /* EtherType of RX VID, used for filtering on master interface */ - __be16 proto; -}; +#include int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto); @@ -38,15 +18,6 @@ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port, void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge); -struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, - u16 tpid, u16 tci); - -void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, - int *vbid); - -struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master, - int vbid); - u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num); u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp); -- cgit From beb3d47d1d3d7185bb401af628ad32ee204a9526 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 23 Nov 2022 07:57:59 -0800 Subject: bpf: Fix a BTF_ID_LIST bug with CONFIG_DEBUG_INFO_BTF not set With CONFIG_DEBUG_INFO_BTF not set, we hit the following compilation error, /.../kernel/bpf/verifier.c:8196:23: error: array index 6 is past the end of the array (that has type 'u32[5]' (aka 'unsigned int[5]')) [-Werror,-Warray-bounds] if (meta->func_id == special_kfunc_list[KF_bpf_cast_to_kern_ctx]) ^ ~~~~~~~~~~~~~~~~~~~~~~~ /.../kernel/bpf/verifier.c:8174:1: note: array 'special_kfunc_list' declared here BTF_ID_LIST(special_kfunc_list) ^ /.../include/linux/btf_ids.h:207:27: note: expanded from macro 'BTF_ID_LIST' #define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; ^ /.../kernel/bpf/verifier.c:8443:19: error: array index 5 is past the end of the array (that has type 'u32[5]' (aka 'unsigned int[5]')) [-Werror,-Warray-bounds] btf_id == special_kfunc_list[KF_bpf_list_pop_back]; ^ ~~~~~~~~~~~~~~~~~~~~ /.../kernel/bpf/verifier.c:8174:1: note: array 'special_kfunc_list' declared here BTF_ID_LIST(special_kfunc_list) ^ /.../include/linux/btf_ids.h:207:27: note: expanded from macro 'BTF_ID_LIST' #define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; ... Fix the problem by increase the size of BTF_ID_LIST to 16 to avoid compilation error and also prevent potentially unintended issue due to out-of-bound access. Reported-by: kernel test robot Reported-by: Dan Carpenter Reported-by: Nathan Chancellor Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221123155759.2669749-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index c9744efd202f..93397711a68c 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -204,7 +204,7 @@ extern struct btf_id_set8 name; #else -#define BTF_ID_LIST(name) static u32 __maybe_unused name[5]; +#define BTF_ID_LIST(name) static u32 __maybe_unused name[16]; #define BTF_ID(prefix, name) #define BTF_ID_FLAGS(prefix, name, ...) #define BTF_ID_UNUSED -- cgit From 5a0f663f0189cf9e031e444f97c029717a99548d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 23 Nov 2022 21:32:06 -0800 Subject: compiler_types: Define __rcu as __attribute__((btf_type_tag("rcu"))) Currently, without rcu attribute info in BTF, the verifier treats rcu tagged pointer as a normal pointer. This might be a problem for sleepable program where rcu_read_lock()/unlock() is not available. For example, for a sleepable fentry program, if rcu protected memory access is interleaved with a sleepable helper/kfunc, it is possible the memory access after the sleepable helper/kfunc might be invalid since the object might have been freed then. To prevent such cases, introducing rcu tagging for memory accesses in verifier can help to reject such programs. To enable rcu tagging in BTF, during kernel compilation, define __rcu as attribute btf_type_tag("rcu") so __rcu information can be preserved in dwarf and btf, and later can be used for bpf prog verification. Acked-by: KP Singh Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221124053206.2373141-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/compiler_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index eb0466236661..7c1afe0f4129 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -49,7 +49,8 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { } # endif # define __iomem # define __percpu BTF_TYPE_TAG(percpu) -# define __rcu +# define __rcu BTF_TYPE_TAG(rcu) + # define __chk_user_ptr(x) (void)0 # define __chk_io_ptr(x) (void)0 /* context/locking */ -- cgit From 01685c5bddaa6df3d662c8afed5e5289fcc68e5a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 23 Nov 2022 21:32:11 -0800 Subject: bpf: Introduce might_sleep field in bpf_func_proto Introduce bpf_func_proto->might_sleep to indicate a particular helper might sleep. This will make later check whether a helper might be sleepable or not easier. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221124053211.2373553-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c9eafa67f2a2..43fd7eeeeabb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -682,6 +682,7 @@ struct bpf_func_proto { u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); bool gpl_only; bool pkt_access; + bool might_sleep; enum bpf_return_type ret_type; union { struct { -- cgit From 9bb00b2895cbfe0ad410457b605d0a72524168c1 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 23 Nov 2022 21:32:17 -0800 Subject: bpf: Add kfunc bpf_rcu_read_lock/unlock() Add two kfunc's bpf_rcu_read_lock() and bpf_rcu_read_unlock(). These two kfunc's can be used for all program types. The following is an example about how rcu pointer are used w.r.t. bpf_rcu_read_lock()/bpf_rcu_read_unlock(). struct task_struct { ... struct task_struct *last_wakee; struct task_struct __rcu *real_parent; ... }; Let us say prog does 'task = bpf_get_current_task_btf()' to get a 'task' pointer. The basic rules are: - 'real_parent = task->real_parent' should be inside bpf_rcu_read_lock region. This is to simulate rcu_dereference() operation. The 'real_parent' is marked as MEM_RCU only if (1). task->real_parent is inside bpf_rcu_read_lock region, and (2). task is a trusted ptr. So MEM_RCU marked ptr can be 'trusted' inside the bpf_rcu_read_lock region. - 'last_wakee = real_parent->last_wakee' should be inside bpf_rcu_read_lock region since it tries to access rcu protected memory. - the ptr 'last_wakee' will be marked as PTR_UNTRUSTED since in general it is not clear whether the object pointed by 'last_wakee' is valid or not even inside bpf_rcu_read_lock region. The verifier will reset all rcu pointer register states to untrusted at bpf_rcu_read_unlock() kfunc call site, so any such rcu pointer won't be trusted any more outside the bpf_rcu_read_lock() region. The current implementation does not support nested rcu read lock region in the prog. Acked-by: Martin KaFai Lau Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221124053217.2373910-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 3 +++ include/linux/bpf_verifier.h | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 43fd7eeeeabb..c6aa6912ea16 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -572,6 +572,9 @@ enum bpf_type_flag { */ PTR_TRUSTED = BIT(12 + BPF_BASE_TYPE_BITS), + /* MEM is tagged with rcu and memory access needs rcu_read_lock protection. */ + MEM_RCU = BIT(13 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 545152ac136c..c05aa6e1f6f5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -344,6 +344,7 @@ struct bpf_verifier_state { u32 id; } active_lock; bool speculative; + bool active_rcu_lock; /* first and last insn idx of this verifier state */ u32 first_insn_idx; @@ -445,6 +446,7 @@ struct bpf_insn_aux_data { u32 seen; /* this insn was processed by the verifier at env->pass_cnt */ bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ + bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ u8 alu_state; /* used in combination with alu_limit */ /* below fields are initialized once */ @@ -534,6 +536,7 @@ struct bpf_verifier_env { bool bypass_spec_v1; bool bypass_spec_v4; bool seen_direct_write; + bool rcu_tag_supported; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; @@ -680,7 +683,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } -#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | MEM_RCU | PTR_TRUSTED) static inline bool bpf_type_has_unsafe_modifiers(u32 type) { -- cgit From a66d79ee0bd5140a64b72cde588f8c83a55a1eb9 Mon Sep 17 00:00:00 2001 From: Sujuan Chen Date: Thu, 24 Nov 2022 11:18:14 +0800 Subject: net: ethernet: mtk_wed: add wcid overwritten support for wed v1 All wed versions should enable the wcid overwritten feature, since the wcid size is controlled by the wlan driver. Tested-by: Sujuan Chen Co-developed-by: Bo Jiao Signed-off-by: Bo Jiao Signed-off-by: Sujuan Chen Signed-off-by: David S. Miller --- include/linux/soc/mediatek/mtk_wed.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 8294978f4bca..1b1ef57609f7 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -85,6 +85,9 @@ struct mtk_wed_device { int irq; u8 version; + /* used by wlan driver */ + u32 rev_id; + struct mtk_wed_ring tx_ring[MTK_WED_TX_QUEUES]; struct mtk_wed_ring rx_ring[MTK_WED_RX_QUEUES]; struct mtk_wed_ring txfree_ring; -- cgit From 4c47867bc789bdc722f3bb760355c2c246fbe9af Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Thu, 24 Nov 2022 12:15:54 +0100 Subject: of: net: export of_get_mac_address_nvmem() Export of_get_mac_addr_nvmem() and rename it to of_get_mac_address_nvmem() in order to fit the convention followed by the existing exported helpers of the same kind. This way, OF compatible drivers using eg. fwnode_get_mac_address() can do a direct call to it instead of calling of_get_mac_address() just for the nvmem step, avoiding to repeat an expensive DT lookup which has already been done once. Eventually, fwnode_get_mac_address() should probably be updated to perform the nvmem lookup directly, but as of today, nvmem cells seem not to be supported by ACPI yet which would defeat this kind of extension. Suggested-by: Marcin Wojtas Signed-off-by: Miquel Raynal Signed-off-by: Paolo Abeni --- include/linux/of_net.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/of_net.h b/include/linux/of_net.h index 0484b613ca64..d88715a0b3a5 100644 --- a/include/linux/of_net.h +++ b/include/linux/of_net.h @@ -14,6 +14,7 @@ struct net_device; extern int of_get_phy_mode(struct device_node *np, phy_interface_t *interface); extern int of_get_mac_address(struct device_node *np, u8 *mac); +extern int of_get_mac_address_nvmem(struct device_node *np, u8 *mac); int of_get_ethdev_address(struct device_node *np, struct net_device *dev); extern struct net_device *of_find_net_device_by_node(struct device_node *np); #else @@ -28,6 +29,11 @@ static inline int of_get_mac_address(struct device_node *np, u8 *mac) return -ENODEV; } +static inline int of_get_mac_address_nvmem(struct device_node *np, u8 *mac) +{ + return -ENODEV; +} + static inline int of_get_ethdev_address(struct device_node *np, struct net_device *dev) { return -ENODEV; -- cgit From f78cd9c783e09a0fe454b0fc8b39c22025d7869e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 24 Nov 2022 16:22:53 +0100 Subject: net: ethernet: mtk_wed: update mtk_wed_stop Update mtk_wed_stop routine and rename old mtk_wed_stop() to mtk_wed_deinit(). This is a preliminary patch to add Wireless Ethernet Dispatcher reset support. Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: Paolo Abeni --- include/linux/soc/mediatek/mtk_wed.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index 1b1ef57609f7..c43510e541df 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -234,6 +234,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev) (_dev)->ops->ppe_check(_dev, _skb, _reason, _hash) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \ (_dev)->ops->msg_update(_dev, _id, _msg, _len) +#define mtk_wed_device_stop(_dev) (_dev)->ops->stop(_dev) +#define mtk_wed_device_dma_reset(_dev) (_dev)->ops->reset_dma(_dev) #else static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) { @@ -250,6 +252,8 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) #define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) do {} while (0) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV +#define mtk_wed_device_stop(_dev) do {} while (0) +#define mtk_wed_device_dma_reset(_dev) do {} while (0) #endif #endif -- cgit From 23dca7a90017ff2512c501f7da4c7ca7a95c2d6e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 24 Nov 2022 16:22:55 +0100 Subject: net: ethernet: mtk_wed: add reset to tx_ring_setup callback Introduce reset parameter to mtk_wed_tx_ring_setup signature. This is a preliminary patch to add Wireless Ethernet Dispatcher reset support. Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Signed-off-by: Paolo Abeni --- include/linux/soc/mediatek/mtk_wed.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index c43510e541df..beb190449704 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -158,7 +158,7 @@ struct mtk_wed_device { struct mtk_wed_ops { int (*attach)(struct mtk_wed_device *dev); int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, - void __iomem *regs); + void __iomem *regs, bool reset); int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs); int (*txfree_ring_setup)(struct mtk_wed_device *dev, @@ -216,8 +216,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev) #define mtk_wed_device_active(_dev) !!(_dev)->ops #define mtk_wed_device_detach(_dev) (_dev)->ops->detach(_dev) #define mtk_wed_device_start(_dev, _mask) (_dev)->ops->start(_dev, _mask) -#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) \ - (_dev)->ops->tx_ring_setup(_dev, _ring, _regs) +#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs, _reset) \ + (_dev)->ops->tx_ring_setup(_dev, _ring, _regs, _reset) #define mtk_wed_device_txfree_ring_setup(_dev, _regs) \ (_dev)->ops->txfree_ring_setup(_dev, _regs) #define mtk_wed_device_reg_read(_dev, _reg) \ @@ -243,7 +243,7 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) } #define mtk_wed_device_detach(_dev) do {} while (0) #define mtk_wed_device_start(_dev, _mask) do {} while (0) -#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_tx_ring_setup(_dev, _ring, _regs, _reset) -ENODEV #define mtk_wed_device_txfree_ring_setup(_dev, _ring, _regs) -ENODEV #define mtk_wed_device_reg_read(_dev, _reg) 0 #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0) -- cgit From 12eb0f84a601cec8920b56d7ffd4182a6bfd6521 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Thu, 29 Sep 2022 13:45:08 +0200 Subject: net/mlx5: Remove unused ctx variables Remove mlx5_priv.ctx_list and ctx_lock which are no longer used after commit 601c10c89cbb ("net/mlx5: Delete custom device management logic"). Signed-off-by: Petr Pavlu Signed-off-by: Saeed Mahameed --- include/linux/mlx5/driver.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 06cbad166225..d476255c9a3f 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -606,8 +606,6 @@ struct mlx5_priv { struct list_head pgdir_list; /* end: alloc staff */ - struct list_head ctx_list; - spinlock_t ctx_lock; struct mlx5_adev **adev; int adev_idx; int sw_vhca_id; -- cgit From b146658f2ed90768b769222ca418617274242b32 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Oct 2022 14:18:22 +0200 Subject: net/mlx5e: Add padding when needed in UMR WQEs Per the device spec, MTTs/KLMs list in a UMR WQE must be aligned to 64B. Per our SW design, the MTT/KLMs list would need alignment only if it's too small, for example on PPC when PAGE_SIZE is 64KB, and only 4 pages are needed to cover a MPWQE of size 256KB. Padding, if needed, is taken into account when calculating the UMR WQE fields (ds_cnt and xlt_octowords), however no entries are provided, instead garbage is passed. No real harm though, as these parts act as gaps between the RX MPWQEs and not used by any of them. Hence, in practice, device does not try to write any incoming packet to them. Still, prefer providing clean padding marking the end of the list, and do not map garbage into the RQ memory region. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index eb3fac30488b..97275965f156 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -294,6 +294,7 @@ enum { #define MLX5_UMR_MTT_ALIGNMENT 0x40 #define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) #define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT +#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit From 683d78a0d46235edfd3c50ddbc4af1065b422670 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Oct 2022 14:30:23 +0200 Subject: net/mlx5: Remove unused UMR MTT definitions Defines MLX5_UMR_MTT_MASK and MLX5_UMR_MTT_MIN_CHUNK_SIZE are not in use. Remove them. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 97275965f156..ecf840e2989b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -292,8 +292,6 @@ enum { #define MLX5_UMR_KLM_ALIGNMENT 4 #define MLX5_UMR_MTT_ALIGNMENT 0x40 -#define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) -#define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT #define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit From 02648b4b09d506bd4df2e17bf109c229fc728640 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Oct 2022 14:44:57 +0200 Subject: net/mlx5: Generalize name of UMR alignment definition Per the device spec, MLX5_UMR_MTT_ALIGNMENT is good not only for UMR MTT entries, but for all other entries as well, like KLMs and KSMs. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index ecf840e2989b..a02f779f5c5b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -291,8 +291,8 @@ enum { }; #define MLX5_UMR_KLM_ALIGNMENT 4 -#define MLX5_UMR_MTT_ALIGNMENT 0x40 -#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) +#define MLX5_UMR_FLEX_ALIGNMENT 0x40 +#define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit From daab2e9c54a52dea8dbd1af516e6f986eeed2556 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Mon, 31 Oct 2022 14:24:02 +0200 Subject: net/mlx5: Use generic definition for UMR KLM alignment MLX5_UMR_KLM_ALIGNMENT is in units of number of entries, while MLX5_UMR_MTT_ALIGNMENT (generalized and renamed to MLX5_UMR_FLEX_ALIGNMENT) is in byte units. This is misleading and confusing. Replace this KLM definition with one based on the generic definition. Signed-off-by: Tariq Toukan Reviewed-by: Gal Pressman Signed-off-by: Saeed Mahameed --- include/linux/mlx5/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index a02f779f5c5b..5fe5d198b57a 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -290,9 +290,9 @@ enum { MLX5_UMR_INLINE = (1 << 7), }; -#define MLX5_UMR_KLM_ALIGNMENT 4 #define MLX5_UMR_FLEX_ALIGNMENT 0x40 #define MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_mtt)) +#define MLX5_UMR_KLM_NUM_ENTRIES_ALIGNMENT (MLX5_UMR_FLEX_ALIGNMENT / sizeof(struct mlx5_klm)) #define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) -- cgit From e9374524950512a1769f610a868fcdf89ea59b8e Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Tue, 22 Nov 2022 20:30:57 +0100 Subject: netfilter: ipset: Add support for new bitmask parameter Add a new parameter to complement the existing 'netmask' option. The main difference between netmask and bitmask is that bitmask takes any arbitrary ip address as input, it does not have to be a valid netmask. The name of the new parameter is 'bitmask'. This lets us mask out arbitrary bits in the ip address, for example: ipset create set1 hash:ip bitmask 255.128.255.0 ipset create set2 hash:ip,port family inet6 bitmask ffff::ff80 Signed-off-by: Vishwanath Pai Signed-off-by: Joshua Hunt Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/ipset/ip_set.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index ada1296c87d5..ab934ad951a8 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -515,6 +515,16 @@ ip_set_init_skbinfo(struct ip_set_skbinfo *skbinfo, *skbinfo = ext->skbinfo; } +static inline void +nf_inet_addr_mask_inplace(union nf_inet_addr *a1, + const union nf_inet_addr *mask) +{ + a1->all[0] &= mask->all[0]; + a1->all[1] &= mask->all[1]; + a1->all[2] &= mask->all[2]; + a1->all[3] &= mask->all[3]; +} + #define IP_SET_INIT_KEXT(skb, opt, set) \ { .bytes = (skb)->len, .packets = 1, .target = true,\ .timeout = ip_set_adt_opt_timeout(opt, set) } -- cgit From c67cae551f0df80421b5703ee56ff5e2fe9c4de6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 25 Nov 2022 14:06:17 -0800 Subject: bpf: Tighten ptr_to_btf_id checks. The networking programs typically don't require CAP_PERFMON, but through kfuncs like bpf_cast_to_kern_ctx() they can access memory through PTR_TO_BTF_ID. In such case enforce CAP_PERFMON. Also make sure that only GPL programs can access kernel data structures. All kfuncs require GPL already. Also remove allow_ptr_to_map_access. It's the same as allow_ptr_leaks and different name for the same check only causes confusion. Fixes: fd264ca02094 ("bpf: Add a kfunc to type cast from bpf uapi ctx to kernel ctx") Fixes: 50c6b8a9aea2 ("selftests/bpf: Add a test for btf_type_tag "percpu"") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20221125220617.26846-1-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 5 ----- include/linux/bpf_verifier.h | 1 - 2 files changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 67452103bb86..4920ac252754 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1909,11 +1909,6 @@ static inline bool bpf_allow_uninit_stack(void) return perfmon_capable(); } -static inline bool bpf_allow_ptr_to_map_access(void) -{ - return perfmon_capable(); -} - static inline bool bpf_bypass_spec_v1(void) { return perfmon_capable(); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c05aa6e1f6f5..b5090e89cb3f 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -531,7 +531,6 @@ struct bpf_verifier_env { bool explore_alu_limits; bool allow_ptr_leaks; bool allow_uninit_stack; - bool allow_ptr_to_map_access; bool bpf_capable; bool bypass_spec_v1; bool bypass_spec_v4; -- cgit From a351d6087bf7d3d8440d58d3bf244ec64b89394a Mon Sep 17 00:00:00 2001 From: Pengcheng Yang Date: Tue, 29 Nov 2022 18:40:39 +0800 Subject: bpf, sockmap: Fix missing BPF_F_INGRESS flag when using apply_bytes When redirecting, we use sk_msg_to_ingress() to get the BPF_F_INGRESS flag from the msg->flags. If apply_bytes is used and it is larger than the current data being processed, sk_psock_msg_verdict() will not be called when sendmsg() is called again. At this time, the msg->flags is 0, and we lost the BPF_F_INGRESS flag. So we need to save the BPF_F_INGRESS flag in sk_psock and use it when redirection. Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support") Signed-off-by: Pengcheng Yang Signed-off-by: Daniel Borkmann Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1669718441-2654-3-git-send-email-yangpc@wangsu.com --- include/linux/skmsg.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 70d6cb94e580..84f787416a54 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -82,6 +82,7 @@ struct sk_psock { u32 apply_bytes; u32 cork_bytes; u32 eval; + bool redir_ingress; /* undefined if sk_redir is null */ struct sk_msg *cork; struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) -- cgit From 3144bfa5078e0df7507a4de72061501e6a0e56be Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 29 Nov 2022 21:21:47 -0800 Subject: bpf: Fix a compilation failure with clang lto build When building the kernel with clang lto (CONFIG_LTO_CLANG_FULL=y), the following compilation error will appear: $ make LLVM=1 LLVM_IAS=1 -j ... ld.lld: error: ld-temp.o :26889:1: symbol 'cgroup_storage_map_btf_ids' is already defined cgroup_storage_map_btf_ids:; ^ make[1]: *** [/.../bpf-next/scripts/Makefile.vmlinux_o:61: vmlinux.o] Error 1 In local_storage.c, we have BTF_ID_LIST_SINGLE(cgroup_storage_map_btf_ids, struct, bpf_local_storage_map) Commit c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs") added the above identical BTF_ID_LIST_SINGLE definition in bpf_cgrp_storage.c. With duplicated definitions, llvm linker complains with lto build. Also, extracting btf_id of 'struct bpf_local_storage_map' is defined four times for sk, inode, task and cgrp local storages. Let us define a single global one with a different name than cgroup_storage_map_btf_ids, which also fixed the lto compilation error. Fixes: c4bcfb38a95e ("bpf: Implement cgroup storage available to non-cgroup-attached bpf progs") Signed-off-by: Yonghong Song Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20221130052147.1591625-1-yhs@fb.com --- include/linux/btf_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 93397711a68c..3a4f7cd882ca 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -266,5 +266,6 @@ MAX_BTF_TRACING_TYPE, extern u32 btf_tracing_ids[]; extern u32 bpf_cgroup_btf_id[]; +extern u32 bpf_local_storage_map_btf_id[]; #endif -- cgit From bc66fa87d4fda9053a8145e5718fc278c2b88253 Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Wed, 30 Nov 2022 10:12:16 +0800 Subject: net: phy: Add link between phy dev and mac dev If the external phy used by current mac interface is managed by another mac interface, it means that this network port cannot work independently, especially when the system suspends and resumes, the following trace may appear, so we should create a device link between phy dev and mac dev. WARNING: CPU: 0 PID: 24 at drivers/net/phy/phy.c:983 phy_error+0x20/0x68 Modules linked in: CPU: 0 PID: 24 Comm: kworker/0:2 Not tainted 6.1.0-rc3-00011-g5aaef24b5c6d-dirty #34 Hardware name: Freescale i.MX6 SoloX (Device Tree) Workqueue: events_power_efficient phy_state_machine unwind_backtrace from show_stack+0x10/0x14 show_stack from dump_stack_lvl+0x68/0x90 dump_stack_lvl from __warn+0xb4/0x24c __warn from warn_slowpath_fmt+0x5c/0xd8 warn_slowpath_fmt from phy_error+0x20/0x68 phy_error from phy_state_machine+0x22c/0x23c phy_state_machine from process_one_work+0x288/0x744 process_one_work from worker_thread+0x3c/0x500 worker_thread from kthread+0xf0/0x114 kthread from ret_from_fork+0x14/0x28 Exception stack(0xf0951fb0 to 0xf0951ff8) Signed-off-by: Xiaolei Wang Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20221130021216.1052230-1-xiaolei.wang@windriver.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 9a3752c0c444..71eeb4e3b1fd 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -529,6 +529,8 @@ struct macsec_ops; * * @mdio: MDIO bus this PHY is on * @drv: Pointer to the driver for this PHY instance + * @devlink: Create a link between phy dev and mac dev, if the external phy + * used by current mac interface is managed by another mac interface. * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. @@ -618,6 +620,8 @@ struct phy_device { /* And management functions */ struct phy_driver *drv; + struct device_link *devlink; + u32 phy_id; struct phy_c45_device_ids c45_ids; -- cgit From 278ab9793116a8531ffaa52b4e61644971131e35 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 30 Nov 2022 13:26:45 -0800 Subject: wifi: ieee80211: Do not open-code qos address offsets When building with -Wstringop-overflow, GCC's KASAN implementation does not correctly perform bounds checking within some complex structures when faced with literal offsets, and can get very confused. For example, this warning is seen due to literal offsets into sturct ieee80211_hdr that may or may not be large enough: drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c: In function 'iwl_mvm_rx_mpdu_mq': drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:2022:29: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] 2022 | *qc &= ~IEEE80211_QOS_CTL_A_MSDU_PRESENT; In file included from drivers/net/wireless/intel/iwlwifi/mvm/fw-api.h:32, from drivers/net/wireless/intel/iwlwifi/mvm/sta.h:15, from drivers/net/wireless/intel/iwlwifi/mvm/mvm.h:27, from drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c:10: drivers/net/wireless/intel/iwlwifi/mvm/../fw/api/rx.h:559:16: note: at offset [78, 166] into destination object 'mpdu_len' of size 2 559 | __le16 mpdu_len; | ^~~~~~~~ Refactor ieee80211_get_qos_ctl() to avoid using literal offsets, requiring the creation of the actual structure that is described in the comments. Explicitly choose the desired offset, making the code more human-readable too. This is one of the last remaining warning to fix before enabling -Wstringop-overflow globally. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97490 Link: https://github.com/KSPP/linux/issues/181 Cc: Johannes Berg Cc: Kalle Valo Cc: Gregory Greenman Cc: "Gustavo A. R. Silva" Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221130212641.never.627-kees@kernel.org Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 6252f02f38b7..80d6308dea06 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -338,6 +338,17 @@ struct ieee80211_qos_hdr { __le16 qos_ctrl; } __packed __aligned(2); +struct ieee80211_qos_hdr_4addr { + __le16 frame_control; + __le16 duration_id; + u8 addr1[ETH_ALEN]; + u8 addr2[ETH_ALEN]; + u8 addr3[ETH_ALEN]; + __le16 seq_ctrl; + u8 addr4[ETH_ALEN]; + __le16 qos_ctrl; +} __packed __aligned(2); + struct ieee80211_trigger { __le16 frame_control; __le16 duration; @@ -4060,16 +4071,21 @@ struct ieee80211_he_6ghz_capa { * @hdr: the frame * * The qos ctrl bytes come after the frame_control, duration, seq_num - * and 3 or 4 addresses of length ETH_ALEN. - * 3 addr: 2 + 2 + 2 + 3*6 = 24 - * 4 addr: 2 + 2 + 2 + 4*6 = 30 + * and 3 or 4 addresses of length ETH_ALEN. Checks frame_control to choose + * between struct ieee80211_qos_hdr_4addr and struct ieee80211_qos_hdr. */ static inline u8 *ieee80211_get_qos_ctl(struct ieee80211_hdr *hdr) { - if (ieee80211_has_a4(hdr->frame_control)) - return (u8 *)hdr + 30; + union { + struct ieee80211_qos_hdr addr3; + struct ieee80211_qos_hdr_4addr addr4; + } *qos; + + qos = (void *)hdr; + if (ieee80211_has_a4(qos->addr3.frame_control)) + return (u8 *)&qos->addr4.qos_ctrl; else - return (u8 *)hdr + 24; + return (u8 *)&qos->addr3.qos_ctrl; } /** -- cgit From eb8c507296f6038d46010396d91b42a05c3b64d9 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Wed, 23 Nov 2022 17:38:55 +0000 Subject: jump_label: Prevent key->enabled int overflow 1. With CONFIG_JUMP_LABEL=n static_key_slow_inc() doesn't have any protection against key->enabled refcounter overflow. 2. With CONFIG_JUMP_LABEL=y static_key_slow_inc_cpuslocked() still may turn the refcounter negative as (v + 1) may overflow. key->enabled is indeed a ref-counter as it's documented in multiple places: top comment in jump_label.h, Documentation/staging/static-keys.rst, etc. As -1 is reserved for static key that's in process of being enabled, functions would break with negative key->enabled refcount: - for CONFIG_JUMP_LABEL=n negative return of static_key_count() breaks static_key_false(), static_key_true() - the ref counter may become 0 from negative side by too many static_key_slow_inc() calls and lead to use-after-free issues. These flaws result in that some users have to introduce an additional mutex and prevent the reference counter from overflowing themselves, see bpf_enable_runtime_stats() checking the counter against INT_MAX / 2. Prevent the reference counter overflow by checking if (v + 1) > 0. Change functions API to return whether the increment was successful. Signed-off-by: Dmitry Safonov Acked-by: Jakub Kicinski Acked-by: Peter Zijlstra (Intel) Signed-off-by: Jakub Kicinski --- include/linux/jump_label.h | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 570831ca9951..4e968ebadce6 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -224,9 +224,10 @@ extern bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_transform_apply(void); extern int jump_label_text_reserved(void *start, void *end); -extern void static_key_slow_inc(struct static_key *key); +extern bool static_key_slow_inc(struct static_key *key); +extern bool static_key_fast_inc_not_disabled(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); -extern void static_key_slow_inc_cpuslocked(struct static_key *key); +extern bool static_key_slow_inc_cpuslocked(struct static_key *key); extern void static_key_slow_dec_cpuslocked(struct static_key *key); extern int static_key_count(struct static_key *key); extern void static_key_enable(struct static_key *key); @@ -278,11 +279,23 @@ static __always_inline bool static_key_true(struct static_key *key) return false; } -static inline void static_key_slow_inc(struct static_key *key) +static inline bool static_key_fast_inc_not_disabled(struct static_key *key) { + int v; + STATIC_KEY_CHECK_USE(key); - atomic_inc(&key->enabled); + /* + * Prevent key->enabled getting negative to follow the same semantics + * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment. + */ + v = atomic_read(&key->enabled); + do { + if (v < 0 || (v + 1) < 0) + return false; + } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); + return true; } +#define static_key_slow_inc(key) static_key_fast_inc_not_disabled(key) static inline void static_key_slow_dec(struct static_key *key) { -- cgit From d93607082e982223cf92750f2d9039ff365b9d24 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 30 Nov 2022 23:28:26 +0100 Subject: net: add netdev_sw_irq_coalesce_default_on() Add a helper for drivers wanting to set SW IRQ coalescing by default. The related sysfs attributes can be used to override the default values. Follow Jakub's suggestion and put this functionality into net core so that drivers wanting to use software interrupt coalescing per default don't have to open-code it. Note that this function needs to be called before the netdevice is registered. Suggested-by: Jakub Kicinski Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5aa35c58c342..f78db610ada5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -78,6 +78,7 @@ struct xdp_buff; void synchronize_net(void); void netdev_set_default_ethtool_ops(struct net_device *dev, const struct ethtool_ops *ops); +void netdev_sw_irq_coalesce_default_on(struct net_device *dev); /* Backlog congestion levels */ #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ -- cgit From fca1aa75518c03b04c3c249e9a9134faf9ca18c5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 3 Dec 2022 10:46:02 -0800 Subject: bpf: Handle MEM_RCU type properly Commit 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()") introduced MEM_RCU and bpf_rcu_read_lock/unlock() support. In that commit, a rcu pointer is tagged with both MEM_RCU and PTR_TRUSTED so that it can be passed into kfuncs or helpers as an argument. Martin raised a good question in [1] such that the rcu pointer, although being able to accessing the object, might have reference count of 0. This might cause a problem if the rcu pointer is passed to a kfunc which expects trusted arguments where ref count should be greater than 0. This patch makes the following changes related to MEM_RCU pointer: - MEM_RCU pointer might be NULL (PTR_MAYBE_NULL). - Introduce KF_RCU so MEM_RCU ptr can be acquired with a KF_RCU tagged kfunc which assumes ref count of rcu ptr could be zero. - For mem access 'b = ptr->a', say 'ptr' is a MEM_RCU ptr, and 'a' is tagged with __rcu as well. Let us mark 'b' as MEM_RCU | PTR_MAYBE_NULL. [1] https://lore.kernel.org/bpf/ac70f574-4023-664e-b711-e0d3b18117fd@linux.dev/ Fixes: 9bb00b2895cb ("bpf: Add kfunc bpf_rcu_read_lock/unlock()") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221203184602.477272-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 +- include/linux/btf.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b5090e89cb3f..c07b351a5bc7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -682,7 +682,7 @@ static inline bool bpf_prog_check_recur(const struct bpf_prog *prog) } } -#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | MEM_RCU | PTR_TRUSTED) +#define BPF_REG_TRUSTED_MODIFIERS (MEM_ALLOC | PTR_TRUSTED) static inline bool bpf_type_has_unsafe_modifiers(u32 type) { diff --git a/include/linux/btf.h b/include/linux/btf.h index 9ed00077db6e..cbd6e4096f8c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -70,6 +70,7 @@ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ +#define KF_RCU (1 << 7) /* kfunc only takes rcu pointer arguments */ /* * Return the name of the passed struct, if exists, or halt the build if for -- cgit From c0c852dd1876dc1db4600ce951a92aadd3073b1c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Sat, 3 Dec 2022 12:49:54 -0800 Subject: bpf: Do not mark certain LSM hook arguments as trusted Martin mentioned that the verifier cannot assume arguments from LSM hook sk_alloc_security being trusted since after the hook is called, the sk ref_count is set to 1. This will overwrite the ref_count changed by the bpf program and may cause ref_count underflow later on. I then further checked some other hooks. For example, for bpf_lsm_file_alloc() hook in fs/file_table.c, f->f_cred = get_cred(cred); error = security_file_alloc(f); if (unlikely(error)) { file_free_rcu(&f->f_rcuhead); return ERR_PTR(error); } atomic_long_set(&f->f_count, 1); The input parameter 'f' to security_file_alloc() cannot be trusted as well. Specifically, I investiaged bpf_map/bpf_prog/file/sk/task alloc/free lsm hooks. Except bpf_map_alloc and task_alloc, arguments for all other hooks should not be considered as trusted. This may not be a complete list, but it covers common usage for sk and task. Fixes: 3f00c5239344 ("bpf: Allow trusted pointers to be passed to KF_TRUSTED_ARGS kfuncs") Signed-off-by: Yonghong Song Link: https://lore.kernel.org/r/20221203204954.2043348-1-yhs@fb.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_lsm.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 4bcf76a9bb06..1de7ece5d36d 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -28,6 +28,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); bool bpf_lsm_is_sleepable_hook(u32 btf_id); +bool bpf_lsm_is_trusted(const struct bpf_prog *prog); static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) @@ -51,6 +52,11 @@ static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) return false; } +static inline bool bpf_lsm_is_trusted(const struct bpf_prog *prog) +{ + return false; +} + static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { -- cgit From 919e43fad5163a8ceb39826ecdee897a9f799351 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:29 +0200 Subject: xfrm: add an interface to offload policy Extend netlink interface to add and delete XFRM policy from the device. This functionality is a first step to implement packet IPsec offload solution. Signed-off-by: Raed Salem Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5aa35c58c342..4096e3fe8e4a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1040,6 +1040,9 @@ struct xfrmdev_ops { bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); + int (*xdo_dev_policy_add) (struct xfrm_policy *x); + void (*xdo_dev_policy_delete) (struct xfrm_policy *x); + void (*xdo_dev_policy_free) (struct xfrm_policy *x); }; #endif -- cgit From f3da86dc2c8c9004445cfbb15ac086773622d853 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 20:41:33 +0200 Subject: xfrm: add support to HW update soft and hard limits Both in RX and TX, the traffic that performs IPsec packet offload transformation is accounted by HW. It is needed to properly handle hard limits that require to drop the packet. It means that XFRM core needs to update internal counters with the one that accounted by the HW, so new callbacks are introduced in this patch. In case of soft or hard limit is occurred, the driver should call to xfrm_state_check_expire() that will perform key rekeying exactly as done by XFRM core. Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4096e3fe8e4a..29ae964e3b89 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1040,6 +1040,7 @@ struct xfrmdev_ops { bool (*xdo_dev_offload_ok) (struct sk_buff *skb, struct xfrm_state *x); void (*xdo_dev_state_advance_esn) (struct xfrm_state *x); + void (*xdo_dev_state_update_curlft) (struct xfrm_state *x); int (*xdo_dev_policy_add) (struct xfrm_policy *x); void (*xdo_dev_policy_delete) (struct xfrm_policy *x); void (*xdo_dev_policy_free) (struct xfrm_policy *x); -- cgit From a46e9010124256f5bf5fc2c241a45cf1944b768e Mon Sep 17 00:00:00 2001 From: Revanth Kumar Uppala Date: Thu, 1 Dec 2022 15:58:43 +0000 Subject: net: stmmac: Power up SERDES after the PHY link The Tegra MGBE ethernet controller requires that the SERDES link is powered-up after the PHY link is up, otherwise the link fails to become ready following a resume from suspend. Add a variable to indicate that the SERDES link must be powered-up after the PHY link. Signed-off-by: Revanth Kumar Uppala Signed-off-by: Jon Hunter Signed-off-by: David S. Miller --- include/linux/stmmac.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index fb2e88614f5d..83ca2e8eb6b5 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -271,5 +271,6 @@ struct plat_stmmacenet_data { int msi_tx_base_vec; bool use_phy_wol; bool sph_disable; + bool serdes_up_after_phy_linkup; }; #endif -- cgit From 3afee4ed336ed7a6cff78d23339879ffb654e02e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Fri, 2 Dec 2022 22:10:23 +0200 Subject: net/mlx5: Add HW definitions for IPsec packet offload Add all needed bits to support IPsec packet offload mode. Reviewed-by: Raed Salem Reviewed-by: Saeed Mahameed Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/linux/mlx5/mlx5_ifc.h | 53 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 50 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5a4e914e2a6f..300b56ea5ff4 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -445,7 +445,10 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 max_modify_header_actions[0x8]; u8 max_ft_level[0x8]; - u8 reserved_at_40[0x6]; + u8 reformat_add_esp_trasport[0x1]; + u8 reserved_at_41[0x2]; + u8 reformat_del_esp_trasport[0x1]; + u8 reserved_at_44[0x2]; u8 execute_aso[0x1]; u8 reserved_at_47[0x19]; @@ -638,8 +641,10 @@ struct mlx5_ifc_fte_match_set_misc2_bits { u8 reserved_at_1a0[0x8]; u8 macsec_syndrome[0x8]; + u8 ipsec_syndrome[0x8]; + u8 reserved_at_1b8[0x8]; - u8 reserved_at_1b0[0x50]; + u8 reserved_at_1c0[0x40]; }; struct mlx5_ifc_fte_match_set_misc3_bits { @@ -6384,6 +6389,9 @@ enum mlx5_reformat_ctx_type { MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x2, MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x3, MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x4, + MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV4 = 0x5, + MLX5_REFORMAT_TYPE_DEL_ESP_TRANSPORT = 0x8, + MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_OVER_IPV6 = 0xb, MLX5_REFORMAT_TYPE_INSERT_HDR = 0xf, MLX5_REFORMAT_TYPE_REMOVE_HDR = 0x10, MLX5_REFORMAT_TYPE_ADD_MACSEC = 0x11, @@ -11563,6 +11571,41 @@ enum { MLX5_IPSEC_OBJECT_ICV_LEN_16B, }; +enum { + MLX5_IPSEC_ASO_REG_C_0_1 = 0x0, + MLX5_IPSEC_ASO_REG_C_2_3 = 0x1, + MLX5_IPSEC_ASO_REG_C_4_5 = 0x2, + MLX5_IPSEC_ASO_REG_C_6_7 = 0x3, +}; + +enum { + MLX5_IPSEC_ASO_MODE = 0x0, + MLX5_IPSEC_ASO_REPLAY_PROTECTION = 0x1, + MLX5_IPSEC_ASO_INC_SN = 0x2, +}; + +struct mlx5_ifc_ipsec_aso_bits { + u8 valid[0x1]; + u8 reserved_at_201[0x1]; + u8 mode[0x2]; + u8 window_sz[0x2]; + u8 soft_lft_arm[0x1]; + u8 hard_lft_arm[0x1]; + u8 remove_flow_enable[0x1]; + u8 esn_event_arm[0x1]; + u8 reserved_at_20a[0x16]; + + u8 remove_flow_pkt_cnt[0x20]; + + u8 remove_flow_soft_lft[0x20]; + + u8 reserved_at_260[0x80]; + + u8 mode_parameter[0x20]; + + u8 replay_protection_window[0x100]; +}; + struct mlx5_ifc_ipsec_obj_bits { u8 modify_field_select[0x40]; u8 full_offload[0x1]; @@ -11584,7 +11627,11 @@ struct mlx5_ifc_ipsec_obj_bits { u8 implicit_iv[0x40]; - u8 reserved_at_100[0x700]; + u8 reserved_at_100[0x8]; + u8 ipsec_aso_access_pd[0x18]; + u8 reserved_at_120[0xe0]; + + struct mlx5_ifc_ipsec_aso_bits ipsec_aso; }; struct mlx5_ifc_create_ipsec_obj_in_bits { -- cgit From bffdeaa8a5af7200b0e74c9d5a41167f86626a36 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 6 Dec 2022 15:33:43 -0800 Subject: bpf: decouple prune and jump points BPF verifier marks some instructions as prune points. Currently these prune points serve two purposes. It's a point where verifier tries to find previously verified state and check current state's equivalence to short circuit verification for current code path. But also currently it's a point where jump history, used for precision backtracking, is updated. This is done so that non-linear flow of execution could be properly backtracked. Such coupling is coincidental and unnecessary. Some prune points are not part of some non-linear jump path, so don't need update of jump history. On the other hand, not all instructions which have to be recorded in jump history necessarily are good prune points. This patch splits prune and jump points into independent flags. Currently all prune points are marked as jump points to minimize amount of changes in this patch, but next patch will perform some optimization of prune vs jmp point placement. No functional changes are intended. Acked-by: John Fastabend Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20221206233345.438540-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c07b351a5bc7..70d06a99f0b8 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -452,6 +452,7 @@ struct bpf_insn_aux_data { /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ bool prune_point; + bool jmp_point; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ -- cgit From ed883bec679b027b198d57a336715f8298fb88b4 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Mon, 5 Dec 2022 12:34:42 +0100 Subject: net: ethernet: mtk_wed: add reset to rx_ring_setup callback This patch adds reset parameter to mtk_wed_rx_ring_setup signature in order to align rx_ring_setup callback to tx_ring_setup one introduced in 'commit 23dca7a90017 ("net: ethernet: mtk_wed: add reset to tx_ring_setup callback")' Co-developed-by: Sujuan Chen Signed-off-by: Sujuan Chen Signed-off-by: Lorenzo Bianconi Reviewed-by: Leon Romanovsky Link: https://lore.kernel.org/r/29c6e7a5469e784406cf3e2920351d1207713d05.1670239984.git.lorenzo@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/soc/mediatek/mtk_wed.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/soc/mediatek/mtk_wed.h b/include/linux/soc/mediatek/mtk_wed.h index beb190449704..a0746d4aec20 100644 --- a/include/linux/soc/mediatek/mtk_wed.h +++ b/include/linux/soc/mediatek/mtk_wed.h @@ -160,7 +160,7 @@ struct mtk_wed_ops { int (*tx_ring_setup)(struct mtk_wed_device *dev, int ring, void __iomem *regs, bool reset); int (*rx_ring_setup)(struct mtk_wed_device *dev, int ring, - void __iomem *regs); + void __iomem *regs, bool reset); int (*txfree_ring_setup)(struct mtk_wed_device *dev, void __iomem *regs); int (*msg_update)(struct mtk_wed_device *dev, int cmd_id, @@ -228,8 +228,8 @@ mtk_wed_get_rx_capa(struct mtk_wed_device *dev) (_dev)->ops->irq_get(_dev, _mask) #define mtk_wed_device_irq_set_mask(_dev, _mask) \ (_dev)->ops->irq_set_mask(_dev, _mask) -#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) \ - (_dev)->ops->rx_ring_setup(_dev, _ring, _regs) +#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs, _reset) \ + (_dev)->ops->rx_ring_setup(_dev, _ring, _regs, _reset) #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) \ (_dev)->ops->ppe_check(_dev, _skb, _reason, _hash) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) \ @@ -249,7 +249,7 @@ static inline bool mtk_wed_device_active(struct mtk_wed_device *dev) #define mtk_wed_device_reg_write(_dev, _reg, _val) do {} while (0) #define mtk_wed_device_irq_get(_dev, _mask) 0 #define mtk_wed_device_irq_set_mask(_dev, _mask) do {} while (0) -#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs) -ENODEV +#define mtk_wed_device_rx_ring_setup(_dev, _ring, _regs, _reset) -ENODEV #define mtk_wed_device_ppe_check(_dev, _skb, _reason, _hash) do {} while (0) #define mtk_wed_device_update_msg(_dev, _id, _msg, _len) -ENODEV #define mtk_wed_device_stop(_dev) do {} while (0) -- cgit From 5b481acab4ce017fda8166fa9428511da41109e5 Mon Sep 17 00:00:00 2001 From: Benjamin Tissoires Date: Tue, 6 Dec 2022 15:59:32 +0100 Subject: bpf: do not rely on ALLOW_ERROR_INJECTION for fmod_ret The current way of expressing that a non-bpf kernel component is willing to accept that bpf programs can be attached to it and that they can change the return value is to abuse ALLOW_ERROR_INJECTION. This is debated in the link below, and the result is that it is not a reasonable thing to do. Reuse the kfunc declaration structure to also tag the kernel functions we want to be fmodret. This way we can control from any subsystem which functions are being modified by bpf without touching the verifier. Link: https://lore.kernel.org/all/20221121104403.1545f9b5@gandalf.local.home/ Suggested-by: Alexei Starovoitov Signed-off-by: Benjamin Tissoires Acked-by: Alexei Starovoitov Link: https://lore.kernel.org/r/20221206145936.922196-2-benjamin.tissoires@redhat.com --- include/linux/btf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index f9aababc5d78..6a0808e7845c 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -412,8 +412,10 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog); u32 *btf_kfunc_id_set_contains(const struct btf *btf, enum bpf_prog_type prog_type, u32 kfunc_btf_id); +u32 *btf_kfunc_is_modify_return(const struct btf *btf, u32 kfunc_btf_id); int register_btf_kfunc_id_set(enum bpf_prog_type prog_type, const struct btf_kfunc_id_set *s); +int register_btf_fmodret_id_set(const struct btf_kfunc_id_set *kset); s32 btf_find_dtor_kfunc(struct btf *btf, u32 btf_id); int register_btf_id_dtor_kfuncs(const struct btf_id_dtor_kfunc *dtors, u32 add_cnt, struct module *owner); -- cgit From df268f6ca7da1088d7ecff4250d1478d3fa2406b Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 6 Dec 2022 20:51:12 +0200 Subject: net/mlx5: Introduce IFC bits for migratable Introduce IFC related capabilities to enable setting VF to be able to perform live migration. e.g.: to be migratable. Signed-off-by: Yishai Hadas Reviewed-by: Mark Bloch Acked-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/mlx5_ifc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 5a4e914e2a6f..2093131483c7 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -68,6 +68,7 @@ enum { MLX5_SET_HCA_CAP_OP_MOD_ODP = 0x2, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3, MLX5_SET_HCA_CAP_OP_MOD_ROCE = 0x4, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE2 = 0x20, MLX5_SET_HCA_CAP_OP_MODE_PORT_SELECTION = 0x25, }; @@ -1875,7 +1876,10 @@ struct mlx5_ifc_cmd_hca_cap_bits { }; struct mlx5_ifc_cmd_hca_cap_2_bits { - u8 reserved_at_0[0xa0]; + u8 reserved_at_0[0x80]; + + u8 migratable[0x1]; + u8 reserved_at_81[0x1f]; u8 max_reformat_insert_size[0x8]; u8 max_reformat_insert_offset[0x8]; -- cgit From 47d0c500d76c7b1d220c47cd875763fef04eba2e Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 6 Dec 2022 20:51:16 +0200 Subject: net/mlx5: Add generic getters for other functions caps Downstream patch requires to get other function GENERAL2 caps while mlx5_vport_get_other_func_cap() gets only one type of caps (general). Rename it to represent this and introduce a generic implementation of mlx5_vport_get_other_func_cap(). Signed-off-by: Shay Drory Reviewed-by: Mark Bloch Acked-by: Saeed Mahameed Signed-off-by: Jakub Kicinski --- include/linux/mlx5/vport.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index aad53cb72f17..7f31432f44c2 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -132,4 +132,6 @@ int mlx5_nic_vport_affiliate_multiport(struct mlx5_core_dev *master_mdev, int mlx5_nic_vport_unaffiliate_multiport(struct mlx5_core_dev *port_mdev); u64 mlx5_query_nic_system_image_guid(struct mlx5_core_dev *mdev); +int mlx5_vport_get_other_func_cap(struct mlx5_core_dev *dev, u16 function_id, void *out, + u16 opmod); #endif /* __MLX5_VPORT_H__ */ -- cgit From f1543c7abab25d93bc8e9fae79b4cb3153ed6669 Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Tue, 31 May 2022 13:55:28 +0300 Subject: net/mlx5: mlx5_ifc updates for MATCH_DEFINER general object Update full structure of match definer and add an ID of the SELECT match definer type. Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Signed-off-by: Saeed Mahameed --- include/linux/mlx5/mlx5_ifc.h | 68 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 66 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 2093131483c7..294cfe175c4b 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -6108,6 +6108,38 @@ struct mlx5_ifc_match_definer_format_32_bits { u8 inner_dmac_15_0[0x10]; }; +enum { + MLX5_IFC_DEFINER_FORMAT_ID_SELECT = 61, +}; + +#define MLX5_IFC_DEFINER_FORMAT_OFFSET_UNUSED 0x0 +#define MLX5_IFC_DEFINER_FORMAT_OFFSET_OUTER_ETH_PKT_LEN 0x48 +#define MLX5_IFC_DEFINER_DW_SELECTORS_NUM 9 +#define MLX5_IFC_DEFINER_BYTE_SELECTORS_NUM 8 + +struct mlx5_ifc_match_definer_match_mask_bits { + u8 reserved_at_1c0[5][0x20]; + u8 match_dw_8[0x20]; + u8 match_dw_7[0x20]; + u8 match_dw_6[0x20]; + u8 match_dw_5[0x20]; + u8 match_dw_4[0x20]; + u8 match_dw_3[0x20]; + u8 match_dw_2[0x20]; + u8 match_dw_1[0x20]; + u8 match_dw_0[0x20]; + + u8 match_byte_7[0x8]; + u8 match_byte_6[0x8]; + u8 match_byte_5[0x8]; + u8 match_byte_4[0x8]; + + u8 match_byte_3[0x8]; + u8 match_byte_2[0x8]; + u8 match_byte_1[0x8]; + u8 match_byte_0[0x8]; +}; + struct mlx5_ifc_match_definer_bits { u8 modify_field_select[0x40]; @@ -6116,9 +6148,41 @@ struct mlx5_ifc_match_definer_bits { u8 reserved_at_80[0x10]; u8 format_id[0x10]; - u8 reserved_at_a0[0x160]; + u8 reserved_at_a0[0x60]; - u8 match_mask[16][0x20]; + u8 format_select_dw3[0x8]; + u8 format_select_dw2[0x8]; + u8 format_select_dw1[0x8]; + u8 format_select_dw0[0x8]; + + u8 format_select_dw7[0x8]; + u8 format_select_dw6[0x8]; + u8 format_select_dw5[0x8]; + u8 format_select_dw4[0x8]; + + u8 reserved_at_100[0x18]; + u8 format_select_dw8[0x8]; + + u8 reserved_at_120[0x20]; + + u8 format_select_byte3[0x8]; + u8 format_select_byte2[0x8]; + u8 format_select_byte1[0x8]; + u8 format_select_byte0[0x8]; + + u8 format_select_byte7[0x8]; + u8 format_select_byte6[0x8]; + u8 format_select_byte5[0x8]; + u8 format_select_byte4[0x8]; + + u8 reserved_at_180[0x40]; + + union { + struct { + u8 match_mask[16][0x20]; + }; + struct mlx5_ifc_match_definer_match_mask_bits match_mask_format; + }; }; struct mlx5_ifc_general_obj_in_cmd_hdr_bits { -- cgit From 38bf24c38d19dc4ba5ec684ac2afa2f8e256db1e Mon Sep 17 00:00:00 2001 From: Yevgeny Kliteynik Date: Mon, 15 Aug 2022 12:45:28 +0300 Subject: net/mlx5: fs, add match on ranges API Range is a new flow destination type which allows matching on a range of values instead of matching on a specific value. Range flow destination has the following fields: - hit_ft: flow table to forward the traffic in case of hit - miss_ft: flow table to forward the traffic in case of miss - field: which packet characteristic to match on - min: minimal value for the selected field - max: maximal value for the selected field Note: - In order to match, the value in the packet should meet the following criteria: min <= value < max - Currently, the only supported field type is L2 packet length Signed-off-by: Yevgeny Kliteynik Reviewed-by: Alex Vesker Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- include/linux/mlx5/fs.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index c7a91981cd5a..ba6958b49a8e 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -50,6 +50,7 @@ enum mlx5_flow_destination_type { MLX5_FLOW_DESTINATION_TYPE_PORT, MLX5_FLOW_DESTINATION_TYPE_COUNTER, MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM, + MLX5_FLOW_DESTINATION_TYPE_RANGE, }; enum { @@ -143,6 +144,10 @@ enum { MLX5_FLOW_DEST_VPORT_REFORMAT_ID = BIT(1), }; +enum mlx5_flow_dest_range_field { + MLX5_FLOW_DEST_RANGE_FIELD_PKT_LEN = 0, +}; + struct mlx5_flow_destination { enum mlx5_flow_destination_type type; union { @@ -156,6 +161,13 @@ struct mlx5_flow_destination { struct mlx5_pkt_reformat *pkt_reformat; u8 flags; } vport; + struct { + struct mlx5_flow_table *hit_ft; + struct mlx5_flow_table *miss_ft; + enum mlx5_flow_dest_range_field field; + u32 min; + u32 max; + } range; u32 sampler_id; }; }; -- cgit From 6b75bd3d036745b9be30917909f03602099adbdb Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:35 +0530 Subject: bpf: Refactor ARG_PTR_TO_DYNPTR checks into process_dynptr_func ARG_PTR_TO_DYNPTR is akin to ARG_PTR_TO_TIMER, ARG_PTR_TO_KPTR, where the underlying register type is subjected to more special checks to determine the type of object represented by the pointer and its state consistency. Move dynptr checks to their own 'process_dynptr_func' function so that is consistent and in-line with existing code. This also makes it easier to reuse this code for kfunc handling. Then, reuse this consolidated function in kfunc dynptr handling too. Note that for kfuncs, the arg_type constraint of DYNPTR_TYPE_LOCAL has been lifted. Acked-by: David Vernet Acked-by: Joanne Koong Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 70d06a99f0b8..df0cb825e0e3 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -615,11 +615,9 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, enum bpf_arg_type arg_type); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); -bool is_dynptr_reg_valid_init(struct bpf_verifier_env *env, - struct bpf_reg_state *reg); -bool is_dynptr_type_expected(struct bpf_verifier_env *env, - struct bpf_reg_state *reg, - enum bpf_arg_type arg_type); +struct bpf_call_arg_meta; +int process_dynptr_func(struct bpf_verifier_env *env, int regno, + enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, -- cgit From 270605317366e4535d8d9fc3d9da1ad0fb3c9d45 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Thu, 8 Dec 2022 02:11:37 +0530 Subject: bpf: Rework process_dynptr_func Recently, user ringbuf support introduced a PTR_TO_DYNPTR register type for use in callback state, because in case of user ringbuf helpers, there is no dynptr on the stack that is passed into the callback. To reflect such a state, a special register type was created. However, some checks have been bypassed incorrectly during the addition of this feature. First, for arg_type with MEM_UNINIT flag which initialize a dynptr, they must be rejected for such register type. Secondly, in the future, there are plans to add dynptr helpers that operate on the dynptr itself and may change its offset and other properties. In all of these cases, PTR_TO_DYNPTR shouldn't be allowed to be passed to such helpers, however the current code simply returns 0. The rejection for helpers that release the dynptr is already handled. For fixing this, we take a step back and rework existing code in a way that will allow fitting in all classes of helpers and have a coherent model for dealing with the variety of use cases in which dynptr is used. First, for ARG_PTR_TO_DYNPTR, it can either be set alone or together with a DYNPTR_TYPE_* constant that denotes the only type it accepts. Next, helpers which initialize a dynptr use MEM_UNINIT to indicate this fact. To make the distinction clear, use MEM_RDONLY flag to indicate that the helper only operates on the memory pointed to by the dynptr, not the dynptr itself. In C parlance, it would be equivalent to taking the dynptr as a point to const argument. When either of these flags are not present, the helper is allowed to mutate both the dynptr itself and also the memory it points to. Currently, the read only status of the memory is not tracked in the dynptr, but it would be trivial to add this support inside dynptr state of the register. With these changes and renaming PTR_TO_DYNPTR to CONST_PTR_TO_DYNPTR to better reflect its usage, it can no longer be passed to helpers that initialize a dynptr, i.e. bpf_dynptr_from_mem, bpf_ringbuf_reserve_dynptr. A note to reviewers is that in code that does mark_stack_slots_dynptr, and unmark_stack_slots_dynptr, we implicitly rely on the fact that PTR_TO_STACK reg is the only case that can reach that code path, as one cannot pass CONST_PTR_TO_DYNPTR to helpers that don't set MEM_RDONLY. In both cases such helpers won't be setting that flag. The next patch will add a couple of selftest cases to make sure this doesn't break. Fixes: 205715673844 ("bpf: Add bpf_user_ringbuf_drain() helper") Acked-by: Joanne Koong Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20221207204141.308952-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4920ac252754..3de24cfb7a3d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -775,7 +775,7 @@ enum bpf_reg_type { PTR_TO_MEM, /* reg points to valid memory region */ PTR_TO_BUF, /* reg points to a read/write buffer */ PTR_TO_FUNC, /* reg points to a bpf program function */ - PTR_TO_DYNPTR, /* reg points to a dynptr */ + CONST_PTR_TO_DYNPTR, /* reg points to a const struct bpf_dynptr */ __BPF_REG_TYPE_MAX, /* Extended reg_types. */ @@ -2828,7 +2828,7 @@ void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, enum bpf_dynptr_type type, u32 offset, u32 size); void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); int bpf_dynptr_check_size(u32 size); -u32 bpf_dynptr_get_size(struct bpf_dynptr_kern *ptr); +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr); #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); -- cgit From e47877c7aa821c413b45e05f804819579bdfa1a3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 6 Dec 2022 11:36:32 -1000 Subject: rhashtable: Allow rhashtable to be used from irq-safe contexts rhashtable currently only does bh-safe synchronization making it impossible to use from irq-safe contexts. Switch it to use irq-safe synchronization to remove the restriction. v2: Update the lock functions to return the ulong flags value and unlock functions to take the value directly instead of passing around the pointer. Suggested by Linus. Signed-off-by: Tejun Heo Reviewed-by: David Vernet Acked-by: Josh Don Acked-by: Hao Luo Acked-by: Barret Rhoden Cc: Linus Torvalds Signed-off-by: David S. Miller --- include/linux/rhashtable.h | 61 +++++++++++++++++++++++++++------------------- 1 file changed, 36 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 68dab3e08aad..5b5357c0bd8c 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -323,29 +323,36 @@ static inline struct rhash_lock_head __rcu **rht_bucket_insert( * When we write to a bucket without unlocking, we use rht_assign_locked(). */ -static inline void rht_lock(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bkt) +static inline unsigned long rht_lock(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bkt) { - local_bh_disable(); + unsigned long flags; + + local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bkt); lock_map_acquire(&tbl->dep_map); + return flags; } -static inline void rht_lock_nested(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bucket, - unsigned int subclass) +static inline unsigned long rht_lock_nested(struct bucket_table *tbl, + struct rhash_lock_head __rcu **bucket, + unsigned int subclass) { - local_bh_disable(); + unsigned long flags; + + local_irq_save(flags); bit_spin_lock(0, (unsigned long *)bucket); lock_acquire_exclusive(&tbl->dep_map, subclass, 0, NULL, _THIS_IP_); + return flags; } static inline void rht_unlock(struct bucket_table *tbl, - struct rhash_lock_head __rcu **bkt) + struct rhash_lock_head __rcu **bkt, + unsigned long flags) { lock_map_release(&tbl->dep_map); bit_spin_unlock(0, (unsigned long *)bkt); - local_bh_enable(); + local_irq_restore(flags); } static inline struct rhash_head *__rht_ptr( @@ -393,7 +400,8 @@ static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, static inline void rht_assign_unlock(struct bucket_table *tbl, struct rhash_lock_head __rcu **bkt, - struct rhash_head *obj) + struct rhash_head *obj, + unsigned long flags) { if (rht_is_a_nulls(obj)) obj = NULL; @@ -401,7 +409,7 @@ static inline void rht_assign_unlock(struct bucket_table *tbl, rcu_assign_pointer(*bkt, (void *)obj); preempt_enable(); __release(bitlock); - local_bh_enable(); + local_irq_restore(flags); } /** @@ -706,6 +714,7 @@ static inline void *__rhashtable_insert_fast( struct rhash_head __rcu **pprev; struct bucket_table *tbl; struct rhash_head *head; + unsigned long flags; unsigned int hash; int elasticity; void *data; @@ -720,11 +729,11 @@ static inline void *__rhashtable_insert_fast( if (!bkt) goto out; pprev = NULL; - rht_lock(tbl, bkt); + flags = rht_lock(tbl, bkt); if (unlikely(rcu_access_pointer(tbl->future_tbl))) { slow_path: - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); rcu_read_unlock(); return rhashtable_insert_slow(ht, key, obj); } @@ -756,9 +765,9 @@ slow_path: RCU_INIT_POINTER(list->rhead.next, head); if (pprev) { rcu_assign_pointer(*pprev, obj); - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); } else - rht_assign_unlock(tbl, bkt, obj); + rht_assign_unlock(tbl, bkt, obj, flags); data = NULL; goto out; } @@ -785,7 +794,7 @@ slow_path: } atomic_inc(&ht->nelems); - rht_assign_unlock(tbl, bkt, obj); + rht_assign_unlock(tbl, bkt, obj, flags); if (rht_grow_above_75(ht, tbl)) schedule_work(&ht->run_work); @@ -797,7 +806,7 @@ out: return data; out_unlock: - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); goto out; } @@ -991,6 +1000,7 @@ static inline int __rhashtable_remove_fast_one( struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; + unsigned long flags; unsigned int hash; int err = -ENOENT; @@ -999,7 +1009,7 @@ static inline int __rhashtable_remove_fast_one( if (!bkt) return -ENOENT; pprev = NULL; - rht_lock(tbl, bkt); + flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { struct rhlist_head *list; @@ -1043,14 +1053,14 @@ static inline int __rhashtable_remove_fast_one( if (pprev) { rcu_assign_pointer(*pprev, obj); - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); } else { - rht_assign_unlock(tbl, bkt, obj); + rht_assign_unlock(tbl, bkt, obj, flags); } goto unlocked; } - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); unlocked: if (err > 0) { atomic_dec(&ht->nelems); @@ -1143,6 +1153,7 @@ static inline int __rhashtable_replace_fast( struct rhash_lock_head __rcu **bkt; struct rhash_head __rcu **pprev; struct rhash_head *he; + unsigned long flags; unsigned int hash; int err = -ENOENT; @@ -1158,7 +1169,7 @@ static inline int __rhashtable_replace_fast( return -ENOENT; pprev = NULL; - rht_lock(tbl, bkt); + flags = rht_lock(tbl, bkt); rht_for_each_from(he, rht_ptr(bkt, tbl, hash), tbl, hash) { if (he != obj_old) { @@ -1169,15 +1180,15 @@ static inline int __rhashtable_replace_fast( rcu_assign_pointer(obj_new->next, obj_old->next); if (pprev) { rcu_assign_pointer(*pprev, obj_new); - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); } else { - rht_assign_unlock(tbl, bkt, obj_new); + rht_assign_unlock(tbl, bkt, obj_new, flags); } err = 0; goto unlocked; } - rht_unlock(tbl, bkt); + rht_unlock(tbl, bkt, flags); unlocked: return err; -- cgit From ce098da1497c6dee9589fce2c61d1910f4fcf0e7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 7 Dec 2022 22:02:59 -0800 Subject: skbuff: Introduce slab_build_skb() syzkaller reported: BUG: KASAN: slab-out-of-bounds in __build_skb_around+0x235/0x340 net/core/skbuff.c:294 Write of size 32 at addr ffff88802aa172c0 by task syz-executor413/5295 For bpf_prog_test_run_skb(), which uses a kmalloc()ed buffer passed to build_skb(). When build_skb() is passed a frag_size of 0, it means the buffer came from kmalloc. In these cases, ksize() is used to find its actual size, but since the allocation may not have been made to that size, actually perform the krealloc() call so that all the associated buffer size checking will be correctly notified (and use the "new" pointer so that compiler hinting works correctly). Split this logic out into a new interface, slab_build_skb(), but leave the original 0 checking for now to catch any stragglers. Reported-by: syzbot+fda18eaa8c12534ccb3b@syzkaller.appspotmail.com Link: https://groups.google.com/g/syzkaller-bugs/c/UnIKxTtU5-0/m/-wbXinkgAQAJ Fixes: 38931d8989b5 ("mm: Make ksize() a reporting-only function") Cc: Pavel Begunkov Cc: pepsipu Cc: syzbot+fda18eaa8c12534ccb3b@syzkaller.appspotmail.com Cc: Vlastimil Babka Cc: kasan-dev Cc: Andrii Nakryiko Cc: ast@kernel.org Cc: Daniel Borkmann Cc: Hao Luo Cc: Jesper Dangaard Brouer Cc: John Fastabend Cc: jolsa@kernel.org Cc: KP Singh Cc: martin.lau@linux.dev Cc: Stanislav Fomichev Cc: song@kernel.org Cc: Yonghong Song Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20221208060256.give.994-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 4e464a27adaf..4c8492401a10 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1255,6 +1255,7 @@ struct sk_buff *build_skb_around(struct sk_buff *skb, void skb_attempt_defer_free(struct sk_buff *skb); struct sk_buff *napi_build_skb(void *data, unsigned int frag_size); +struct sk_buff *slab_build_skb(void *data); /** * alloc_skb - allocate a network buffer -- cgit From 5dd9cdbc9dec3e99b19e483767e247d15ca8cc0d Mon Sep 17 00:00:00 2001 From: Eduard Zingerman Date: Fri, 9 Dec 2022 15:57:29 +0200 Subject: bpf: states_equal() must build idmap for all function frames verifier.c:states_equal() must maintain register ID mapping across all function frames. Otherwise the following example might be erroneously marked as safe: main: fp[-24] = map_lookup_elem(...) ; frame[0].fp[-24].id == 1 fp[-32] = map_lookup_elem(...) ; frame[0].fp[-32].id == 2 r1 = &fp[-24] r2 = &fp[-32] call foo() r0 = 0 exit foo: 0: r9 = r1 1: r8 = r2 2: r7 = ktime_get_ns() 3: r6 = ktime_get_ns() 4: if (r6 > r7) goto skip_assign 5: r9 = r8 skip_assign: ; <--- checkpoint 6: r9 = *r9 ; (a) frame[1].r9.id == 2 ; (b) frame[1].r9.id == 1 7: if r9 == 0 goto exit: ; mark_ptr_or_null_regs() transfers != 0 info ; for all regs sharing ID: ; (a) r9 != 0 => &frame[0].fp[-32] != 0 ; (b) r9 != 0 => &frame[0].fp[-24] != 0 8: r8 = *r8 ; (a) r8 == &frame[0].fp[-32] ; (b) r8 == &frame[0].fp[-32] 9: r0 = *r8 ; (a) safe ; (b) unsafe exit: 10: exit While processing call to foo() verifier considers the following execution paths: (a) 0-10 (b) 0-4,6-10 (There is also path 0-7,10 but it is not interesting for the issue at hand. (a) is verified first.) Suppose that checkpoint is created at (6) when path (a) is verified, next path (b) is verified and (6) is reached. If states_equal() maintains separate 'idmap' for each frame the mapping at (6) for frame[1] would be empty and regsafe(r9)::check_ids() would add a pair 2->1 and return true, which is an error. If states_equal() maintains single 'idmap' for all frames the mapping at (6) would be { 1->1, 2->2 } and regsafe(r9)::check_ids() would return false when trying to add a pair 2->1. This issue was suggested in the following discussion: https://lore.kernel.org/bpf/CAEf4BzbFB5g4oUfyxk9rHy-PJSLQ3h8q9mV=rVoXfr_JVm8+1Q@mail.gmail.com/ Suggested-by: Andrii Nakryiko Signed-off-by: Eduard Zingerman Link: https://lore.kernel.org/r/20221209135733.28851-4-eddyz87@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index df0cb825e0e3..53d175cbaa02 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -273,9 +273,9 @@ struct bpf_id_pair { u32 cur; }; -/* Maximum number of register states that can exist at once */ -#define BPF_ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) #define MAX_CALL_FRAMES 8 +/* Maximum number of register states that can exist at once */ +#define BPF_ID_MAP_SIZE ((MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) * MAX_CALL_FRAMES) struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; -- cgit From 860b7f27b8f78564ca5a2f607e0820b2d352a562 Mon Sep 17 00:00:00 2001 From: Andrew Melnychenko Date: Wed, 7 Dec 2022 13:35:57 +0200 Subject: linux/virtio_net.h: Support USO offload in vnet header. Now, it's possible to convert USO vnet packets from/to skb. Added support for GSO_UDP_L4 offload. Signed-off-by: Andrew Melnychenko Signed-off-by: David S. Miller --- include/linux/virtio_net.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index a960de68ac69..bdf8de2cdd93 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -15,6 +15,7 @@ static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type) case VIRTIO_NET_HDR_GSO_TCPV6: return protocol == cpu_to_be16(ETH_P_IPV6); case VIRTIO_NET_HDR_GSO_UDP: + case VIRTIO_NET_HDR_GSO_UDP_L4: return protocol == cpu_to_be16(ETH_P_IP) || protocol == cpu_to_be16(ETH_P_IPV6); default: @@ -31,6 +32,7 @@ static inline int virtio_net_hdr_set_proto(struct sk_buff *skb, switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: case VIRTIO_NET_HDR_GSO_UDP: + case VIRTIO_NET_HDR_GSO_UDP_L4: skb->protocol = cpu_to_be16(ETH_P_IP); break; case VIRTIO_NET_HDR_GSO_TCPV6: @@ -69,6 +71,11 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, ip_proto = IPPROTO_UDP; thlen = sizeof(struct udphdr); break; + case VIRTIO_NET_HDR_GSO_UDP_L4: + gso_type = SKB_GSO_UDP_L4; + ip_proto = IPPROTO_UDP; + thlen = sizeof(struct udphdr); + break; default: return -EINVAL; } @@ -182,6 +189,8 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb, hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (sinfo->gso_type & SKB_GSO_TCPV6) hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + else if (sinfo->gso_type & SKB_GSO_UDP_L4) + hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4; else return -EINVAL; if (sinfo->gso_type & SKB_GSO_TCP_ECN) -- cgit From 983055bf839745ec2812fc55cacd96888aa0d7a6 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Thu, 1 Dec 2022 02:46:54 +0900 Subject: USB: core: export usb_cache_string() usb_cache_string() can also be useful for the drivers so export it. Signed-off-by: Vincent Mailhol Acked-by: Greg Kroah-Hartman Link: https://lore.kernel.org/all/20221130174658.29282-4-mailhol.vincent@wanadoo.fr Signed-off-by: Marc Kleine-Budde --- include/linux/usb.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/usb.h b/include/linux/usb.h index 9ff1ad4dfad1..d2d2f41052c0 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -1829,6 +1829,7 @@ static inline int usb_get_ptm_status(struct usb_device *dev, void *data) extern int usb_string(struct usb_device *dev, int index, char *buf, size_t size); +extern char *usb_cache_string(struct usb_device *udev, int index); /* wrappers that also update important state inside usbcore */ extern int usb_clear_halt(struct usb_device *dev, int pipe); -- cgit From 8a321cf7becc6c065ae595b837b826a2a81036b9 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 9 Dec 2022 10:21:38 -0500 Subject: net: add IFF_NO_ADDRCONF and use it in bonding to prevent ipv6 addrconf Currently, in bonding it reused the IFF_SLAVE flag and checked it in ipv6 addrconf to prevent ipv6 addrconf. However, it is not a proper flag to use for no ipv6 addrconf, for bonding it has to move IFF_SLAVE flag setting ahead of dev_open() in bond_enslave(). Also, IFF_MASTER/SLAVE are historical flags used in bonding and eql, as Jiri mentioned, the new devices like Team, Failover do not use this flag. So as Jiri suggested, this patch adds IFF_NO_ADDRCONF in priv_flags of the device to indicate no ipv6 addconf, and uses it in bonding and moves IFF_SLAVE flag setting back to its original place. Signed-off-by: Xin Long Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 2287cb8eb9e4..aad12a179e54 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1662,6 +1662,7 @@ struct net_device_ops { * @IFF_FAILOVER: device is a failover master device * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device + * @IFF_NO_ADDRCONF: prevent ipv6 addrconf * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with * skb_headlen(skb) == 0 (data starts from frag0) * @IFF_CHANGE_PROTO_DOWN: device supports setting carrier via IFLA_PROTO_DOWN @@ -1697,7 +1698,7 @@ enum netdev_priv_flags { IFF_FAILOVER = 1<<27, IFF_FAILOVER_SLAVE = 1<<28, IFF_L3MDEV_RX_HANDLER = 1<<29, - /* was IFF_LIVE_RENAME_OK */ + IFF_NO_ADDRCONF = BIT_ULL(30), IFF_TX_SKB_NO_LINEAR = BIT_ULL(31), IFF_CHANGE_PROTO_DOWN = BIT_ULL(32), }; -- cgit