summaryrefslogtreecommitdiff
path: root/net/core/dev.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 06:20:58 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-02 06:20:58 -0700
commitfc02cb2b37fe2cbf1d3334b9f0f0eab9431766c4 (patch)
tree93b16bc48fdc3be4a1adccbf4c7de92a5e8440e1 /net/core/dev.c
parentbfc484fe6abba4b89ec9330e0e68778e2a9856b2 (diff)
parent84882cf72cd774cf16fd338bdbf00f69ac9f9194 (diff)
Merge tag 'net-next-for-5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - Remove socket skb caches - Add a SO_RESERVE_MEM socket op to forward allocate buffer space and avoid memory accounting overhead on each message sent - Introduce managed neighbor entries - added by control plane and resolved by the kernel for use in acceleration paths (BPF / XDP right now, HW offload users will benefit as well) - Make neighbor eviction on link down controllable by userspace to work around WiFi networks with bad roaming implementations - vrf: Rework interaction with netfilter/conntrack - fq_codel: implement L4S style ce_threshold_ect1 marking - sch: Eliminate unnecessary RCU waits in mini_qdisc_pair_swap() BPF: - Add support for new btf kind BTF_KIND_TAG, arbitrary type tagging as implemented in LLVM14 - Introduce bpf_get_branch_snapshot() to capture Last Branch Records - Implement variadic trace_printk helper - Add a new Bloomfilter map type - Track <8-byte scalar spill and refill - Access hw timestamp through BPF's __sk_buff - Disallow unprivileged BPF by default - Document BPF licensing Netfilter: - Introduce egress hook for looking at raw outgoing packets - Allow matching on and modifying inner headers / payload data - Add NFT_META_IFTYPE to match on the interface type either from ingress or egress Protocols: - Multi-Path TCP: - increase default max additional subflows to 2 - rework forward memory allocation - add getsockopts: MPTCP_INFO, MPTCP_TCPINFO, MPTCP_SUBFLOW_ADDRS - MCTP flow support allowing lower layer drivers to configure msg muxing as needed - Automatic Multicast Tunneling (AMT) driver based on RFC7450 - HSR support the redbox supervision frames (IEC-62439-3:2018) - Support for the ip6ip6 encapsulation of IOAM - Netlink interface for CAN-FD's Transmitter Delay Compensation - Support SMC-Rv2 eliminating the current same-subnet restriction, by exploiting the UDP encapsulation feature of RoCE adapters - TLS: add SM4 GCM/CCM crypto support - Bluetooth: initial support for link quality and audio/codec offload Driver APIs: - Add a batched interface for RX buffer allocation in AF_XDP buffer pool - ethtool: Add ability to control transceiver modules' power mode - phy: Introduce supported interfaces bitmap to express MAC capabilities and simplify PHY code - Drop rtnl_lock from DSA .port_fdb_{add,del} callbacks New drivers: - WiFi driver for Realtek 8852AE 802.11ax devices (rtw89) - Ethernet driver for ASIX AX88796C SPI device (x88796c) Drivers: - Broadcom PHYs - support 72165, 7712 16nm PHYs - support IDDQ-SR for additional power savings - PHY support for QCA8081, QCA9561 PHYs - NXP DPAA2: support for IRQ coalescing - NXP Ethernet (enetc): support for software TCP segmentation - Renesas Ethernet (ravb) - support DMAC and EMAC blocks of Gigabit-capable IP found on RZ/G2L SoC - Intel 100G Ethernet - support for eswitch offload of TC/OvS flow API, including offload of GRE, VxLAN, Geneve tunneling - support application device queues - ability to assign Rx and Tx queues to application threads - PTP and PPS (pulse-per-second) extensions - Broadcom Ethernet (bnxt) - devlink health reporting and device reload extensions - Mellanox Ethernet (mlx5) - offload macvlan interfaces - support HW offload of TC rules involving OVS internal ports - support HW-GRO and header/data split - support application device queues - Marvell OcteonTx2: - add XDP support for PF - add PTP support for VF - Qualcomm Ethernet switch (qca8k): support for QCA8328 - Realtek Ethernet DSA switch (rtl8366rb) - support bridge offload - support STP, fast aging, disabling address learning - support for Realtek RTL8365MB-VC, a 4+1 port 10M/100M/1GE switch - Mellanox Ethernet/IB switch (mlxsw) - multi-level qdisc hierarchy offload (e.g. RED, prio and shaping) - offload root TBF qdisc as port shaper - support multiple routing interface MAC address prefixes - support for IP-in-IP with IPv6 underlay - MediaTek WiFi (mt76) - mt7921 - ASPM, 6GHz, SDIO and testmode support - mt7915 - LED and TWT support - Qualcomm WiFi (ath11k) - include channel rx and tx time in survey dump statistics - support for 80P80 and 160 MHz bandwidths - support channel 2 in 6 GHz band - spectral scan support for QCN9074 - support for rx decapsulation offload (data frames in 802.3 format) - Qualcomm phone SoC WiFi (wcn36xx) - enable Idle Mode Power Save (IMPS) to reduce power consumption during idle - Bluetooth driver support for MediaTek MT7922 and MT7921 - Enable support for AOSP Bluetooth extension in Qualcomm WCN399x and Realtek 8822C/8852A - Microsoft vNIC driver (mana) - support hibernation and kexec - Google vNIC driver (gve) - support for jumbo frames - implement Rx page reuse Refactor: - Make all writes to netdev->dev_addr go thru helpers, so that we can add this address to the address rbtree and handle the updates - Various TCP cleanups and optimizations including improvements to CPU cache use - Simplify the gnet_stats, Qdisc stats' handling and remove qdisc->running sequence counter - Driver changes and API updates to address devlink locking deficiencies" * tag 'net-next-for-5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2122 commits) Revert "net: avoid double accounting for pure zerocopy skbs" selftests: net: add arp_ndisc_evict_nocarrier net: ndisc: introduce ndisc_evict_nocarrier sysctl parameter net: arp: introduce arp_evict_nocarrier sysctl parameter libbpf: Deprecate AF_XDP support kbuild: Unify options for BTF generation for vmlinux and modules selftests/bpf: Add a testcase for 64-bit bounds propagation issue. bpf: Fix propagation of signed bounds from 64-bit min/max into 32-bit. bpf: Fix propagation of bounds from 64-bit min/max into 32-bit and var_off. net: vmxnet3: remove multiple false checks in vmxnet3_ethtool.c net: avoid double accounting for pure zerocopy skbs tcp: rename sk_wmem_free_skb netdevsim: fix uninit value in nsim_drv_configure_vfs() selftests/bpf: Fix also no-alu32 strobemeta selftest bpf: Add missing map_delete_elem method to bloom filter map selftests/bpf: Add bloom map success test for userspace calls bpf: Add alignment padding for "map_extra" + consolidate holes bpf: Bloom filter map naming fixups selftests/bpf: Add test cases for struct_ops prog bpf: Add dummy BPF STRUCT_OPS for test purpose ...
Diffstat (limited to 'net/core/dev.c')
-rw-r--r--net/core/dev.c92
1 files changed, 58 insertions, 34 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index eb3a366bf212..edeb811c454e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -140,7 +140,7 @@
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
-#include <linux/netfilter_ingress.h>
+#include <linux/netfilter_netdev.h>
#include <linux/crash_dump.h>
#include <linux/sctp.h>
#include <net/udp_tunnel.h>
@@ -303,6 +303,12 @@ static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
return NULL;
}
+bool netdev_name_in_use(struct net *net, const char *name)
+{
+ return netdev_name_node_lookup(net, name);
+}
+EXPORT_SYMBOL(netdev_name_in_use);
+
int netdev_name_node_alt_create(struct net_device *dev, const char *name)
{
struct netdev_name_node *name_node;
@@ -1133,7 +1139,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
}
snprintf(buf, IFNAMSIZ, name, i);
- if (!__dev_get_by_name(net, buf))
+ if (!netdev_name_in_use(net, buf))
return i;
/* It is possible to run out of possible slots
@@ -1187,7 +1193,7 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev,
if (strchr(name, '%'))
return dev_alloc_name_ns(net, dev, name);
- else if (__dev_get_by_name(net, name))
+ else if (netdev_name_in_use(net, name))
return -EEXIST;
else if (dev->name != name)
strlcpy(dev->name, name, IFNAMSIZ);
@@ -1290,8 +1296,8 @@ rollback:
old_assign_type = NET_NAME_RENAMED;
goto rollback;
} else {
- pr_err("%s: name change rollback failed: %d\n",
- dev->name, ret);
+ netdev_err(dev, "name change rollback failed: %d\n",
+ ret);
}
}
@@ -2345,7 +2351,7 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
/* If TC0 is invalidated disable TC mapping */
if (tc->offset + tc->count > txq) {
- pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
+ netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
dev->num_tc = 0;
return;
}
@@ -2356,8 +2362,8 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
tc = &dev->tc_to_txq[q];
if (tc->offset + tc->count > txq) {
- pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
- i, q);
+ netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
+ i, q);
netdev_set_prio_tc_map(dev, i, 0);
}
}
@@ -2921,6 +2927,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->num_tc)
netif_setup_tc(dev, txq);
+ dev_qdisc_change_real_num_tx(dev, txq);
+
dev->real_num_tx_queues = txq;
if (disabling) {
@@ -3414,7 +3422,7 @@ EXPORT_SYMBOL(__skb_gso_segment);
#ifdef CONFIG_BUG
static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
- pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
+ netdev_err(dev, "hw csum failure\n");
skb_dump(KERN_ERR, skb, true);
dump_stack();
}
@@ -3925,6 +3933,7 @@ EXPORT_SYMBOL(dev_loopback_xmit);
static struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
+#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
struct tcf_result cl_res;
@@ -3960,6 +3969,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
default:
break;
}
+#endif /* CONFIG_NET_CLS_ACT */
return skb;
}
@@ -4153,13 +4163,20 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
qdisc_pkt_len_init(skb);
#ifdef CONFIG_NET_CLS_ACT
skb->tc_at_ingress = 0;
-# ifdef CONFIG_NET_EGRESS
+#endif
+#ifdef CONFIG_NET_EGRESS
if (static_branch_unlikely(&egress_needed_key)) {
+ if (nf_hook_egress_active()) {
+ skb = nf_hook_egress(skb, &rc, dev);
+ if (!skb)
+ goto out;
+ }
+ nf_skip_egress(skb, true);
skb = sch_handle_egress(skb, &rc, dev);
if (!skb)
goto out;
+ nf_skip_egress(skb, false);
}
-# endif
#endif
/* If device/qdisc don't need skb->dst, release it right now while
* its hot in this cpu cache.
@@ -5301,6 +5318,7 @@ skip_taps:
if (static_branch_unlikely(&ingress_needed_key)) {
bool another = false;
+ nf_skip_egress(skb, true);
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
&another);
if (another)
@@ -5308,6 +5326,7 @@ skip_taps:
if (!skb)
goto out;
+ nf_skip_egress(skb, false);
if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
goto out;
}
@@ -5844,7 +5863,7 @@ static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int se
gro_normal_list(napi);
}
-static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
+static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
{
struct packet_offload *ptype;
__be16 type = skb->protocol;
@@ -5873,12 +5892,11 @@ static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
if (err) {
WARN_ON(&ptype->list == head);
kfree_skb(skb);
- return NET_RX_SUCCESS;
+ return;
}
out:
gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
- return NET_RX_SUCCESS;
}
static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
@@ -6905,19 +6923,25 @@ EXPORT_SYMBOL(netif_napi_add);
void napi_disable(struct napi_struct *n)
{
+ unsigned long val, new;
+
might_sleep();
set_bit(NAPI_STATE_DISABLE, &n->state);
- while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
- msleep(1);
- while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
- msleep(1);
+ do {
+ val = READ_ONCE(n->state);
+ if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
+ usleep_range(20, 200);
+ continue;
+ }
+
+ new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
+ new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
+ } while (cmpxchg(&n->state, val, new) != val);
hrtimer_cancel(&n->timer);
- clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state);
- clear_bit(NAPI_STATE_THREADED, &n->state);
}
EXPORT_SYMBOL(napi_disable);
@@ -6995,8 +7019,8 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
}
if (unlikely(work > weight))
- pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
- n->poll, work, weight);
+ netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
+ n->poll, work, weight);
if (likely(work < weight))
return work;
@@ -8550,8 +8574,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
dev->flags &= ~IFF_PROMISC;
else {
dev->promiscuity -= inc;
- pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
- dev->name);
+ netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
return -EOVERFLOW;
}
}
@@ -8621,8 +8644,7 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
dev->flags &= ~IFF_ALLMULTI;
else {
dev->allmulti -= inc;
- pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
- dev->name);
+ netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
return -EOVERFLOW;
}
}
@@ -9159,14 +9181,11 @@ int dev_get_port_parent_id(struct net_device *dev,
}
err = devlink_compat_switch_id_get(dev, ppid);
- if (!err || err != -EOPNOTSUPP)
+ if (!recurse || err != -EOPNOTSUPP)
return err;
- if (!recurse)
- return -EOPNOTSUPP;
-
netdev_for_each_lower_dev(dev, lower_dev, iter) {
- err = dev_get_port_parent_id(lower_dev, ppid, recurse);
+ err = dev_get_port_parent_id(lower_dev, ppid, true);
if (err)
break;
if (!first.id_len)
@@ -9910,6 +9929,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+ if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
+ netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
+ features &= ~NETIF_F_LRO;
+ }
+
if (features & NETIF_F_HW_TLS_TX) {
bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
@@ -10867,7 +10891,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
- nf_hook_ingress_init(dev);
+ nf_hook_netdev_init(dev);
return dev;
@@ -11153,7 +11177,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
* we can use it in the destination network namespace.
*/
err = -EEXIST;
- if (__dev_get_by_name(net, dev->name)) {
+ if (netdev_name_in_use(net, dev->name)) {
/* We get here if we can't use the current device name */
if (!pat)
goto out;
@@ -11506,7 +11530,7 @@ static void __net_exit default_device_exit(struct net *net)
/* Push remaining network devices to init_net */
snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
- if (__dev_get_by_name(&init_net, fb_name))
+ if (netdev_name_in_use(&init_net, fb_name))
snprintf(fb_name, IFNAMSIZ, "dev%%d");
err = dev_change_net_namespace(dev, &init_net, fb_name);
if (err) {