summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile4
-rw-r--r--net/core/bpf_sk_storage.c24
-rw-r--r--net/core/datagram.c90
-rw-r--r--net/core/dev.c1473
-rw-r--r--net/core/dev.h81
-rw-r--r--net/core/dev_addr_lists.c7
-rw-r--r--net/core/dev_api.c369
-rw-r--r--net/core/dev_ioctl.c156
-rw-r--r--net/core/devmem.c268
-rw-r--r--net/core/devmem.h138
-rw-r--r--net/core/drop_monitor.c37
-rw-r--r--net/core/dst.c14
-rw-r--r--net/core/dst_cache.c30
-rw-r--r--net/core/fib_rules.c301
-rw-r--r--net/core/filter.c279
-rw-r--r--net/core/flow_dissector.c78
-rw-r--r--net/core/gen_estimator.c2
-rw-r--r--net/core/gro.c107
-rw-r--r--net/core/hotdata.c1
-rw-r--r--net/core/link_watch.c28
-rw-r--r--net/core/lock_debug.c (renamed from net/core/rtnl_net_debug.c)37
-rw-r--r--net/core/lwtunnel.c105
-rw-r--r--net/core/neighbour.c48
-rw-r--r--net/core/net-procfs.c37
-rw-r--r--net/core/net-sysfs.c449
-rw-r--r--net/core/net_namespace.c186
-rw-r--r--net/core/netdev-genl-gen.c17
-rw-r--r--net/core/netdev-genl-gen.h7
-rw-r--r--net/core/netdev-genl.c382
-rw-r--r--net/core/netdev_rx_queue.c142
-rw-r--r--net/core/netmem_priv.h33
-rw-r--r--net/core/netpoll.c86
-rw-r--r--net/core/page_pool.c337
-rw-r--r--net/core/page_pool_priv.h2
-rw-r--r--net/core/page_pool_user.c22
-rw-r--r--net/core/pktgen.c454
-rw-r--r--net/core/rtnetlink.c300
-rw-r--r--net/core/scm.c132
-rw-r--r--net/core/secure_seq.c44
-rw-r--r--net/core/selftests.c22
-rw-r--r--net/core/skbuff.c445
-rw-r--r--net/core/skmsg.c63
-rw-r--r--net/core/sock.c227
-rw-r--r--net/core/sock_diag.c2
-rw-r--r--net/core/sock_map.c8
-rw-r--r--net/core/sysctl_net_core.c14
-rw-r--r--net/core/timestamping.c52
-rw-r--r--net/core/utils.c12
-rw-r--r--net/core/xdp.c375
49 files changed, 5268 insertions, 2259 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index d9326600e289..b2a76ce33932 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,7 +9,7 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
-obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
+obj-y += dev.o dev_api.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
fib_notifier.o xdp.o flow_offload.o gro.o \
@@ -45,5 +45,5 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
obj-$(CONFIG_OF) += of_net.o
obj-$(CONFIG_NET_TEST) += net_test.o
obj-$(CONFIG_NET_DEVMEM) += devmem.o
-obj-$(CONFIG_DEBUG_NET_SMALL_RTNL) += rtnl_net_debug.o
+obj-$(CONFIG_DEBUG_NET) += lock_debug.o
obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 2f4ed83a75ae..2e538399757f 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -50,15 +50,16 @@ void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_local_storage *sk_storage;
+ migrate_disable();
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
- if (!sk_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!sk_storage)
+ goto out;
bpf_local_storage_destroy(sk_storage);
+out:
rcu_read_unlock();
+ migrate_enable();
}
static void bpf_sk_storage_map_free(struct bpf_map *map)
@@ -160,6 +161,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+ migrate_disable();
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
@@ -212,6 +214,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
out:
rcu_read_unlock();
+ migrate_enable();
/* In case of an error, don't free anything explicitly here, the
* caller is responsible to call bpf_sk_storage_free.
@@ -352,11 +355,6 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
{
- const struct btf *btf_vmlinux;
- const struct btf_type *t;
- const char *tname;
- u32 btf_id;
-
if (prog->aux->dst_prog)
return false;
@@ -371,13 +369,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
return true;
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
- btf_vmlinux = bpf_get_btf_vmlinux();
- if (IS_ERR_OR_NULL(btf_vmlinux))
- return false;
- btf_id = prog->aux->attach_btf_id;
- t = btf_type_by_id(btf_vmlinux, btf_id);
- tname = btf_name_by_offset(btf_vmlinux, t->name_off);
- return !!strncmp(tname, "bpf_sk_storage",
+ return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage",
strlen("bpf_sk_storage"));
default:
return false;
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f0693707aece..94cc4705e91d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -52,6 +52,7 @@
#include <linux/pagemap.h>
#include <linux/iov_iter.h>
#include <linux/indirect_call_wrapper.h>
+#include <linux/crc32.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -61,7 +62,8 @@
#include <net/tcp_states.h>
#include <trace/events/skb.h>
#include <net/busy_poll.h>
-#include <crypto/hash.h>
+
+#include "devmem.h"
/*
* Is a socket 'connection oriented' ?
@@ -163,8 +165,7 @@ done:
return skb;
}
-struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
- struct sk_buff_head *queue,
+struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue,
unsigned int flags,
int *off, int *err,
struct sk_buff **last)
@@ -261,7 +262,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
* However, this function was correct in any case. 8)
*/
spin_lock_irqsave(&queue->lock, cpu_flags);
- skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error,
+ skb = __skb_try_recv_from_queue(queue, flags, off, &error,
last);
spin_unlock_irqrestore(&queue->lock, cpu_flags);
if (error)
@@ -482,41 +483,37 @@ short_copy:
return 0;
}
-static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
- struct iov_iter *i)
+#ifdef CONFIG_NET_CRC32C
+static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes,
+ void *_crcp, struct iov_iter *i)
{
-#ifdef CONFIG_CRYPTO_HASH
- struct ahash_request *hash = hashp;
- struct scatterlist sg;
+ u32 *crcp = _crcp;
size_t copied;
copied = copy_to_iter(addr, bytes, i);
- sg_init_one(&sg, addr, copied);
- ahash_request_set_crypt(hash, &sg, NULL, copied);
- crypto_ahash_update(hash);
+ *crcp = crc32c(*crcp, addr, copied);
return copied;
-#else
- return 0;
-#endif
}
/**
- * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
- * and update a hash.
+ * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator
+ * and update a CRC32C value.
* @skb: buffer to copy
* @offset: offset in the buffer to start copying from
* @to: iovec iterator to copy to
* @len: amount of data to copy from buffer to iovec
- * @hash: hash request to update
+ * @crcp: pointer to CRC32C value to update
+ *
+ * Return: 0 on success, -EFAULT if there was a fault during copy.
*/
-int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
- struct iov_iter *to, int len,
- struct ahash_request *hash)
+int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len, u32 *crcp)
{
return __skb_datagram_iter(skb, offset, to, len, true,
- hash_and_copy_to_iter, hash);
+ crc32c_and_copy_to_iter, crcp);
}
-EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter);
+EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter);
+#endif /* CONFIG_NET_CRC32C */
static size_t simple_copy_to_iter(const void *addr, size_t bytes,
void *data __always_unused, struct iov_iter *i)
@@ -692,9 +689,50 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
return 0;
}
+static int
+zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
+ int length,
+ struct net_devmem_dmabuf_binding *binding)
+{
+ int i = skb_shinfo(skb)->nr_frags;
+ size_t virt_addr, size, off;
+ struct net_iov *niov;
+
+ /* Devmem filling works by taking an IOVEC from the user where the
+ * iov_addrs are interpreted as an offset in bytes into the dma-buf to
+ * send from. We do not support other iter types.
+ */
+ if (iov_iter_type(from) != ITER_IOVEC &&
+ iov_iter_type(from) != ITER_UBUF)
+ return -EFAULT;
+
+ while (length && iov_iter_count(from)) {
+ if (i == MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+
+ virt_addr = (size_t)iter_iov_addr(from);
+ niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size);
+ if (!niov)
+ return -EFAULT;
+
+ size = min_t(size_t, size, length);
+ size = min_t(size_t, size, iter_iov_len(from));
+
+ get_netmem(net_iov_to_netmem(niov));
+ skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off,
+ size, PAGE_SIZE);
+ iov_iter_advance(from, size);
+ length -= size;
+ i++;
+ }
+
+ return 0;
+}
+
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, struct iov_iter *from,
- size_t length)
+ size_t length,
+ struct net_devmem_dmabuf_binding *binding)
{
unsigned long orig_size = skb->truesize;
unsigned long truesize;
@@ -702,6 +740,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
if (msg && msg->msg_ubuf && msg->sg_from_iter)
ret = msg->sg_from_iter(skb, from, length);
+ else if (binding)
+ ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding);
else
ret = zerocopy_fill_skb_from_iter(skb, from, length);
@@ -735,7 +775,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
return -EFAULT;
- return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
+ return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
diff --git a/net/core/dev.c b/net/core/dev.c
index a9f62f5aeb84..be97c440ecd5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -92,6 +92,7 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>
#include <linux/bpf.h>
@@ -105,6 +106,7 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/gro.h>
+#include <net/netdev_queues.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
@@ -154,9 +156,11 @@
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
+#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/types.h>
#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
#include <net/rps.h>
#include <linux/phy_link_topology.h>
@@ -180,8 +184,6 @@ static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
-static DECLARE_RWSEM(devnet_rename_sem);
-
static inline void dev_base_seq_inc(struct net *net)
{
unsigned int val = net->dev_base_seq + 1;
@@ -460,7 +462,9 @@ EXPORT_PER_CPU_SYMBOL(softnet_data);
* PP consumers must pay attention to run APIs in the appropriate context
* (e.g. NAPI context).
*/
-static DEFINE_PER_CPU(struct page_pool *, system_page_pool);
+DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
#ifdef CONFIG_LOCKDEP
/*
@@ -570,10 +574,18 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
- if (pt->type == htons(ETH_P_ALL))
- return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
- else
- return pt->dev ? &pt->dev->ptype_specific :
+ if (pt->type == htons(ETH_P_ALL)) {
+ if (!pt->af_packet_net && !pt->dev)
+ return NULL;
+
+ return pt->dev ? &pt->dev->ptype_all :
+ &pt->af_packet_net->ptype_all;
+ }
+
+ if (pt->dev)
+ return &pt->dev->ptype_specific;
+
+ return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
&ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}
@@ -594,6 +606,9 @@ void dev_add_pack(struct packet_type *pt)
{
struct list_head *head = ptype_head(pt);
+ if (WARN_ON_ONCE(!head))
+ return;
+
spin_lock(&ptype_lock);
list_add_rcu(&pt->list, head);
spin_unlock(&ptype_lock);
@@ -618,6 +633,9 @@ void __dev_remove_pack(struct packet_type *pt)
struct list_head *head = ptype_head(pt);
struct packet_type *pt1;
+ if (!head)
+ return;
+
spin_lock(&ptype_lock);
list_for_each_entry(pt1, head, list) {
@@ -767,7 +785,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
}
/* must be called under rcu_read_lock(), as we dont take a reference */
-struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id)
+static struct napi_struct *
+netdev_napi_by_id(struct net *net, unsigned int napi_id)
{
struct napi_struct *napi;
@@ -784,6 +803,49 @@ struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id)
}
/**
+ * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
+ * @net: the applicable net namespace
+ * @napi_id: ID of a NAPI of a target device
+ *
+ * Find a NAPI instance with @napi_id. Lock its device.
+ * The device must be in %NETREG_REGISTERED state for lookup to succeed.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to NAPI, its device with lock held, NULL if not found.
+ */
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
+{
+ struct napi_struct *napi;
+ struct net_device *dev;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ dev = napi->dev;
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev, net);
+ if (!dev)
+ return NULL;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (napi && napi->dev != dev)
+ napi = NULL;
+ rcu_read_unlock();
+
+ if (!napi)
+ netdev_unlock(dev);
+ return napi;
+}
+
+/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
* @name: name to find
@@ -957,21 +1019,138 @@ EXPORT_SYMBOL(netdev_get_by_index);
* its reference counter increased so the caller must be careful
* about locking. The caller must hold RCU lock.
*/
-
struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
struct napi_struct *napi;
WARN_ON_ONCE(!rcu_read_lock_held());
- if (napi_id < MIN_NAPI_ID)
+ if (!napi_id_valid(napi_id))
return NULL;
napi = napi_by_id(napi_id);
return napi ? napi->dev : NULL;
}
-EXPORT_SYMBOL(dev_get_by_napi_id);
+
+/* Release the held reference on the net_device, and if the net_device
+ * is still registered try to lock the instance lock. If device is being
+ * unregistered NULL will be returned (but the reference has been released,
+ * either way!)
+ *
+ * This helper is intended for locking net_device after it has been looked up
+ * using a lockless lookup helper. Lock prevents the instance from going away.
+ */
+struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net)
+{
+ netdev_lock(dev);
+ if (dev->reg_state > NETREG_REGISTERED ||
+ dev->moving_ns || !net_eq(dev_net(dev), net)) {
+ netdev_unlock(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+static struct net_device *
+__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net)
+{
+ netdev_lock_ops_compat(dev);
+ if (dev->reg_state > NETREG_REGISTERED ||
+ dev->moving_ns || !net_eq(dev_net(dev), net)) {
+ netdev_unlock_ops_compat(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+/**
+ * netdev_get_by_index_lock() - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. If a valid device
+ * with @ifindex is found it will be returned with netdev->lock held.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to a device with lock held, NULL if not found.
+ */
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock(dev, net);
+}
+
+struct net_device *
+netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock_ops_compat(dev, net);
+}
+
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev, net);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
+
+struct net_device *
+netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock_ops_compat(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock_ops_compat(dev, net);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
static DEFINE_SEQLOCK(netdev_rename_lock);
@@ -1012,6 +1191,12 @@ out:
return ret;
}
+static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
+ const char *ha)
+{
+ return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
+}
+
/**
* dev_getbyhwaddr_rcu - find a device by its hardware address
* @net: the applicable net namespace
@@ -1020,7 +1205,7 @@ out:
*
* Search for an interface by MAC address. Returns NULL if the device
* is not found or a pointer to the device.
- * The caller must hold RCU or RTNL.
+ * The caller must hold RCU.
* The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
@@ -1032,14 +1217,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
struct net_device *dev;
for_each_netdev_rcu(net, dev)
- if (dev->type == type &&
- !memcmp(dev->dev_addr, ha, dev->addr_len))
+ if (dev_addr_cmp(dev, type, ha))
return dev;
return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
+/**
+ * dev_getbyhwaddr() - find a device by its hardware address
+ * @net: the applicable net namespace
+ * @type: media type of device
+ * @ha: hardware address
+ *
+ * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
+ * rtnl_lock.
+ *
+ * Context: rtnl_lock() must be held.
+ * Return: pointer to the net_device, or NULL if not found
+ */
+struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
+ const char *ha)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL();
+ for_each_netdev(net, dev)
+ if (dev_addr_cmp(dev, type, ha))
+ return dev;
+
+ return NULL;
+}
+EXPORT_SYMBOL(dev_getbyhwaddr);
+
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
struct net_device *dev, *ret = NULL;
@@ -1230,33 +1440,18 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev,
return ret < 0 ? ret : 0;
}
-/**
- * dev_change_name - change name of a device
- * @dev: device
- * @newname: name (or format string) must be at least IFNAMSIZ
- *
- * Change name of a device, can pass format strings "eth%d".
- * for wildcarding.
- */
-int dev_change_name(struct net_device *dev, const char *newname)
+int netif_change_name(struct net_device *dev, const char *newname)
{
+ struct net *net = dev_net(dev);
unsigned char old_assign_type;
char oldname[IFNAMSIZ];
int err = 0;
int ret;
- struct net *net;
- ASSERT_RTNL();
- BUG_ON(!dev_net(dev));
-
- net = dev_net(dev);
+ ASSERT_RTNL_NET(net);
- down_write(&devnet_rename_sem);
-
- if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
- up_write(&devnet_rename_sem);
+ if (!strncmp(newname, dev->name, IFNAMSIZ))
return 0;
- }
memcpy(oldname, dev->name, IFNAMSIZ);
@@ -1264,10 +1459,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
err = dev_get_valid_name(net, dev, newname);
write_sequnlock_bh(&netdev_rename_lock);
- if (err < 0) {
- up_write(&devnet_rename_sem);
+ if (err < 0)
return err;
- }
if (oldname[0] && !strchr(oldname, '%'))
netdev_info(dev, "renamed from %s%s\n", oldname,
@@ -1279,14 +1472,13 @@ int dev_change_name(struct net_device *dev, const char *newname)
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
+ write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
WRITE_ONCE(dev->name_assign_type, old_assign_type);
- up_write(&devnet_rename_sem);
return ret;
}
- up_write(&devnet_rename_sem);
-
netdev_adjacent_rename_links(dev, oldname);
netdev_name_node_del(dev->name_node);
@@ -1302,7 +1494,6 @@ rollback:
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
- down_write(&devnet_rename_sem);
write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
write_sequnlock_bh(&netdev_rename_lock);
@@ -1319,15 +1510,7 @@ rollback:
return err;
}
-/**
- * dev_set_alias - change ifalias of a device
- * @dev: device
- * @alias: name up to IFALIASZ
- * @len: limit of bytes to copy from info
- *
- * Set ifalias for a device,
- */
-int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+int netif_set_alias(struct net_device *dev, const char *alias, size_t len)
{
struct dev_ifalias *new_alias = NULL;
@@ -1353,7 +1536,6 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
return len;
}
-EXPORT_SYMBOL(dev_set_alias);
/**
* dev_get_alias - get ifalias of a device
@@ -1390,16 +1572,10 @@ void netdev_features_change(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_features_change);
-/**
- * netdev_state_change - device changes state
- * @dev: device to cause notification
- *
- * Called to indicate a device has changed state. This function calls
- * the notifier chains for netdev_chain and sends a NEWLINK message
- * to the routing socket.
- */
-void netdev_state_change(struct net_device *dev)
+void netif_state_change(struct net_device *dev)
{
+ netdev_ops_assert_locked_or_invisible(dev);
+
if (dev->flags & IFF_UP) {
struct netdev_notifier_change_info change_info = {
.info.dev = dev,
@@ -1410,7 +1586,6 @@ void netdev_state_change(struct net_device *dev)
rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
}
}
-EXPORT_SYMBOL(netdev_state_change);
/**
* __netdev_notify_peers - notify network peers about existence of @dev,
@@ -1499,6 +1674,8 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
set_bit(__LINK_STATE_START, &dev->state);
+ netdev_ops_assert_locked(dev);
+
if (ops->ndo_validate_addr)
ret = ops->ndo_validate_addr(dev);
@@ -1510,7 +1687,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
else {
- dev->flags |= IFF_UP;
+ netif_set_up(dev, true);
dev_set_rx_mode(dev);
dev_activate(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -1519,20 +1696,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
return ret;
}
-/**
- * dev_open - prepare an interface for use.
- * @dev: device to open
- * @extack: netlink extended ack
- *
- * Takes a device from down to up state. The device's private open
- * function is invoked and then the multicast lists are loaded. Finally
- * the device is moved into the up state and a %NETDEV_UP message is
- * sent to the netdev notifier chain.
- *
- * Calling this function on an active interface is a nop. On a failure
- * a negative errno code is returned.
- */
-int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
+int netif_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
int ret;
@@ -1548,7 +1712,6 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
return ret;
}
-EXPORT_SYMBOL(dev_open);
static void __dev_close_many(struct list_head *head)
{
@@ -1586,10 +1749,13 @@ static void __dev_close_many(struct list_head *head)
* We allow it to be called even after a DETACH hot-plug
* event.
*/
+
+ netdev_ops_assert_locked(dev);
+
if (ops->ndo_stop)
ops->ndo_stop(dev);
- dev->flags &= ~IFF_UP;
+ netif_set_up(dev, false);
netpoll_poll_enable(dev);
}
}
@@ -1623,16 +1789,7 @@ void dev_close_many(struct list_head *head, bool unlink)
}
EXPORT_SYMBOL(dev_close_many);
-/**
- * dev_close - shutdown an interface.
- * @dev: device to shutdown
- *
- * This function moves an active device into down state. A
- * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
- * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
- * chain.
- */
-void dev_close(struct net_device *dev)
+void netif_close(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
LIST_HEAD(single);
@@ -1642,18 +1799,9 @@ void dev_close(struct net_device *dev)
list_del(&single);
}
}
-EXPORT_SYMBOL(dev_close);
+EXPORT_SYMBOL(netif_close);
-
-/**
- * dev_disable_lro - disable Large Receive Offload on a device
- * @dev: device
- *
- * Disable Large Receive Offload (LRO) on a net device. Must be
- * called under RTNL. This is needed if received packets may be
- * forwarded to another interface.
- */
-void dev_disable_lro(struct net_device *dev)
+void netif_disable_lro(struct net_device *dev)
{
struct net_device *lower_dev;
struct list_head *iter;
@@ -1664,10 +1812,13 @@ void dev_disable_lro(struct net_device *dev)
if (unlikely(dev->features & NETIF_F_LRO))
netdev_WARN(dev, "failed to disable LRO!\n");
- netdev_for_each_lower_dev(dev, lower_dev, iter)
- dev_disable_lro(lower_dev);
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ netdev_lock_ops(lower_dev);
+ netif_disable_lro(lower_dev);
+ netdev_unlock_ops(lower_dev);
+ }
}
-EXPORT_SYMBOL(dev_disable_lro);
+EXPORT_IPV6_MOD(netif_disable_lro);
/**
* dev_disable_gro_hw - disable HW Generic Receive Offload on a device
@@ -1755,7 +1906,9 @@ static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
int err;
for_each_netdev(net, dev) {
+ netdev_lock_ops(dev);
err = call_netdevice_register_notifiers(nb, dev);
+ netdev_unlock_ops(dev);
if (err)
goto rollback;
}
@@ -1799,14 +1952,19 @@ int register_netdevice_notifier(struct notifier_block *nb)
/* Close race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
+
+ /* When RTNL is removed, we need protection for netdev_chain. */
rtnl_lock();
+
err = raw_notifier_chain_register(&netdev_chain, nb);
if (err)
goto unlock;
if (dev_boot_phase)
goto unlock;
for_each_net(net) {
+ __rtnl_net_lock(net);
err = call_netdevice_register_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
if (err)
goto rollback;
}
@@ -1817,8 +1975,11 @@ unlock:
return err;
rollback:
- for_each_net_continue_reverse(net)
+ for_each_net_continue_reverse(net) {
+ __rtnl_net_lock(net);
call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
@@ -1851,8 +2012,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
if (err)
goto unlock;
- for_each_net(net)
+ for_each_net(net) {
+ __rtnl_net_lock(net);
call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
unlock:
rtnl_unlock();
@@ -1916,9 +2080,10 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{
int err;
- rtnl_lock();
+ rtnl_net_lock(net);
err = __register_netdevice_notifier_net(net, nb, false);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_net);
@@ -1944,9 +2109,10 @@ int unregister_netdevice_notifier_net(struct net *net,
{
int err;
- rtnl_lock();
+ rtnl_net_lock(net);
err = __unregister_netdevice_notifier_net(net, nb);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);
@@ -1959,19 +2125,56 @@ static void __move_netdevice_notifier_net(struct net *src_net,
__register_netdevice_notifier_net(dst_net, nb, true);
}
+static void rtnl_net_dev_lock(struct net_device *dev)
+{
+ bool again;
+
+ do {
+ struct net *net;
+
+ again = false;
+
+ /* netns might be being dismantled. */
+ rcu_read_lock();
+ net = dev_net_rcu(dev);
+ net_passive_inc(net);
+ rcu_read_unlock();
+
+ rtnl_net_lock(net);
+
+#ifdef CONFIG_NET_NS
+ /* dev might have been moved to another netns. */
+ if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+ again = true;
+ }
+#endif
+ } while (again);
+}
+
+static void rtnl_net_dev_unlock(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+}
+
int register_netdevice_notifier_dev_net(struct net_device *dev,
struct notifier_block *nb,
struct netdev_net_notifier *nn)
{
int err;
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
if (!err) {
nn->nb = nb;
list_add(&nn->list, &dev->net_notifier_list);
}
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
+
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
@@ -1982,10 +2185,11 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev,
{
int err;
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
list_del(&nn->list);
err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
+
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
@@ -2134,8 +2338,8 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
#ifdef CONFIG_NET_CLS_ACT
-DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key);
-EXPORT_SYMBOL(tcf_bypass_check_needed_key);
+DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
+EXPORT_SYMBOL(tcf_sw_enabled_key);
#endif
DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
@@ -2301,16 +2505,21 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
}
/**
- * dev_nit_active - return true if any network interface taps are in use
+ * dev_nit_active_rcu - return true if any network interface taps are in use
+ *
+ * The caller must hold the RCU lock
*
* @dev: network device to check for the presence of taps
*/
-bool dev_nit_active(struct net_device *dev)
+bool dev_nit_active_rcu(const struct net_device *dev)
{
- return !list_empty(&net_hotdata.ptype_all) ||
+ /* Callers may hold either RCU or RCU BH lock */
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+
+ return !list_empty(&dev_net(dev)->ptype_all) ||
!list_empty(&dev->ptype_all);
}
-EXPORT_SYMBOL_GPL(dev_nit_active);
+EXPORT_SYMBOL_GPL(dev_nit_active_rcu);
/*
* Support routine. Sends outgoing frames to any network
@@ -2319,11 +2528,12 @@ EXPORT_SYMBOL_GPL(dev_nit_active);
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
- struct list_head *ptype_list = &net_hotdata.ptype_all;
struct packet_type *ptype, *pt_prev = NULL;
+ struct list_head *ptype_list;
struct sk_buff *skb2 = NULL;
rcu_read_lock();
+ ptype_list = &dev_net_rcu(dev)->ptype_all;
again:
list_for_each_entry_rcu(ptype, ptype_list, list) {
if (READ_ONCE(ptype->ignore_outgoing))
@@ -2367,7 +2577,7 @@ again:
pt_prev = ptype;
}
- if (ptype_list == &net_hotdata.ptype_all) {
+ if (ptype_list != &dev->ptype_all) {
ptype_list = &dev->ptype_all;
goto again;
}
@@ -2970,6 +3180,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->reg_state == NETREG_REGISTERED ||
dev->reg_state == NETREG_UNREGISTERING) {
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
txq);
@@ -3000,7 +3211,6 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
-#ifdef CONFIG_SYSFS
/**
* netif_set_real_num_rx_queues - set actual number of RX queues used
* @dev: Network device
@@ -3020,6 +3230,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
if (dev->reg_state == NETREG_REGISTERED) {
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
rxq);
@@ -3031,7 +3242,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
return 0;
}
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
-#endif
/**
* netif_set_real_num_queues - set actual number of RX and TX queues used
@@ -3263,7 +3473,7 @@ void netif_device_attach(struct net_device *dev)
if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_wake_all_queues(dev);
- __netdev_watchdog_up(dev);
+ netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_device_attach);
@@ -3386,9 +3596,10 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
+#ifdef CONFIG_NET_CRC32C
int skb_crc32c_csum_help(struct sk_buff *skb)
{
- __le32 crc32c_csum;
+ u32 crc;
int ret = 0, offset, start;
if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -3416,15 +3627,14 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
if (ret)
goto out;
- crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
- skb->len - start, ~(__u32)0,
- crc32c_csum_stub));
- *(__le32 *)(skb->data + offset) = crc32c_csum;
+ crc = ~skb_crc32c(skb, start, skb->len - start, ~0);
+ *(__le32 *)(skb->data + offset) = cpu_to_le32(crc);
skb_reset_csum_not_inet(skb);
out:
return ret;
}
EXPORT_SYMBOL(skb_crc32c_csum_help);
+#endif /* CONFIG_NET_CRC32C */
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
@@ -3612,7 +3822,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
unsigned int len;
int rc;
- if (dev_nit_active(dev))
+ if (dev_nit_active_rcu(dev))
dev_queue_xmit_nit(skb, dev);
len = skb->len;
@@ -3688,10 +3898,43 @@ sw_checksum:
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
+static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct skb_shared_info *shinfo;
+ struct net_iov *niov;
+
+ if (likely(skb_frags_readable(skb)))
+ goto out;
+
+ if (!dev->netmem_tx)
+ goto out_free;
+
+ shinfo = skb_shinfo(skb);
+
+ if (shinfo->nr_frags > 0) {
+ niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
+ if (net_is_devmem_iov(niov) &&
+ net_devmem_iov_binding(niov)->dev != dev)
+ goto out_free;
+ }
+
+out:
+ return skb;
+
+out_free:
+ kfree_skb(skb);
+ return NULL;
+}
+
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
{
netdev_features_t features;
+ skb = validate_xmit_unreadable_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
+
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
@@ -4030,10 +4273,13 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
if (!miniq)
return ret;
- if (static_branch_unlikely(&tcf_bypass_check_needed_key)) {
- if (tcf_block_bypass_sw(miniq->block))
- return ret;
- }
+ /* Global bypass */
+ if (!static_branch_likely(&tcf_sw_enabled_key))
+ return ret;
+
+ /* Block-wise bypass */
+ if (tcf_block_bypass_sw(miniq->block))
+ return ret;
tc_skb_cb(skb)->mru = 0;
tc_skb_cb(skb)->post_ct = false;
@@ -4382,7 +4628,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_reset_mac_header(skb);
skb_assert_len(skb);
- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
+ if (unlikely(skb_shinfo(skb)->tx_flags &
+ (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
__skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* Disable soft irqs for various locks below. Also
@@ -4568,13 +4815,14 @@ static inline void ____napi_schedule(struct softnet_data *sd,
}
use_local_napi:
+ DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list));
list_add_tail(&napi->poll_list, &sd->poll_list);
WRITE_ONCE(napi->list_owner, smp_processor_id());
/* If not called from net_rx_action()
* we have to raise NET_RX_SOFTIRQ.
*/
if (!sd->in_net_rx_action)
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
#ifdef CONFIG_RPS
@@ -4584,6 +4832,11 @@ EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);
+static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
+{
+ return hash_32(hash, flow_table->log);
+}
+
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
@@ -4610,7 +4863,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = skb_get_hash(skb) & flow_table->mask;
+ flow_id = rfs_slot(skb_get_hash(skb), flow_table);
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
@@ -4689,7 +4942,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
- rflow = &flow_table->flows[hash & flow_table->mask];
+ rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
tcpu = rflow->cpu;
/*
@@ -4756,13 +5009,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
- if (flow_table && flow_id <= flow_table->mask) {
+ if (flow_table && flow_id < (1UL << flow_table->log)) {
rflow = &flow_table->flows[flow_id];
cpu = READ_ONCE(rflow->cpu);
if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
READ_ONCE(rflow->last_qtail)) <
- (int)(10 * flow_table->mask)))
+ (int)(10 << flow_table->log)))
expire = false;
}
rcu_read_unlock();
@@ -4778,7 +5031,8 @@ static void rps_trigger_softirq(void *data)
struct softnet_data *sd = data;
____napi_schedule(sd, &sd->backlog);
- sd->received_rps++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(sd->received_rps, sd->received_rps + 1);
}
#endif /* CONFIG_RPS */
@@ -4863,7 +5117,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
if (fl) {
- new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
+ new_flow = hash_32(skb_get_hash(skb), fl->log_buckets);
old_flow = fl->history[fl->history_head];
fl->history[fl->history_head] = new_flow;
@@ -4874,7 +5128,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
fl->buckets[old_flow]--;
if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
- fl->count++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(fl->count, fl->count + 1);
rcu_read_unlock();
return true;
}
@@ -4963,7 +5218,7 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
}
u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
@@ -5065,12 +5320,15 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
}
static int
-netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
+netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
{
struct sk_buff *skb = *pskb;
int err, hroom, troom;
- if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog);
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
+ if (!err)
return 0;
/* In case we have to go down the path and also linearize,
@@ -5089,7 +5347,7 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
struct sk_buff *skb = *pskb;
u32 mac_len, act = XDP_DROP;
@@ -5142,7 +5400,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
* and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
* queues, so they do not have this starvation issue.
*/
-void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -5167,7 +5425,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
+int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
@@ -5506,8 +5764,14 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
orig_dev = skb->dev;
skb_reset_network_header(skb);
+#if !defined(CONFIG_DEBUG_NET)
+ /* We plan to no longer reset the transport header here.
+ * Give some time to fuzzers and dev build to catch bugs
+ * in network stacks.
+ */
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
+#endif
skb_reset_mac_len(skb);
pt_prev = NULL;
@@ -5543,7 +5807,8 @@ another_round:
if (pfmemalloc)
goto skip_taps;
- list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
+ list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
+ list) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
@@ -5655,6 +5920,14 @@ check_vlan_id:
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
+
+ /* orig_dev and skb->dev could belong to different netns;
+ * Even in such case we need to traverse only the list
+ * coming from skb->dev, as the ptype owner (packet socket)
+ * will use dev_net(skb->dev) to do namespace filtering.
+ */
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &dev_net_rcu(skb->dev)->ptype_specific);
}
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
@@ -5865,7 +6138,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
static_branch_dec(&generic_xdp_needed_key);
} else if (new && !old) {
static_branch_inc(&generic_xdp_needed_key);
- dev_disable_lro(dev);
+ netif_disable_lro(dev);
dev_disable_gro_hw(dev);
}
break;
@@ -5991,22 +6264,22 @@ void netif_receive_skb_list(struct list_head *head)
}
EXPORT_SYMBOL(netif_receive_skb_list);
-static DEFINE_PER_CPU(struct work_struct, flush_works);
-
/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
struct sk_buff *skb, *tmp;
+ struct sk_buff_head list;
struct softnet_data *sd;
+ __skb_queue_head_init(&list);
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
backlog_lock_irq_disable(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
- if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
- dev_kfree_skb_irq(skb);
+ __skb_queue_tail(&list, skb);
rps_input_queue_head_incr(sd);
}
}
@@ -6014,14 +6287,16 @@ static void flush_backlog(struct work_struct *work)
local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
- if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
- kfree_skb(skb);
+ __skb_queue_tail(&list, skb);
rps_input_queue_head_incr(sd);
}
}
local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
local_bh_enable();
+
+ __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY);
}
static bool flush_required(int cpu)
@@ -6049,36 +6324,54 @@ static bool flush_required(int cpu)
return true;
}
+struct flush_backlogs {
+ cpumask_t flush_cpus;
+ struct work_struct w[];
+};
+
+static struct flush_backlogs *flush_backlogs_alloc(void)
+{
+ return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids),
+ GFP_KERNEL);
+}
+
+static struct flush_backlogs *flush_backlogs_fallback;
+static DEFINE_MUTEX(flush_backlogs_mutex);
+
static void flush_all_backlogs(void)
{
- static cpumask_t flush_cpus;
+ struct flush_backlogs *ptr = flush_backlogs_alloc();
unsigned int cpu;
- /* since we are under rtnl lock protection we can use static data
- * for the cpumask and avoid allocating on stack the possibly
- * large mask
- */
- ASSERT_RTNL();
+ if (!ptr) {
+ mutex_lock(&flush_backlogs_mutex);
+ ptr = flush_backlogs_fallback;
+ }
+ cpumask_clear(&ptr->flush_cpus);
cpus_read_lock();
- cpumask_clear(&flush_cpus);
for_each_online_cpu(cpu) {
if (flush_required(cpu)) {
- queue_work_on(cpu, system_highpri_wq,
- per_cpu_ptr(&flush_works, cpu));
- cpumask_set_cpu(cpu, &flush_cpus);
+ INIT_WORK(&ptr->w[cpu], flush_backlog);
+ queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
+ __cpumask_set_cpu(cpu, &ptr->flush_cpus);
}
}
/* we can have in flight packet[s] on the cpus we are not flushing,
* synchronize_net() in unregister_netdevice_many() will take care of
- * them
+ * them.
*/
- for_each_cpu(cpu, &flush_cpus)
- flush_work(per_cpu_ptr(&flush_works, cpu));
+ for_each_cpu(cpu, &ptr->flush_cpus)
+ flush_work(&ptr->w[cpu]);
cpus_read_unlock();
+
+ if (ptr != flush_backlogs_fallback)
+ kfree(ptr);
+ else
+ mutex_unlock(&flush_backlogs_mutex);
}
static void net_rps_send_ipi(struct softnet_data *remsd)
@@ -6267,7 +6560,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
return false;
if (work_done) {
- if (n->gro_bitmask)
+ if (n->gro.bitmask)
timeout = napi_get_gro_flush_timeout(n);
n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
}
@@ -6277,15 +6570,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
if (timeout)
ret = false;
}
- if (n->gro_bitmask) {
- /* When the NAPI instance uses a timeout and keeps postponing
- * it, we need to bound somehow the time packets are kept in
- * the GRO layer
- */
- napi_gro_flush(n, !!timeout);
- }
- gro_normal_list(n);
+ /*
+ * When the NAPI instance uses a timeout and keeps postponing
+ * it, we need to bound somehow the time packets are kept in
+ * the GRO layer.
+ */
+ gro_flush(&n->gro, !!timeout);
+ gro_normal_list(&n->gro);
if (unlikely(!list_empty(&n->poll_list))) {
/* If n->poll_list is not empty, we need to mask irqs */
@@ -6349,19 +6641,15 @@ static void skb_defer_free_flush(struct softnet_data *sd)
static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{
if (!skip_schedule) {
- gro_normal_list(napi);
+ gro_normal_list(&napi->gro);
__napi_schedule(napi);
return;
}
- if (napi->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(napi, HZ >= 1000);
- }
+ /* Flush too old packets. If HZ < 1000, flush all packets */
+ gro_flush(&napi->gro, HZ >= 1000);
+ gro_normal_list(&napi->gro);
- gro_normal_list(napi);
clear_bit(NAPI_STATE_SCHED, &napi->state);
}
@@ -6468,7 +6756,7 @@ restart:
}
work = napi_poll(napi, budget);
trace_napi_poll(napi, work, budget);
- gro_normal_list(napi);
+ gro_normal_list(&napi->gro);
count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
@@ -6568,7 +6856,9 @@ void napi_resume_irqs(unsigned int napi_id)
static void __napi_hash_add_with_id(struct napi_struct *napi,
unsigned int napi_id)
{
- napi->napi_id = napi_id;
+ napi->gro.cached_napi_id = napi_id;
+
+ WRITE_ONCE(napi->napi_id, napi_id);
hlist_add_head_rcu(&napi->napi_hash_node,
&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
}
@@ -6595,7 +6885,7 @@ static void napi_hash_add(struct napi_struct *napi)
/* 0..NR_CPUS range is reserved for sender_cpu use */
do {
- if (unlikely(++napi_gen_id < MIN_NAPI_ID))
+ if (unlikely(!napi_id_valid(++napi_gen_id)))
napi_gen_id = MIN_NAPI_ID;
} while (napi_by_id(napi_gen_id));
@@ -6636,22 +6926,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void init_gro_hash(struct napi_struct *napi)
-{
- int i;
-
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- INIT_LIST_HEAD(&napi->gro_hash[i].list);
- napi->gro_hash[i].count = 0;
- }
- napi->gro_bitmask = 0;
-}
-
int dev_set_threaded(struct net_device *dev, bool threaded)
{
struct napi_struct *napi;
int err = 0;
+ netdev_assert_locked_or_invisible(dev);
+
if (dev->threaded == threaded)
return 0;
@@ -6706,8 +6987,7 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
if (WARN_ON_ONCE(napi && !napi->dev))
return;
- if (dev->reg_state >= NETREG_REGISTERED)
- ASSERT_RTNL();
+ netdev_ops_assert_locked_or_invisible(dev);
switch (type) {
case NETDEV_QUEUE_TYPE_RX:
@@ -6724,19 +7004,184 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
}
EXPORT_SYMBOL(netif_queue_set_napi);
+static void
+netif_napi_irq_notify(struct irq_affinity_notify *notify,
+ const cpumask_t *mask)
+{
+ struct napi_struct *napi =
+ container_of(notify, struct napi_struct, notify);
+#ifdef CONFIG_RFS_ACCEL
+ struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
+ int err;
+#endif
+
+ if (napi->config && napi->dev->irq_affinity_auto)
+ cpumask_copy(&napi->config->affinity_mask, mask);
+
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask);
+ if (err)
+ netdev_warn(napi->dev, "RMAP update failed (%d)\n",
+ err);
+ }
+#endif
+}
+
+#ifdef CONFIG_RFS_ACCEL
+static void netif_napi_affinity_release(struct kref *ref)
+{
+ struct napi_struct *napi =
+ container_of(ref, struct napi_struct, notify.kref);
+ struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
+
+ netdev_assert_locked(napi->dev);
+ WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER,
+ &napi->state));
+
+ if (!napi->dev->rx_cpu_rmap_auto)
+ return;
+ rmap->obj[napi->napi_rmap_idx] = NULL;
+ napi->napi_rmap_idx = -1;
+ cpu_rmap_put(rmap);
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+ if (dev->rx_cpu_rmap_auto)
+ return 0;
+
+ dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
+ if (!dev->rx_cpu_rmap)
+ return -ENOMEM;
+
+ dev->rx_cpu_rmap_auto = true;
+ return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+
+static void netif_del_cpu_rmap(struct net_device *dev)
+{
+ struct cpu_rmap *rmap = dev->rx_cpu_rmap;
+
+ if (!dev->rx_cpu_rmap_auto)
+ return;
+
+ /* Free the rmap */
+ cpu_rmap_put(rmap);
+ dev->rx_cpu_rmap = NULL;
+ dev->rx_cpu_rmap_auto = false;
+}
+
+#else
+static void netif_napi_affinity_release(struct kref *ref)
+{
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+ return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+
+static void netif_del_cpu_rmap(struct net_device *dev)
+{
+}
+#endif
+
+void netif_set_affinity_auto(struct net_device *dev)
+{
+ unsigned int i, maxqs, numa;
+
+ maxqs = max(dev->num_tx_queues, dev->num_rx_queues);
+ numa = dev_to_node(&dev->dev);
+
+ for (i = 0; i < maxqs; i++)
+ cpumask_set_cpu(cpumask_local_spread(i, numa),
+ &dev->napi_config[i].affinity_mask);
+
+ dev->irq_affinity_auto = true;
+}
+EXPORT_SYMBOL(netif_set_affinity_auto);
+
+void netif_napi_set_irq_locked(struct napi_struct *napi, int irq)
+{
+ int rc;
+
+ netdev_assert_locked_or_invisible(napi->dev);
+
+ if (napi->irq == irq)
+ return;
+
+ /* Remove existing resources */
+ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
+ irq_set_affinity_notifier(napi->irq, NULL);
+
+ napi->irq = irq;
+ if (irq < 0 ||
+ (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto))
+ return;
+
+ /* Abort for buggy drivers */
+ if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config))
+ return;
+
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi);
+ if (rc < 0)
+ return;
+
+ cpu_rmap_get(napi->dev->rx_cpu_rmap);
+ napi->napi_rmap_idx = rc;
+ }
+#endif
+
+ /* Use core IRQ notifier */
+ napi->notify.notify = netif_napi_irq_notify;
+ napi->notify.release = netif_napi_affinity_release;
+ rc = irq_set_affinity_notifier(irq, &napi->notify);
+ if (rc) {
+ netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n",
+ rc);
+ goto put_rmap;
+ }
+
+ set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state);
+ return;
+
+put_rmap:
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL;
+ cpu_rmap_put(napi->dev->rx_cpu_rmap);
+ napi->napi_rmap_idx = -1;
+ }
+#endif
+ napi->notify.notify = NULL;
+ napi->notify.release = NULL;
+}
+EXPORT_SYMBOL(netif_napi_set_irq_locked);
+
static void napi_restore_config(struct napi_struct *n)
{
n->defer_hard_irqs = n->config->defer_hard_irqs;
n->gro_flush_timeout = n->config->gro_flush_timeout;
n->irq_suspend_timeout = n->config->irq_suspend_timeout;
+
+ if (n->dev->irq_affinity_auto &&
+ test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state))
+ irq_set_affinity(n->irq, &n->config->affinity_mask);
+
/* a NAPI ID might be stored in the config, if so use it. if not, use
- * napi_hash_add to generate one for us. It will be saved to the config
- * in napi_disable.
+ * napi_hash_add to generate one for us.
*/
- if (n->config->napi_id)
+ if (n->config->napi_id) {
napi_hash_add_with_id(n, n->config->napi_id);
- else
+ } else {
napi_hash_add(n);
+ n->config->napi_id = n->napi_id;
+ }
}
static void napi_save_config(struct napi_struct *n)
@@ -6744,24 +7189,70 @@ static void napi_save_config(struct napi_struct *n)
n->config->defer_hard_irqs = n->defer_hard_irqs;
n->config->gro_flush_timeout = n->gro_flush_timeout;
n->config->irq_suspend_timeout = n->irq_suspend_timeout;
- n->config->napi_id = n->napi_id;
napi_hash_del(n);
}
-void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
- int (*poll)(struct napi_struct *, int), int weight)
+/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
+ * inherit an existing ID try to insert it at the right position.
+ */
+static void
+netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
+{
+ unsigned int new_id, pos_id;
+ struct list_head *higher;
+ struct napi_struct *pos;
+
+ new_id = UINT_MAX;
+ if (napi->config && napi->config->napi_id)
+ new_id = napi->config->napi_id;
+
+ higher = &dev->napi_list;
+ list_for_each_entry(pos, &dev->napi_list, dev_list) {
+ if (napi_id_valid(pos->napi_id))
+ pos_id = pos->napi_id;
+ else if (pos->config)
+ pos_id = pos->config->napi_id;
+ else
+ pos_id = UINT_MAX;
+
+ if (pos_id <= new_id)
+ break;
+ higher = &pos->dev_list;
+ }
+ list_add_rcu(&napi->dev_list, higher); /* adds after higher */
+}
+
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
+ */
+static void napi_get_frags_check(struct napi_struct *napi)
+{
+ struct sk_buff *skb;
+
+ local_bh_disable();
+ skb = napi_get_frags(napi);
+ WARN_ON_ONCE(skb && skb->head_frag);
+ napi_free_frags(napi);
+ local_bh_enable();
+}
+
+void netif_napi_add_weight_locked(struct net_device *dev,
+ struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
{
+ netdev_assert_locked(dev);
if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
return;
INIT_LIST_HEAD(&napi->poll_list);
INIT_HLIST_NODE(&napi->napi_hash_node);
- hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- napi->timer.function = napi_watchdog;
- init_gro_hash(napi);
+ hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ gro_init(&napi->gro);
napi->skb = NULL;
- INIT_LIST_HEAD(&napi->rx_list);
- napi->rx_count = 0;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
netdev_err_once(dev, "%s() called with weight %d\n", __func__,
@@ -6774,7 +7265,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
- list_add_rcu(&napi->dev_list, &dev->napi_list);
+ netif_napi_dev_list_add(dev, napi);
/* default settings from sysfs are applied to all NAPIs. any per-NAPI
* configuration will be loaded in napi_enable
@@ -6789,15 +7280,17 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
*/
if (dev->threaded && napi_kthread_create(napi))
dev->threaded = false;
- netif_napi_set_irq(napi, -1);
+ netif_napi_set_irq_locked(napi, -1);
}
-EXPORT_SYMBOL(netif_napi_add_weight);
+EXPORT_SYMBOL(netif_napi_add_weight_locked);
-void napi_disable(struct napi_struct *n)
+void napi_disable_locked(struct napi_struct *n)
{
unsigned long val, new;
might_sleep();
+ netdev_assert_locked(n->dev);
+
set_bit(NAPI_STATE_DISABLE, &n->state);
val = READ_ONCE(n->state);
@@ -6820,16 +7313,25 @@ void napi_disable(struct napi_struct *n)
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
-EXPORT_SYMBOL(napi_disable);
+EXPORT_SYMBOL(napi_disable_locked);
/**
- * napi_enable - enable NAPI scheduling
- * @n: NAPI context
+ * napi_disable() - prevent NAPI from scheduling
+ * @n: NAPI context
*
- * Resume NAPI from being scheduled on this context.
- * Must be paired with napi_disable.
+ * Stop NAPI from being scheduled on this context.
+ * Waits till any outstanding processing completes.
+ * Takes netdev_lock() for associated net_device.
*/
-void napi_enable(struct napi_struct *n)
+void napi_disable(struct napi_struct *n)
+{
+ netdev_lock(n->dev);
+ napi_disable_locked(n);
+ netdev_unlock(n->dev);
+}
+EXPORT_SYMBOL(napi_disable);
+
+void napi_enable_locked(struct napi_struct *n)
{
unsigned long new, val = READ_ONCE(n->state);
@@ -6846,27 +7348,38 @@ void napi_enable(struct napi_struct *n)
new |= NAPIF_STATE_THREADED;
} while (!try_cmpxchg(&n->state, &val, new));
}
-EXPORT_SYMBOL(napi_enable);
+EXPORT_SYMBOL(napi_enable_locked);
-static void flush_gro_hash(struct napi_struct *napi)
+/**
+ * napi_enable() - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Enable scheduling of a NAPI instance.
+ * Must be paired with napi_disable().
+ * Takes netdev_lock() for associated net_device.
+ */
+void napi_enable(struct napi_struct *n)
{
- int i;
-
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- struct sk_buff *skb, *n;
-
- list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
- kfree_skb(skb);
- napi->gro_hash[i].count = 0;
- }
+ netdev_lock(n->dev);
+ napi_enable_locked(n);
+ netdev_unlock(n->dev);
}
+EXPORT_SYMBOL(napi_enable);
/* Must be called in process context */
-void __netif_napi_del(struct napi_struct *napi)
+void __netif_napi_del_locked(struct napi_struct *napi)
{
+ netdev_assert_locked(napi->dev);
+
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
return;
+ /* Make sure NAPI is disabled (or was never enabled). */
+ WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
+
+ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
+ irq_set_affinity_notifier(napi->irq, NULL);
+
if (napi->config) {
napi->index = -1;
napi->config = NULL;
@@ -6875,15 +7388,14 @@ void __netif_napi_del(struct napi_struct *napi)
list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
- flush_gro_hash(napi);
- napi->gro_bitmask = 0;
+ gro_cleanup(&napi->gro);
if (napi->thread) {
kthread_stop(napi->thread);
napi->thread = NULL;
}
}
-EXPORT_SYMBOL(__netif_napi_del);
+EXPORT_SYMBOL(__netif_napi_del_locked);
static int __napi_poll(struct napi_struct *n, bool *repoll)
{
@@ -6935,14 +7447,9 @@ static int __napi_poll(struct napi_struct *n, bool *repoll)
return work;
}
- if (n->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(n, HZ >= 1000);
- }
-
- gro_normal_list(n);
+ /* Flush too old packets. If HZ < 1000, flush all packets */
+ gro_flush(&n->gro, HZ >= 1000);
+ gro_normal_list(&n->gro);
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
@@ -6970,9 +7477,14 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
work = __napi_poll(n, &do_repoll);
- if (do_repoll)
+ if (do_repoll) {
+#if defined(CONFIG_DEBUG_NET)
+ if (unlikely(!napi_is_scheduled(n)))
+ pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n",
+ n->dev->name, n->poll);
+#endif
list_add_tail(&n->poll_list, repoll);
-
+ }
netpoll_poll_unlock(have);
return work;
@@ -7098,7 +7610,8 @@ start:
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
- sd->time_squeeze++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1);
break;
}
}
@@ -8771,23 +9284,20 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
dev_change_rx_flags(dev, IFF_PROMISC);
}
- if (notify)
+ if (notify) {
+ /* The ops lock is only required to ensure consistent locking
+ * for `NETDEV_CHANGE` notifiers. This function is sometimes
+ * called without the lock, even for devices that are ops
+ * locked, such as in `dev_uc_sync_multiple` when using
+ * bonding or teaming.
+ */
+ netdev_ops_assert_locked(dev);
__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
+ }
return 0;
}
-/**
- * dev_set_promiscuity - update promiscuity count on a device
- * @dev: device
- * @inc: modifier
- *
- * Add or remove promiscuity from a device. While the count in the device
- * remains above zero the interface remains promiscuous. Once it hits zero
- * the device reverts back to normal filtering operation. A negative inc
- * value is used to drop promiscuity on the device.
- * Return 0 if successful or a negative errno code on error.
- */
-int dev_set_promiscuity(struct net_device *dev, int inc)
+int netif_set_promiscuity(struct net_device *dev, int inc)
{
unsigned int old_flags = dev->flags;
int err;
@@ -8799,9 +9309,8 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
dev_set_rx_mode(dev);
return err;
}
-EXPORT_SYMBOL(dev_set_promiscuity);
-static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
+int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
unsigned int allmulti, flags;
@@ -8836,25 +9345,6 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
return 0;
}
-/**
- * dev_set_allmulti - update allmulti count on a device
- * @dev: device
- * @inc: modifier
- *
- * Add or remove reception of all multicast frames to a device. While the
- * count in the device remains above zero the interface remains listening
- * to all interfaces. Once it hits zero the device reverts back to normal
- * filtering operation. A negative @inc value is used to drop the counter
- * when releasing a resource needing all multicasts.
- * Return 0 if successful or a negative errno code on error.
- */
-
-int dev_set_allmulti(struct net_device *dev, int inc)
-{
- return __dev_set_allmulti(dev, inc, true);
-}
-EXPORT_SYMBOL(dev_set_allmulti);
-
/*
* Upload unicast and multicast address lists to device and
* configure RX filtering. When the device doesn't support unicast
@@ -8970,7 +9460,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
- unsigned int old_flags = dev->flags;
+ old_flags = dev->flags;
dev->gflags ^= IFF_PROMISC;
@@ -8987,7 +9477,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
dev->gflags ^= IFF_ALLMULTI;
- __dev_set_allmulti(dev, inc, false);
+ netif_set_allmulti(dev, inc, false);
}
return ret;
@@ -9022,17 +9512,8 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
}
}
-/**
- * dev_change_flags - change device settings
- * @dev: device
- * @flags: device state flags
- * @extack: netlink extended ack
- *
- * Change settings on device based state flags. The flags are
- * in the userspace exported format.
- */
-int dev_change_flags(struct net_device *dev, unsigned int flags,
- struct netlink_ext_ack *extack)
+int netif_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
{
int ret;
unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
@@ -9045,7 +9526,6 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
__dev_notify_flags(dev, old_flags, changes, 0, NULL);
return ret;
}
-EXPORT_SYMBOL(dev_change_flags);
int __dev_set_mtu(struct net_device *dev, int new_mtu)
{
@@ -9077,15 +9557,15 @@ int dev_validate_mtu(struct net_device *dev, int new_mtu,
}
/**
- * dev_set_mtu_ext - Change maximum transfer unit
+ * netif_set_mtu_ext - Change maximum transfer unit
* @dev: device
* @new_mtu: new transfer unit
* @extack: netlink extended ack
*
* Change the maximum transfer size of the network device.
*/
-int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
- struct netlink_ext_ack *extack)
+int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
{
int err, orig_mtu;
@@ -9123,25 +9603,20 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
return err;
}
-int dev_set_mtu(struct net_device *dev, int new_mtu)
+int netif_set_mtu(struct net_device *dev, int new_mtu)
{
struct netlink_ext_ack extack;
int err;
memset(&extack, 0, sizeof(extack));
- err = dev_set_mtu_ext(dev, new_mtu, &extack);
+ err = netif_set_mtu_ext(dev, new_mtu, &extack);
if (err && extack._msg)
net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
return err;
}
-EXPORT_SYMBOL(dev_set_mtu);
+EXPORT_SYMBOL(netif_set_mtu);
-/**
- * dev_change_tx_queue_len - Change TX queue length of a netdevice
- * @dev: device
- * @new_len: new tx queue length
- */
-int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
unsigned int orig_len = dev->tx_queue_len;
int res;
@@ -9168,12 +9643,7 @@ err_rollback:
return res;
}
-/**
- * dev_set_group - Change group this device belongs to
- * @dev: device
- * @new_group: group this device should belong to
- */
-void dev_set_group(struct net_device *dev, int new_group)
+void netif_set_group(struct net_device *dev, int new_group)
{
dev->group = new_group;
}
@@ -9199,31 +9669,23 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
}
EXPORT_SYMBOL(dev_pre_changeaddr_notify);
-/**
- * dev_set_mac_address - Change Media Access Control Address
- * @dev: device
- * @sa: new address
- * @extack: netlink extended ack
- *
- * Change the hardware (MAC) address of the device
- */
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
- struct netlink_ext_ack *extack)
+int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
if (!ops->ndo_set_mac_address)
return -EOPNOTSUPP;
- if (sa->sa_family != dev->type)
+ if (ss->ss_family != dev->type)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
+ err = dev_pre_changeaddr_notify(dev, ss->__data, extack);
if (err)
return err;
- if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
- err = ops->ndo_set_mac_address(dev, sa);
+ if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) {
+ err = ops->ndo_set_mac_address(dev, ss);
if (err)
return err;
}
@@ -9232,22 +9694,10 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
add_device_randomness(dev->dev_addr, dev->addr_len);
return 0;
}
-EXPORT_SYMBOL(dev_set_mac_address);
DECLARE_RWSEM(dev_addr_sem);
-int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
- struct netlink_ext_ack *extack)
-{
- int ret;
-
- down_write(&dev_addr_sem);
- ret = dev_set_mac_address(dev, sa, extack);
- up_write(&dev_addr_sem);
- return ret;
-}
-EXPORT_SYMBOL(dev_set_mac_address_user);
-
+/* "sa" is a true struct sockaddr with limited "sa_data" member. */
int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
size_t size = sizeof(sa->sa_data_min);
@@ -9276,14 +9726,7 @@ unlock:
}
EXPORT_SYMBOL(dev_get_mac_address);
-/**
- * dev_change_carrier - Change device carrier
- * @dev: device
- * @new_carrier: new value
- *
- * Change device carrier
- */
-int dev_change_carrier(struct net_device *dev, bool new_carrier)
+int netif_change_carrier(struct net_device *dev, bool new_carrier)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -9394,13 +9837,7 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
}
EXPORT_SYMBOL(netdev_port_same_parent_id);
-/**
- * dev_change_proto_down - set carrier according to proto_down.
- *
- * @dev: device
- * @proto_down: new value
- */
-int dev_change_proto_down(struct net_device *dev, bool proto_down)
+int netif_change_proto_down(struct net_device *dev, bool proto_down)
{
if (!dev->change_proto_down)
return -EOPNOTSUPP;
@@ -9415,14 +9852,14 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
}
/**
- * dev_change_proto_down_reason - proto down reason
+ * netdev_change_proto_down_reason_locked - proto down reason
*
* @dev: device
* @mask: proto down mask
* @value: proto down value
*/
-void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
- u32 value)
+void netdev_change_proto_down_reason_locked(struct net_device *dev,
+ unsigned long mask, u32 value)
{
u32 proto_down_reason;
int b;
@@ -9499,11 +9936,31 @@ u8 dev_xdp_prog_count(struct net_device *dev)
}
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
-int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
+u8 dev_xdp_sb_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog &&
+ !dev->xdp_state[i].prog->aux->xdp_has_frags)
+ count++;
+ return count;
+}
+
+int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
{
if (!dev->netdev_ops->ndo_bpf)
return -EOPNOTSUPP;
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ bpf->command == XDP_SETUP_PROG &&
+ bpf->prog && !bpf->prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(bpf->extack,
+ "unable to propagate XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
if (dev_get_min_mp_channel_count(dev)) {
NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
return -EBUSY;
@@ -9511,7 +9968,7 @@ int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
return dev->netdev_ops->ndo_bpf(dev, bpf);
}
-EXPORT_SYMBOL_GPL(dev_xdp_propagate);
+EXPORT_SYMBOL_GPL(netif_xdp_propagate);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
@@ -9541,6 +9998,14 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
struct netdev_bpf xdp;
int err;
+ netdev_ops_assert_locked(dev);
+
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ prog && !prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
if (dev_get_min_mp_channel_count(dev)) {
NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
return -EBUSY;
@@ -9694,6 +10159,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
NL_SET_ERR_MSG(extack, "Program bound to different device");
return -EINVAL;
}
+ if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
+ NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
+ return -EINVAL;
+ }
if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
return -EINVAL;
@@ -9763,7 +10232,9 @@ static void bpf_xdp_link_release(struct bpf_link *link)
* already NULL, in which case link was already auto-detached
*/
if (xdp_link->dev) {
+ netdev_lock_ops(xdp_link->dev);
WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
+ netdev_unlock_ops(xdp_link->dev);
xdp_link->dev = NULL;
}
@@ -9845,10 +10316,12 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
goto out_unlock;
}
+ netdev_lock_ops(xdp_link->dev);
mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
xdp_link->flags, new_prog);
+ netdev_unlock_ops(xdp_link->dev);
if (err)
goto out_unlock;
@@ -9901,7 +10374,9 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
goto unlock;
}
+ netdev_lock_ops(dev);
err = dev_xdp_attach_link(dev, &extack, link);
+ netdev_unlock_ops(dev);
rtnl_unlock();
if (err) {
@@ -9974,7 +10449,7 @@ u32 dev_get_min_mp_channel_count(const struct net_device *dev)
{
int i;
- ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
if (dev->_rx[i].mp_params.mp_priv)
@@ -10021,6 +10496,15 @@ static void dev_index_release(struct net *net, int ifindex)
WARN_ON(xa_erase(&net->dev_by_index, ifindex));
}
+static bool from_cleanup_net(void)
+{
+#ifdef CONFIG_NET_NS
+ return current == READ_ONCE(cleanup_net_task);
+#else
+ return false;
+#endif
+}
+
/* Delayed registration/unregisteration */
LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
@@ -10063,6 +10547,7 @@ static void netdev_sync_lower_features(struct net_device *upper,
if (!(features & feature) && (lower->features & feature)) {
netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
&feature, lower->name);
+ netdev_lock_ops(lower);
lower->wanted_features &= ~feature;
__netdev_update_features(lower);
@@ -10071,6 +10556,7 @@ static void netdev_sync_lower_features(struct net_device *upper,
&feature, lower->name);
else
netdev_features_change(lower);
+ netdev_unlock_ops(lower);
}
}
}
@@ -10191,6 +10677,7 @@ int __netdev_update_features(struct net_device *dev)
int err = -1;
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
features = netdev_get_wanted_features(dev);
@@ -10617,12 +11104,16 @@ int register_netdevice(struct net_device *dev)
ret = netdev_register_kobject(dev);
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
+ netdev_unlock(dev);
if (ret)
goto err_uninit_notify;
+ netdev_lock_ops(dev);
__netdev_update_features(dev);
+ netdev_unlock_ops(dev);
/*
* Default initial state at registry is that the
@@ -10648,7 +11139,9 @@ int register_netdevice(struct net_device *dev)
memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
/* Notify protocols, that a new device appeared. */
+ netdev_lock_ops(dev);
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ netdev_unlock_ops(dev);
ret = notifier_to_errno(ret);
if (ret) {
/* Expect explicit free_netdev() on failure */
@@ -10660,8 +11153,7 @@ int register_netdevice(struct net_device *dev)
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
out:
@@ -10685,26 +11177,20 @@ err_free_name:
EXPORT_SYMBOL(register_netdevice);
/* Initialize the core of a dummy net device.
- * This is useful if you are calling this function after alloc_netdev(),
- * since it does not memset the net_device fields.
+ * The setup steps dummy netdevs need which normal netdevs get by going
+ * through register_netdevice().
*/
-static void init_dummy_netdev_core(struct net_device *dev)
+static void init_dummy_netdev(struct net_device *dev)
{
/* make sure we BUG if trying to hit standard
* register/unregister code path
*/
dev->reg_state = NETREG_DUMMY;
- /* NAPI wants this */
- INIT_LIST_HEAD(&dev->napi_list);
-
/* a dummy interface is started by default */
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
- /* napi_busy_loop stats accounting wants this */
- dev_net_set(dev, &init_net);
-
/* Note : We dont allocate pcpu_refcnt for dummy devices,
* because users of this 'device' dont need to change
* its refcount.
@@ -10712,28 +11198,6 @@ static void init_dummy_netdev_core(struct net_device *dev)
}
/**
- * init_dummy_netdev - init a dummy network device for NAPI
- * @dev: device to init
- *
- * This takes a network device structure and initializes the minimum
- * amount of fields so it can be used to schedule NAPI polls without
- * registering a full blown interface. This is to be used by drivers
- * that need to tie several hardware interfaces to a single NAPI
- * poll scheduler due to HW limitations.
- */
-void init_dummy_netdev(struct net_device *dev)
-{
- /* Clear everything. Note we don't initialize spinlocks
- * as they aren't supposed to be taken by any of the
- * NAPI code and this dummy netdev is supposed to be
- * only ever used for NAPI polls
- */
- memset(dev, 0, sizeof(struct net_device));
- init_dummy_netdev_core(dev);
-}
-EXPORT_SYMBOL_GPL(init_dummy_netdev);
-
-/**
* register_netdev - register a network device
* @dev: device to register
*
@@ -10748,12 +11212,16 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev);
*/
int register_netdev(struct net_device *dev)
{
+ struct net *net = dev_net(dev);
int err;
- if (rtnl_lock_killable())
+ if (rtnl_net_lock_killable(net))
return -EINTR;
+
err = register_netdevice(dev);
- rtnl_unlock();
+
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdev);
@@ -10891,9 +11359,8 @@ void netdev_run_todo(void)
list_replace_init(&net_unlink_list, &unlink_list);
while (!list_empty(&unlink_list)) {
- struct net_device *dev = list_first_entry(&unlink_list,
- struct net_device,
- unlink_list);
+ dev = list_first_entry(&unlink_list, struct net_device,
+ unlink_list);
list_del_init(&dev->unlink_list);
dev->nested_level = dev->lower_level - 1;
}
@@ -10915,7 +11382,9 @@ void netdev_run_todo(void)
continue;
}
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
+ netdev_unlock(dev);
linkwatch_sync_dev(dev);
}
@@ -11063,6 +11532,20 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
const struct net_device_ops *ops = dev->netdev_ops;
const struct net_device_core_stats __percpu *p;
+ /*
+ * IPv{4,6} and udp tunnels share common stat helpers and use
+ * different stat type (NETDEV_PCPU_STAT_TSTATS vs
+ * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
+ */
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
+ offsetof(struct pcpu_dstats, rx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
+ offsetof(struct pcpu_dstats, rx_packets));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
+ offsetof(struct pcpu_dstats, tx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
+ offsetof(struct pcpu_dstats, tx_packets));
+
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
ops->ndo_get_stats64(dev, storage);
@@ -11301,6 +11784,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (!dev->ethtool)
goto free_all;
+ dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT);
+ if (!dev->cfg)
+ goto free_all;
+ dev->cfg_pending = dev->cfg;
+
napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
if (!dev->napi_config)
@@ -11330,6 +11818,22 @@ free_dev:
}
EXPORT_SYMBOL(alloc_netdev_mqs);
+static void netdev_napi_exit(struct net_device *dev)
+{
+ if (!list_empty(&dev->napi_list)) {
+ struct napi_struct *p, *n;
+
+ netdev_lock(dev);
+ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+ __netif_napi_del_locked(p);
+ netdev_unlock(dev);
+
+ synchronize_net();
+ }
+
+ kvfree(dev->napi_config);
+}
+
/**
* free_netdev - free network device
* @dev: device
@@ -11341,8 +11845,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
*/
void free_netdev(struct net_device *dev)
{
- struct napi_struct *p, *n;
-
might_sleep();
/* When called immediately after register_netdevice() failed the unwind
@@ -11355,8 +11857,8 @@ void free_netdev(struct net_device *dev)
return;
}
- mutex_destroy(&dev->lock);
-
+ WARN_ON(dev->cfg != dev->cfg_pending);
+ kfree(dev->cfg);
kfree(dev->ethtool);
netif_free_tx_queues(dev);
netif_free_rx_queues(dev);
@@ -11366,10 +11868,9 @@ void free_netdev(struct net_device *dev)
/* Flush device addresses */
dev_addr_flush(dev);
- list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
- netif_napi_del(p);
+ netdev_napi_exit(dev);
- kvfree(dev->napi_config);
+ netif_del_cpu_rmap(dev);
ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
@@ -11383,6 +11884,8 @@ void free_netdev(struct net_device *dev)
netdev_free_phy_link_topology(dev);
+ mutex_destroy(&dev->lock);
+
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED ||
dev->reg_state == NETREG_DUMMY) {
@@ -11407,7 +11910,7 @@ EXPORT_SYMBOL(free_netdev);
struct net_device *alloc_netdev_dummy(int sizeof_priv)
{
return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
- init_dummy_netdev_core);
+ init_dummy_netdev);
}
EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
@@ -11420,7 +11923,7 @@ EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
void synchronize_net(void)
{
might_sleep();
- if (rtnl_is_locked())
+ if (from_cleanup_net() || rtnl_is_locked())
synchronize_rcu_expedited();
else
synchronize_rcu();
@@ -11483,6 +11986,19 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
}
EXPORT_SYMBOL(unregister_netdevice_queue);
+static void dev_memory_provider_uninstall(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->real_num_rx_queues; i++) {
+ struct netdev_rx_queue *rxq = &dev->_rx[i];
+ struct pp_memory_provider_params *p = &rxq->mp_params;
+
+ if (p->mp_ops && p->mp_ops->uninstall)
+ p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq);
+ }
+}
+
void unregister_netdevice_many_notify(struct list_head *head,
u32 portid, const struct nlmsghdr *nlh)
{
@@ -11513,15 +12029,29 @@ void unregister_netdevice_many_notify(struct list_head *head,
BUG_ON(dev->reg_state != NETREG_REGISTERED);
}
- /* If device is running, close it first. */
- list_for_each_entry(dev, head, unreg_list)
- list_add_tail(&dev->close_list, &close_head);
+ /* If device is running, close it first. Start with ops locked... */
+ list_for_each_entry(dev, head, unreg_list) {
+ if (netdev_need_ops_lock(dev)) {
+ list_add_tail(&dev->close_list, &close_head);
+ netdev_lock(dev);
+ }
+ }
+ dev_close_many(&close_head, true);
+ /* ... now unlock them and go over the rest. */
+ list_for_each_entry(dev, head, unreg_list) {
+ if (netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+ else
+ list_add_tail(&dev->close_list, &close_head);
+ }
dev_close_many(&close_head, true);
list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
unlist_netdevice(dev);
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
+ netdev_unlock(dev);
}
flush_all_backlogs();
@@ -11531,11 +12061,13 @@ void unregister_netdevice_many_notify(struct list_head *head,
struct sk_buff *skb = NULL;
/* Shutdown queueing discipline. */
+ netdev_lock_ops(dev);
dev_shutdown(dev);
dev_tcx_uninstall(dev);
dev_xdp_uninstall(dev);
+ dev_memory_provider_uninstall(dev);
+ netdev_unlock_ops(dev);
bpf_dev_bound_netdev_unregister(dev);
- dev_dmabuf_uninstall(dev);
netdev_offload_xstats_disable_all(dev);
@@ -11544,8 +12076,7 @@ void unregister_netdevice_many_notify(struct list_head *head,
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
+ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
GFP_KERNEL, NULL, 0,
portid, nlh);
@@ -11623,30 +12154,15 @@ EXPORT_SYMBOL(unregister_netdevice_many);
*/
void unregister_netdev(struct net_device *dev)
{
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
unregister_netdevice(dev);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
}
EXPORT_SYMBOL(unregister_netdev);
-/**
- * __dev_change_net_namespace - move device to different nethost namespace
- * @dev: device
- * @net: network namespace
- * @pat: If not NULL name pattern to try if the current device name
- * is already taken in the destination network namespace.
- * @new_ifindex: If not zero, specifies device index in the target
- * namespace.
- *
- * This function shuts down a device interface and moves it
- * to a new network namespace. On success 0 is returned, on
- * a failure a netagive errno code is returned.
- *
- * Callers must hold the rtnl semaphore.
- */
-
int __dev_change_net_namespace(struct net_device *dev, struct net *net,
- const char *pat, int new_ifindex)
+ const char *pat, int new_ifindex,
+ struct netlink_ext_ack *extack)
{
struct netdev_name_node *name_node;
struct net *net_old = dev_net(dev);
@@ -11657,12 +12173,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
/* Don't allow namespace local devices to be moved. */
err = -EINVAL;
- if (dev->netns_local)
+ if (dev->netns_immutable) {
+ NL_SET_ERR_MSG(extack, "The interface netns is immutable");
goto out;
+ }
/* Ensure the device has been registered */
- if (dev->reg_state != NETREG_REGISTERED)
+ if (dev->reg_state != NETREG_REGISTERED) {
+ NL_SET_ERR_MSG(extack, "The interface isn't registered");
goto out;
+ }
/* Get out if there is nothing todo */
err = 0;
@@ -11675,30 +12195,49 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
err = -EEXIST;
if (netdev_name_in_use(net, dev->name)) {
/* We get here if we can't use the current device name */
- if (!pat)
+ if (!pat) {
+ NL_SET_ERR_MSG(extack,
+ "An interface with the same name exists in the target netns");
goto out;
+ }
err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Unable to use '%s' for the new interface name in the target netns",
+ pat);
goto out;
+ }
}
/* Check that none of the altnames conflicts. */
err = -EEXIST;
- netdev_for_each_altname(dev, name_node)
- if (netdev_name_in_use(net, name_node->name))
+ netdev_for_each_altname(dev, name_node) {
+ if (netdev_name_in_use(net, name_node->name)) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "An interface with the altname %s exists in the target netns",
+ name_node->name);
goto out;
+ }
+ }
/* Check that new_ifindex isn't used yet. */
if (new_ifindex) {
err = dev_index_reserve(net, new_ifindex);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "The ifindex %d is not available in the target netns",
+ new_ifindex);
goto out;
+ }
} else {
/* If there is an ifindex conflict assign a new one */
err = dev_index_reserve(net, dev->ifindex);
if (err == -EBUSY)
err = dev_index_reserve(net, 0);
- if (err < 0)
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Unable to allocate a new ifindex in the target netns");
goto out;
+ }
new_ifindex = err;
}
@@ -11706,16 +12245,23 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
* And now a mini version of register_netdevice unregister_netdevice.
*/
+ netdev_lock_ops(dev);
/* If device is running close it first. */
- dev_close(dev);
-
+ netif_close(dev);
/* And unlink it from device chain */
unlist_netdevice(dev);
+ if (!netdev_need_ops_lock(dev))
+ netdev_lock(dev);
+ dev->moving_ns = true;
+ netdev_unlock(dev);
+
synchronize_net();
/* Shutdown queueing discipline. */
+ netdev_lock_ops(dev);
dev_shutdown(dev);
+ netdev_unlock_ops(dev);
/* Notify protocols, that we are about to destroy
* this device. They should clean all the things.
@@ -11746,7 +12292,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
move_netdevice_notifiers_dev_net(dev, net);
/* Actually switch the network namespace */
+ netdev_lock(dev);
dev_net_set(dev, net);
+ netdev_unlock(dev);
dev->ifindex = new_ifindex;
if (new_name[0]) {
@@ -11772,11 +12320,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
err = netdev_change_owner(dev, net_old, net);
WARN_ON(err);
+ netdev_lock(dev);
+ dev->moving_ns = false;
+ if (!netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+
/* Add the device back in the hashes */
list_netdevice(dev);
-
/* Notify protocols, that a new device appeared. */
call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ netdev_unlock_ops(dev);
/*
* Prevent userspace races by waiting until the network
@@ -11789,7 +12342,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
out:
return err;
}
-EXPORT_SYMBOL_GPL(__dev_change_net_namespace);
static int dev_cpu_dead(unsigned int oldcpu)
{
@@ -11904,7 +12456,7 @@ static struct hlist_head * __net_init netdev_create_hash(void)
static int __net_init netdev_init(struct net *net)
{
BUILD_BUG_ON(GRO_HASH_BUCKETS >
- 8 * sizeof_field(struct napi_struct, gro_bitmask));
+ BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask));
INIT_LIST_HEAD(&net->dev_base_head);
@@ -12039,7 +12591,7 @@ static void __net_exit default_device_exit_net(struct net *net)
char fb_name[IFNAMSIZ];
/* Ignore unmoveable devices (i.e. loopback) */
- if (dev->netns_local)
+ if (dev->netns_immutable)
continue;
/* Leave virtual devices for the generic cleanup */
@@ -12171,12 +12723,19 @@ static int net_page_pool_create(int cpuid)
.nid = cpu_to_mem(cpuid),
};
struct page_pool *pp_ptr;
+ int err;
pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
if (IS_ERR(pp_ptr))
return -ENOMEM;
- per_cpu(system_page_pool, cpuid) = pp_ptr;
+ err = xdp_reg_page_pool(pp_ptr);
+ if (err) {
+ page_pool_destroy(pp_ptr);
+ return err;
+ }
+
+ per_cpu(system_page_pool.pool, cpuid) = pp_ptr;
#endif
return 0;
}
@@ -12241,12 +12800,13 @@ static int __init net_dev_init(void)
* Initialise the packet receive queues.
*/
+ flush_backlogs_fallback = flush_backlogs_alloc();
+ if (!flush_backlogs_fallback)
+ goto out;
+
for_each_possible_cpu(i) {
- struct work_struct *flush = per_cpu_ptr(&flush_works, i);
struct softnet_data *sd = &per_cpu(softnet_data, i);
- INIT_WORK(flush, flush_backlog);
-
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
@@ -12261,7 +12821,7 @@ static int __init net_dev_init(void)
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
spin_lock_init(&sd->defer_lock);
- init_gro_hash(&sd->backlog);
+ gro_init(&sd->backlog.gro);
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
INIT_LIST_HEAD(&sd->backlog.poll_list);
@@ -12305,12 +12865,13 @@ out:
for_each_possible_cpu(i) {
struct page_pool *pp_ptr;
- pp_ptr = per_cpu(system_page_pool, i);
+ pp_ptr = per_cpu(system_page_pool.pool, i);
if (!pp_ptr)
continue;
+ xdp_unreg_page_pool(pp_ptr);
page_pool_destroy(pp_ptr);
- per_cpu(system_page_pool, i) = NULL;
+ per_cpu(system_page_pool.pool, i) = NULL;
}
}
diff --git a/net/core/dev.h b/net/core/dev.h
index deb5eae5749f..e93f36b7ddf3 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -2,9 +2,11 @@
#ifndef _NET_CORE_DEV_H
#define _NET_CORE_DEV_H
+#include <linux/cleanup.h>
#include <linux/types.h>
#include <linux/rwsem.h>
#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
struct net;
struct netlink_ext_ack;
@@ -13,8 +15,9 @@ struct cpumask;
/* Random bits of netdevice that don't need to be exposed */
#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */
struct sd_flow_limit {
- u64 count;
- unsigned int num_buckets;
+ struct rcu_head rcu;
+ unsigned int count;
+ u8 log_buckets;
unsigned int history_head;
u16 history[FLOW_LIMIT_HISTORY];
u8 buckets[];
@@ -22,7 +25,37 @@ struct sd_flow_limit {
extern int netdev_flow_limit_table_len;
-struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id);
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
+struct net_device *dev_get_by_napi_id(unsigned int napi_id);
+
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex);
+struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net);
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index);
+
+DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T));
+
+#define for_each_netdev_lock_scoped(net, var_name, ifindex) \
+ for (struct net_device *var_name __free(netdev_unlock) = NULL; \
+ (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \
+ ifindex++)
+
+struct net_device *
+netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex);
+struct net_device *
+netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
+ unsigned long *index);
+
+DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *,
+ if (_T) netdev_unlock_ops_compat(_T));
+
+#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \
+ for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \
+ (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \
+ &ifindex)); \
+ ifindex++)
#ifdef CONFIG_PROC_FS
int __init dev_proc_init(void);
@@ -69,6 +102,7 @@ struct netdev_name_node {
};
int netdev_get_name(struct net *net, char *name, int ifindex);
+int netif_change_name(struct net_device *dev, const char *newname);
int dev_change_name(struct net_device *dev, const char *newname);
#define netdev_for_each_altname(dev, namenode) \
@@ -82,24 +116,28 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
int dev_validate_mtu(struct net_device *dev, int mtu,
struct netlink_ext_ack *extack);
-int dev_set_mtu_ext(struct net_device *dev, int mtu,
- struct netlink_ext_ack *extack);
+int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack);
int dev_get_phys_port_id(struct net_device *dev,
struct netdev_phys_item_id *ppid);
int dev_get_phys_port_name(struct net_device *dev,
char *name, size_t len);
+int netif_change_proto_down(struct net_device *dev, bool proto_down);
int dev_change_proto_down(struct net_device *dev, bool proto_down);
-void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
- u32 value);
+void netdev_change_proto_down_reason_locked(struct net_device *dev,
+ unsigned long mask, u32 value);
typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
int fd, int expected_fd, u32 flags);
+int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len);
int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len);
+void netif_set_group(struct net_device *dev, int new_group);
void dev_set_group(struct net_device *dev, int new_group);
+int netif_change_carrier(struct net_device *dev, bool new_carrier);
int dev_change_carrier(struct net_device *dev, bool new_carrier);
void __dev_set_rx_mode(struct net_device *dev);
@@ -111,6 +149,20 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
void unregister_netdevice_many_notify(struct list_head *head,
u32 portid, const struct nlmsghdr *nlh);
+static inline void netif_set_up(struct net_device *dev, bool value)
+{
+ if (value)
+ dev->flags |= IFF_UP;
+ else
+ dev->flags &= ~IFF_UP;
+
+ if (!netdev_need_ops_lock(dev))
+ netdev_lock(dev);
+ dev->up = value;
+ if (!netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+}
+
static inline void netif_set_gso_max_size(struct net_device *dev,
unsigned int size)
{
@@ -271,6 +323,18 @@ void xdp_do_check_flushed(struct napi_struct *napi);
static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
#endif
+/* Best effort check that NAPI is not idle (can't be scheduled to run) */
+static inline void napi_assert_will_not_race(const struct napi_struct *napi)
+{
+ /* uninitialized instance, can't race */
+ if (!napi->poll_list.next)
+ return;
+
+ /* SCHED bit is set on disabled instances */
+ WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
+ WARN_ON(READ_ONCE(napi->list_owner) != -1);
+}
+
void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
#define XMIT_RECURSION_LIMIT 8
@@ -311,5 +375,8 @@ static inline void dev_xmit_recursion_dec(void)
int dev_set_hwtstamp_phylib(struct net_device *dev,
struct kernel_hwtstamp_config *cfg,
struct netlink_ext_ack *extack);
+int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg);
+int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg);
#endif
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 166e404f7c03..90716bd736f3 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -242,9 +242,9 @@ static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
__hw_addr_del_entry(from_list, ha, false, false);
}
-static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
- struct netdev_hw_addr_list *from_list,
- int addr_len)
+int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
{
int err = 0;
struct netdev_hw_addr *ha, *tmp;
@@ -260,6 +260,7 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
}
return err;
}
+EXPORT_SYMBOL(__hw_addr_sync_multiple);
/* This function only works where there is a strict 1-1 relationship
* between source and destination of they synch. If you ever need to
diff --git a/net/core/dev_api.c b/net/core/dev_api.c
new file mode 100644
index 000000000000..1bf0153195f2
--- /dev/null
+++ b/net/core/dev_api.c
@@ -0,0 +1,369 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
+
+#include "dev.h"
+
+/**
+ * dev_change_name() - change name of a device
+ * @dev: device
+ * @newname: name (or format string) must be at least IFNAMSIZ
+ *
+ * Change name of a device, can pass format strings "eth%d".
+ * for wildcarding.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_name(struct net_device *dev, const char *newname)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_name(dev, newname);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_set_alias() - change ifalias of a device
+ * @dev: device
+ * @alias: name up to IFALIASZ
+ * @len: limit of bytes to copy from info
+ *
+ * Set ifalias for a device.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_alias(dev, alias, len);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_alias);
+
+/**
+ * dev_change_flags() - change device settings
+ * @dev: device
+ * @flags: device state flags
+ * @extack: netlink extended ack
+ *
+ * Change settings on device based state flags. The flags are
+ * in the userspace exported format.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_flags(dev, flags, extack);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_change_flags);
+
+/**
+ * dev_set_group() - change group this device belongs to
+ * @dev: device
+ * @new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+ netdev_lock_ops(dev);
+ netif_set_group(dev, new_group);
+ netdev_unlock_ops(dev);
+}
+
+int dev_set_mac_address_user(struct net_device *dev,
+ struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ down_write(&dev_addr_sem);
+ netdev_lock_ops(dev);
+ ret = netif_set_mac_address(dev, ss, extack);
+ netdev_unlock_ops(dev);
+ up_write(&dev_addr_sem);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address_user);
+
+/**
+ * dev_change_net_namespace() - move device to different nethost namespace
+ * @dev: device
+ * @net: network namespace
+ * @pat: If not NULL name pattern to try if the current device name
+ * is already taken in the destination network namespace.
+ *
+ * This function shuts down a device interface and moves it
+ * to a new network namespace. On success 0 is returned, on
+ * a failure a netagive errno code is returned.
+ *
+ * Callers must hold the rtnl semaphore.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_net_namespace(struct net_device *dev, struct net *net,
+ const char *pat)
+{
+ return __dev_change_net_namespace(dev, net, pat, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(dev_change_net_namespace);
+
+/**
+ * dev_change_carrier() - change device carrier
+ * @dev: device
+ * @new_carrier: new value
+ *
+ * Change device carrier
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_carrier(struct net_device *dev, bool new_carrier)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_carrier(dev, new_carrier);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_change_tx_queue_len() - change TX queue length of a netdevice
+ * @dev: device
+ * @new_len: new tx queue length
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_tx_queue_len(dev, new_len);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_change_proto_down() - set carrier according to proto_down
+ * @dev: device
+ * @proto_down: new value
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_proto_down(struct net_device *dev, bool proto_down)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_proto_down(dev, proto_down);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_open() - prepare an interface for use
+ * @dev: device to open
+ * @extack: netlink extended ack
+ *
+ * Takes a device from down to up state. The device's private open
+ * function is invoked and then the multicast lists are loaded. Finally
+ * the device is moved into the up state and a %NETDEV_UP message is
+ * sent to the netdev notifier chain.
+ *
+ * Calling this function on an active interface is a nop. On a failure
+ * a negative errno code is returned.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_open(dev, extack);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_open);
+
+/**
+ * dev_close() - shutdown an interface
+ * @dev: device to shutdown
+ *
+ * This function moves an active device into down state. A
+ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
+ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
+ * chain.
+ */
+void dev_close(struct net_device *dev)
+{
+ netdev_lock_ops(dev);
+ netif_close(dev);
+ netdev_unlock_ops(dev);
+}
+EXPORT_SYMBOL(dev_close);
+
+int dev_eth_ioctl(struct net_device *dev,
+ struct ifreq *ifr, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int ret = -ENODEV;
+
+ if (!ops->ndo_eth_ioctl)
+ return -EOPNOTSUPP;
+
+ netdev_lock_ops(dev);
+ if (netif_device_present(dev))
+ ret = ops->ndo_eth_ioctl(dev, ifr, cmd);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_eth_ioctl);
+
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_mtu(dev, new_mtu);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mtu);
+
+/**
+ * dev_disable_lro() - disable Large Receive Offload on a device
+ * @dev: device
+ *
+ * Disable Large Receive Offload (LRO) on a net device. Must be
+ * called under RTNL. This is needed if received packets may be
+ * forwarded to another interface.
+ */
+void dev_disable_lro(struct net_device *dev)
+{
+ netdev_lock_ops(dev);
+ netif_disable_lro(dev);
+ netdev_unlock_ops(dev);
+}
+EXPORT_SYMBOL(dev_disable_lro);
+
+/**
+ * dev_set_promiscuity() - update promiscuity count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove promiscuity from a device. While the count in the device
+ * remains above zero the interface remains promiscuous. Once it hits zero
+ * the device reverts back to normal filtering operation. A negative inc
+ * value is used to drop promiscuity on the device.
+ * Return 0 if successful or a negative errno code on error.
+ */
+int dev_set_promiscuity(struct net_device *dev, int inc)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_promiscuity(dev, inc);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_promiscuity);
+
+/**
+ * dev_set_allmulti() - update allmulti count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove reception of all multicast frames to a device. While the
+ * count in the device remains above zero the interface remains listening
+ * to all interfaces. Once it hits zero the device reverts back to normal
+ * filtering operation. A negative @inc value is used to drop the counter
+ * when releasing a resource needing all multicasts.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+
+int dev_set_allmulti(struct net_device *dev, int inc)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_allmulti(dev, inc, true);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_allmulti);
+
+/**
+ * dev_set_mac_address() - change Media Access Control Address
+ * @dev: device
+ * @ss: new address
+ * @extack: netlink extended ack
+ *
+ * Change the hardware (MAC) address of the device
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_mac_address(dev, ss, extack);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address);
+
+int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_xdp_propagate(dev, bpf);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_xdp_propagate);
+
+/**
+ * netdev_state_change() - device changes state
+ * @dev: device to cause notification
+ *
+ * Called to indicate a device has changed state. This function calls
+ * the notifier chains for netdev_chain and sends a NEWLINK message
+ * to the routing socket.
+ */
+void netdev_state_change(struct net_device *dev)
+{
+ netdev_lock_ops(dev);
+ netif_state_change(dev);
+ netdev_unlock_ops(dev);
+}
+EXPORT_SYMBOL(netdev_state_change);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 46d43b950471..616479e71466 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -6,9 +6,11 @@
#include <linux/rtnetlink.h>
#include <linux/net_tstamp.h>
#include <linux/phylib_stubs.h>
+#include <linux/ptp_clock_kernel.h>
#include <linux/wireless.h>
#include <linux/if_bridge.h>
#include <net/dsa_stubs.h>
+#include <net/netdev_lock.h>
#include <net/wext.h>
#include "dev.h"
@@ -109,7 +111,7 @@ static int dev_getifmap(struct net_device *dev, struct ifreq *ifr)
return 0;
}
-static int dev_setifmap(struct net_device *dev, struct ifreq *ifr)
+static int netif_setifmap(struct net_device *dev, struct ifreq *ifr)
{
struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map;
@@ -184,7 +186,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
return err;
}
-static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
+int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
{
enum hwtstamp_tx_types tx_type;
enum hwtstamp_rx_filters rx_filter;
@@ -239,20 +241,6 @@ static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
return 0;
}
-static int dev_eth_ioctl(struct net_device *dev,
- struct ifreq *ifr, unsigned int cmd)
-{
- const struct net_device_ops *ops = dev->netdev_ops;
-
- if (!ops->ndo_eth_ioctl)
- return -EOPNOTSUPP;
-
- if (!netif_device_present(dev))
- return -ENODEV;
-
- return ops->ndo_eth_ioctl(dev, ifr, cmd);
-}
-
/**
* dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC
* or of attached phylib PHY
@@ -266,9 +254,24 @@ static int dev_eth_ioctl(struct net_device *dev,
* -EOPNOTSUPP for phylib for now, which is still more accurate than letting
* the netdev handle the GET request.
*/
-static int dev_get_hwtstamp_phylib(struct net_device *dev,
- struct kernel_hwtstamp_config *cfg)
+int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg)
{
+ struct hwtstamp_provider *hwprov;
+
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ cfg->qualifier = hwprov->desc.qualifier;
+ if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB &&
+ hwprov->phydev)
+ return phy_hwtstamp_get(hwprov->phydev, cfg);
+
+ if (hwprov->source == HWTSTAMP_SOURCE_NETDEV)
+ return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
+
+ return -EOPNOTSUPP;
+ }
+
if (phy_is_default_hwtstamp(dev->phydev))
return phy_hwtstamp_get(dev->phydev, cfg);
@@ -289,7 +292,9 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
return -ENODEV;
kernel_cfg.ifr = ifr;
+ netdev_lock_ops(dev);
err = dev_get_hwtstamp_phylib(dev, &kernel_cfg);
+ netdev_unlock_ops(dev);
if (err)
return err;
@@ -324,11 +329,32 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
- bool phy_ts = phy_is_default_hwtstamp(dev->phydev);
struct kernel_hwtstamp_config old_cfg = {};
+ struct hwtstamp_provider *hwprov;
+ struct phy_device *phydev;
bool changed = false;
+ bool phy_ts;
int err;
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB &&
+ hwprov->phydev) {
+ phy_ts = true;
+ phydev = hwprov->phydev;
+ } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) {
+ phy_ts = false;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ cfg->qualifier = hwprov->desc.qualifier;
+ } else {
+ phy_ts = phy_is_default_hwtstamp(dev->phydev);
+ if (phy_ts)
+ phydev = dev->phydev;
+ }
+
cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV;
if (phy_ts && dev->see_all_hwtstamp_requests) {
@@ -350,7 +376,7 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
changed = kernel_hwtstamp_config_changed(&old_cfg, cfg);
if (phy_ts) {
- err = phy_hwtstamp_set(dev->phydev, cfg, extack);
+ err = phy_hwtstamp_set(phydev, cfg, extack);
if (err) {
if (changed)
ops->ndo_hwtstamp_set(dev, &old_cfg, NULL);
@@ -392,7 +418,9 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
if (!netif_device_present(dev))
return -ENODEV;
+ netdev_lock_ops(dev);
err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack);
+ netdev_unlock_ops(dev);
if (err)
return err;
@@ -467,10 +495,14 @@ static int dev_siocbond(struct net_device *dev,
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_siocbond) {
+ int ret = -ENODEV;
+
+ netdev_lock_ops(dev);
if (netif_device_present(dev))
- return ops->ndo_siocbond(dev, ifr, cmd);
- else
- return -ENODEV;
+ ret = ops->ndo_siocbond(dev, ifr, cmd);
+ netdev_unlock_ops(dev);
+
+ return ret;
}
return -EOPNOTSUPP;
@@ -482,10 +514,14 @@ static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_siocdevprivate) {
+ int ret = -ENODEV;
+
+ netdev_lock_ops(dev);
if (netif_device_present(dev))
- return ops->ndo_siocdevprivate(dev, ifr, data, cmd);
- else
- return -ENODEV;
+ ret = ops->ndo_siocdevprivate(dev, ifr, data, cmd);
+ netdev_unlock_ops(dev);
+
+ return ret;
}
return -EOPNOTSUPP;
@@ -496,17 +532,21 @@ static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs)
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_siocwandev) {
+ int ret = -ENODEV;
+
+ netdev_lock_ops(dev);
if (netif_device_present(dev))
- return ops->ndo_siocwandev(dev, ifs);
- else
- return -ENODEV;
+ ret = ops->ndo_siocwandev(dev, ifs);
+ netdev_unlock_ops(dev);
+
+ return ret;
}
return -EOPNOTSUPP;
}
/*
- * Perform the SIOCxIFxxx calls, inside rtnl_lock()
+ * Perform the SIOCxIFxxx calls, inside rtnl_net_lock()
*/
static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
unsigned int cmd)
@@ -514,7 +554,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
int err;
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
const struct net_device_ops *ops;
- netdevice_tracker dev_tracker;
if (!dev)
return -ENODEV;
@@ -533,9 +572,11 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return dev_set_mtu(dev, ifr->ifr_mtu);
case SIOCSIFHWADDR:
- if (dev->addr_len > sizeof(struct sockaddr))
+ if (dev->addr_len > sizeof(ifr->ifr_hwaddr))
return -EINVAL;
- return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL);
+ return dev_set_mac_address_user(dev,
+ (struct sockaddr_storage *)&ifr->ifr_hwaddr,
+ NULL);
case SIOCSIFHWBROADCAST:
if (ifr->ifr_hwaddr.sa_family != dev->type)
@@ -543,11 +584,16 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
min(sizeof(ifr->ifr_hwaddr.sa_data_min),
(size_t)dev->addr_len));
+ netdev_lock_ops(dev);
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ netdev_unlock_ops(dev);
return 0;
case SIOCSIFMAP:
- return dev_setifmap(dev, ifr);
+ netdev_lock_ops(dev);
+ err = netif_setifmap(dev, ifr);
+ netdev_unlock_ops(dev);
+ return err;
case SIOCADDMULTI:
if (!ops->ndo_set_rx_mode ||
@@ -555,7 +601,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_lock_ops(dev);
+ err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_unlock_ops(dev);
+ return err;
case SIOCDELMULTI:
if (!ops->ndo_set_rx_mode ||
@@ -563,7 +612,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_lock_ops(dev);
+ err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_unlock_ops(dev);
+ return err;
case SIOCSIFTXQLEN:
if (ifr->ifr_qlen < 0)
@@ -577,19 +629,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
case SIOCWANDEV:
return dev_siocwandev(dev, &ifr->ifr_settings);
- case SIOCBRADDIF:
- case SIOCBRDELIF:
- if (!netif_device_present(dev))
- return -ENODEV;
- if (!netif_is_bridge_master(dev))
- return -EOPNOTSUPP;
- netdev_hold(dev, &dev_tracker, GFP_KERNEL);
- rtnl_unlock();
- err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL);
- netdev_put(dev, &dev_tracker);
- rtnl_lock();
- return err;
-
case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15:
return dev_siocdevprivate(dev, ifr, data, cmd);
@@ -733,9 +772,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
dev_load(net, ifr->ifr_name);
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- rtnl_lock();
+
+ rtnl_net_lock(net);
ret = dev_ifsioc(net, ifr, data, cmd);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
if (colon)
*colon = ':';
return ret;
@@ -770,8 +811,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
case SIOCBONDRELEASE:
case SIOCBONDSETHWADDR:
case SIOCBONDCHANGEACTIVE:
- case SIOCBRADDIF:
- case SIOCBRDELIF:
case SIOCSHWTSTAMP:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -779,9 +818,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
case SIOCBONDSLAVEINFOQUERY:
case SIOCBONDINFOQUERY:
dev_load(net, ifr->ifr_name);
- rtnl_lock();
+
+ rtnl_net_lock(net);
ret = dev_ifsioc(net, ifr, data, cmd);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
if (need_copyout)
*need_copyout = false;
return ret;
@@ -804,9 +845,10 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
(cmd >= SIOCDEVPRIVATE &&
cmd <= SIOCDEVPRIVATE + 15)) {
dev_load(net, ifr->ifr_name);
- rtnl_lock();
+
+ rtnl_net_lock(net);
ret = dev_ifsioc(net, ifr, data, cmd);
- rtnl_unlock();
+ rtnl_net_unlock(net);
return ret;
}
return -ENOTTY;
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 11b91c12ee11..b3a62ca0df65 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -15,6 +15,8 @@
#include <net/netdev_queues.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
+#include <net/sock.h>
#include <trace/events/page_pool.h>
#include "devmem.h"
@@ -23,29 +25,38 @@
/* Device memory support */
-/* Protected by rtnl_lock() */
static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1);
+static const struct memory_provider_ops dmabuf_devmem_ops;
+
+bool net_is_devmem_iov(struct net_iov *niov)
+{
+ return niov->type == NET_IOV_DMABUF;
+}
+
static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
struct gen_pool_chunk *chunk,
void *not_used)
{
struct dmabuf_genpool_chunk_owner *owner = chunk->owner;
- kvfree(owner->niovs);
+ kvfree(owner->area.niovs);
kfree(owner);
}
static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
{
- struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov);
+ struct dmabuf_genpool_chunk_owner *owner;
+ owner = net_devmem_iov_to_chunk_owner(niov);
return owner->base_dma_addr +
((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
}
-void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
+void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
{
+ struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w);
+
size_t size, avail;
gen_pool_for_each_chunk(binding->chunk_pool,
@@ -63,8 +74,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
dma_buf_detach(binding->dmabuf, binding->attachment);
dma_buf_put(binding->dmabuf);
xa_destroy(&binding->bound_rxqs);
+ kvfree(binding->tx_vec);
kfree(binding);
}
+EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free);
struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
@@ -82,7 +95,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
offset = dma_addr - owner->base_dma_addr;
index = offset / PAGE_SIZE;
- niov = &owner->niovs[index];
+ niov = &owner->area.niovs[index];
niov->pp_magic = 0;
niov->pp = NULL;
@@ -93,7 +106,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
void net_devmem_free_dmabuf(struct net_iov *niov)
{
- struct net_devmem_dmabuf_binding *binding = net_iov_binding(niov);
+ struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov);
unsigned long dma_addr = net_devmem_get_dma_addr(niov);
if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr,
@@ -109,21 +122,27 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
unsigned long xa_idx;
unsigned int rxq_idx;
+ xa_erase(&net_devmem_dmabuf_bindings, binding->id);
+
+ /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the
+ * erase.
+ */
+ synchronize_net();
+
if (binding->list.next)
list_del(&binding->list);
xa_for_each(&binding->bound_rxqs, xa_idx, rxq) {
- WARN_ON(rxq->mp_params.mp_priv != binding);
-
- rxq->mp_params.mp_priv = NULL;
+ const struct pp_memory_provider_params mp_params = {
+ .mp_priv = binding,
+ .mp_ops = &dmabuf_devmem_ops,
+ };
rxq_idx = get_netdev_rx_queue_index(rxq);
- WARN_ON(netdev_rx_queue_restart(binding->dev, rxq_idx));
+ __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
}
- xa_erase(&net_devmem_dmabuf_bindings, binding->id);
-
net_devmem_dmabuf_binding_put(binding);
}
@@ -131,50 +150,35 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
struct net_devmem_dmabuf_binding *binding,
struct netlink_ext_ack *extack)
{
+ struct pp_memory_provider_params mp_params = {
+ .mp_priv = binding,
+ .mp_ops = &dmabuf_devmem_ops,
+ };
struct netdev_rx_queue *rxq;
u32 xa_idx;
int err;
- if (rxq_idx >= dev->real_num_rx_queues) {
- NL_SET_ERR_MSG(extack, "rx queue index out of range");
- return -ERANGE;
- }
+ err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack);
+ if (err)
+ return err;
rxq = __netif_get_rx_queue(dev, rxq_idx);
- if (rxq->mp_params.mp_priv) {
- NL_SET_ERR_MSG(extack, "designated queue already memory provider bound");
- return -EEXIST;
- }
-
-#ifdef CONFIG_XDP_SOCKETS
- if (rxq->pool) {
- NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP");
- return -EBUSY;
- }
-#endif
-
err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b,
GFP_KERNEL);
if (err)
- return err;
-
- rxq->mp_params.mp_priv = binding;
-
- err = netdev_rx_queue_restart(dev, rxq_idx);
- if (err)
- goto err_xa_erase;
+ goto err_close_rxq;
return 0;
-err_xa_erase:
- rxq->mp_params.mp_priv = NULL;
- xa_erase(&binding->bound_rxqs, xa_idx);
-
+err_close_rxq:
+ __net_mp_close_rxq(dev, rxq_idx, &mp_params);
return err;
}
struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+net_devmem_bind_dmabuf(struct net_device *dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack)
{
struct net_devmem_dmabuf_binding *binding;
@@ -197,43 +201,48 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
}
binding->dev = dev;
-
- err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
- binding, xa_limit_32b, &id_alloc_next,
- GFP_KERNEL);
- if (err < 0)
- goto err_free_binding;
-
xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
refcount_set(&binding->ref, 1);
+ mutex_init(&binding->lock);
+
binding->dmabuf = dmabuf;
binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent);
if (IS_ERR(binding->attachment)) {
err = PTR_ERR(binding->attachment);
NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
- goto err_free_id;
+ goto err_free_binding;
}
binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
- DMA_FROM_DEVICE);
+ direction);
if (IS_ERR(binding->sgt)) {
err = PTR_ERR(binding->sgt);
NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment");
goto err_detach;
}
+ if (direction == DMA_TO_DEVICE) {
+ binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
+ sizeof(struct net_iov *),
+ GFP_KERNEL);
+ if (!binding->tx_vec) {
+ err = -ENOMEM;
+ goto err_unmap;
+ }
+ }
+
/* For simplicity we expect to make PAGE_SIZE allocations, but the
* binding can be much more flexible than that. We may be able to
* allocate MTU sized chunks here. Leave that for future work...
*/
- binding->chunk_pool =
- gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev));
+ binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
+ dev_to_node(&dev->dev));
if (!binding->chunk_pool) {
err = -ENOMEM;
- goto err_unmap;
+ goto err_tx_vec;
}
virtual = 0;
@@ -250,9 +259,9 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
goto err_free_chunks;
}
- owner->base_virtual = virtual;
+ owner->area.base_virtual = virtual;
owner->base_dma_addr = dma_addr;
- owner->num_niovs = len / PAGE_SIZE;
+ owner->area.num_niovs = len / PAGE_SIZE;
owner->binding = binding;
err = gen_pool_add_owner(binding->chunk_pool, dma_addr,
@@ -264,37 +273,48 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
goto err_free_chunks;
}
- owner->niovs = kvmalloc_array(owner->num_niovs,
- sizeof(*owner->niovs),
- GFP_KERNEL);
- if (!owner->niovs) {
+ owner->area.niovs = kvmalloc_array(owner->area.num_niovs,
+ sizeof(*owner->area.niovs),
+ GFP_KERNEL);
+ if (!owner->area.niovs) {
err = -ENOMEM;
goto err_free_chunks;
}
- for (i = 0; i < owner->num_niovs; i++) {
- niov = &owner->niovs[i];
- niov->owner = owner;
+ for (i = 0; i < owner->area.num_niovs; i++) {
+ niov = &owner->area.niovs[i];
+ niov->type = NET_IOV_DMABUF;
+ niov->owner = &owner->area;
page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
net_devmem_get_dma_addr(niov));
+ if (direction == DMA_TO_DEVICE)
+ binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
}
virtual += len;
}
+ err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
+ binding, xa_limit_32b, &id_alloc_next,
+ GFP_KERNEL);
+ if (err < 0)
+ goto err_free_chunks;
+
+ list_add(&binding->list, &priv->bindings);
+
return binding;
err_free_chunks:
gen_pool_for_each_chunk(binding->chunk_pool,
net_devmem_dmabuf_free_chunk_owner, NULL);
gen_pool_destroy(binding->chunk_pool);
+err_tx_vec:
+ kvfree(binding->tx_vec);
err_unmap:
dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
DMA_FROM_DEVICE);
err_detach:
dma_buf_detach(dmabuf, binding->attachment);
-err_free_id:
- xa_erase(&net_devmem_dmabuf_bindings, binding->id);
err_free_binding:
kfree(binding);
err_put_dmabuf:
@@ -302,24 +322,72 @@ err_put_dmabuf:
return ERR_PTR(err);
}
-void dev_dmabuf_uninstall(struct net_device *dev)
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
{
struct net_devmem_dmabuf_binding *binding;
- struct netdev_rx_queue *rxq;
- unsigned long xa_idx;
- unsigned int i;
- for (i = 0; i < dev->real_num_rx_queues; i++) {
- binding = dev->_rx[i].mp_params.mp_priv;
- if (!binding)
- continue;
+ rcu_read_lock();
+ binding = xa_load(&net_devmem_dmabuf_bindings, id);
+ if (binding) {
+ if (!net_devmem_dmabuf_binding_get(binding))
+ binding = NULL;
+ }
+ rcu_read_unlock();
- xa_for_each(&binding->bound_rxqs, xa_idx, rxq)
- if (rxq == &dev->_rx[i]) {
- xa_erase(&binding->bound_rxqs, xa_idx);
- break;
- }
+ return binding;
+}
+
+void net_devmem_get_net_iov(struct net_iov *niov)
+{
+ net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov));
+}
+
+void net_devmem_put_net_iov(struct net_iov *niov)
+{
+ net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov));
+}
+
+struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
+ unsigned int dmabuf_id)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct dst_entry *dst = __sk_dst_get(sk);
+ int err = 0;
+
+ binding = net_devmem_lookup_dmabuf(dmabuf_id);
+ if (!binding || !binding->tx_vec) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ /* The dma-addrs in this binding are only reachable to the corresponding
+ * net_device.
+ */
+ if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) {
+ err = -ENODEV;
+ goto out_err;
}
+
+ return binding;
+
+out_err:
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
+
+ return ERR_PTR(err);
+}
+
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
+ size_t virt_addr, size_t *off, size_t *size)
+{
+ if (virt_addr >= binding->dmabuf->size)
+ return NULL;
+
+ *off = virt_addr % PAGE_SIZE;
+ *size = PAGE_SIZE - *off;
+
+ return binding->tx_vec[virt_addr / PAGE_SIZE];
}
/*** "Dmabuf devmem memory provider" ***/
@@ -331,11 +399,11 @@ int mp_dmabuf_devmem_init(struct page_pool *pool)
if (!binding)
return -EINVAL;
- if (!pool->dma_map)
- return -EOPNOTSUPP;
-
- if (pool->dma_sync)
- return -EOPNOTSUPP;
+ /* dma-buf dma addresses do not need and should not be used with
+ * dma_sync_for_cpu/device. Force disable dma_sync.
+ */
+ pool->dma_sync = false;
+ pool->dma_sync_for_cpu = false;
if (pool->p.order != 0)
return -E2BIG;
@@ -387,3 +455,41 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem)
/* We don't want the page pool put_page()ing our net_iovs. */
return false;
}
+
+static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp,
+ struct netdev_rx_queue *rxq)
+{
+ const struct net_devmem_dmabuf_binding *binding = mp_priv;
+ int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF;
+
+ return nla_put_u32(rsp, type, binding->id);
+}
+
+static void mp_dmabuf_devmem_uninstall(void *mp_priv,
+ struct netdev_rx_queue *rxq)
+{
+ struct net_devmem_dmabuf_binding *binding = mp_priv;
+ struct netdev_rx_queue *bound_rxq;
+ unsigned long xa_idx;
+
+ xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) {
+ if (bound_rxq == rxq) {
+ xa_erase(&binding->bound_rxqs, xa_idx);
+ if (xa_empty(&binding->bound_rxqs)) {
+ mutex_lock(&binding->lock);
+ binding->dev = NULL;
+ mutex_unlock(&binding->lock);
+ }
+ break;
+ }
+ }
+}
+
+static const struct memory_provider_ops dmabuf_devmem_ops = {
+ .init = mp_dmabuf_devmem_init,
+ .destroy = mp_dmabuf_devmem_destroy,
+ .alloc_netmems = mp_dmabuf_devmem_alloc_netmems,
+ .release_netmem = mp_dmabuf_devmem_release_page,
+ .nl_fill = mp_dmabuf_devmem_nl_fill,
+ .uninstall = mp_dmabuf_devmem_uninstall,
+};
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 76099ef9c482..0a3b28ba5c13 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -10,6 +10,9 @@
#ifndef _NET_DEVMEM_H
#define _NET_DEVMEM_H
+#include <net/netmem.h>
+#include <net/netdev_netlink.h>
+
struct netlink_ext_ack;
struct net_devmem_dmabuf_binding {
@@ -18,15 +21,25 @@ struct net_devmem_dmabuf_binding {
struct sg_table *sgt;
struct net_device *dev;
struct gen_pool *chunk_pool;
+ /* Protect dev */
+ struct mutex lock;
/* The user holds a ref (via the netlink API) for as long as they want
* the binding to remain alive. Each page pool using this binding holds
- * a ref to keep the binding alive. Each allocated net_iov holds a
- * ref.
+ * a ref to keep the binding alive. The page_pool does not release the
+ * ref until all the net_iovs allocated from this binding are released
+ * back to the page_pool.
*
* The binding undos itself and unmaps the underlying dmabuf once all
* those refs are dropped and the binding is no longer desired or in
* use.
+ *
+ * net_devmem_get_net_iov() on dmabuf net_iovs will increment this
+ * reference, making sure that the binding remains alive until all the
+ * net_iovs are no longer used. net_iovs allocated from this binding
+ * that are stuck in the TX path for any reason (such as awaiting
+ * retransmits) hold a reference to the binding until the skb holding
+ * them is freed.
*/
refcount_t ref;
@@ -42,6 +55,14 @@ struct net_devmem_dmabuf_binding {
* active.
*/
u32 id;
+
+ /* Array of net_iov pointers for this binding, sorted by virtual
+ * address. This array is convenient to map the virtual addresses to
+ * net_iovs in the TX path.
+ */
+ struct net_iov **tx_vec;
+
+ struct work_struct unbind_w;
};
#if defined(CONFIG_NET_DEVMEM)
@@ -51,63 +72,57 @@ struct net_devmem_dmabuf_binding {
* allocations from this chunk.
*/
struct dmabuf_genpool_chunk_owner {
- /* Offset into the dma-buf where this chunk starts. */
- unsigned long base_virtual;
+ struct net_iov_area area;
+ struct net_devmem_dmabuf_binding *binding;
/* dma_addr of the start of the chunk. */
dma_addr_t base_dma_addr;
-
- /* Array of net_iovs for this chunk. */
- struct net_iov *niovs;
- size_t num_niovs;
-
- struct net_devmem_dmabuf_binding *binding;
};
-void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding);
+void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+net_devmem_bind_dmabuf(struct net_device *dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack);
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id);
void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding);
int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
struct net_devmem_dmabuf_binding *binding,
struct netlink_ext_ack *extack);
-void dev_dmabuf_uninstall(struct net_device *dev);
+void net_devmem_bind_tx_release(struct sock *sk);
static inline struct dmabuf_genpool_chunk_owner *
-net_iov_owner(const struct net_iov *niov)
+net_devmem_iov_to_chunk_owner(const struct net_iov *niov)
{
- return niov->owner;
+ struct net_iov_area *owner = net_iov_owner(niov);
+
+ return container_of(owner, struct dmabuf_genpool_chunk_owner, area);
}
-static inline unsigned int net_iov_idx(const struct net_iov *niov)
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_iov_binding(const struct net_iov *niov)
{
- return niov - net_iov_owner(niov)->niovs;
+ return net_devmem_iov_to_chunk_owner(niov)->binding;
}
-static inline struct net_devmem_dmabuf_binding *
-net_iov_binding(const struct net_iov *niov)
+static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
{
- return net_iov_owner(niov)->binding;
+ return net_devmem_iov_binding(niov)->id;
}
static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
{
- struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov);
+ struct net_iov_area *owner = net_iov_owner(niov);
return owner->base_virtual +
((unsigned long)net_iov_idx(niov) << PAGE_SHIFT);
}
-static inline u32 net_iov_binding_id(const struct net_iov *niov)
-{
- return net_iov_owner(niov)->binding->id;
-}
-
-static inline void
+static inline bool
net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding)
{
- refcount_inc(&binding->ref);
+ return refcount_inc_not_zero(&binding->ref);
}
static inline void
@@ -116,28 +131,59 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
if (!refcount_dec_and_test(&binding->ref))
return;
- __net_devmem_dmabuf_binding_free(binding);
+ INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
+ schedule_work(&binding->unbind_w);
}
+void net_devmem_get_net_iov(struct net_iov *niov);
+void net_devmem_put_net_iov(struct net_iov *niov);
+
struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
void net_devmem_free_dmabuf(struct net_iov *ppiov);
+bool net_is_devmem_iov(struct net_iov *niov);
+struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id);
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+ size_t *off, size_t *size);
+
#else
struct net_devmem_dmabuf_binding;
static inline void
-__net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding)
+net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
+{
+}
+
+static inline void net_devmem_get_net_iov(struct net_iov *niov)
+{
+}
+
+static inline void net_devmem_put_net_iov(struct net_iov *niov)
+{
+}
+
+static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
{
}
static inline struct net_devmem_dmabuf_binding *
-net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd,
+net_devmem_bind_dmabuf(struct net_device *dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd,
+ struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack)
{
return ERR_PTR(-EOPNOTSUPP);
}
+static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+ return NULL;
+}
+
static inline void
net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
{
@@ -152,10 +198,6 @@ net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
return -EOPNOTSUPP;
}
-static inline void dev_dmabuf_uninstall(struct net_device *dev)
-{
-}
-
static inline struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
{
@@ -171,10 +213,34 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
return 0;
}
-static inline u32 net_iov_binding_id(const struct net_iov *niov)
+static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
{
return 0;
}
+
+static inline bool net_is_devmem_iov(struct net_iov *niov)
+{
+ return false;
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+ size_t *off, size_t *size)
+{
+ return NULL;
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_iov_binding(const struct net_iov *niov)
+{
+ return NULL;
+}
#endif
#endif /* _NET_DEVMEM_H */
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 6efd4cccc9dd..8a7ce640f74d 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -1088,7 +1088,7 @@ err_module_put:
struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
struct sk_buff *skb;
- del_timer_sync(&hw_data->send_timer);
+ timer_delete_sync(&hw_data->send_timer);
cancel_work_sync(&hw_data->dm_alert_work);
while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
struct devlink_trap_metadata *hw_metadata;
@@ -1122,7 +1122,7 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
struct sk_buff *skb;
- del_timer_sync(&hw_data->send_timer);
+ timer_delete_sync(&hw_data->send_timer);
cancel_work_sync(&hw_data->dm_alert_work);
while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
struct devlink_trap_metadata *hw_metadata;
@@ -1183,7 +1183,7 @@ err_module_put:
struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
struct sk_buff *skb;
- del_timer_sync(&data->send_timer);
+ timer_delete_sync(&data->send_timer);
cancel_work_sync(&data->dm_alert_work);
while ((skb = __skb_dequeue(&data->drop_queue)))
consume_skb(skb);
@@ -1211,7 +1211,7 @@ static void net_dm_trace_off_set(void)
struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
struct sk_buff *skb;
- del_timer_sync(&data->send_timer);
+ timer_delete_sync(&data->send_timer);
cancel_work_sync(&data->dm_alert_work);
while ((skb = __skb_dequeue(&data->drop_queue)))
consume_skb(skb);
@@ -1734,30 +1734,30 @@ static int __init init_net_drop_monitor(void)
return -ENOSPC;
}
- rc = genl_register_family(&net_drop_monitor_family);
- if (rc) {
- pr_err("Could not create drop monitor netlink family\n");
- return rc;
+ for_each_possible_cpu(cpu) {
+ net_dm_cpu_data_init(cpu);
+ net_dm_hw_cpu_data_init(cpu);
}
- WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
rc = register_netdevice_notifier(&dropmon_net_notifier);
if (rc < 0) {
pr_crit("Failed to register netdevice notifier\n");
+ return rc;
+ }
+
+ rc = genl_register_family(&net_drop_monitor_family);
+ if (rc) {
+ pr_err("Could not create drop monitor netlink family\n");
goto out_unreg;
}
+ WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
rc = 0;
- for_each_possible_cpu(cpu) {
- net_dm_cpu_data_init(cpu);
- net_dm_hw_cpu_data_init(cpu);
- }
-
goto out;
out_unreg:
- genl_unregister_family(&net_drop_monitor_family);
+ WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
out:
return rc;
}
@@ -1766,19 +1766,18 @@ static void exit_net_drop_monitor(void)
{
int cpu;
- BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
-
/*
* Because of the module_get/put we do in the trace state change path
* we are guaranteed not to have any current users when we get here
*/
+ BUG_ON(genl_unregister_family(&net_drop_monitor_family));
+
+ BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
for_each_possible_cpu(cpu) {
net_dm_hw_cpu_data_fini(cpu);
net_dm_cpu_data_fini(cpu);
}
-
- BUG_ON(genl_unregister_family(&net_drop_monitor_family));
}
module_init(init_net_drop_monitor);
diff --git a/net/core/dst.c b/net/core/dst.c
index 9552a90d4772..795ca07e28a4 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -165,6 +165,14 @@ static void dst_count_dec(struct dst_entry *dst)
void dst_release(struct dst_entry *dst)
{
if (dst && rcuref_put(&dst->__rcuref)) {
+#ifdef CONFIG_DST_CACHE
+ if (dst->flags & DST_METADATA) {
+ struct metadata_dst *md_dst = (struct metadata_dst *)dst;
+
+ if (md_dst->type == METADATA_IP_TUNNEL)
+ dst_cache_reset_now(&md_dst->u.tun_info.dst_cache);
+ }
+#endif
dst_count_dec(dst);
call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
}
@@ -286,7 +294,8 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
{
struct metadata_dst *md_dst;
- md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
+ md_dst = kmalloc(struct_size(md_dst, u.tun_info.options, optslen),
+ flags);
if (!md_dst)
return NULL;
@@ -314,7 +323,8 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
int cpu;
struct metadata_dst __percpu *md_dst;
- md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
+ md_dst = __alloc_percpu_gfp(struct_size(md_dst, u.tun_info.options,
+ optslen),
__alignof__(struct metadata_dst), flags);
if (!md_dst)
return NULL;
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 70c634b9e7b0..93a04d18e505 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -17,6 +17,7 @@
struct dst_cache_pcpu {
unsigned long refresh_ts;
struct dst_entry *dst;
+ local_lock_t bh_lock;
u32 cookie;
union {
struct in_addr in_saddr;
@@ -65,10 +66,15 @@ fail:
struct dst_entry *dst_cache_get(struct dst_cache *dst_cache)
{
+ struct dst_entry *dst;
+
if (!dst_cache->cache)
return NULL;
- return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+ dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+ return dst;
}
EXPORT_SYMBOL_GPL(dst_cache_get);
@@ -80,12 +86,16 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
if (!dst_cache->cache)
return NULL;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
idst = this_cpu_ptr(dst_cache->cache);
dst = dst_cache_per_cpu_get(dst_cache, idst);
- if (!dst)
+ if (!dst) {
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return NULL;
+ }
*saddr = idst->in_saddr.s_addr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return dst_rtable(dst);
}
EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
@@ -98,9 +108,11 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
if (!dst_cache->cache)
return;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
idst = this_cpu_ptr(dst_cache->cache);
dst_cache_per_cpu_dst_set(idst, dst, 0);
idst->in_saddr.s_addr = saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
@@ -113,10 +125,13 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
if (!dst_cache->cache)
return;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+
idst = this_cpu_ptr(dst_cache->cache);
dst_cache_per_cpu_dst_set(idst, dst,
rt6_get_cookie(dst_rt6_info(dst)));
idst->in6_saddr = *saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
@@ -129,12 +144,17 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
if (!dst_cache->cache)
return NULL;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+
idst = this_cpu_ptr(dst_cache->cache);
dst = dst_cache_per_cpu_get(dst_cache, idst);
- if (!dst)
+ if (!dst) {
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return NULL;
+ }
*saddr = idst->in6_saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return dst;
}
EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
@@ -142,10 +162,14 @@ EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
{
+ unsigned int i;
+
dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
gfp | __GFP_ZERO);
if (!dst_cache->cache)
return -ENOMEM;
+ for_each_possible_cpu(i)
+ local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock);
dst_cache_reset(dst_cache);
return 0;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 34185d138c95..8ca634964e36 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -37,8 +37,8 @@ static const struct fib_kuid_range fib_kuid_range_unset = {
bool fib_rule_matchall(const struct fib_rule *rule)
{
- if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id ||
- rule->flags)
+ if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) ||
+ rule->mark || rule->tun_id || rule->flags)
return false;
if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1)
return false;
@@ -257,16 +257,36 @@ static int nla_put_port_range(struct sk_buff *skb, int attrtype,
return nla_put(skb, attrtype, sizeof(*range), range);
}
+static bool fib_rule_iif_match(const struct fib_rule *rule, int iifindex,
+ const struct flowi *fl)
+{
+ u8 iif_is_l3_master = READ_ONCE(rule->iif_is_l3_master);
+
+ return iif_is_l3_master ? l3mdev_fib_rule_iif_match(fl, iifindex) :
+ fl->flowi_iif == iifindex;
+}
+
+static bool fib_rule_oif_match(const struct fib_rule *rule, int oifindex,
+ const struct flowi *fl)
+{
+ u8 oif_is_l3_master = READ_ONCE(rule->oif_is_l3_master);
+
+ return oif_is_l3_master ? l3mdev_fib_rule_oif_match(fl, oifindex) :
+ fl->flowi_oif == oifindex;
+}
+
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
{
- int ret = 0;
+ int iifindex, oifindex, ret = 0;
- if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
+ iifindex = READ_ONCE(rule->iifindex);
+ if (iifindex && !fib_rule_iif_match(rule, iifindex, fl))
goto out;
- if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
+ oifindex = READ_ONCE(rule->oifindex);
+ if (oifindex && !fib_rule_oif_match(rule, oifindex, fl))
goto out;
if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
@@ -371,7 +391,8 @@ static int call_fib_rule_notifiers(struct net *net,
.rule = rule,
};
- ASSERT_RTNL();
+ ASSERT_RTNL_NET(net);
+
/* Paired with READ_ONCE() in fib_rules_seq() */
WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1);
return call_fib_notifiers(net, event_type, &info.info);
@@ -459,9 +480,6 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops,
if (rule->tun_id && r->tun_id != rule->tun_id)
continue;
- if (r->fr_net != rule->fr_net)
- continue;
-
if (rule->l3mdev && r->l3mdev != rule->l3mdev)
continue;
@@ -481,11 +499,17 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops,
&rule->sport_range))
continue;
+ if (rule->sport_mask && r->sport_mask != rule->sport_mask)
+ continue;
+
if (fib_rule_port_range_set(&rule->dport_range) &&
!fib_rule_port_range_compare(&r->dport_range,
&rule->dport_range))
continue;
+ if (rule->dport_mask && r->dport_mask != rule->dport_mask)
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return r;
@@ -515,14 +539,40 @@ static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
}
#endif
-static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
+static int fib_nl2rule_port_mask(const struct nlattr *mask_attr,
+ const struct fib_rule_port_range *range,
+ u16 *port_mask,
+ struct netlink_ext_ack *extack)
+{
+ if (!fib_rule_port_range_valid(range)) {
+ NL_SET_ERR_MSG_ATTR(extack, mask_attr,
+ "Cannot specify port mask without port value");
+ return -EINVAL;
+ }
+
+ if (fib_rule_port_is_range(range)) {
+ NL_SET_ERR_MSG_ATTR(extack, mask_attr,
+ "Cannot specify port mask for port range");
+ return -EINVAL;
+ }
+
+ if (range->start & ~nla_get_u16(mask_attr)) {
+ NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask");
+ return -EINVAL;
+ }
+
+ *port_mask = nla_get_u16(mask_attr);
+
+ return 0;
+}
+
+static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack,
struct fib_rules_ops *ops,
struct nlattr *tb[],
struct fib_rule **rule,
bool *user_priority)
{
- struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
struct fib_rule *nlrule = NULL;
int err = -EINVAL;
@@ -554,30 +604,18 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[FRA_PRIORITY]) {
nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
*user_priority = true;
- } else {
- nlrule->pref = fib_default_rule_pref(ops);
}
nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC);
if (tb[FRA_IIFNAME]) {
- struct net_device *dev;
-
nlrule->iifindex = -1;
nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, nlrule->iifname);
- if (dev)
- nlrule->iifindex = dev->ifindex;
}
if (tb[FRA_OIFNAME]) {
- struct net_device *dev;
-
nlrule->oifindex = -1;
nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, nlrule->oifname);
- if (dev)
- nlrule->oifindex = dev->ifindex;
}
if (tb[FRA_FWMARK]) {
@@ -619,11 +657,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
}
nlrule->target = nla_get_u32(tb[FRA_GOTO]);
- /* Backward jumps are prohibited to avoid endless loops */
- if (nlrule->target <= nlrule->pref) {
- NL_SET_ERR_MSG(extack, "Backward goto not supported");
- goto errout_free;
- }
} else if (nlrule->action == FR_ACT_GOTO) {
NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
goto errout_free;
@@ -662,6 +695,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
NL_SET_ERR_MSG(extack, "Invalid sport range");
goto errout_free;
}
+ if (!fib_rule_port_is_range(&nlrule->sport_range))
+ nlrule->sport_mask = U16_MAX;
+ }
+
+ if (tb[FRA_SPORT_MASK]) {
+ err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK],
+ &nlrule->sport_range,
+ &nlrule->sport_mask, extack);
+ if (err)
+ goto errout_free;
}
if (tb[FRA_DPORT_RANGE]) {
@@ -671,6 +714,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
NL_SET_ERR_MSG(extack, "Invalid dport range");
goto errout_free;
}
+ if (!fib_rule_port_is_range(&nlrule->dport_range))
+ nlrule->dport_mask = U16_MAX;
+ }
+
+ if (tb[FRA_DPORT_MASK]) {
+ err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK],
+ &nlrule->dport_range,
+ &nlrule->dport_mask, extack);
+ if (err)
+ goto errout_free;
}
*rule = nlrule;
@@ -683,6 +736,43 @@ errout:
return err;
}
+static int fib_nl2rule_rtnl(struct fib_rule *nlrule,
+ struct fib_rules_ops *ops,
+ struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ if (!tb[FRA_PRIORITY])
+ nlrule->pref = fib_default_rule_pref(ops);
+
+ /* Backward jumps are prohibited to avoid endless loops */
+ if (tb[FRA_GOTO] && nlrule->target <= nlrule->pref) {
+ NL_SET_ERR_MSG(extack, "Backward goto not supported");
+ return -EINVAL;
+ }
+
+ if (tb[FRA_IIFNAME]) {
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname);
+ if (dev) {
+ nlrule->iifindex = dev->ifindex;
+ nlrule->iif_is_l3_master = netif_is_l3_master(dev);
+ }
+ }
+
+ if (tb[FRA_OIFNAME]) {
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname);
+ if (dev) {
+ nlrule->oifindex = dev->ifindex;
+ nlrule->oif_is_l3_master = netif_is_l3_master(dev);
+ }
+ }
+
+ return 0;
+}
+
static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
struct nlattr **tb, struct fib_rule *rule)
{
@@ -719,9 +809,6 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
if (r->tun_id != rule->tun_id)
continue;
- if (r->fr_net != rule->fr_net)
- continue;
-
if (r->l3mdev != rule->l3mdev)
continue;
@@ -739,10 +826,16 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
&rule->sport_range))
continue;
+ if (r->sport_mask != rule->sport_mask)
+ continue;
+
if (!fib_rule_port_range_compare(&r->dport_range,
&rule->dport_range))
continue;
+ if (r->dport_mask != rule->dport_mask)
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return 1;
@@ -770,20 +863,25 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = {
[FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
[FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
[FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2),
+ [FRA_FLOWLABEL] = { .type = NLA_BE32 },
+ [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 },
+ [FRA_SPORT_MASK] = { .type = NLA_U16 },
+ [FRA_DPORT_MASK] = { .type = NLA_U16 },
+ [FRA_DSCP_MASK] = NLA_POLICY_MASK(NLA_U8, INET_DSCP_MASK >> 2),
};
-int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, bool rtnl_held)
{
- struct net *net = sock_net(skb->sk);
- struct fib_rule_hdr *frh = nlmsg_data(nlh);
- struct fib_rules_ops *ops = NULL;
struct fib_rule *rule = NULL, *r, *last = NULL;
- struct nlattr *tb[FRA_MAX + 1];
int err = -EINVAL, unresolved = 0;
+ struct fib_rules_ops *ops = NULL;
+ struct nlattr *tb[FRA_MAX + 1];
bool user_priority = false;
+ struct fib_rule_hdr *frh;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
}
@@ -802,10 +900,17 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
+ err = fib_nl2rule(net, nlh, extack, ops, tb, &rule, &user_priority);
if (err)
goto errout;
+ if (!rtnl_held)
+ rtnl_net_lock(net);
+
+ err = fib_nl2rule_rtnl(rule, ops, tb, extack);
+ if (err)
+ goto errout_free;
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
rule_exists(ops, frh, tb, rule)) {
err = -EEXIST;
@@ -867,31 +972,45 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
if (rule->tun_id)
ip_tunnel_need_metadata();
+ fib_rule_get(rule);
+
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
+
notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
+ fib_rule_put(rule);
flush_route_cache(ops);
rules_ops_put(ops);
return 0;
errout_free:
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
kfree(rule);
errout:
rules_ops_put(ops);
return err;
}
-EXPORT_SYMBOL_GPL(fib_nl_newrule);
+EXPORT_SYMBOL_GPL(fib_newrule);
-int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
- struct net *net = sock_net(skb->sk);
- struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ return fib_newrule(sock_net(skb->sk), skb, nlh, extack, false);
+}
+
+int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, bool rtnl_held)
+{
+ struct fib_rule *rule = NULL, *nlrule = NULL;
struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule = NULL, *r, *nlrule = NULL;
struct nlattr *tb[FRA_MAX+1];
- int err = -EINVAL;
bool user_priority = false;
+ struct fib_rule_hdr *frh;
+ int err = -EINVAL;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
}
@@ -910,25 +1029,32 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
+ err = fib_nl2rule(net, nlh, extack, ops, tb, &nlrule, &user_priority);
if (err)
goto errout;
+ if (!rtnl_held)
+ rtnl_net_lock(net);
+
+ err = fib_nl2rule_rtnl(nlrule, ops, tb, extack);
+ if (err)
+ goto errout_free;
+
rule = rule_find(ops, frh, tb, nlrule, user_priority);
if (!rule) {
err = -ENOENT;
- goto errout;
+ goto errout_free;
}
if (rule->flags & FIB_RULE_PERMANENT) {
err = -EPERM;
- goto errout;
+ goto errout_free;
}
if (ops->delete) {
err = ops->delete(rule);
if (err)
- goto errout;
+ goto errout_free;
}
if (rule->tun_id)
@@ -950,7 +1076,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
* current if it is goto rule, have actually been added.
*/
if (ops->nr_goto_rules > 0) {
- struct fib_rule *n;
+ struct fib_rule *n, *r;
n = list_next_entry(rule, list);
if (&n->list == &ops->rules_list || n->pref != rule->pref)
@@ -964,22 +1090,33 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
}
}
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
- NULL);
- notify_rule_change(RTM_DELRULE, rule, ops, nlh,
- NETLINK_CB(skb).portid);
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL);
+
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
+
+ notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
fib_rule_put(rule);
flush_route_cache(ops);
rules_ops_put(ops);
kfree(nlrule);
return 0;
-errout:
+errout_free:
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
kfree(nlrule);
+errout:
rules_ops_put(ops);
return err;
}
-EXPORT_SYMBOL_GPL(fib_nl_delrule);
+EXPORT_SYMBOL_GPL(fib_delrule);
+
+static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ return fib_delrule(sock_net(skb->sk), skb, nlh, extack, false);
+}
static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
struct fib_rule *rule)
@@ -998,7 +1135,9 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(1) /* FRA_PROTOCOL */
+ nla_total_size(1) /* FRA_IP_PROTO */
+ nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */
- + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */
+ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */
+ + nla_total_size(2) /* FRA_SPORT_MASK */
+ + nla_total_size(2); /* FRA_DPORT_MASK */
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -1039,14 +1178,14 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
if (rule->iifname[0]) {
if (nla_put_string(skb, FRA_IIFNAME, rule->iifname))
goto nla_put_failure;
- if (rule->iifindex == -1)
+ if (READ_ONCE(rule->iifindex) == -1)
frh->flags |= FIB_RULE_IIF_DETACHED;
}
if (rule->oifname[0]) {
if (nla_put_string(skb, FRA_OIFNAME, rule->oifname))
goto nla_put_failure;
- if (rule->oifindex == -1)
+ if (READ_ONCE(rule->oifindex) == -1)
frh->flags |= FIB_RULE_OIF_DETACHED;
}
@@ -1066,8 +1205,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
nla_put_uid_range(skb, &rule->uid_range)) ||
(fib_rule_port_range_set(&rule->sport_range) &&
nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) ||
+ (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK,
+ rule->sport_mask)) ||
(fib_rule_port_range_set(&rule->dport_range) &&
nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) ||
+ (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK,
+ rule->dport_mask)) ||
(rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto)))
goto nla_put_failure;
@@ -1119,12 +1262,12 @@ static int fib_valid_dumprule_req(const struct nlmsghdr *nlh,
{
struct fib_rule_hdr *frh;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request");
return -EINVAL;
}
- frh = nlmsg_data(nlh);
if (frh->dst_len || frh->src_len || frh->tos || frh->table ||
frh->res1 || frh->res2 || frh->action || frh->flags) {
NL_SET_ERR_MSG(extack,
@@ -1217,11 +1360,17 @@ static void attach_rules(struct list_head *rules, struct net_device *dev)
list_for_each_entry(rule, rules, list) {
if (rule->iifindex == -1 &&
- strcmp(dev->name, rule->iifname) == 0)
- rule->iifindex = dev->ifindex;
+ strcmp(dev->name, rule->iifname) == 0) {
+ WRITE_ONCE(rule->iifindex, dev->ifindex);
+ WRITE_ONCE(rule->iif_is_l3_master,
+ netif_is_l3_master(dev));
+ }
if (rule->oifindex == -1 &&
- strcmp(dev->name, rule->oifname) == 0)
- rule->oifindex = dev->ifindex;
+ strcmp(dev->name, rule->oifname) == 0) {
+ WRITE_ONCE(rule->oifindex, dev->ifindex);
+ WRITE_ONCE(rule->oif_is_l3_master,
+ netif_is_l3_master(dev));
+ }
}
}
@@ -1230,10 +1379,14 @@ static void detach_rules(struct list_head *rules, struct net_device *dev)
struct fib_rule *rule;
list_for_each_entry(rule, rules, list) {
- if (rule->iifindex == dev->ifindex)
- rule->iifindex = -1;
- if (rule->oifindex == dev->ifindex)
- rule->oifindex = -1;
+ if (rule->iifindex == dev->ifindex) {
+ WRITE_ONCE(rule->iifindex, -1);
+ WRITE_ONCE(rule->iif_is_l3_master, false);
+ }
+ if (rule->oifindex == dev->ifindex) {
+ WRITE_ONCE(rule->oifindex, -1);
+ WRITE_ONCE(rule->oif_is_l3_master, false);
+ }
}
}
@@ -1291,8 +1444,10 @@ static struct pernet_operations fib_rules_net_ops = {
};
static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = {
- {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule},
- {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule},
+ {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule,
+ .flags = RTNL_FLAG_DOIT_PERNET},
{.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule,
.flags = RTNL_FLAG_DUMP_UNLOCKED},
};
diff --git a/net/core/filter.c b/net/core/filter.c
index 2fb45a86f3dd..327ca73f9cd7 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -218,24 +218,36 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
return 0;
}
+static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset)
+{
+ if (likely(offset >= 0))
+ return offset;
+
+ if (offset >= SKF_NET_OFF)
+ return offset - SKF_NET_OFF + skb_network_offset(skb);
+
+ if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb))
+ return offset - SKF_LL_OFF + skb_mac_offset(skb);
+
+ return INT_MIN;
+}
+
BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- u8 tmp, *ptr;
+ u8 tmp;
const int len = sizeof(tmp);
- if (offset >= 0) {
- if (headlen - offset >= len)
- return *(u8 *)(data + offset);
- if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
- return tmp;
- } else {
- ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
- if (likely(ptr))
- return *(u8 *)ptr;
- }
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
- return -EFAULT;
+ if (headlen - offset >= len)
+ return *(u8 *)(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return tmp;
+ else
+ return -EFAULT;
}
BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
@@ -248,21 +260,19 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- __be16 tmp, *ptr;
+ __be16 tmp;
const int len = sizeof(tmp);
- if (offset >= 0) {
- if (headlen - offset >= len)
- return get_unaligned_be16(data + offset);
- if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
- return be16_to_cpu(tmp);
- } else {
- ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
- if (likely(ptr))
- return get_unaligned_be16(ptr);
- }
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
- return -EFAULT;
+ if (headlen - offset >= len)
+ return get_unaligned_be16(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be16_to_cpu(tmp);
+ else
+ return -EFAULT;
}
BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
@@ -275,21 +285,19 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- __be32 tmp, *ptr;
+ __be32 tmp;
const int len = sizeof(tmp);
- if (likely(offset >= 0)) {
- if (headlen - offset >= len)
- return get_unaligned_be32(data + offset);
- if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
- return be32_to_cpu(tmp);
- } else {
- ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
- if (likely(ptr))
- return get_unaligned_be32(ptr);
- }
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
- return -EFAULT;
+ if (headlen - offset >= len)
+ return get_unaligned_be32(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be32_to_cpu(tmp);
+ else
+ return -EFAULT;
}
BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
@@ -1960,10 +1968,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
bool do_mforce = flags & BPF_F_MARK_ENFORCE;
+ bool is_ipv6 = flags & BPF_F_IPV6;
__sum16 *ptr;
if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
- BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
+ BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6)))
return -EINVAL;
if (unlikely(offset > 0xffff || offset & 1))
return -EFAULT;
@@ -1979,7 +1988,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
if (unlikely(from != 0))
return -EINVAL;
- inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
+ inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6);
break;
case 2:
inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
@@ -2501,6 +2510,7 @@ int skb_do_redirect(struct sk_buff *skb)
goto out_drop;
skb->dev = dev;
dev_sw_netstats_rx_add(dev, skb->len);
+ skb_scrub_packet(skb, false);
return -EAGAIN;
}
return flags & BPF_F_NEIGH ?
@@ -4128,13 +4138,13 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
}
static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
- struct xdp_mem_info *mem_info, bool release)
+ enum xdp_mem_type mem_type, bool release)
{
struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);
if (release) {
xsk_buff_del_tail(zc_frag);
- __xdp_return(NULL, mem_info, false, zc_frag);
+ __xdp_return(0, mem_type, false, zc_frag);
} else {
zc_frag->data_end -= shrink;
}
@@ -4143,19 +4153,16 @@ static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
int shrink)
{
- struct xdp_mem_info *mem_info = &xdp->rxq->mem;
+ enum xdp_mem_type mem_type = xdp->rxq->mem.type;
bool release = skb_frag_size(frag) == shrink;
- if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
- bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release);
+ if (mem_type == MEM_TYPE_XSK_BUFF_POOL) {
+ bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release);
goto out;
}
- if (release) {
- struct page *page = skb_frag_page(frag);
-
- __xdp_return(page_address(page), mem_info, false, NULL);
- }
+ if (release)
+ __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL);
out:
return release;
@@ -4357,9 +4364,9 @@ u32 xdp_master_redirect(struct xdp_buff *xdp)
EXPORT_SYMBOL_GPL(xdp_master_redirect);
static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
- struct net_device *dev,
+ const struct net_device *dev,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
@@ -4380,10 +4387,10 @@ err:
return err;
}
-static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
- struct net_device *dev,
- struct xdp_frame *xdpf,
- struct bpf_prog *xdp_prog)
+static __always_inline int
+__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev,
+ struct xdp_frame *xdpf,
+ const struct bpf_prog *xdp_prog)
{
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
@@ -4452,7 +4459,7 @@ err:
}
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
@@ -4466,7 +4473,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
EXPORT_SYMBOL_GPL(xdp_do_redirect);
int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
- struct xdp_frame *xdpf, struct bpf_prog *xdp_prog)
+ struct xdp_frame *xdpf,
+ const struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
@@ -4481,9 +4489,9 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);
static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog, void *fwd,
- enum bpf_map_type map_type, u32 map_id,
- u32 flags)
+ const struct bpf_prog *xdp_prog,
+ void *fwd, enum bpf_map_type map_type,
+ u32 map_id, u32 flags)
{
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct bpf_map *map;
@@ -4537,7 +4545,8 @@ err:
}
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
- struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
+ struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
@@ -5223,6 +5232,25 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt)
+{
+ u32 sk_bpf_cb_flags;
+
+ if (getopt) {
+ *(u32 *)optval = sk->sk_bpf_cb_flags;
+ return 0;
+ }
+
+ sk_bpf_cb_flags = *(u32 *)optval;
+
+ if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
+ return -EINVAL;
+
+ sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
+
+ return 0;
+}
+
static int sol_socket_sockopt(struct sock *sk, int optname,
char *optval, int *optlen,
bool getopt)
@@ -5239,6 +5267,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
case SO_MAX_PACING_RATE:
case SO_BINDTOIFINDEX:
case SO_TXREHASH:
+ case SK_BPF_CB_FLAGS:
if (*optlen != sizeof(int))
return -EINVAL;
break;
@@ -5248,6 +5277,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
return -EINVAL;
}
+ if (optname == SK_BPF_CB_FLAGS)
+ return sk_bpf_set_get_cb_flags(sk, optval, getopt);
+
if (getopt) {
if (optname == SO_BINDTODEVICE)
return -EINVAL;
@@ -5260,6 +5292,38 @@ static int sol_socket_sockopt(struct sock *sk, int optname,
KERNEL_SOCKPTR(optval), *optlen);
}
+static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname,
+ char *optval, int optlen)
+{
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ switch (optname) {
+ case TCP_BPF_SOCK_OPS_CB_FLAGS: {
+ int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags;
+
+ memcpy(optval, &cb_flags, optlen);
+ break;
+ }
+ case TCP_BPF_RTO_MIN: {
+ int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min);
+
+ memcpy(optval, &rto_min_us, optlen);
+ break;
+ }
+ case TCP_BPF_DELACK_MAX: {
+ int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max);
+
+ memcpy(optval, &delack_max_us, optlen);
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
char *optval, int optlen)
{
@@ -5383,6 +5447,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
case TCP_USER_TIMEOUT:
case TCP_NOTSENT_LOWAT:
case TCP_SAVE_SYN:
+ case TCP_RTO_MAX_MS:
if (*optlen != sizeof(int))
return -EINVAL;
break;
@@ -5392,20 +5457,9 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
if (*optlen < 1)
return -EINVAL;
break;
- case TCP_BPF_SOCK_OPS_CB_FLAGS:
- if (*optlen != sizeof(int))
- return -EINVAL;
- if (getopt) {
- struct tcp_sock *tp = tcp_sk(sk);
- int cb_flags = tp->bpf_sock_ops_cb_flags;
-
- memcpy(optval, &cb_flags, *optlen);
- return 0;
- }
- return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
default:
if (getopt)
- return -EINVAL;
+ return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen);
return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
}
@@ -5501,6 +5555,11 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname,
return -EINVAL;
}
+static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock)
+{
+ return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+}
+
static int _bpf_setsockopt(struct sock *sk, int level, int optname,
char *optval, int optlen)
{
@@ -5651,6 +5710,9 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}
@@ -5736,6 +5798,9 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
int, level, int, optname, char *, optval, int, optlen)
{
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
int ret, copy_len = 0;
@@ -5778,6 +5843,9 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
struct sock *sk = bpf_sock->sk;
int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
return -EINVAL;
@@ -7587,6 +7655,9 @@ BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
u8 search_kind, search_len, copy_len, magic_len;
int ret;
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
/* 2 byte is the minimal option len except TCPOPT_NOP and
* TCPOPT_EOL which are useless for the bpf prog to learn
* and this helper disallow loading them also.
@@ -7652,7 +7723,7 @@ static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
};
@@ -7953,10 +8024,6 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_proto;
@@ -7982,10 +8049,6 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
if (func_proto)
return func_proto;
- func_proto = cgroup_current_func_proto(func_id, prog);
- if (func_proto)
- return func_proto;
-
switch (func_id) {
case BPF_FUNC_bind:
switch (prog->expected_attach_type) {
@@ -8076,6 +8139,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_load_bytes_relative_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_proto;
case BPF_FUNC_get_socket_uid:
return &bpf_get_socket_uid_proto;
case BPF_FUNC_perf_event_output:
@@ -8417,18 +8482,12 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_msg_pop_data_proto;
case BPF_FUNC_perf_event_output:
return &bpf_event_output_data_proto;
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
case BPF_FUNC_get_netns_cookie:
return &bpf_get_netns_cookie_sk_msg_proto;
-#ifdef CONFIG_CGROUP_NET_CLASSID
- case BPF_FUNC_get_cgroup_classid:
- return &bpf_get_cgroup_classid_curr_proto;
-#endif
default:
return bpf_sk_base_func_proto(func_id, prog);
}
@@ -9079,7 +9138,8 @@ static bool xdp_is_valid_access(int off, int size,
return __is_valid_xdp_access(off, size);
}
-void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act)
+void bpf_warn_invalid_xdp_action(const struct net_device *dev,
+ const struct bpf_prog *prog, u32 act)
{
const u32 act_max = XDP_REDIRECT;
@@ -9635,7 +9695,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, queue_mapping):
if (type == BPF_WRITE) {
- u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);
+ u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);
if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) {
*insn++ = BPF_JMP_A(0); /* noop */
@@ -9644,7 +9704,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
if (BPF_CLASS(si->code) == BPF_STX)
*insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
- *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
+ *insn++ = BPF_EMIT_STORE(BPF_H, si, offset);
} else {
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
bpf_target_off(struct sk_buff,
@@ -10358,10 +10418,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
} \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
- is_fullsock), \
+ is_locked_tcp_sock), \
fullsock_reg, si->src_reg, \
offsetof(struct bpf_sock_ops_kern, \
- is_fullsock)); \
+ is_locked_tcp_sock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
if (si->dst_reg == si->src_reg) \
*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
@@ -10446,10 +10506,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
temp)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
- is_fullsock), \
+ is_locked_tcp_sock), \
reg, si->dst_reg, \
offsetof(struct bpf_sock_ops_kern, \
- is_fullsock)); \
+ is_locked_tcp_sock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, sk),\
@@ -12062,6 +12122,25 @@ __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
#endif
}
+__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
+ u64 flags)
+{
+ struct sk_buff *skb;
+
+ if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB)
+ return -EOPNOTSUPP;
+
+ if (flags)
+ return -EINVAL;
+
+ skb = skops->skb;
+ skb_shinfo(skb)->tx_flags |= SKBTX_BPF;
+ TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF;
+ skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+
+ return 0;
+}
+
__bpf_kfunc_end_defs();
int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
@@ -12095,6 +12174,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)
+BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
+BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)
+
static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
.owner = THIS_MODULE,
.set = &bpf_kfunc_check_set_skb,
@@ -12115,6 +12198,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
.set = &bpf_kfunc_check_set_tcp_reqsk,
};
+static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_sock_ops,
+};
+
static int __init bpf_kfunc_init(void)
{
int ret;
@@ -12133,7 +12221,8 @@ static int __init bpf_kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
&bpf_kfunc_set_sock_addr);
- return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
}
late_initcall(bpf_kfunc_init);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 0e638a37aa09..1b61bb25ba0e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -106,7 +106,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net,
#endif /* CONFIG_BPF_SYSCALL */
/**
- * __skb_flow_get_ports - extract the upper layer ports and return them
+ * skb_flow_get_ports - extract the upper layer ports and return them
* @skb: sk_buff to extract the ports from
* @thoff: transport header offset
* @ip_proto: protocol for which to get port offset
@@ -116,8 +116,8 @@ int flow_dissector_bpf_prog_attach_check(struct net *net,
* The function will try to retrieve the ports at offset thoff + poff where poff
* is the protocol port offset returned from proto_ports_offset
*/
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
- const void *data, int hlen)
+__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+ const void *data, int hlen)
{
int poff = proto_ports_offset(ip_proto);
@@ -137,7 +137,7 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
return 0;
}
-EXPORT_SYMBOL(__skb_flow_get_ports);
+EXPORT_SYMBOL(skb_flow_get_ports);
static bool icmp_has_id(u8 type)
{
@@ -853,23 +853,30 @@ __skb_flow_dissect_ports(const struct sk_buff *skb,
void *target_container, const void *data,
int nhoff, u8 ip_proto, int hlen)
{
- enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
- struct flow_dissector_key_ports *key_ports;
+ struct flow_dissector_key_ports_range *key_ports_range = NULL;
+ struct flow_dissector_key_ports *key_ports = NULL;
+ __be32 ports;
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
- dissector_ports = FLOW_DISSECTOR_KEY_PORTS;
- else if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_PORTS_RANGE))
- dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE;
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS,
+ target_container);
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE))
+ key_ports_range = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE,
+ target_container);
- if (dissector_ports == FLOW_DISSECTOR_KEY_MAX)
+ if (!key_ports && !key_ports_range)
return;
- key_ports = skb_flow_dissector_target(flow_dissector,
- dissector_ports,
- target_container);
- key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
- data, hlen);
+ ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen);
+
+ if (key_ports)
+ key_ports->ports = ports;
+
+ if (key_ports_range)
+ key_ports_range->tp.ports = ports;
}
static void
@@ -924,6 +931,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
struct flow_dissector *flow_dissector,
void *target_container)
{
+ struct flow_dissector_key_ports_range *key_ports_range = NULL;
struct flow_dissector_key_ports *key_ports = NULL;
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
@@ -968,20 +976,21 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
- if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
key_ports = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_PORTS,
target_container);
- else if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_PORTS_RANGE))
- key_ports = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_PORTS_RANGE,
- target_container);
-
- if (key_ports) {
key_ports->src = flow_keys->sport;
key_ports->dst = flow_keys->dport;
}
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE)) {
+ key_ports_range = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE,
+ target_container);
+ key_ports_range->tp.src = flow_keys->sport;
+ key_ports_range->tp.dst = flow_keys->dport;
+ }
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
@@ -1108,10 +1117,12 @@ bool __skb_flow_dissect(const struct net *net,
FLOW_DISSECTOR_KEY_BASIC,
target_container);
+ rcu_read_lock();
+
if (skb) {
if (!net) {
if (skb->dev)
- net = dev_net(skb->dev);
+ net = dev_net_rcu(skb->dev);
else if (skb->sk)
net = sock_net(skb->sk);
}
@@ -1122,7 +1133,6 @@ bool __skb_flow_dissect(const struct net *net,
enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
struct bpf_prog_array *run_array;
- rcu_read_lock();
run_array = rcu_dereference(init_net.bpf.run_array[type]);
if (!run_array)
run_array = rcu_dereference(net->bpf.run_array[type]);
@@ -1150,17 +1160,17 @@ bool __skb_flow_dissect(const struct net *net,
prog = READ_ONCE(run_array->items[0].prog);
result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
hlen, flags);
- if (result == BPF_FLOW_DISSECTOR_CONTINUE)
- goto dissect_continue;
- __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
- target_container);
- rcu_read_unlock();
- return result == BPF_OK;
+ if (result != BPF_FLOW_DISSECTOR_CONTINUE) {
+ __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
+ target_container);
+ rcu_read_unlock();
+ return result == BPF_OK;
+ }
}
-dissect_continue:
- rcu_read_unlock();
}
+ rcu_read_unlock();
+
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
struct ethhdr *eth = eth_hdr(skb);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 412816076b8b..2b821b9a8699 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -177,7 +177,7 @@ int gen_new_estimator(struct gnet_stats_basic_sync *bstats,
spin_lock_bh(lock);
old = rcu_dereference_protected(*rate_est, 1);
if (old) {
- del_timer_sync(&old->timer);
+ timer_delete_sync(&old->timer);
est->avbps = old->avbps;
est->avpps = old->avpps;
}
diff --git a/net/core/gro.c b/net/core/gro.c
index d1f44084e978..b350e5b69549 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -7,9 +7,6 @@
#define MAX_GRO_SKBS 8
-/* This should be increased if a protocol with a bigger head is added. */
-#define GRO_MAX_HEAD (MAX_HEADER + 128)
-
static DEFINE_SPINLOCK(offload_lock);
/**
@@ -253,8 +250,7 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
return 0;
}
-
-static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
+static void gro_complete(struct gro_node *gro, struct sk_buff *skb)
{
struct list_head *head = &net_hotdata.offload_base;
struct packet_offload *ptype;
@@ -287,43 +283,43 @@ static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
}
out:
- gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
+ gro_normal_one(gro, skb, NAPI_GRO_CB(skb)->count);
}
-static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
- bool flush_old)
+static void __gro_flush_chain(struct gro_node *gro, u32 index, bool flush_old)
{
- struct list_head *head = &napi->gro_hash[index].list;
+ struct list_head *head = &gro->hash[index].list;
struct sk_buff *skb, *p;
list_for_each_entry_safe_reverse(skb, p, head, list) {
if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
return;
skb_list_del_init(skb);
- napi_gro_complete(napi, skb);
- napi->gro_hash[index].count--;
+ gro_complete(gro, skb);
+ gro->hash[index].count--;
}
- if (!napi->gro_hash[index].count)
- __clear_bit(index, &napi->gro_bitmask);
+ if (!gro->hash[index].count)
+ __clear_bit(index, &gro->bitmask);
}
-/* napi->gro_hash[].list contains packets ordered by age.
+/*
+ * gro->hash[].list contains packets ordered by age.
* youngest packets at the head of it.
* Complete skbs in reverse order to reduce latencies.
*/
-void napi_gro_flush(struct napi_struct *napi, bool flush_old)
+void __gro_flush(struct gro_node *gro, bool flush_old)
{
- unsigned long bitmask = napi->gro_bitmask;
+ unsigned long bitmask = gro->bitmask;
unsigned int i, base = ~0U;
while ((i = ffs(bitmask)) != 0) {
bitmask >>= i;
base += i;
- __napi_gro_flush_chain(napi, base, flush_old);
+ __gro_flush_chain(gro, base, flush_old);
}
}
-EXPORT_SYMBOL(napi_gro_flush);
+EXPORT_SYMBOL(__gro_flush);
static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb,
const struct sk_buff *p,
@@ -442,7 +438,7 @@ static void gro_try_pull_from_frag0(struct sk_buff *skb)
gro_pull_from_frag0(skb, grow);
}
-static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
+static void gro_flush_oldest(struct gro_node *gro, struct list_head *head)
{
struct sk_buff *oldest;
@@ -458,14 +454,15 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
* SKB to the chain.
*/
skb_list_del_init(oldest);
- napi_gro_complete(napi, oldest);
+ gro_complete(gro, oldest);
}
-static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+static enum gro_result dev_gro_receive(struct gro_node *gro,
+ struct sk_buff *skb)
{
u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
- struct gro_list *gro_list = &napi->gro_hash[bucket];
struct list_head *head = &net_hotdata.offload_base;
+ struct gro_list *gro_list = &gro->hash[bucket];
struct packet_offload *ptype;
__be16 type = skb->protocol;
struct sk_buff *pp = NULL;
@@ -529,7 +526,7 @@ found_ptype:
if (pp) {
skb_list_del_init(pp);
- napi_gro_complete(napi, pp);
+ gro_complete(gro, pp);
gro_list->count--;
}
@@ -540,7 +537,7 @@ found_ptype:
goto normal;
if (unlikely(gro_list->count >= MAX_GRO_SKBS))
- gro_flush_oldest(napi, &gro_list->list);
+ gro_flush_oldest(gro, &gro_list->list);
else
gro_list->count++;
@@ -554,10 +551,10 @@ found_ptype:
ret = GRO_HELD;
ok:
if (gro_list->count) {
- if (!test_bit(bucket, &napi->gro_bitmask))
- __set_bit(bucket, &napi->gro_bitmask);
- } else if (test_bit(bucket, &napi->gro_bitmask)) {
- __clear_bit(bucket, &napi->gro_bitmask);
+ if (!test_bit(bucket, &gro->bitmask))
+ __set_bit(bucket, &gro->bitmask);
+ } else if (test_bit(bucket, &gro->bitmask)) {
+ __clear_bit(bucket, &gro->bitmask);
}
return ret;
@@ -596,13 +593,12 @@ struct packet_offload *gro_find_complete_by_type(__be16 type)
}
EXPORT_SYMBOL(gro_find_complete_by_type);
-static gro_result_t napi_skb_finish(struct napi_struct *napi,
- struct sk_buff *skb,
- gro_result_t ret)
+static gro_result_t gro_skb_finish(struct gro_node *gro, struct sk_buff *skb,
+ gro_result_t ret)
{
switch (ret) {
case GRO_NORMAL:
- gro_normal_one(napi, skb, 1);
+ gro_normal_one(gro, skb, 1);
break;
case GRO_MERGED_FREE:
@@ -623,21 +619,21 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi,
return ret;
}
-gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
+gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb)
{
gro_result_t ret;
- skb_mark_napi_id(skb, napi);
+ __skb_mark_napi_id(skb, gro);
trace_napi_gro_receive_entry(skb);
skb_gro_reset_offset(skb, 0);
- ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
+ ret = gro_skb_finish(gro, skb, dev_gro_receive(gro, skb));
trace_napi_gro_receive_exit(ret);
return ret;
}
-EXPORT_SYMBOL(napi_gro_receive);
+EXPORT_SYMBOL(gro_receive_skb);
static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
{
@@ -656,6 +652,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
skb->pkt_type = PACKET_HOST;
skb->encapsulation = 0;
+ skb->ip_summed = CHECKSUM_NONE;
skb_shinfo(skb)->gso_type = 0;
skb_shinfo(skb)->gso_size = 0;
if (unlikely(skb->slow_gro)) {
@@ -693,7 +690,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
__skb_push(skb, ETH_HLEN);
skb->protocol = eth_type_trans(skb, skb->dev);
if (ret == GRO_NORMAL)
- gro_normal_one(napi, skb, 1);
+ gro_normal_one(&napi->gro, skb, 1);
break;
case GRO_MERGED_FREE:
@@ -762,7 +759,7 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
trace_napi_gro_frags_entry(skb);
- ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
+ ret = napi_frags_finish(napi, skb, dev_gro_receive(&napi->gro, skb));
trace_napi_gro_frags_exit(ret);
return ret;
@@ -794,3 +791,37 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
return sum;
}
EXPORT_SYMBOL(__skb_gro_checksum_complete);
+
+void gro_init(struct gro_node *gro)
+{
+ for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) {
+ INIT_LIST_HEAD(&gro->hash[i].list);
+ gro->hash[i].count = 0;
+ }
+
+ gro->bitmask = 0;
+ gro->cached_napi_id = 0;
+
+ INIT_LIST_HEAD(&gro->rx_list);
+ gro->rx_count = 0;
+}
+
+void gro_cleanup(struct gro_node *gro)
+{
+ struct sk_buff *skb, *n;
+
+ for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) {
+ list_for_each_entry_safe(skb, n, &gro->hash[i].list, list)
+ kfree_skb(skb);
+
+ gro->hash[i].count = 0;
+ }
+
+ gro->bitmask = 0;
+ gro->cached_napi_id = 0;
+
+ list_for_each_entry_safe(skb, n, &gro->rx_list, list)
+ kfree_skb(skb);
+
+ gro->rx_count = 0;
+}
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
index d0aaaaa556f2..0bc893d5f07b 100644
--- a/net/core/hotdata.c
+++ b/net/core/hotdata.c
@@ -7,7 +7,6 @@
struct net_hotdata net_hotdata __cacheline_aligned = {
.offload_base = LIST_HEAD_INIT(net_hotdata.offload_base),
- .ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all),
.gro_normal_batch = 8,
.netdev_budget = 300,
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index cb04ef2b9807..864f3bbc3a4c 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -183,7 +183,7 @@ static void linkwatch_do_dev(struct net_device *dev)
else
dev_deactivate(dev);
- netdev_state_change(dev);
+ netif_state_change(dev);
}
/* Note: our callers are responsible for calling netdev_tracker_free().
* This is the reason we use __dev_put() instead of dev_put().
@@ -240,7 +240,9 @@ static void __linkwatch_run_queue(int urgent_only)
*/
netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
spin_unlock_irq(&lweventlist_lock);
+ netdev_lock_ops(dev);
linkwatch_do_dev(dev);
+ netdev_unlock_ops(dev);
do_dev--;
spin_lock_irq(&lweventlist_lock);
}
@@ -253,25 +255,41 @@ static void __linkwatch_run_queue(int urgent_only)
spin_unlock_irq(&lweventlist_lock);
}
-void linkwatch_sync_dev(struct net_device *dev)
+static bool linkwatch_clean_dev(struct net_device *dev)
{
unsigned long flags;
- int clean = 0;
+ bool clean = false;
spin_lock_irqsave(&lweventlist_lock, flags);
if (!list_empty(&dev->link_watch_list)) {
list_del_init(&dev->link_watch_list);
- clean = 1;
+ clean = true;
/* We must release netdev tracker under
* the spinlock protection.
*/
netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
- if (clean)
+
+ return clean;
+}
+
+void __linkwatch_sync_dev(struct net_device *dev)
+{
+ netdev_ops_assert_locked(dev);
+
+ if (linkwatch_clean_dev(dev))
linkwatch_do_dev(dev);
}
+void linkwatch_sync_dev(struct net_device *dev)
+{
+ if (linkwatch_clean_dev(dev)) {
+ netdev_lock_ops(dev);
+ linkwatch_do_dev(dev);
+ netdev_unlock_ops(dev);
+ }
+}
/* Must be called with the rtnl semaphore held */
void linkwatch_run_queue(void)
diff --git a/net/core/rtnl_net_debug.c b/net/core/lock_debug.c
index f406045cbd0e..9e9fb25314b9 100644
--- a/net/core/rtnl_net_debug.c
+++ b/net/core/lock_debug.c
@@ -6,10 +6,11 @@
#include <linux/notifier.h>
#include <linux/rtnetlink.h>
#include <net/net_namespace.h>
+#include <net/netdev_lock.h>
#include <net/netns/generic.h>
-static int rtnl_net_debug_event(struct notifier_block *nb,
- unsigned long event, void *ptr)
+int netdev_debug_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
@@ -17,17 +18,21 @@ static int rtnl_net_debug_event(struct notifier_block *nb,
/* Keep enum and don't add default to trigger -Werror=switch */
switch (cmd) {
+ case NETDEV_XDP_FEAT_CHANGE:
+ netdev_assert_locked(dev);
+ fallthrough;
+ case NETDEV_CHANGE:
+ case NETDEV_REGISTER:
case NETDEV_UP:
+ netdev_ops_assert_locked(dev);
+ fallthrough;
case NETDEV_DOWN:
case NETDEV_REBOOT:
- case NETDEV_CHANGE:
- case NETDEV_REGISTER:
case NETDEV_UNREGISTER:
case NETDEV_CHANGEMTU:
case NETDEV_CHANGEADDR:
case NETDEV_PRE_CHANGEADDR:
case NETDEV_GOING_DOWN:
- case NETDEV_CHANGENAME:
case NETDEV_FEAT_CHANGE:
case NETDEV_BONDING_FAILOVER:
case NETDEV_PRE_UP:
@@ -56,25 +61,17 @@ static int rtnl_net_debug_event(struct notifier_block *nb,
case NETDEV_OFFLOAD_XSTATS_DISABLE:
case NETDEV_OFFLOAD_XSTATS_REPORT_USED:
case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA:
- case NETDEV_XDP_FEAT_CHANGE:
ASSERT_RTNL();
break;
- /* Once an event fully supports RTNL_NET, move it here
- * and remove "if (0)" below.
- *
- * case NETDEV_XXX:
- * ASSERT_RTNL_NET(net);
- * break;
- */
- }
-
- /* Just to avoid unused-variable error for dev and net. */
- if (0)
+ case NETDEV_CHANGENAME:
ASSERT_RTNL_NET(net);
+ break;
+ }
return NOTIFY_DONE;
}
+EXPORT_SYMBOL_NS_GPL(netdev_debug_event, "NETDEV_INTERNAL");
static int rtnl_net_debug_net_id;
@@ -83,7 +80,7 @@ static int __net_init rtnl_net_debug_net_init(struct net *net)
struct notifier_block *nb;
nb = net_generic(net, rtnl_net_debug_net_id);
- nb->notifier_call = rtnl_net_debug_event;
+ nb->notifier_call = netdev_debug_event;
return register_netdevice_notifier_net(net, nb);
}
@@ -104,14 +101,14 @@ static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = {
};
static struct notifier_block rtnl_net_debug_block = {
- .notifier_call = rtnl_net_debug_event,
+ .notifier_call = netdev_debug_event,
};
static int __init rtnl_net_debug_init(void)
{
int ret;
- ret = register_pernet_device(&rtnl_net_debug_net_ops);
+ ret = register_pernet_subsys(&rtnl_net_debug_net_ops);
if (ret)
return ret;
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 711cd3b4347a..f9d76d85d04f 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -23,6 +23,8 @@
#include <net/ip6_fib.h>
#include <net/rtnh.h>
+#include "dev.h"
+
DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled);
@@ -158,21 +160,14 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack)
return ret;
}
- rcu_read_lock();
- ops = rcu_dereference(lwtun_encaps[encap_type]);
- rcu_read_unlock();
+ ops = rcu_access_pointer(lwtun_encaps[encap_type]);
#ifdef CONFIG_MODULES
if (!ops) {
const char *encap_type_str = lwtunnel_encap_str(encap_type);
if (encap_type_str) {
- __rtnl_unlock();
request_module("rtnl-lwt-%s", encap_type_str);
- rtnl_lock();
-
- rcu_read_lock();
- ops = rcu_dereference(lwtun_encaps[encap_type]);
- rcu_read_unlock();
+ ops = rcu_access_pointer(lwtun_encaps[encap_type]);
}
}
#endif
@@ -206,8 +201,7 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining,
}
encap_type = nla_get_u16(nla_entype);
- if (lwtunnel_valid_encap_type(encap_type,
- extack) != 0)
+ if (lwtunnel_valid_encap_type(encap_type, extack))
return -EOPNOTSUPP;
}
}
@@ -325,82 +319,132 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
+
+ local_bh_disable();
- if (!dst)
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
- lwtstate->type > LWTUNNEL_ENCAP_MAX)
- return 0;
+ lwtstate->type > LWTUNNEL_ENCAP_MAX) {
+ ret = 0;
+ goto out;
+ }
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->output))
+ if (likely(ops && ops->output)) {
+ dev_xmit_recursion_inc();
ret = ops->output(net, sk, skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
goto drop;
- return ret;
+ goto out;
drop:
kfree_skb(skb);
+out:
+ local_bh_enable();
return ret;
}
EXPORT_SYMBOL_GPL(lwtunnel_output);
int lwtunnel_xmit(struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
- if (!dst)
+ local_bh_disable();
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
- lwtstate->type > LWTUNNEL_ENCAP_MAX)
- return 0;
+ lwtstate->type > LWTUNNEL_ENCAP_MAX) {
+ ret = 0;
+ goto out;
+ }
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->xmit))
+ if (likely(ops && ops->xmit)) {
+ dev_xmit_recursion_inc();
ret = ops->xmit(skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
goto drop;
- return ret;
+ goto out;
drop:
kfree_skb(skb);
+out:
+ local_bh_enable();
return ret;
}
EXPORT_SYMBOL_GPL(lwtunnel_xmit);
int lwtunnel_input(struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
- if (!dst)
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
@@ -410,8 +454,11 @@ int lwtunnel_input(struct sk_buff *skb)
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->input))
+ if (likely(ops && ops->input)) {
+ dev_xmit_recursion_inc();
ret = ops->input(skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 89656d180bc6..a6e2c91ec3e7 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -309,7 +309,7 @@ static void neigh_add_timer(struct neighbour *n, unsigned long when)
static int neigh_del_timer(struct neighbour *n)
{
if ((n->nud_state & NUD_IN_TIMER) &&
- del_timer(&n->timer)) {
+ timer_delete(&n->timer)) {
neigh_release(n);
return 1;
}
@@ -427,7 +427,7 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
tbl->family);
if (skb_queue_empty_lockless(&tbl->proxy_queue))
- del_timer_sync(&tbl->proxy_timer);
+ timer_delete_sync(&tbl->proxy_timer);
return 0;
}
@@ -518,7 +518,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
if (!ret)
return NULL;
- hash_heads = kvzalloc(size, GFP_ATOMIC);
+ hash_heads = kzalloc(size, GFP_ATOMIC);
if (!hash_heads) {
kfree(ret);
return NULL;
@@ -536,7 +536,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
struct neigh_hash_table,
rcu);
- kvfree(nht->hash_heads);
+ kfree(nht->hash_heads);
kfree(nht);
}
@@ -832,12 +832,10 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
return -ENOENT;
}
-static void neigh_parms_destroy(struct neigh_parms *parms);
-
static inline void neigh_parms_put(struct neigh_parms *parms)
{
if (refcount_dec_and_test(&parms->refcnt))
- neigh_parms_destroy(parms);
+ kfree(parms);
}
/*
@@ -1519,7 +1517,7 @@ out:
return rc;
out_kfree_skb:
rc = -EINVAL;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
goto out;
}
EXPORT_SYMBOL(neigh_resolve_output);
@@ -1543,7 +1541,7 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
err = dev_queue_xmit(skb);
else {
err = -EINVAL;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
}
return err;
}
@@ -1599,7 +1597,7 @@ static void neigh_proxy_process(struct timer_list *t)
} else if (!sched_next || tdif < sched_next)
sched_next = tdif;
}
- del_timer(&tbl->proxy_timer);
+ timer_delete(&tbl->proxy_timer);
if (sched_next)
mod_timer(&tbl->proxy_timer, jiffies + sched_next);
spin_unlock(&tbl->proxy_queue.lock);
@@ -1630,7 +1628,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
spin_lock(&tbl->proxy_queue.lock);
- if (del_timer(&tbl->proxy_timer)) {
+ if (timer_delete(&tbl->proxy_timer)) {
if (time_before(tbl->proxy_timer.expires, sched_next))
sched_next = tbl->proxy_timer.expires;
}
@@ -1713,11 +1711,6 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
}
EXPORT_SYMBOL(neigh_parms_release);
-static void neigh_parms_destroy(struct neigh_parms *parms)
-{
- kfree(parms);
-}
-
static struct lock_class_key neigh_table_proxy_queue_class;
static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
@@ -1793,7 +1786,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl)
/* It is not clean... Fix it to unload IPv6 module safely */
cancel_delayed_work_sync(&tbl->managed_work);
cancel_delayed_work_sync(&tbl->gc_work);
- del_timer_sync(&tbl->proxy_timer);
+ timer_delete_sync(&tbl->proxy_timer);
pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family);
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
@@ -2250,6 +2243,7 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_IFINDEX] = { .type = NLA_U32 },
[NDTPA_QUEUE_LEN] = { .type = NLA_U32 },
+ [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 },
[NDTPA_PROXY_QLEN] = { .type = NLA_U32 },
[NDTPA_APP_PROBES] = { .type = NLA_U32 },
[NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
@@ -2436,12 +2430,12 @@ static int neightbl_valid_dump_info(const struct nlmsghdr *nlh,
{
struct ndtmsg *ndtm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) {
+ ndtm = nlmsg_payload(nlh, sizeof(*ndtm));
+ if (!ndtm) {
NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request");
return -EINVAL;
}
- ndtm = nlmsg_data(nlh);
if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) {
NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request");
return -EINVAL;
@@ -2753,12 +2747,12 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh,
if (strict_check) {
struct ndmsg *ndm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request");
return -EINVAL;
}
- ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex ||
ndm->ndm_state || ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request");
@@ -2861,12 +2855,12 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh,
struct ndmsg *ndm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request");
return -EINVAL;
}
- ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request");
@@ -3447,10 +3441,12 @@ static const struct seq_operations neigh_stat_seq_ops = {
static void __neigh_notify(struct neighbour *n, int type, int flags,
u32 pid)
{
- struct net *net = dev_net(n->dev);
struct sk_buff *skb;
int err = -ENOBUFS;
+ struct net *net;
+ rcu_read_lock();
+ net = dev_net_rcu(n->dev);
skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
goto errout;
@@ -3463,9 +3459,11 @@ static void __neigh_notify(struct neighbour *n, int type, int flags,
goto errout;
}
rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
- return;
+ goto out;
errout:
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+out:
+ rcu_read_unlock();
}
void neigh_app_ns(struct neighbour *n)
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index fa6d3969734a..4f0f0709a1cb 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -132,8 +132,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
+ /* Pairs with WRITE_ONCE() in skb_flow_limit() */
if (fl)
- flow_limit_count = fl->count;
+ flow_limit_count = READ_ONCE(fl->count);
rcu_read_unlock();
#endif
@@ -144,11 +145,11 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x\n",
- sd->processed, atomic_read(&sd->dropped),
- sd->time_squeeze, 0,
+ READ_ONCE(sd->processed), atomic_read(&sd->dropped),
+ READ_ONCE(sd->time_squeeze), 0,
0, 0, 0, 0, /* was fastroute */
0, /* was cpu_collision */
- sd->received_rps, flow_limit_count,
+ READ_ONCE(sd->received_rps), flow_limit_count,
input_qlen + process_qlen, (int)seq->index,
input_qlen, process_qlen);
return 0;
@@ -185,7 +186,13 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
}
}
- list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) {
+ list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+
+ list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) {
if (i == pos)
return pt;
++i;
@@ -210,6 +217,7 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ struct net *net = seq_file_net(seq);
struct net_device *dev;
struct packet_type *pt;
struct list_head *nxt;
@@ -232,15 +240,22 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
goto found;
}
}
-
- nxt = net_hotdata.ptype_all.next;
- goto ptype_all;
+ nxt = net->ptype_all.next;
+ goto net_ptype_all;
}
- if (pt->type == htons(ETH_P_ALL)) {
-ptype_all:
- if (nxt != &net_hotdata.ptype_all)
+ if (pt->af_packet_net) {
+net_ptype_all:
+ if (nxt != &net->ptype_all && nxt != &net->ptype_specific)
goto found;
+
+ if (nxt == &net->ptype_all) {
+ /* continue with ->ptype_specific if it's not empty */
+ nxt = net->ptype_specific.next;
+ if (nxt != &net->ptype_specific)
+ goto found;
+ }
+
hash = 0;
nxt = ptype_base[0].next;
} else
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 2d9afc6e2161..1ace0cd01adc 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -23,6 +23,7 @@
#include <linux/of.h>
#include <linux/of_net.h>
#include <linux/cpu.h>
+#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/rps.h>
@@ -36,12 +37,93 @@ static const char fmt_uint[] = "%u\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
-/* Caller holds RTNL or RCU */
+/* Caller holds RTNL, netdev->lock or RCU */
static inline int dev_isalive(const struct net_device *dev)
{
return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED;
}
+/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active,
+ * when unregistering a net device and accessing associated sysfs files. The
+ * potential deadlock is as follow:
+ *
+ * CPU 0 CPU 1
+ *
+ * rtnl_lock vfs_read
+ * unregister_netdevice_many kernfs_seq_start
+ * device_del / kobject_put kernfs_get_active (kn->active++)
+ * kernfs_drain sysfs_kf_seq_show
+ * wait_event( rtnl_lock
+ * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release
+ * -> waits on CPU 1 to decrease kn->active the rtnl lock.
+ *
+ * The historical fix was to use rtnl_trylock with restart_syscall to bail out
+ * of sysfs operations when the lock couldn't be taken. This fixed the above
+ * issue as it allowed CPU 1 to bail out of the ABBA situation.
+ *
+ * But it came with performances issues, as syscalls are being restarted in
+ * loops when there was contention on the rtnl lock, with huge slow downs in
+ * specific scenarios (e.g. lots of virtual interfaces created and userspace
+ * daemons querying their attributes).
+ *
+ * The idea below is to bail out of the active kernfs_node protection
+ * (kn->active) while trying to take the rtnl lock.
+ *
+ * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The
+ * net device is guaranteed to be alive if this returns successfully.
+ */
+static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr,
+ struct net_device *ndev)
+{
+ struct kernfs_node *kn;
+ int ret = 0;
+
+ /* First, we hold a reference to the net device as the unregistration
+ * path might run in parallel. This will ensure the net device and the
+ * associated sysfs objects won't be freed while we try to take the rtnl
+ * lock.
+ */
+ dev_hold(ndev);
+ /* sysfs_break_active_protection was introduced to allow self-removal of
+ * devices and their associated sysfs files by bailing out of the
+ * sysfs/kernfs protection. We do this here to allow the unregistration
+ * path to complete in parallel. The following takes a reference on the
+ * kobject and the kernfs_node being accessed.
+ *
+ * This works because we hold a reference onto the net device and the
+ * unregistration path will wait for us eventually in netdev_run_todo
+ * (outside an rtnl lock section).
+ */
+ kn = sysfs_break_active_protection(kobj, attr);
+ /* We can now try to take the rtnl lock. This can't deadlock us as the
+ * unregistration path is able to drain sysfs files (kernfs_node) thanks
+ * to the above dance.
+ */
+ if (rtnl_lock_interruptible()) {
+ ret = -ERESTARTSYS;
+ goto unbreak;
+ }
+ /* Check dismantle on the device hasn't started, otherwise deny the
+ * operation.
+ */
+ if (!dev_isalive(ndev)) {
+ rtnl_unlock();
+ ret = -ENODEV;
+ goto unbreak;
+ }
+ /* We are now sure the device dismantle hasn't started nor that it can
+ * start before we exit the locking section as we hold the rtnl lock.
+ * There's no need to keep unbreaking the sysfs protection nor to hold
+ * a net device reference from that point; that was only needed to take
+ * the rtnl lock.
+ */
+unbreak:
+ sysfs_unbreak_active_protection(kn);
+ dev_put(ndev);
+
+ return ret;
+}
+
/* use same locking rules as GIF* ioctl's */
static ssize_t netdev_show(const struct device *dev,
struct device_attribute *attr, char *buf,
@@ -95,16 +177,46 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
if (ret)
goto err;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ goto err;
+
+ ret = (*set)(netdev, new);
+ if (ret == 0)
+ ret = len;
+
+ rtnl_unlock();
+ err:
+ return ret;
+}
+
+/* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */
+static ssize_t
+netdev_lock_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len,
+ int (*set)(struct net_device *, unsigned long))
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
+ unsigned long new;
+ int ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 0, &new);
+ if (ret)
+ return ret;
+
+ netdev_lock(netdev);
if (dev_isalive(netdev)) {
ret = (*set)(netdev, new);
if (ret == 0)
ret = len;
}
- rtnl_unlock();
- err:
+ netdev_unlock(netdev);
+
return ret;
}
@@ -190,7 +302,7 @@ static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
struct net_device *netdev = to_net_dev(dev);
/* The check is also done in change_carrier; this helps returning early
- * without hitting the trylock/restart in netdev_store.
+ * without hitting the locking section in netdev_store.
*/
if (!netdev->netdev_ops->ndo_change_carrier)
return -EOPNOTSUPP;
@@ -202,11 +314,13 @@ static ssize_t carrier_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
- int ret = -EINVAL;
+ int ret;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+ ret = -EINVAL;
if (netif_running(netdev)) {
/* Synchronize carrier state with link watch,
* see also rtnl_getlink().
@@ -215,8 +329,8 @@ static ssize_t carrier_show(struct device *dev,
ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev));
}
- rtnl_unlock();
+ rtnl_unlock();
return ret;
}
static DEVICE_ATTR_RW(carrier);
@@ -228,14 +342,16 @@ static ssize_t speed_show(struct device *dev,
int ret = -EINVAL;
/* The check is also done in __ethtool_get_link_ksettings; this helps
- * returning early without hitting the trylock/restart below.
+ * returning early without hitting the locking section below.
*/
if (!netdev->ethtool_ops->get_link_ksettings)
return ret;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+ ret = -EINVAL;
if (netif_running(netdev)) {
struct ethtool_link_ksettings cmd;
@@ -254,14 +370,16 @@ static ssize_t duplex_show(struct device *dev,
int ret = -EINVAL;
/* The check is also done in __ethtool_get_link_ksettings; this helps
- * returning early without hitting the trylock/restart below.
+ * returning early without hitting the locking section below.
*/
if (!netdev->ethtool_ops->get_link_ksettings)
return ret;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+ ret = -EINVAL;
if (netif_running(netdev)) {
struct ethtool_link_ksettings cmd;
@@ -420,7 +538,7 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
+ return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout);
}
NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
@@ -440,7 +558,8 @@ static ssize_t napi_defer_hard_irqs_store(struct device *dev,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs);
+ return netdev_lock_store(dev, attr, buf, len,
+ change_napi_defer_hard_irqs);
}
NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint);
@@ -450,7 +569,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
struct net_device *netdev = to_net_dev(dev);
struct net *net = dev_net(netdev);
size_t count = len;
- ssize_t ret = 0;
+ ssize_t ret;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -459,16 +578,15 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
if (len > 0 && buf[len - 1] == '\n')
--count;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (dev_isalive(netdev)) {
- ret = dev_set_alias(netdev, buf, count);
- if (ret < 0)
- goto err;
- ret = len;
- netdev_state_change(netdev);
- }
+ ret = dev_set_alias(netdev, buf, count);
+ if (ret < 0)
+ goto err;
+ ret = len;
+ netdev_state_change(netdev);
err:
rtnl_unlock();
@@ -480,7 +598,7 @@ static ssize_t ifalias_show(struct device *dev,
{
const struct net_device *netdev = to_net_dev(dev);
char tmp[IFALIASZ];
- ssize_t ret = 0;
+ ssize_t ret;
ret = dev_get_alias(netdev, tmp, sizeof(tmp));
if (ret > 0)
@@ -520,24 +638,23 @@ static ssize_t phys_port_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
- ssize_t ret = -EINVAL;
+ struct netdev_phys_item_id ppid;
+ ssize_t ret;
/* The check is also done in dev_get_phys_port_id; this helps returning
- * early without hitting the trylock/restart below.
+ * early without hitting the locking section below.
*/
if (!netdev->netdev_ops->ndo_get_phys_port_id)
return -EOPNOTSUPP;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (dev_isalive(netdev)) {
- struct netdev_phys_item_id ppid;
+ ret = dev_get_phys_port_id(netdev, &ppid);
+ if (!ret)
+ ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
- ret = dev_get_phys_port_id(netdev, &ppid);
- if (!ret)
- ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
- }
rtnl_unlock();
return ret;
@@ -548,25 +665,24 @@ static ssize_t phys_port_name_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
- ssize_t ret = -EINVAL;
+ char name[IFNAMSIZ];
+ ssize_t ret;
/* The checks are also done in dev_get_phys_port_name; this helps
- * returning early without hitting the trylock/restart below.
+ * returning early without hitting the locking section below.
*/
if (!netdev->netdev_ops->ndo_get_phys_port_name &&
!netdev->devlink_port)
return -EOPNOTSUPP;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (dev_isalive(netdev)) {
- char name[IFNAMSIZ];
+ ret = dev_get_phys_port_name(netdev, name, sizeof(name));
+ if (!ret)
+ ret = sysfs_emit(buf, "%s\n", name);
- ret = dev_get_phys_port_name(netdev, name, sizeof(name));
- if (!ret)
- ret = sysfs_emit(buf, "%s\n", name);
- }
rtnl_unlock();
return ret;
@@ -577,26 +693,25 @@ static ssize_t phys_switch_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
- ssize_t ret = -EINVAL;
+ struct netdev_phys_item_id ppid = { };
+ ssize_t ret;
/* The checks are also done in dev_get_phys_port_name; this helps
- * returning early without hitting the trylock/restart below. This works
+ * returning early without hitting the locking section below. This works
* because recurse is false when calling dev_get_port_parent_id.
*/
if (!netdev->netdev_ops->ndo_get_port_parent_id &&
!netdev->devlink_port)
return -EOPNOTSUPP;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (dev_isalive(netdev)) {
- struct netdev_phys_item_id ppid = { };
+ ret = dev_get_port_parent_id(netdev, &ppid, false);
+ if (!ret)
+ ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
- ret = dev_get_port_parent_id(netdev, &ppid, false);
- if (!ret)
- ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
- }
rtnl_unlock();
return ret;
@@ -638,7 +753,7 @@ static ssize_t threaded_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return netdev_store(dev, attr, buf, len, modify_napi_threaded);
+ return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded);
}
static DEVICE_ATTR_RW(threaded);
@@ -941,7 +1056,7 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
rcu_read_lock();
flow_table = rcu_dereference(queue->rps_flow_table);
if (flow_table)
- val = (unsigned long)flow_table->mask + 1;
+ val = 1UL << flow_table->log;
rcu_read_unlock();
return sysfs_emit(buf, "%lu\n", val);
@@ -994,7 +1109,7 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
if (!table)
return -ENOMEM;
- table->mask = mask;
+ table->log = ilog2(mask) + 1;
for (count = 0; count <= mask; count++)
table->flows[count].cpu = RPS_NO_CPU;
} else {
@@ -1077,7 +1192,6 @@ static void rx_queue_get_ownership(const struct kobject *kobj,
static const struct kobj_type rx_queue_ktype = {
.sysfs_ops = &rx_queue_sysfs_ops,
.release = rx_queue_release,
- .default_groups = rx_queue_default_groups,
.namespace = rx_queue_namespace,
.get_ownership = rx_queue_get_ownership,
};
@@ -1100,6 +1214,22 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
struct kobject *kobj = &queue->kobj;
int error = 0;
+ /* Rx queues are cleared in rx_queue_release to allow later
+ * re-registration. This is triggered when their kobj refcount is
+ * dropped.
+ *
+ * If a queue is removed while both a read (or write) operation and a
+ * the re-addition of the same queue are pending (waiting on rntl_lock)
+ * it might happen that the re-addition will execute before the read,
+ * making the initial removal to never happen (queue's kobj refcount
+ * won't drop enough because of the pending read). In such rare case,
+ * return to allow the removal operation to complete.
+ */
+ if (unlikely(kobj->state_initialized)) {
+ netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed");
+ return -EAGAIN;
+ }
+
/* Kobject_put later will trigger rx_queue_release call which
* decreases dev refcount: Take that reference here
*/
@@ -1111,20 +1241,27 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
if (error)
goto err;
+ queue->groups = rx_queue_default_groups;
+ error = sysfs_create_groups(kobj, queue->groups);
+ if (error)
+ goto err;
+
if (dev->sysfs_rx_queue_group) {
error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
if (error)
- goto err;
+ goto err_default_groups;
}
error = rx_queue_default_mask(dev, queue);
if (error)
- goto err;
+ goto err_default_groups;
kobject_uevent(kobj, KOBJ_ADD);
return error;
+err_default_groups:
+ sysfs_remove_groups(kobj, queue->groups);
err:
kobject_put(kobj);
return error;
@@ -1169,12 +1306,14 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
}
while (--i >= new_num) {
- struct kobject *kobj = &dev->_rx[i].kobj;
+ struct netdev_rx_queue *queue = &dev->_rx[i];
+ struct kobject *kobj = &queue->kobj;
if (!refcount_read(&dev_net(dev)->ns.count))
kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
+ sysfs_remove_groups(kobj, queue->groups);
kobject_put(kobj);
}
@@ -1213,9 +1352,11 @@ static int net_rx_queue_change_owner(struct net_device *dev, int num,
*/
struct netdev_queue_attribute {
struct attribute attr;
- ssize_t (*show)(struct netdev_queue *queue, char *buf);
- ssize_t (*store)(struct netdev_queue *queue,
- const char *buf, size_t len);
+ ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf);
+ ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len);
};
#define to_netdev_queue_attr(_attr) \
container_of(_attr, struct netdev_queue_attribute, attr)
@@ -1232,7 +1373,7 @@ static ssize_t netdev_queue_attr_show(struct kobject *kobj,
if (!attribute->show)
return -EIO;
- return attribute->show(queue, buf);
+ return attribute->show(kobj, attr, queue, buf);
}
static ssize_t netdev_queue_attr_store(struct kobject *kobj,
@@ -1246,7 +1387,7 @@ static ssize_t netdev_queue_attr_store(struct kobject *kobj,
if (!attribute->store)
return -EIO;
- return attribute->store(queue, buf, count);
+ return attribute->store(kobj, attr, queue, buf, count);
}
static const struct sysfs_ops netdev_queue_sysfs_ops = {
@@ -1254,7 +1395,8 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
.store = netdev_queue_attr_store,
};
-static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf)
+static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout);
@@ -1272,18 +1414,18 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
return i;
}
-static ssize_t traffic_class_show(struct netdev_queue *queue,
- char *buf)
+static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
- int num_tc, tc;
- int index;
+ int num_tc, tc, index, ret;
if (!netif_is_multiqueue(dev))
return -ENOENT;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(kobj, attr, queue->dev);
+ if (ret)
+ return ret;
index = get_netdev_queue_index(queue);
@@ -1310,24 +1452,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
}
#ifdef CONFIG_XPS
-static ssize_t tx_maxrate_show(struct netdev_queue *queue,
- char *buf)
+static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
return sysfs_emit(buf, "%lu\n", queue->tx_maxrate);
}
-static ssize_t tx_maxrate_store(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
- struct net_device *dev = queue->dev;
int err, index = get_netdev_queue_index(queue);
+ struct net_device *dev = queue->dev;
u32 rate = 0;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
/* The check is also done later; this helps returning early without
- * hitting the trylock/restart below.
+ * hitting the locking section below.
*/
if (!dev->netdev_ops->ndo_set_tx_maxrate)
return -EOPNOTSUPP;
@@ -1336,18 +1479,23 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue,
if (err < 0)
return err;
- if (!rtnl_trylock())
- return restart_syscall();
+ err = sysfs_rtnl_lock(kobj, attr, dev);
+ if (err)
+ return err;
err = -EOPNOTSUPP;
+ netdev_lock_ops(dev);
if (dev->netdev_ops->ndo_set_tx_maxrate)
err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate);
+ netdev_unlock_ops(dev);
- rtnl_unlock();
if (!err) {
queue->tx_maxrate = rate;
+ rtnl_unlock();
return len;
}
+
+ rtnl_unlock();
return err;
}
@@ -1391,16 +1539,17 @@ static ssize_t bql_set(const char *buf, const size_t count,
return count;
}
-static ssize_t bql_show_hold_time(struct netdev_queue *queue,
- char *buf)
+static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
}
-static ssize_t bql_set_hold_time(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
struct dql *dql = &queue->dql;
unsigned int value;
@@ -1419,15 +1568,17 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init
= __ATTR(hold_time, 0644,
bql_show_hold_time, bql_set_hold_time);
-static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf)
+static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs));
}
-static ssize_t bql_set_stall_thrs(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
struct dql *dql = &queue->dql;
unsigned int value;
@@ -1453,13 +1604,15 @@ static ssize_t bql_set_stall_thrs(struct netdev_queue *queue,
static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init =
__ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs);
-static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf)
+static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max));
}
-static ssize_t bql_set_stall_max(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
WRITE_ONCE(queue->dql.stall_max, 0);
return len;
@@ -1468,7 +1621,8 @@ static ssize_t bql_set_stall_max(struct netdev_queue *queue,
static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init =
__ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max);
-static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf)
+static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
@@ -1478,8 +1632,8 @@ static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf)
static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init =
__ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL);
-static ssize_t bql_show_inflight(struct netdev_queue *queue,
- char *buf)
+static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
@@ -1490,13 +1644,16 @@ static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init =
__ATTR(inflight, 0444, bql_show_inflight, NULL);
#define BQL_ATTR(NAME, FIELD) \
-static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
- char *buf) \
+static ssize_t bql_show_ ## NAME(struct kobject *kobj, \
+ struct attribute *attr, \
+ struct netdev_queue *queue, char *buf) \
{ \
return bql_show(buf, queue->dql.FIELD); \
} \
\
-static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
+static ssize_t bql_set_ ## NAME(struct kobject *kobj, \
+ struct attribute *attr, \
+ struct netdev_queue *queue, \
const char *buf, size_t len) \
{ \
return bql_set(buf, len, &queue->dql.FIELD); \
@@ -1582,19 +1739,21 @@ out_no_maps:
return len < PAGE_SIZE ? len : -EINVAL;
}
-static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf)
+static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
unsigned int index;
- int len, tc;
+ int len, tc, ret;
if (!netif_is_multiqueue(dev))
return -ENOENT;
index = get_netdev_queue_index(queue);
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(kobj, attr, queue->dev);
+ if (ret)
+ return ret;
/* If queue belongs to subordinate dev use its map */
dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
@@ -1605,18 +1764,21 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf)
return -EINVAL;
}
- /* Make sure the subordinate device can't be freed */
- get_device(&dev->dev);
+ /* Increase the net device refcnt to make sure it won't be freed while
+ * xps_queue_show is running.
+ */
+ dev_hold(dev);
rtnl_unlock();
len = xps_queue_show(dev, index, tc, buf, XPS_CPUS);
- put_device(&dev->dev);
+ dev_put(dev);
return len;
}
-static ssize_t xps_cpus_store(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
struct net_device *dev = queue->dev;
unsigned int index;
@@ -1640,9 +1802,10 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
return err;
}
- if (!rtnl_trylock()) {
+ err = sysfs_rtnl_lock(kobj, attr, dev);
+ if (err) {
free_cpumask_var(mask);
- return restart_syscall();
+ return err;
}
err = netif_set_xps_queue(dev, mask, index);
@@ -1656,26 +1819,34 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
-static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
unsigned int index;
- int tc;
+ int tc, ret;
index = get_netdev_queue_index(queue);
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(kobj, attr, dev);
+ if (ret)
+ return ret;
tc = netdev_txq_to_tc(dev, index);
+
+ /* Increase the net device refcnt to make sure it won't be freed while
+ * xps_queue_show is running.
+ */
+ dev_hold(dev);
rtnl_unlock();
- if (tc < 0)
- return -EINVAL;
- return xps_queue_show(dev, index, tc, buf, XPS_RXQS);
+ ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL;
+ dev_put(dev);
+ return ret;
}
-static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
size_t len)
{
struct net_device *dev = queue->dev;
@@ -1699,9 +1870,10 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
return err;
}
- if (!rtnl_trylock()) {
+ err = sysfs_rtnl_lock(kobj, attr, dev);
+ if (err) {
bitmap_free(mask);
- return restart_syscall();
+ return err;
}
cpus_read_lock();
@@ -1761,7 +1933,6 @@ static void netdev_queue_get_ownership(const struct kobject *kobj,
static const struct kobj_type netdev_queue_ktype = {
.sysfs_ops = &netdev_queue_sysfs_ops,
.release = netdev_queue_release,
- .default_groups = netdev_queue_default_groups,
.namespace = netdev_queue_namespace,
.get_ownership = netdev_queue_get_ownership,
};
@@ -1780,6 +1951,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
struct kobject *kobj = &queue->kobj;
int error = 0;
+ /* Tx queues are cleared in netdev_queue_release to allow later
+ * re-registration. This is triggered when their kobj refcount is
+ * dropped.
+ *
+ * If a queue is removed while both a read (or write) operation and a
+ * the re-addition of the same queue are pending (waiting on rntl_lock)
+ * it might happen that the re-addition will execute before the read,
+ * making the initial removal to never happen (queue's kobj refcount
+ * won't drop enough because of the pending read). In such rare case,
+ * return to allow the removal operation to complete.
+ */
+ if (unlikely(kobj->state_initialized)) {
+ netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed");
+ return -EAGAIN;
+ }
+
/* Kobject_put later will trigger netdev_queue_release call
* which decreases dev refcount: Take that reference here
*/
@@ -1791,15 +1978,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index)
if (error)
goto err;
+ queue->groups = netdev_queue_default_groups;
+ error = sysfs_create_groups(kobj, queue->groups);
+ if (error)
+ goto err;
+
if (netdev_uses_bql(dev)) {
error = sysfs_create_group(kobj, &dql_group);
if (error)
- goto err;
+ goto err_default_groups;
}
kobject_uevent(kobj, KOBJ_ADD);
return 0;
+err_default_groups:
+ sysfs_remove_groups(kobj, queue->groups);
err:
kobject_put(kobj);
return error;
@@ -1854,6 +2048,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
if (netdev_uses_bql(dev))
sysfs_remove_group(&queue->kobj, &dql_group);
+ sysfs_remove_groups(&queue->kobj, queue->groups);
kobject_put(&queue->kobj);
}
@@ -1953,8 +2148,10 @@ static void remove_queue_kobjects(struct net_device *dev)
net_rx_queue_update_kobjects(dev, real_rx, 0);
netdev_queue_update_kobjects(dev, real_tx, 0);
+ netdev_lock_ops(dev);
dev->real_num_rx_queues = 0;
dev->real_num_tx_queues = 0;
+ netdev_unlock_ops(dev);
#ifdef CONFIG_SYSFS
kset_unregister(dev->queues_kset);
#endif
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b5cd3ae4f04c..ae54f26709ca 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -163,16 +163,45 @@ static void ops_pre_exit_list(const struct pernet_operations *ops,
}
}
+static void ops_exit_rtnl_list(const struct list_head *ops_list,
+ const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ const struct pernet_operations *saved_ops = ops;
+ LIST_HEAD(dev_kill_list);
+ struct net *net;
+
+ rtnl_lock();
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ __rtnl_net_lock(net);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list) {
+ if (ops->exit_rtnl)
+ ops->exit_rtnl(net, &dev_kill_list);
+ }
+
+ __rtnl_net_unlock(net);
+ }
+
+ unregister_netdevice_many(&dev_kill_list);
+
+ rtnl_unlock();
+}
+
static void ops_exit_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
- struct net *net;
if (ops->exit) {
+ struct net *net;
+
list_for_each_entry(net, net_exit_list, exit_list) {
ops->exit(net);
cond_resched();
}
}
+
if (ops->exit_batch)
ops->exit_batch(net_exit_list);
}
@@ -188,6 +217,56 @@ static void ops_free_list(const struct pernet_operations *ops,
}
}
+static void ops_undo_list(const struct list_head *ops_list,
+ const struct pernet_operations *ops,
+ struct list_head *net_exit_list,
+ bool expedite_rcu)
+{
+ const struct pernet_operations *saved_ops;
+ bool hold_rtnl = false;
+
+ if (!ops)
+ ops = list_entry(ops_list, typeof(*ops), list);
+
+ saved_ops = ops;
+
+ list_for_each_entry_continue_reverse(ops, ops_list, list) {
+ hold_rtnl |= !!ops->exit_rtnl;
+ ops_pre_exit_list(ops, net_exit_list);
+ }
+
+ /* Another CPU might be rcu-iterating the list, wait for it.
+ * This needs to be before calling the exit() notifiers, so the
+ * rcu_barrier() after ops_undo_list() isn't sufficient alone.
+ * Also the pre_exit() and exit() methods need this barrier.
+ */
+ if (expedite_rcu)
+ synchronize_rcu_expedited();
+ else
+ synchronize_rcu();
+
+ if (hold_rtnl)
+ ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list)
+ ops_exit_list(ops, net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list)
+ ops_free_list(ops, net_exit_list);
+}
+
+static void ops_undo_single(struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ LIST_HEAD(ops_list);
+
+ list_add(&ops->list, &ops_list);
+ ops_undo_list(&ops_list, NULL, net_exit_list, false);
+ list_del(&ops->list);
+}
+
/* should be called with nsid_lock held */
static int alloc_netid(struct net *net, struct net *peer, int reqid)
{
@@ -340,6 +419,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
#endif
+ INIT_LIST_HEAD(&net->ptype_all);
+ INIT_LIST_HEAD(&net->ptype_specific);
preinit_net_sysctl(net);
}
@@ -349,9 +430,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
static __net_init int setup_net(struct net *net)
{
/* Must be called with pernet_ops_rwsem held */
- const struct pernet_operations *ops, *saved_ops;
+ const struct pernet_operations *ops;
LIST_HEAD(net_exit_list);
- LIST_HEAD(dev_kill_list);
int error = 0;
preempt_disable();
@@ -374,29 +454,7 @@ out_undo:
* for the pernet modules whose init functions did not fail.
*/
list_add(&net->exit_list, &net_exit_list);
- saved_ops = ops;
- list_for_each_entry_continue_reverse(ops, &pernet_list, list)
- ops_pre_exit_list(ops, &net_exit_list);
-
- synchronize_rcu();
-
- ops = saved_ops;
- rtnl_lock();
- list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
- if (ops->exit_batch_rtnl)
- ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
- }
- unregister_netdevice_many(&dev_kill_list);
- rtnl_unlock();
-
- ops = saved_ops;
- list_for_each_entry_continue_reverse(ops, &pernet_list, list)
- ops_exit_list(ops, &net_exit_list);
-
- ops = saved_ops;
- list_for_each_entry_continue_reverse(ops, &pernet_list, list)
- ops_free_list(ops, &net_exit_list);
-
+ ops_undo_list(&pernet_list, ops, &net_exit_list, false);
rcu_barrier();
goto out;
}
@@ -464,7 +522,7 @@ static void net_complete_free(void)
}
-static void net_free(struct net *net)
+void net_passive_dec(struct net *net)
{
if (refcount_dec_and_test(&net->passive)) {
kfree(rcu_access_pointer(net->gen));
@@ -482,7 +540,7 @@ void net_drop_ns(void *p)
struct net *net = (struct net *)p;
if (net)
- net_free(net);
+ net_passive_dec(net);
}
struct net *copy_net_ns(unsigned long flags,
@@ -523,7 +581,7 @@ put_userns:
key_remove_domain(net->key_domain);
#endif
put_user_ns(user_ns);
- net_free(net);
+ net_passive_dec(net);
dec_ucounts:
dec_net_namespaces(ucounts);
return ERR_PTR(rv);
@@ -588,13 +646,15 @@ static void unhash_nsid(struct net *net, struct net *last)
static LLIST_HEAD(cleanup_list);
+struct task_struct *cleanup_net_task;
+
static void cleanup_net(struct work_struct *work)
{
- const struct pernet_operations *ops;
- struct net *net, *tmp, *last;
struct llist_node *net_kill_list;
+ struct net *net, *tmp, *last;
LIST_HEAD(net_exit_list);
- LIST_HEAD(dev_kill_list);
+
+ WRITE_ONCE(cleanup_net_task, current);
/* Atomically snapshot the list of namespaces to cleanup */
net_kill_list = llist_del_all(&cleanup_list);
@@ -623,33 +683,7 @@ static void cleanup_net(struct work_struct *work)
list_add_tail(&net->exit_list, &net_exit_list);
}
- /* Run all of the network namespace pre_exit methods */
- list_for_each_entry_reverse(ops, &pernet_list, list)
- ops_pre_exit_list(ops, &net_exit_list);
-
- /*
- * Another CPU might be rcu-iterating the list, wait for it.
- * This needs to be before calling the exit() notifiers, so
- * the rcu_barrier() below isn't sufficient alone.
- * Also the pre_exit() and exit() methods need this barrier.
- */
- synchronize_rcu_expedited();
-
- rtnl_lock();
- list_for_each_entry_reverse(ops, &pernet_list, list) {
- if (ops->exit_batch_rtnl)
- ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
- }
- unregister_netdevice_many(&dev_kill_list);
- rtnl_unlock();
-
- /* Run all of the network namespace exit methods */
- list_for_each_entry_reverse(ops, &pernet_list, list)
- ops_exit_list(ops, &net_exit_list);
-
- /* Free the net generic variables */
- list_for_each_entry_reverse(ops, &pernet_list, list)
- ops_free_list(ops, &net_exit_list);
+ ops_undo_list(&pernet_list, NULL, &net_exit_list, true);
up_read(&pernet_ops_rwsem);
@@ -668,8 +702,9 @@ static void cleanup_net(struct work_struct *work)
key_remove_domain(net->key_domain);
#endif
put_user_ns(net->user_ns);
- net_free(net);
+ net_passive_dec(net);
}
+ WRITE_ONCE(cleanup_net_task, NULL);
}
/**
@@ -1232,31 +1267,13 @@ void __init net_ns_init(void)
rtnl_register_many(net_ns_rtnl_msg_handlers);
}
-static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list)
-{
- ops_pre_exit_list(ops, net_exit_list);
- synchronize_rcu();
-
- if (ops->exit_batch_rtnl) {
- LIST_HEAD(dev_kill_list);
-
- rtnl_lock();
- ops->exit_batch_rtnl(net_exit_list, &dev_kill_list);
- unregister_netdevice_many(&dev_kill_list);
- rtnl_unlock();
- }
- ops_exit_list(ops, net_exit_list);
-
- ops_free_list(ops, net_exit_list);
-}
-
#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
+ LIST_HEAD(net_exit_list);
struct net *net;
int error;
- LIST_HEAD(net_exit_list);
list_add_tail(&ops->list, list);
if (ops->init || ops->id) {
@@ -1275,21 +1292,21 @@ static int __register_pernet_operations(struct list_head *list,
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
- free_exit_list(ops, &net_exit_list);
+ ops_undo_single(ops, &net_exit_list);
return error;
}
static void __unregister_pernet_operations(struct pernet_operations *ops)
{
- struct net *net;
LIST_HEAD(net_exit_list);
+ struct net *net;
- list_del(&ops->list);
/* See comment in __register_pernet_operations() */
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
- free_exit_list(ops, &net_exit_list);
+ list_del(&ops->list);
+ ops_undo_single(ops, &net_exit_list);
}
#else
@@ -1311,8 +1328,9 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
list_del(&ops->list);
} else {
LIST_HEAD(net_exit_list);
+
list_add(&init_net.exit_list, &net_exit_list);
- free_exit_list(ops, &net_exit_list);
+ ops_undo_single(ops, &net_exit_list);
}
}
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
index 996ac6a449eb..4fc44587f493 100644
--- a/net/core/netdev-genl-gen.c
+++ b/net/core/netdev-genl-gen.c
@@ -9,7 +9,7 @@
#include "netdev-genl-gen.h"
#include <uapi/linux/netdev.h>
-#include <linux/list.h>
+#include <net/netdev_netlink.h>
/* Integer value ranges */
static const struct netlink_range_validation netdev_a_page_pool_id_range = {
@@ -99,6 +99,12 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPE
[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
};
+/* NETDEV_CMD_BIND_TX - do */
+static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = {
+ [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
+};
+
/* Ops table for netdev */
static const struct genl_split_ops netdev_nl_ops[] = {
{
@@ -190,6 +196,13 @@ static const struct genl_split_ops netdev_nl_ops[] = {
.maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
+ {
+ .cmd = NETDEV_CMD_BIND_TX,
+ .doit = netdev_nl_bind_tx_doit,
+ .policy = netdev_bind_tx_nl_policy,
+ .maxattr = NETDEV_A_DMABUF_FD,
+ .flags = GENL_CMD_CAP_DO,
+ },
};
static const struct genl_multicast_group netdev_nl_mcgrps[] = {
@@ -217,7 +230,7 @@ struct genl_family netdev_nl_family __ro_after_init = {
.n_split_ops = ARRAY_SIZE(netdev_nl_ops),
.mcgrps = netdev_nl_mcgrps,
.n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps),
- .sock_priv_size = sizeof(struct list_head),
+ .sock_priv_size = sizeof(struct netdev_nl_sock),
.sock_priv_init = __netdev_nl_sock_priv_init,
.sock_priv_destroy = __netdev_nl_sock_priv_destroy,
};
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
index e09dd7539ff2..cf3fad74511f 100644
--- a/net/core/netdev-genl-gen.h
+++ b/net/core/netdev-genl-gen.h
@@ -10,7 +10,7 @@
#include <net/genetlink.h>
#include <uapi/linux/netdev.h>
-#include <linux/list.h>
+#include <net/netdev_netlink.h>
/* Common nested types */
extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
@@ -34,6 +34,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
struct netlink_callback *cb);
int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info);
enum {
NETDEV_NLGRP_MGMT,
@@ -42,7 +43,7 @@ enum {
extern struct genl_family netdev_nl_family;
-void netdev_nl_sock_priv_init(struct list_head *priv);
-void netdev_nl_sock_priv_destroy(struct list_head *priv);
+void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv);
+void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv);
#endif /* _LINUX_NETDEV_GEN_H */
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index a3bdaf075b6b..2afa7b2141aa 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -10,6 +10,7 @@
#include <net/sock.h>
#include <net/xdp.h>
#include <net/xdp_sock.h>
+#include <net/page_pool/memory_provider.h>
#include "dev.h"
#include "devmem.h"
@@ -37,6 +38,8 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
u64 xdp_rx_meta = 0;
void *hdr;
+ netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */
+
hdr = genlmsg_iput(rsp, info);
if (!hdr)
return -EMSGSIZE;
@@ -52,6 +55,8 @@ XDP_METADATA_KFUNC_xxx
xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP;
if (netdev->xsk_tx_metadata_ops->tmo_request_checksum)
xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM;
+ if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO;
}
if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
@@ -119,15 +124,14 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info)
if (!rsp)
return -ENOMEM;
- rtnl_lock();
-
- netdev = __dev_get_by_index(genl_info_net(info), ifindex);
- if (netdev)
- err = netdev_nl_dev_fill(netdev, rsp, info);
- else
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
err = -ENODEV;
+ goto err_free_msg;
+ }
- rtnl_unlock();
+ err = netdev_nl_dev_fill(netdev, rsp, info);
+ netdev_unlock(netdev);
if (err)
goto err_free_msg;
@@ -143,18 +147,15 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
struct net *net = sock_net(skb->sk);
- struct net_device *netdev;
- int err = 0;
+ int err;
- rtnl_lock();
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb));
if (err < 0)
- break;
+ return err;
}
- rtnl_unlock();
- return err;
+ return 0;
}
static int
@@ -167,7 +168,7 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
void *hdr;
pid_t pid;
- if (!(napi->dev->flags & IFF_UP))
+ if (!napi->dev->up)
return 0;
hdr = genlmsg_iput(rsp, info);
@@ -229,20 +230,15 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info)
if (!rsp)
return -ENOMEM;
- rtnl_lock();
- rcu_read_lock();
-
- napi = netdev_napi_by_id(genl_info_net(info), napi_id);
+ napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
if (napi) {
err = netdev_nl_napi_fill_one(rsp, napi, info);
+ netdev_unlock(napi->dev);
} else {
NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
err = -ENOENT;
}
- rcu_read_unlock();
- rtnl_unlock();
-
if (err) {
goto err_free_msg;
} else if (!rsp->len) {
@@ -263,14 +259,21 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp,
struct netdev_nl_dump_ctx *ctx)
{
struct napi_struct *napi;
+ unsigned int prev_id;
int err = 0;
- if (!(netdev->flags & IFF_UP))
+ if (!netdev->up)
return err;
+ prev_id = UINT_MAX;
list_for_each_entry(napi, &netdev->napi_list, dev_list) {
- if (napi->napi_id < MIN_NAPI_ID)
+ if (!napi_id_valid(napi->napi_id))
continue;
+
+ /* Dump continuation below depends on the list being sorted */
+ WARN_ON_ONCE(napi->napi_id >= prev_id);
+ prev_id = napi->napi_id;
+
if (ctx->napi_id && napi->napi_id >= ctx->napi_id)
continue;
@@ -294,22 +297,22 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
if (info->attrs[NETDEV_A_NAPI_IFINDEX])
ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]);
- rtnl_lock();
if (ifindex) {
- netdev = __dev_get_by_index(net, ifindex);
- if (netdev)
+ netdev = netdev_get_by_index_lock(net, ifindex);
+ if (netdev) {
err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
- else
+ netdev_unlock(netdev);
+ } else {
err = -ENODEV;
+ }
} else {
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
if (err < 0)
break;
ctx->napi_id = 0;
}
}
- rtnl_unlock();
return err;
}
@@ -350,28 +353,30 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
- rtnl_lock();
- rcu_read_lock();
-
- napi = netdev_napi_by_id(genl_info_net(info), napi_id);
+ napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
if (napi) {
err = netdev_nl_napi_set_config(napi, info);
+ netdev_unlock(napi->dev);
} else {
NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
err = -ENOENT;
}
- rcu_read_unlock();
- rtnl_unlock();
-
return err;
}
+static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi)
+{
+ if (napi && napi_id_valid(napi->napi_id))
+ return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id);
+ return 0;
+}
+
static int
netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
u32 q_idx, u32 q_type, const struct genl_info *info)
{
- struct net_devmem_dmabuf_binding *binding;
+ struct pp_memory_provider_params *params;
struct netdev_rx_queue *rxq;
struct netdev_queue *txq;
void *hdr;
@@ -388,21 +393,30 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
switch (q_type) {
case NETDEV_QUEUE_TYPE_RX:
rxq = __netif_get_rx_queue(netdev, q_idx);
- if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID,
- rxq->napi->napi_id))
+ if (nla_put_napi_id(rsp, rxq->napi))
goto nla_put_failure;
- binding = rxq->mp_params.mp_priv;
- if (binding &&
- nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id))
+ params = &rxq->mp_params;
+ if (params->mp_ops &&
+ params->mp_ops->nl_fill(params->mp_priv, rsp, rxq))
goto nla_put_failure;
+#ifdef CONFIG_XDP_SOCKETS
+ if (rxq->pool)
+ if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK))
+ goto nla_put_failure;
+#endif
break;
case NETDEV_QUEUE_TYPE_TX:
txq = netdev_get_tx_queue(netdev, q_idx);
- if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID,
- txq->napi->napi_id))
+ if (nla_put_napi_id(rsp, txq->napi))
goto nla_put_failure;
+#ifdef CONFIG_XDP_SOCKETS
+ if (txq->pool)
+ if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK))
+ goto nla_put_failure;
+#endif
+ break;
}
genlmsg_end(rsp, hdr);
@@ -435,7 +449,7 @@ netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx,
{
int err;
- if (!(netdev->flags & IFF_UP))
+ if (!netdev->up)
return -ENOENT;
err = netdev_nl_queue_validate(netdev, q_idx, q_type);
@@ -465,15 +479,14 @@ int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info)
if (!rsp)
return -ENOMEM;
- rtnl_lock();
-
- netdev = __dev_get_by_index(genl_info_net(info), ifindex);
- if (netdev)
+ netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info),
+ ifindex);
+ if (netdev) {
err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info);
- else
+ netdev_unlock_ops_compat(netdev);
+ } else {
err = -ENODEV;
-
- rtnl_unlock();
+ }
if (err)
goto err_free_msg;
@@ -492,7 +505,7 @@ netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp,
{
int err = 0;
- if (!(netdev->flags & IFF_UP))
+ if (!netdev->up)
return err;
for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) {
@@ -523,15 +536,17 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
if (info->attrs[NETDEV_A_QUEUE_IFINDEX])
ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
- rtnl_lock();
if (ifindex) {
- netdev = __dev_get_by_index(net, ifindex);
- if (netdev)
+ netdev = netdev_get_by_index_lock_ops_compat(net, ifindex);
+ if (netdev) {
err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
- else
+ netdev_unlock_ops_compat(netdev);
+ } else {
err = -ENODEV;
+ }
} else {
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ for_each_netdev_lock_ops_compat_scoped(net, netdev,
+ ctx->ifindex) {
err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
if (err < 0)
break;
@@ -539,7 +554,6 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
ctx->txq_idx = 0;
}
}
- rtnl_unlock();
return err;
}
@@ -575,6 +589,7 @@ netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx)
netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) ||
netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) ||
netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) ||
netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) ||
netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) ||
netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) ||
@@ -687,25 +702,66 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp,
return 0;
}
+/**
+ * netdev_stat_queue_sum() - add up queue stats from range of queues
+ * @netdev: net_device
+ * @rx_start: index of the first Rx queue to query
+ * @rx_end: index after the last Rx queue (first *not* to query)
+ * @rx_sum: output Rx stats, should be already initialized
+ * @tx_start: index of the first Tx queue to query
+ * @tx_end: index after the last Tx queue (first *not* to query)
+ * @tx_sum: output Tx stats, should be already initialized
+ *
+ * Add stats from [start, end) range of queue IDs to *x_sum structs.
+ * The sum structs must be already initialized. Usually this
+ * helper is invoked from the .get_base_stats callbacks of drivers
+ * to account for stats of disabled queues. In that case the ranges
+ * are usually [netdev->real_num_*x_queues, netdev->num_*x_queues).
+ */
+void netdev_stat_queue_sum(struct net_device *netdev,
+ int rx_start, int rx_end,
+ struct netdev_queue_stats_rx *rx_sum,
+ int tx_start, int tx_end,
+ struct netdev_queue_stats_tx *tx_sum)
+{
+ const struct netdev_stat_ops *ops;
+ struct netdev_queue_stats_rx rx;
+ struct netdev_queue_stats_tx tx;
+ int i;
+
+ ops = netdev->stat_ops;
+
+ for (i = rx_start; i < rx_end; i++) {
+ memset(&rx, 0xff, sizeof(rx));
+ if (ops->get_queue_stats_rx)
+ ops->get_queue_stats_rx(netdev, i, &rx);
+ netdev_nl_stats_add(rx_sum, &rx, sizeof(rx));
+ }
+ for (i = tx_start; i < tx_end; i++) {
+ memset(&tx, 0xff, sizeof(tx));
+ if (ops->get_queue_stats_tx)
+ ops->get_queue_stats_tx(netdev, i, &tx);
+ netdev_nl_stats_add(tx_sum, &tx, sizeof(tx));
+ }
+}
+EXPORT_SYMBOL(netdev_stat_queue_sum);
+
static int
netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp,
const struct genl_info *info)
{
- struct netdev_queue_stats_rx rx_sum, rx;
- struct netdev_queue_stats_tx tx_sum, tx;
- const struct netdev_stat_ops *ops;
+ struct netdev_queue_stats_rx rx_sum;
+ struct netdev_queue_stats_tx tx_sum;
void *hdr;
- int i;
- ops = netdev->stat_ops;
/* Netdev can't guarantee any complete counters */
- if (!ops->get_base_stats)
+ if (!netdev->stat_ops->get_base_stats)
return 0;
memset(&rx_sum, 0xff, sizeof(rx_sum));
memset(&tx_sum, 0xff, sizeof(tx_sum));
- ops->get_base_stats(netdev, &rx_sum, &tx_sum);
+ netdev->stat_ops->get_base_stats(netdev, &rx_sum, &tx_sum);
/* The op was there, but nothing reported, don't bother */
if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) &&
@@ -718,18 +774,8 @@ netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp,
if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex))
goto nla_put_failure;
- for (i = 0; i < netdev->real_num_rx_queues; i++) {
- memset(&rx, 0xff, sizeof(rx));
- if (ops->get_queue_stats_rx)
- ops->get_queue_stats_rx(netdev, i, &rx);
- netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx));
- }
- for (i = 0; i < netdev->real_num_tx_queues; i++) {
- memset(&tx, 0xff, sizeof(tx));
- if (ops->get_queue_stats_tx)
- ops->get_queue_stats_tx(netdev, i, &tx);
- netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx));
- }
+ netdev_stat_queue_sum(netdev, 0, netdev->real_num_rx_queues, &rx_sum,
+ 0, netdev->real_num_tx_queues, &tx_sum);
if (netdev_nl_stats_write_rx(rsp, &rx_sum) ||
netdev_nl_stats_write_tx(rsp, &tx_sum))
@@ -780,26 +826,31 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
if (info->attrs[NETDEV_A_QSTATS_IFINDEX])
ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]);
- rtnl_lock();
if (ifindex) {
- netdev = __dev_get_by_index(net, ifindex);
- if (netdev && netdev->stat_ops) {
- err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
- info, ctx);
- } else {
+ netdev = netdev_get_by_index_lock_ops_compat(net, ifindex);
+ if (!netdev) {
NL_SET_BAD_ATTR(info->extack,
info->attrs[NETDEV_A_QSTATS_IFINDEX]);
- err = netdev ? -EOPNOTSUPP : -ENODEV;
+ return -ENODEV;
}
- } else {
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ if (netdev->stat_ops) {
err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
info, ctx);
- if (err < 0)
- break;
+ } else {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_QSTATS_IFINDEX]);
+ err = -EOPNOTSUPP;
}
+ netdev_unlock_ops_compat(netdev);
+ return err;
+ }
+
+ for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) {
+ err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
+ info, ctx);
+ if (err < 0)
+ break;
}
- rtnl_unlock();
return err;
}
@@ -808,8 +859,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
struct net_devmem_dmabuf_binding *binding;
- struct list_head *sock_binding_list;
u32 ifindex, dmabuf_fd, rxq_idx;
+ struct netdev_nl_sock *priv;
struct net_device *netdev;
struct sk_buff *rsp;
struct nlattr *attr;
@@ -824,10 +875,9 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
- sock_binding_list = genl_sk_priv_get(&netdev_nl_family,
- NETLINK_CB(skb).sk);
- if (IS_ERR(sock_binding_list))
- return PTR_ERR(sock_binding_list);
+ priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!rsp)
@@ -839,21 +889,26 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_genlmsg_free;
}
- rtnl_lock();
+ mutex_lock(&priv->lock);
- netdev = __dev_get_by_index(genl_info_net(info), ifindex);
- if (!netdev || !netif_device_present(netdev)) {
+ err = 0;
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
err = -ENODEV;
- goto err_unlock;
+ goto err_unlock_sock;
}
-
- if (dev_xdp_prog_count(netdev)) {
- NL_SET_ERR_MSG(info->extack, "unable to bind dmabuf to device with XDP program attached");
- err = -EEXIST;
+ if (!netif_device_present(netdev))
+ err = -ENODEV;
+ else if (!netdev_need_ops_lock(netdev))
+ err = -EOPNOTSUPP;
+ if (err) {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_DEV_IFINDEX]);
goto err_unlock;
}
- binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack);
+ binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd,
+ priv, info->extack);
if (IS_ERR(binding)) {
err = PTR_ERR(binding);
goto err_unlock;
@@ -888,8 +943,6 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unbind;
}
- list_add(&binding->list, sock_binding_list);
-
nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
genlmsg_end(rsp, hdr);
@@ -897,34 +950,129 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
if (err)
goto err_unbind;
- rtnl_unlock();
+ netdev_unlock(netdev);
+
+ mutex_unlock(&priv->lock);
return 0;
err_unbind:
net_devmem_unbind_dmabuf(binding);
err_unlock:
- rtnl_unlock();
+ netdev_unlock(netdev);
+err_unlock_sock:
+ mutex_unlock(&priv->lock);
+err_genlmsg_free:
+ nlmsg_free(rsp);
+ return err;
+}
+
+int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct netdev_nl_sock *priv;
+ struct net_device *netdev;
+ u32 ifindex, dmabuf_fd;
+ struct sk_buff *rsp;
+ int err = 0;
+ void *hdr;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+ dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
+
+ priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_genlmsg_free;
+ }
+
+ mutex_lock(&priv->lock);
+
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
+ err = -ENODEV;
+ goto err_unlock_sock;
+ }
+
+ if (!netif_device_present(netdev)) {
+ err = -ENODEV;
+ goto err_unlock_netdev;
+ }
+
+ if (!netdev->netmem_tx) {
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(info->extack,
+ "Driver does not support netmem TX");
+ goto err_unlock_netdev;
+ }
+
+ binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv,
+ info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_unlock_netdev;
+ }
+
+ nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
+ genlmsg_end(rsp, hdr);
+
+ netdev_unlock(netdev);
+ mutex_unlock(&priv->lock);
+
+ return genlmsg_reply(rsp, info);
+
+err_unlock_netdev:
+ netdev_unlock(netdev);
+err_unlock_sock:
+ mutex_unlock(&priv->lock);
err_genlmsg_free:
nlmsg_free(rsp);
return err;
}
-void netdev_nl_sock_priv_init(struct list_head *priv)
+void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
{
- INIT_LIST_HEAD(priv);
+ INIT_LIST_HEAD(&priv->bindings);
+ mutex_init(&priv->lock);
}
-void netdev_nl_sock_priv_destroy(struct list_head *priv)
+void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv)
{
struct net_devmem_dmabuf_binding *binding;
struct net_devmem_dmabuf_binding *temp;
+ netdevice_tracker dev_tracker;
+ struct net_device *dev;
+
+ mutex_lock(&priv->lock);
+ list_for_each_entry_safe(binding, temp, &priv->bindings, list) {
+ mutex_lock(&binding->lock);
+ dev = binding->dev;
+ if (!dev) {
+ mutex_unlock(&binding->lock);
+ net_devmem_unbind_dmabuf(binding);
+ continue;
+ }
+ netdev_hold(dev, &dev_tracker, GFP_KERNEL);
+ mutex_unlock(&binding->lock);
- list_for_each_entry_safe(binding, temp, priv, list) {
- rtnl_lock();
+ netdev_lock(dev);
net_devmem_unbind_dmabuf(binding);
- rtnl_unlock();
+ netdev_unlock(dev);
+ netdev_put(dev, &dev_tracker);
}
+ mutex_unlock(&priv->lock);
}
static int netdev_genl_netdevice_event(struct notifier_block *nb,
@@ -934,10 +1082,14 @@ static int netdev_genl_netdevice_event(struct notifier_block *nb,
switch (event) {
case NETDEV_REGISTER:
+ netdev_lock_ops_to_full(netdev);
netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF);
+ netdev_unlock_full_to_ops(netdev);
break;
case NETDEV_UNREGISTER:
+ netdev_lock(netdev);
netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF);
+ netdev_unlock(netdev);
break;
case NETDEV_XDP_FEAT_CHANGE:
netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF);
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index e217a5838c87..d126f10197bf 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -1,36 +1,38 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/ethtool_netlink.h>
#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
#include <net/netdev_queues.h>
#include <net/netdev_rx_queue.h>
+#include <net/page_pool/memory_provider.h>
#include "page_pool_priv.h"
int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
{
struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx);
+ const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
void *new_mem, *old_mem;
int err;
- if (!dev->queue_mgmt_ops || !dev->queue_mgmt_ops->ndo_queue_stop ||
- !dev->queue_mgmt_ops->ndo_queue_mem_free ||
- !dev->queue_mgmt_ops->ndo_queue_mem_alloc ||
- !dev->queue_mgmt_ops->ndo_queue_start)
+ if (!qops || !qops->ndo_queue_stop || !qops->ndo_queue_mem_free ||
+ !qops->ndo_queue_mem_alloc || !qops->ndo_queue_start)
return -EOPNOTSUPP;
- ASSERT_RTNL();
+ netdev_assert_locked(dev);
- new_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL);
+ new_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL);
if (!new_mem)
return -ENOMEM;
- old_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL);
+ old_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL);
if (!old_mem) {
err = -ENOMEM;
goto err_free_new_mem;
}
- err = dev->queue_mgmt_ops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx);
+ err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx);
if (err)
goto err_free_old_mem;
@@ -38,15 +40,19 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
if (err)
goto err_free_new_queue_mem;
- err = dev->queue_mgmt_ops->ndo_queue_stop(dev, old_mem, rxq_idx);
- if (err)
- goto err_free_new_queue_mem;
+ if (netif_running(dev)) {
+ err = qops->ndo_queue_stop(dev, old_mem, rxq_idx);
+ if (err)
+ goto err_free_new_queue_mem;
- err = dev->queue_mgmt_ops->ndo_queue_start(dev, new_mem, rxq_idx);
- if (err)
- goto err_start_queue;
+ err = qops->ndo_queue_start(dev, new_mem, rxq_idx);
+ if (err)
+ goto err_start_queue;
+ } else {
+ swap(new_mem, old_mem);
+ }
- dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem);
+ qops->ndo_queue_mem_free(dev, old_mem);
kvfree(old_mem);
kvfree(new_mem);
@@ -61,15 +67,15 @@ err_start_queue:
* WARN if we fail to recover the old rx queue, and at least free
* old_mem so we don't also leak that.
*/
- if (dev->queue_mgmt_ops->ndo_queue_start(dev, old_mem, rxq_idx)) {
+ if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) {
WARN(1,
"Failed to restart old queue in error path. RX queue %d may be unhealthy.",
rxq_idx);
- dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem);
+ qops->ndo_queue_mem_free(dev, old_mem);
}
err_free_new_queue_mem:
- dev->queue_mgmt_ops->ndo_queue_mem_free(dev, new_mem);
+ qops->ndo_queue_mem_free(dev, new_mem);
err_free_old_mem:
kvfree(old_mem);
@@ -79,3 +85,103 @@ err_free_new_mem:
return err;
}
+EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
+
+int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *p,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_rx_queue *rxq;
+ int ret;
+
+ if (!netdev_need_ops_lock(dev))
+ return -EOPNOTSUPP;
+
+ if (rxq_idx >= dev->real_num_rx_queues)
+ return -EINVAL;
+ rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues);
+
+ if (rxq_idx >= dev->real_num_rx_queues) {
+ NL_SET_ERR_MSG(extack, "rx queue index out of range");
+ return -ERANGE;
+ }
+ if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
+ NL_SET_ERR_MSG(extack, "tcp-data-split is disabled");
+ return -EINVAL;
+ }
+ if (dev->cfg->hds_thresh) {
+ NL_SET_ERR_MSG(extack, "hds-thresh is not zero");
+ return -EINVAL;
+ }
+ if (dev_xdp_prog_count(dev)) {
+ NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached");
+ return -EEXIST;
+ }
+
+ rxq = __netif_get_rx_queue(dev, rxq_idx);
+ if (rxq->mp_params.mp_ops) {
+ NL_SET_ERR_MSG(extack, "designated queue already memory provider bound");
+ return -EEXIST;
+ }
+#ifdef CONFIG_XDP_SOCKETS
+ if (rxq->pool) {
+ NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP");
+ return -EBUSY;
+ }
+#endif
+
+ rxq->mp_params = *p;
+ ret = netdev_rx_queue_restart(dev, rxq_idx);
+ if (ret) {
+ rxq->mp_params.mp_ops = NULL;
+ rxq->mp_params.mp_priv = NULL;
+ }
+ return ret;
+}
+
+int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ struct pp_memory_provider_params *p)
+{
+ int ret;
+
+ netdev_lock(dev);
+ ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL);
+ netdev_unlock(dev);
+ return ret;
+}
+
+void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
+ const struct pp_memory_provider_params *old_p)
+{
+ struct netdev_rx_queue *rxq;
+ int err;
+
+ if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues))
+ return;
+
+ rxq = __netif_get_rx_queue(dev, ifq_idx);
+
+ /* Callers holding a netdev ref may get here after we already
+ * went thru shutdown via dev_memory_provider_uninstall().
+ */
+ if (dev->reg_state > NETREG_REGISTERED &&
+ !rxq->mp_params.mp_ops)
+ return;
+
+ if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops ||
+ rxq->mp_params.mp_priv != old_p->mp_priv))
+ return;
+
+ rxq->mp_params.mp_ops = NULL;
+ rxq->mp_params.mp_priv = NULL;
+ err = netdev_rx_queue_restart(dev, ifq_idx);
+ WARN_ON(err && err != -ENETDOWN);
+}
+
+void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
+ struct pp_memory_provider_params *old_p)
+{
+ netdev_lock(dev);
+ __net_mp_close_rxq(dev, ifq_idx, old_p);
+ netdev_unlock(dev);
+}
diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h
index 7eadb8393e00..cd95394399b4 100644
--- a/net/core/netmem_priv.h
+++ b/net/core/netmem_priv.h
@@ -5,7 +5,7 @@
static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
{
- return __netmem_clear_lsb(netmem)->pp_magic;
+ return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
}
static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
@@ -15,9 +15,16 @@ static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
static inline void netmem_clear_pp_magic(netmem_ref netmem)
{
+ WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK);
+
__netmem_clear_lsb(netmem)->pp_magic = 0;
}
+static inline bool netmem_is_pp(netmem_ref netmem)
+{
+ return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE;
+}
+
static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
{
__netmem_clear_lsb(netmem)->pp = pool;
@@ -28,4 +35,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem,
{
__netmem_clear_lsb(netmem)->dma_addr = dma_addr;
}
+
+static inline unsigned long netmem_get_dma_index(netmem_ref netmem)
+{
+ unsigned long magic;
+
+ if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
+ return 0;
+
+ magic = __netmem_clear_lsb(netmem)->pp_magic;
+
+ return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT;
+}
+
+static inline void netmem_set_dma_index(netmem_ref netmem,
+ unsigned long id)
+{
+ unsigned long magic;
+
+ if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
+ return;
+
+ magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT);
+ __netmem_clear_lsb(netmem)->pp_magic = magic;
+}
#endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 96a6ed37d4cc..4ddb7490df4b 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -284,12 +284,13 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
struct sk_buff *skb;
zap_completion_queue();
- refill_skbs(np);
repeat:
skb = alloc_skb(len, GFP_ATOMIC);
- if (!skb)
+ if (!skb) {
skb = skb_dequeue(&np->skb_pool);
+ schedule_work(&np->refill_wq);
+ }
if (!skb) {
if (++count < 10) {
@@ -319,6 +320,7 @@ static int netpoll_owner_active(struct net_device *dev)
static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
{
netdev_tx_t status = NETDEV_TX_BUSY;
+ netdev_tx_t ret = NET_XMIT_DROP;
struct net_device *dev;
unsigned long tries;
/* It is up to the caller to keep npinfo alive. */
@@ -327,11 +329,12 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
lockdep_assert_irqs_disabled();
dev = np->dev;
+ rcu_read_lock();
npinfo = rcu_dereference_bh(dev->npinfo);
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
dev_kfree_skb_irq(skb);
- return NET_XMIT_DROP;
+ goto out;
}
/* don't get messages out of order, and no recursion */
@@ -370,7 +373,10 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
skb_queue_tail(&npinfo->txq, skb);
schedule_delayed_work(&npinfo->tx_work,0);
}
- return NETDEV_TX_OK;
+ ret = NETDEV_TX_OK;
+out:
+ rcu_read_unlock();
+ return ret;
}
netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
@@ -390,7 +396,7 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
}
EXPORT_SYMBOL(netpoll_send_skb);
-void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
+int netpoll_send_udp(struct netpoll *np, const char *msg, int len)
{
int total_len, ip_len, udp_len;
struct sk_buff *skb;
@@ -414,7 +420,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
skb = find_skb(np, total_len + np->dev->needed_tailroom,
total_len - len);
if (!skb)
- return;
+ return -ENOMEM;
skb_copy_to_linear_data(skb, msg, len);
skb_put(skb, len);
@@ -427,7 +433,6 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
udph->len = htons(udp_len);
if (np->ipv6) {
- udph->check = 0;
udph->check = csum_ipv6_magic(&np->local_ip.in6,
&np->remote_ip.in6,
udp_len, IPPROTO_UDP,
@@ -490,7 +495,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
skb->dev = np->dev;
- netpoll_send_skb(np, skb);
+ return (int)netpoll_send_skb(np, skb);
}
EXPORT_SYMBOL(netpoll_send_udp);
@@ -501,7 +506,8 @@ void netpoll_print_options(struct netpoll *np)
np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6);
else
np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip);
- np_info(np, "interface '%s'\n", np->dev_name);
+ np_info(np, "interface name '%s'\n", np->dev_name);
+ np_info(np, "local ethernet address '%pM'\n", np->dev_mac);
np_info(np, "remote port %d\n", np->remote_port);
if (np->ipv6)
np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6);
@@ -535,6 +541,7 @@ static void skb_pool_flush(struct netpoll *np)
{
struct sk_buff_head *skb_pool;
+ cancel_work_sync(&np->refill_wq);
skb_pool = &np->skb_pool;
skb_queue_purge_reason(skb_pool, SKB_CONSUMED);
}
@@ -570,11 +577,18 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
cur++;
if (*cur != ',') {
- /* parse out dev name */
+ /* parse out dev_name or dev_mac */
if ((delim = strchr(cur, ',')) == NULL)
goto parse_failed;
*delim = 0;
- strscpy(np->dev_name, cur, sizeof(np->dev_name));
+
+ np->dev_name[0] = '\0';
+ eth_broadcast_addr(np->dev_mac);
+ if (!strchr(cur, ':'))
+ strscpy(np->dev_name, cur, sizeof(np->dev_name));
+ else if (!mac_pton(cur, np->dev_mac))
+ goto parse_failed;
+
cur = delim;
}
cur++;
@@ -621,6 +635,14 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
}
EXPORT_SYMBOL(netpoll_parse_options);
+static void refill_skbs_work_handler(struct work_struct *work)
+{
+ struct netpoll *np =
+ container_of(work, struct netpoll, refill_wq);
+
+ refill_skbs(np);
+}
+
int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
{
struct netpoll_info *npinfo;
@@ -636,7 +658,8 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
goto out;
}
- if (!rcu_access_pointer(ndev->npinfo)) {
+ npinfo = rtnl_dereference(ndev->npinfo);
+ if (!npinfo) {
npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
if (!npinfo) {
err = -ENOMEM;
@@ -656,7 +679,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
goto free_npinfo;
}
} else {
- npinfo = rtnl_dereference(ndev->npinfo);
refcount_inc(&npinfo->refcnt);
}
@@ -666,6 +688,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
/* fill up the skb queue */
refill_skbs(np);
+ INIT_WORK(&np->refill_wq, refill_skbs_work_handler);
/* last thing to do is link it to the net device structure */
rcu_assign_pointer(ndev->npinfo, npinfo);
@@ -679,27 +702,45 @@ out:
}
EXPORT_SYMBOL_GPL(__netpoll_setup);
+/*
+ * Returns a pointer to a string representation of the identifier used
+ * to select the egress interface for the given netpoll instance. buf
+ * must be a buffer of length at least MAC_ADDR_STR_LEN + 1.
+ */
+static char *egress_dev(struct netpoll *np, char *buf)
+{
+ if (np->dev_name[0])
+ return np->dev_name;
+
+ snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac);
+ return buf;
+}
+
int netpoll_setup(struct netpoll *np)
{
+ struct net *net = current->nsproxy->net_ns;
+ char buf[MAC_ADDR_STR_LEN + 1];
struct net_device *ndev = NULL;
bool ip_overwritten = false;
struct in_device *in_dev;
int err;
rtnl_lock();
- if (np->dev_name[0]) {
- struct net *net = current->nsproxy->net_ns;
+ if (np->dev_name[0])
ndev = __dev_get_by_name(net, np->dev_name);
- }
+ else if (is_valid_ether_addr(np->dev_mac))
+ ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac);
+
if (!ndev) {
- np_err(np, "%s doesn't exist, aborting\n", np->dev_name);
+ np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf));
err = -ENODEV;
goto unlock;
}
netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL);
if (netdev_master_upper_dev_get(ndev)) {
- np_err(np, "%s is a slave device, aborting\n", np->dev_name);
+ np_err(np, "%s is a slave device, aborting\n",
+ egress_dev(np, buf));
err = -EBUSY;
goto put;
}
@@ -707,7 +748,8 @@ int netpoll_setup(struct netpoll *np)
if (!netif_running(ndev)) {
unsigned long atmost;
- np_info(np, "device %s not up yet, forcing it\n", np->dev_name);
+ np_info(np, "device %s not up yet, forcing it\n",
+ egress_dev(np, buf));
err = dev_open(ndev, NULL);
@@ -741,7 +783,7 @@ int netpoll_setup(struct netpoll *np)
if (!ifa) {
put_noaddr:
np_err(np, "no IP address for %s, aborting\n",
- np->dev_name);
+ egress_dev(np, buf));
err = -EDESTADDRREQ;
goto put;
}
@@ -772,13 +814,13 @@ put_noaddr:
}
if (err) {
np_err(np, "no IPv6 address for %s, aborting\n",
- np->dev_name);
+ egress_dev(np, buf));
goto put;
} else
np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
#else
np_err(np, "IPv6 is not supported %s, aborting\n",
- np->dev_name);
+ egress_dev(np, buf));
err = -EINVAL;
goto put;
#endif
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index f89cf93f6eb4..ba7cf3e3c32f 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -11,8 +11,10 @@
#include <linux/slab.h>
#include <linux/device.h>
+#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
#include <net/xdp.h>
#include <linux/dma-direction.h>
@@ -25,6 +27,7 @@
#include <trace/events/page_pool.h>
+#include "dev.h"
#include "mp_dmabuf_devmem.h"
#include "netmem_priv.h"
#include "page_pool_priv.h"
@@ -150,9 +153,9 @@ u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
EXPORT_SYMBOL(page_pool_ethtool_stats_get);
#else
-#define alloc_stat_inc(pool, __stat)
-#define recycle_stat_inc(pool, __stat)
-#define recycle_stat_add(pool, __stat, val)
+#define alloc_stat_inc(...) do { } while (0)
+#define recycle_stat_inc(...) do { } while (0)
+#define recycle_stat_add(...) do { } while (0)
#endif
static bool page_pool_producer_lock(struct page_pool *pool)
@@ -201,6 +204,7 @@ static int page_pool_init(struct page_pool *pool,
memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
pool->cpuid = cpuid;
+ pool->dma_sync_for_cpu = true;
/* Validate only known flags were used */
if (pool->slow.flags & ~PP_FLAG_ALL)
@@ -272,22 +276,26 @@ static int page_pool_init(struct page_pool *pool,
/* Driver calling page_pool_create() also call page_pool_destroy() */
refcount_set(&pool->user_cnt, 1);
- if (pool->dma_map)
- get_device(pool->p.dev);
+ xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1);
if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
- /* We rely on rtnl_lock()ing to make sure netdev_rx_queue
- * configuration doesn't change while we're initializing
- * the page_pool.
- */
- ASSERT_RTNL();
+ netdev_assert_locked(pool->slow.netdev);
rxq = __netif_get_rx_queue(pool->slow.netdev,
pool->slow.queue_idx);
pool->mp_priv = rxq->mp_params.mp_priv;
+ pool->mp_ops = rxq->mp_params.mp_ops;
}
- if (pool->mp_priv) {
- err = mp_dmabuf_devmem_init(pool);
+ if (pool->mp_ops) {
+ if (!pool->dma_map || !pool->dma_sync)
+ return -EOPNOTSUPP;
+
+ if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) {
+ err = -EFAULT;
+ goto free_ptr_ring;
+ }
+
+ err = pool->mp_ops->init(pool);
if (err) {
pr_warn("%s() mem-provider init failed %d\n", __func__,
err);
@@ -311,9 +319,7 @@ free_ptr_ring:
static void page_pool_uninit(struct page_pool *pool)
{
ptr_ring_cleanup(&pool->ring, NULL);
-
- if (pool->dma_map)
- put_device(pool->p.dev);
+ xa_destroy(&pool->dma_mapped);
#ifdef CONFIG_PAGE_POOL_STATS
if (!pool->system)
@@ -454,13 +460,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool,
netmem_ref netmem,
u32 dma_sync_size)
{
- if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
- __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
+ if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) {
+ rcu_read_lock();
+ /* re-check under rcu_read_lock() to sync with page_pool_scrub() */
+ if (pool->dma_sync)
+ __page_pool_dma_sync_for_device(pool, netmem,
+ dma_sync_size);
+ rcu_read_unlock();
+ }
}
-static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
+static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp)
{
dma_addr_t dma;
+ int err;
+ u32 id;
/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
* since dma_addr_t can be either 32 or 64 bits and does not always fit
@@ -474,15 +488,30 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem)
if (dma_mapping_error(pool->p.dev, dma))
return false;
- if (page_pool_set_dma_addr_netmem(netmem, dma))
+ if (page_pool_set_dma_addr_netmem(netmem, dma)) {
+ WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
goto unmap_failed;
+ }
+ if (in_softirq())
+ err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem),
+ PP_DMA_INDEX_LIMIT, gfp);
+ else
+ err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem),
+ PP_DMA_INDEX_LIMIT, gfp);
+ if (err) {
+ WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@");
+ goto unset_failed;
+ }
+
+ netmem_set_dma_index(netmem, id);
page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
return true;
+unset_failed:
+ page_pool_set_dma_addr_netmem(netmem, 0);
unmap_failed:
- WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
dma_unmap_page_attrs(pool->p.dev, dma,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
@@ -499,7 +528,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
if (unlikely(!page))
return NULL;
- if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) {
+ if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) {
put_page(page);
return NULL;
}
@@ -532,12 +561,11 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
if (unlikely(pool->alloc.count > 0))
return pool->alloc.cache[--pool->alloc.count];
- /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
+ /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */
memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
- nr_pages = alloc_pages_bulk_array_node(gfp,
- pool->p.nid, bulk,
- (struct page **)pool->alloc.cache);
+ nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk,
+ (struct page **)pool->alloc.cache);
if (unlikely(!nr_pages))
return 0;
@@ -546,7 +574,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
*/
for (i = 0; i < nr_pages; i++) {
netmem = pool->alloc.cache[i];
- if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) {
+ if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) {
put_page(netmem_to_page(netmem));
continue;
}
@@ -574,7 +602,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
/* For using page_pool replace: alloc_pages() API calls, but provide
* synchronization guarantee for allocation side.
*/
-netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)
+netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp)
{
netmem_ref netmem;
@@ -584,20 +612,20 @@ netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)
return netmem;
/* Slow-path: cache empty, do real allocation */
- if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
- netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp);
+ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops)
+ netmem = pool->mp_ops->alloc_netmems(pool, gfp);
else
netmem = __page_pool_alloc_pages_slow(pool, gfp);
return netmem;
}
-EXPORT_SYMBOL(page_pool_alloc_netmem);
+EXPORT_SYMBOL(page_pool_alloc_netmems);
+ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL);
struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
{
- return netmem_to_page(page_pool_alloc_netmem(pool, gfp));
+ return netmem_to_page(page_pool_alloc_netmems(pool, gfp));
}
EXPORT_SYMBOL(page_pool_alloc_pages);
-ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
/* Calculate distance between two u32 values, valid if distance is below 2^(31)
* https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
@@ -648,6 +676,8 @@ void page_pool_clear_pp_info(netmem_ref netmem)
static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
netmem_ref netmem)
{
+ struct page *old, *page = netmem_to_page(netmem);
+ unsigned long id;
dma_addr_t dma;
if (!pool->dma_map)
@@ -656,6 +686,17 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
*/
return;
+ id = netmem_get_dma_index(netmem);
+ if (!id)
+ return;
+
+ if (in_softirq())
+ old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0);
+ else
+ old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0);
+ if (old != page)
+ return;
+
dma = page_pool_get_dma_addr_netmem(netmem);
/* When page is unmapped, it cannot be returned to our pool */
@@ -663,6 +704,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
page_pool_set_dma_addr_netmem(netmem, 0);
+ netmem_set_dma_index(netmem, 0);
}
/* Disconnects a page (from a page_pool). API users can have a need
@@ -676,8 +718,8 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
bool put;
put = true;
- if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv)
- put = mp_dmabuf_devmem_release_page(pool, netmem);
+ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops)
+ put = pool->mp_ops->release_netmem(pool, netmem);
else
__page_pool_release_page_dma(pool, netmem);
@@ -699,19 +741,16 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem)
static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
{
- int ret;
- /* BH protection not needed if current is softirq */
- if (in_softirq())
- ret = ptr_ring_produce(&pool->ring, (__force void *)netmem);
- else
- ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem);
+ bool in_softirq, ret;
- if (!ret) {
+ /* BH protection not needed if current is softirq */
+ in_softirq = page_pool_producer_lock(pool);
+ ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem);
+ if (ret)
recycle_stat_inc(pool, ring);
- return true;
- }
+ page_pool_producer_unlock(pool, in_softirq);
- return false;
+ return ret;
}
/* Only allow direct recycling in special circumstances, into the
@@ -797,6 +836,10 @@ static bool page_pool_napi_local(const struct page_pool *pool)
const struct napi_struct *napi;
u32 cpuid;
+ /* On PREEMPT_RT the softirq can be preempted by the consumer */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return false;
+
if (unlikely(!in_softirq()))
return false;
@@ -821,8 +864,8 @@ void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
if (!allow_direct)
allow_direct = page_pool_napi_local(pool);
- netmem =
- __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct);
+ netmem = __page_pool_put_page(pool, netmem, dma_sync_size,
+ allow_direct);
if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
/* Cache full, fallback to free pages */
recycle_stat_inc(pool, ring_full);
@@ -839,69 +882,104 @@ void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
}
EXPORT_SYMBOL(page_pool_put_unrefed_page);
-/**
- * page_pool_put_page_bulk() - release references on multiple pages
- * @pool: pool from which pages were allocated
- * @data: array holding page pointers
- * @count: number of pages in @data
- *
- * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
- * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
- * will release leftover pages to the page allocator.
- * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
- * completion loop for the XDP_REDIRECT use case.
- *
- * Please note the caller must not use data area after running
- * page_pool_put_page_bulk(), as this function overwrites it.
- */
-void page_pool_put_page_bulk(struct page_pool *pool, void **data,
- int count)
+static void page_pool_recycle_ring_bulk(struct page_pool *pool,
+ netmem_ref *bulk,
+ u32 bulk_len)
{
- int i, bulk_len = 0;
- bool allow_direct;
bool in_softirq;
+ u32 i;
- allow_direct = page_pool_napi_local(pool);
-
- for (i = 0; i < count; i++) {
- netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i]));
-
- /* It is not the last user for the page frag case */
- if (!page_pool_is_last_ref(netmem))
- continue;
-
- netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);
- /* Approved for bulk recycling in ptr_ring cache */
- if (netmem)
- data[bulk_len++] = (__force void *)netmem;
- }
-
- if (!bulk_len)
- return;
-
- /* Bulk producer into ptr_ring page_pool cache */
+ /* Bulk produce into ptr_ring page_pool cache */
in_softirq = page_pool_producer_lock(pool);
+
for (i = 0; i < bulk_len; i++) {
- if (__ptr_ring_produce(&pool->ring, data[i])) {
+ if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) {
/* ring full */
recycle_stat_inc(pool, ring_full);
break;
}
}
- recycle_stat_add(pool, ring, i);
+
page_pool_producer_unlock(pool, in_softirq);
+ recycle_stat_add(pool, ring, i);
- /* Hopefully all pages was return into ptr_ring */
+ /* Hopefully all pages were returned into ptr_ring */
if (likely(i == bulk_len))
return;
- /* ptr_ring cache full, free remaining pages outside producer lock
- * since put_page() with refcnt == 1 can be an expensive operation
+ /*
+ * ptr_ring cache is full, free remaining pages outside producer lock
+ * since put_page() with refcnt == 1 can be an expensive operation.
*/
for (; i < bulk_len; i++)
- page_pool_return_page(pool, (__force netmem_ref)data[i]);
+ page_pool_return_page(pool, bulk[i]);
}
-EXPORT_SYMBOL(page_pool_put_page_bulk);
+
+/**
+ * page_pool_put_netmem_bulk() - release references on multiple netmems
+ * @data: array holding netmem references
+ * @count: number of entries in @data
+ *
+ * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring
+ * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk()
+ * will release leftover netmems to the memory provider.
+ * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx
+ * completion loop for the XDP_REDIRECT use case.
+ *
+ * Please note the caller must not use data area after running
+ * page_pool_put_netmem_bulk(), as this function overwrites it.
+ */
+void page_pool_put_netmem_bulk(netmem_ref *data, u32 count)
+{
+ u32 bulk_len = 0;
+
+ for (u32 i = 0; i < count; i++) {
+ netmem_ref netmem = netmem_compound_head(data[i]);
+
+ if (page_pool_unref_and_test(netmem))
+ data[bulk_len++] = netmem;
+ }
+
+ count = bulk_len;
+ while (count) {
+ netmem_ref bulk[XDP_BULK_QUEUE_SIZE];
+ struct page_pool *pool = NULL;
+ bool allow_direct;
+ u32 foreign = 0;
+
+ bulk_len = 0;
+
+ for (u32 i = 0; i < count; i++) {
+ struct page_pool *netmem_pp;
+ netmem_ref netmem = data[i];
+
+ netmem_pp = netmem_get_pp(netmem);
+ if (unlikely(!pool)) {
+ pool = netmem_pp;
+ allow_direct = page_pool_napi_local(pool);
+ } else if (netmem_pp != pool) {
+ /*
+ * If the netmem belongs to a different
+ * page_pool, save it for another round.
+ */
+ data[foreign++] = netmem;
+ continue;
+ }
+
+ netmem = __page_pool_put_page(pool, netmem, -1,
+ allow_direct);
+ /* Approved for bulk recycling in ptr_ring cache */
+ if (netmem)
+ bulk[bulk_len++] = netmem;
+ }
+
+ if (bulk_len)
+ page_pool_recycle_ring_bulk(pool, bulk, bulk_len);
+
+ count = foreign;
+ }
+}
+EXPORT_SYMBOL(page_pool_put_netmem_bulk);
static netmem_ref page_pool_drain_frag(struct page_pool *pool,
netmem_ref netmem)
@@ -957,7 +1035,7 @@ netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
}
if (!netmem) {
- netmem = page_pool_alloc_netmem(pool, gfp);
+ netmem = page_pool_alloc_netmems(pool, gfp);
if (unlikely(!netmem)) {
pool->frag_page = 0;
return 0;
@@ -1010,8 +1088,8 @@ static void __page_pool_destroy(struct page_pool *pool)
page_pool_unlist(pool);
page_pool_uninit(pool);
- if (pool->mp_priv) {
- mp_dmabuf_devmem_destroy(pool);
+ if (pool->mp_ops) {
+ pool->mp_ops->destroy(pool);
static_branch_dec(&page_pool_mem_providers);
}
@@ -1037,8 +1115,29 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
static void page_pool_scrub(struct page_pool *pool)
{
+ unsigned long id;
+ void *ptr;
+
page_pool_empty_alloc_cache_once(pool);
- pool->destroy_cnt++;
+ if (!pool->destroy_cnt++ && pool->dma_map) {
+ if (pool->dma_sync) {
+ /* Disable page_pool_dma_sync_for_device() */
+ pool->dma_sync = false;
+
+ /* Make sure all concurrent returns that may see the old
+ * value of dma_sync (and thus perform a sync) have
+ * finished before doing the unmapping below. Skip the
+ * wait if the device doesn't actually need syncing, or
+ * if there are no outstanding mapped pages.
+ */
+ if (dma_dev_need_sync(pool->p.dev) &&
+ !xa_empty(&pool->dma_mapped))
+ synchronize_net();
+ }
+
+ xa_for_each(&pool->dma_mapped, id, ptr)
+ __page_pool_release_page_dma(pool, page_to_netmem(ptr));
+ }
/* No more consumers should exist, but producers could still
* be in-flight.
@@ -1048,10 +1147,14 @@ static void page_pool_scrub(struct page_pool *pool)
static int page_pool_release(struct page_pool *pool)
{
+ bool in_softirq;
int inflight;
page_pool_scrub(pool);
inflight = page_pool_inflight(pool, true);
+ /* Acquire producer lock to make sure producers have exited. */
+ in_softirq = page_pool_producer_lock(pool);
+ page_pool_producer_unlock(pool, in_softirq);
if (!inflight)
__page_pool_destroy(pool);
@@ -1066,7 +1169,13 @@ static void page_pool_release_retry(struct work_struct *wq)
int inflight;
inflight = page_pool_release(pool);
- if (!inflight)
+ /* In rare cases, a driver bug may cause inflight to go negative.
+ * Don't reschedule release if inflight is 0 or negative.
+ * - If 0, the page_pool has been destroyed
+ * - if negative, we will never recover
+ * in both cases no reschedule is necessary.
+ */
+ if (inflight <= 0)
return;
/* Periodic warning for page pools the user can't see */
@@ -1102,13 +1211,11 @@ void page_pool_disable_direct_recycling(struct page_pool *pool)
if (!pool->p.napi)
return;
- /* To avoid races with recycling and additional barriers make sure
- * pool and NAPI are unlinked when NAPI is disabled.
- */
- WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state));
- WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1);
+ napi_assert_will_not_race(pool->p.napi);
+ mutex_lock(&page_pools_lock);
WRITE_ONCE(pool->p.napi, NULL);
+ mutex_unlock(&page_pools_lock);
}
EXPORT_SYMBOL(page_pool_disable_direct_recycling);
@@ -1150,3 +1257,31 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid)
}
}
EXPORT_SYMBOL(page_pool_update_nid);
+
+bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr)
+{
+ return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr);
+}
+
+/* Associate a niov with a page pool. Should follow with a matching
+ * net_mp_niov_clear_page_pool()
+ */
+void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov)
+{
+ netmem_ref netmem = net_iov_to_netmem(niov);
+
+ page_pool_set_pp_info(pool, netmem);
+
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt);
+}
+
+/* Disassociate a niov from a page pool. Should only be used in the
+ * ->release_netmem() path.
+ */
+void net_mp_niov_clear_page_pool(struct net_iov *niov)
+{
+ netmem_ref netmem = net_iov_to_netmem(niov);
+
+ page_pool_clear_pp_info(netmem);
+}
diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h
index 57439787b9c2..2fb06d5f6d55 100644
--- a/net/core/page_pool_priv.h
+++ b/net/core/page_pool_priv.h
@@ -7,6 +7,8 @@
#include "netmem_priv.h"
+extern struct mutex page_pools_lock;
+
s32 page_pool_inflight(const struct page_pool *pool, bool strict);
int page_pool_list(struct page_pool *pool);
diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
index 48335766c1bf..c82a95beceff 100644
--- a/net/core/page_pool_user.c
+++ b/net/core/page_pool_user.c
@@ -3,21 +3,23 @@
#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/xarray.h>
+#include <net/busy_poll.h>
#include <net/net_debug.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
#include <net/page_pool/types.h>
+#include <net/page_pool/memory_provider.h>
#include <net/sock.h>
-#include "devmem.h"
#include "page_pool_priv.h"
#include "netdev-genl-gen.h"
static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1);
-/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user.
+/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev,
+ * pool->user.
* Ordering: inside rtnl_lock
*/
-static DEFINE_MUTEX(page_pools_lock);
+DEFINE_MUTEX(page_pools_lock);
/* Page pools are only reachable from user space (via netlink) if they are
* linked to a netdev at creation time. Following page pool "visibility"
@@ -214,8 +216,8 @@ static int
page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
const struct genl_info *info)
{
- struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
size_t inflight, refsz;
+ unsigned int napi_id;
void *hdr;
hdr = genlmsg_iput(rsp, info);
@@ -229,8 +231,10 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX,
pool->slow.netdev->ifindex))
goto err_cancel;
- if (pool->user.napi_id &&
- nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id))
+
+ napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0;
+ if (napi_id_valid(napi_id) &&
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id))
goto err_cancel;
inflight = page_pool_inflight(pool, false);
@@ -244,7 +248,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
pool->user.detach_time))
goto err_cancel;
- if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id))
+ if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL))
goto err_cancel;
genlmsg_end(rsp, hdr);
@@ -319,8 +323,6 @@ int page_pool_list(struct page_pool *pool)
if (pool->slow.netdev) {
hlist_add_head(&pool->user.list,
&pool->slow.netdev->page_pools);
- pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0;
-
netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF);
}
@@ -353,7 +355,7 @@ void page_pool_unlist(struct page_pool *pool)
int page_pool_check_memory_provider(struct net_device *dev,
struct netdev_rx_queue *rxq)
{
- struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv;
+ void *binding = rxq->mp_params.mp_priv;
struct page_pool *pool;
struct hlist_node *n;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 4cb547fae91f..0ebe5461d4d9 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -158,9 +158,7 @@
#include <net/udp.h>
#include <net/ip6_checksum.h>
#include <net/addrconf.h>
-#ifdef CONFIG_XFRM
#include <net/xfrm.h>
-#endif
#include <net/netns/generic.h>
#include <asm/byteorder.h>
#include <linux/rcupdate.h>
@@ -179,7 +177,7 @@
#define MAX_IMIX_ENTRIES 20
#define IMIX_PRECISION 100 /* Precision of IMIX distribution */
-#define func_enter() pr_debug("entering %s\n", __func__);
+#define func_enter() pr_debug("entering %s\n", __func__)
#define PKT_FLAGS \
pf(IPV6) /* Interface in IPV6 Mode */ \
@@ -229,12 +227,12 @@ static char *pkt_flag_names[] = {
/* Xmit modes */
#define M_START_XMIT 0 /* Default normal TX */
-#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
+#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */
/* If lock -- protects updating of if_list */
-#define if_lock(t) mutex_lock(&(t->if_lock));
-#define if_unlock(t) mutex_unlock(&(t->if_lock));
+#define if_lock(t) mutex_lock(&(t->if_lock))
+#define if_unlock(t) mutex_unlock(&(t->if_lock))
/* Used to help with determining the pkts on receive */
#define PKTGEN_MAGIC 0xbe9be955
@@ -285,7 +283,8 @@ struct pktgen_dev {
int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
int nfrags;
int removal_mark; /* non-zero => the device is marked for
- * removal by worker thread */
+ * removal by worker thread
+ */
struct page *page;
u64 delay; /* nano-seconds */
@@ -348,10 +347,12 @@ struct pktgen_dev {
__u16 udp_dst_max; /* exclusive, dest UDP port */
/* DSCP + ECN */
- __u8 tos; /* six MSB of (former) IPv4 TOS
- are for dscp codepoint */
- __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
- (see RFC 3260, sec. 4) */
+ __u8 tos; /* six MSB of (former) IPv4 TOS
+ * are for dscp codepoint
+ */
+ __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
+ * (see RFC 3260, sec. 4)
+ */
/* IMIX */
unsigned int n_imix_entries;
@@ -391,12 +392,12 @@ struct pktgen_dev {
__u8 hh[14];
/* = {
- 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
-
- We fill in SRC address later
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x08, 0x00
- };
+ * 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
+ *
+ * We fill in SRC address later
+ * 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ * 0x08, 0x00
+ * };
*/
__u16 pad; /* pad out the hh struct to an even 16 bytes */
@@ -460,7 +461,8 @@ struct pktgen_thread {
char result[512];
/* Field for thread to receive "posted" events terminate,
- stop ifs etc. */
+ * stop ifs etc.
+ */
u32 control;
int cpu;
@@ -474,8 +476,7 @@ struct pktgen_thread {
#define FIND 0
static const char version[] =
- "Packet Generator for packet performance testing. "
- "Version: " VERSION "\n";
+ "Packet Generator for packet performance testing. Version: " VERSION "\n";
static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i);
static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);
@@ -517,21 +518,23 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
char data[128];
+ size_t max;
struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id);
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- if (count == 0)
+ if (count < 1)
return -EINVAL;
- if (count > sizeof(data))
- count = sizeof(data);
-
- if (copy_from_user(data, buf, count))
+ max = min(count, sizeof(data) - 1);
+ if (copy_from_user(data, buf, max))
return -EFAULT;
- data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
+ if (data[max - 1] == '\n')
+ data[max - 1] = 0; /* strip trailing '\n', terminate string */
+ else
+ data[max] = 0; /* terminate string */
if (!strcmp(data, "stop"))
pktgen_stop_all_threads(pn);
@@ -624,8 +627,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
seq_printf(seq, "%pM\n", pkt_dev->dst_mac);
seq_printf(seq,
- " udp_src_min: %d udp_src_max: %d"
- " udp_dst_min: %d udp_dst_max: %d\n",
+ " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n",
pkt_dev->udp_src_min, pkt_dev->udp_src_max,
pkt_dev->udp_dst_min, pkt_dev->udp_dst_max);
@@ -744,34 +746,37 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
}
-static int hex32_arg(const char __user *user_buffer, unsigned long maxlen,
- __u32 *num)
+static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen,
+ __u32 *num)
{
- int i = 0;
+ size_t i = 0;
+
*num = 0;
for (; i < maxlen; i++) {
int value;
char c;
- *num <<= 4;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
value = hex_to_bin(c);
- if (value >= 0)
+ if (value >= 0) {
+ *num <<= 4;
*num |= value;
- else
+ } else {
break;
+ }
}
return i;
}
-static int count_trail_chars(const char __user * user_buffer,
- unsigned int maxlen)
+static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen)
{
- int i;
+ size_t i;
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
switch (c) {
@@ -790,14 +795,15 @@ done:
return i;
}
-static long num_arg(const char __user *user_buffer, unsigned long maxlen,
- unsigned long *num)
+static ssize_t num_arg(const char __user *user_buffer, size_t maxlen,
+ unsigned long *num)
{
- int i;
+ size_t i;
*num = 0;
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
if ((c >= '0') && (c <= '9')) {
@@ -809,12 +815,13 @@ static long num_arg(const char __user *user_buffer, unsigned long maxlen,
return i;
}
-static int strn_len(const char __user * user_buffer, unsigned int maxlen)
+static ssize_t strn_len(const char __user *user_buffer, size_t maxlen)
{
- int i;
+ size_t i;
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
switch (c) {
@@ -823,6 +830,7 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen)
case '\r':
case '\t':
case ' ':
+ case '=':
goto done_str;
default:
break;
@@ -838,11 +846,11 @@ done_str:
* "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example.
*/
static ssize_t get_imix_entries(const char __user *buffer,
+ size_t maxlen,
struct pktgen_dev *pkt_dev)
{
- const int max_digits = 10;
- int i = 0;
- long len;
+ size_t i = 0, max;
+ ssize_t len;
char c;
pkt_dev->n_imix_entries = 0;
@@ -854,21 +862,30 @@ static ssize_t get_imix_entries(const char __user *buffer,
if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES)
return -E2BIG;
- len = num_arg(&buffer[i], max_digits, &size);
+ if (i >= maxlen)
+ return -EINVAL;
+
+ max = min(10, maxlen - i);
+ len = num_arg(&buffer[i], max, &size);
if (len < 0)
return len;
i += len;
+ if (i >= maxlen)
+ return -EINVAL;
if (get_user(c, &buffer[i]))
return -EFAULT;
/* Check for comma between size_i and weight_i */
if (c != ',')
return -EINVAL;
i++;
+ if (i >= maxlen)
+ return -EINVAL;
if (size < 14 + 20 + 8)
size = 14 + 20 + 8;
- len = num_arg(&buffer[i], max_digits, &weight);
+ max = min(10, maxlen - i);
+ len = num_arg(&buffer[i], max, &weight);
if (len < 0)
return len;
if (weight <= 0)
@@ -878,39 +895,55 @@ static ssize_t get_imix_entries(const char __user *buffer,
pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight;
i += len;
+ pkt_dev->n_imix_entries++;
+
+ if (i >= maxlen)
+ break;
if (get_user(c, &buffer[i]))
return -EFAULT;
-
i++;
- pkt_dev->n_imix_entries++;
} while (c == ' ');
return i;
}
-static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
+static ssize_t get_labels(const char __user *buffer,
+ size_t maxlen, struct pktgen_dev *pkt_dev)
{
unsigned int n = 0;
+ size_t i = 0, max;
+ ssize_t len;
char c;
- ssize_t i = 0;
- int len;
pkt_dev->nr_labels = 0;
do {
__u32 tmp;
- len = hex32_arg(&buffer[i], 8, &tmp);
- if (len <= 0)
+
+ if (n >= MAX_MPLS_LABELS)
+ return -E2BIG;
+
+ if (i >= maxlen)
+ return -EINVAL;
+
+ max = min(8, maxlen - i);
+ len = hex32_arg(&buffer[i], max, &tmp);
+ if (len < 0)
return len;
+
+ /* return empty list in case of invalid input or zero value */
+ if (len == 0 || tmp == 0)
+ return maxlen;
+
pkt_dev->labels[n] = htonl(tmp);
if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM)
pkt_dev->flags |= F_MPLS_RND;
i += len;
+ n++;
+ if (i >= maxlen)
+ break;
if (get_user(c, &buffer[i]))
return -EFAULT;
i++;
- n++;
- if (n >= MAX_MPLS_LABELS)
- return -E2BIG;
} while (c == ',');
pkt_dev->nr_labels = n;
@@ -947,16 +980,16 @@ static __u32 pktgen_read_flag(const char *f, bool *disable)
}
static ssize_t pktgen_if_write(struct file *file,
- const char __user * user_buffer, size_t count,
- loff_t * offset)
+ const char __user *user_buffer, size_t count,
+ loff_t *offset)
{
struct seq_file *seq = file->private_data;
struct pktgen_dev *pkt_dev = seq->private;
- int i, max, len;
+ size_t i, max;
+ ssize_t len;
char name[16], valstr[32];
unsigned long value = 0;
char *pg_result = NULL;
- int tmp = 0;
char buf[128];
pg_result = &(pkt_dev->result[0]);
@@ -967,16 +1000,16 @@ static ssize_t pktgen_if_write(struct file *file,
}
max = count;
- tmp = count_trail_chars(user_buffer, max);
- if (tmp < 0) {
+ len = count_trail_chars(user_buffer, max);
+ if (len < 0) {
pr_warn("illegal format\n");
- return tmp;
+ return len;
}
- i = tmp;
+ i = len;
/* Read variable name */
-
- len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ max = min(sizeof(name) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1004,11 +1037,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "min_pkt_size")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value < 14 + 20 + 8)
value = 14 + 20 + 8;
if (value != pkt_dev->min_pkt_size) {
@@ -1021,11 +1054,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "max_pkt_size")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value < 14 + 20 + 8)
value = 14 + 20 + 8;
if (value != pkt_dev->max_pkt_size) {
@@ -1040,11 +1073,11 @@ static ssize_t pktgen_if_write(struct file *file,
/* Shortcut for min = max */
if (!strcmp(name, "pkt_size")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value < 14 + 20 + 8)
value = 14 + 20 + 8;
if (value != pkt_dev->min_pkt_size) {
@@ -1060,43 +1093,43 @@ static ssize_t pktgen_if_write(struct file *file,
if (pkt_dev->clone_skb > 0)
return -EINVAL;
- len = get_imix_entries(&user_buffer[i], pkt_dev);
+ max = count - i;
+ len = get_imix_entries(&user_buffer[i], max, pkt_dev);
if (len < 0)
return len;
fill_imix_distribution(pkt_dev);
- i += len;
return count;
}
if (!strcmp(name, "debug")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
debug = value;
sprintf(pg_result, "OK: debug=%u", debug);
return count;
}
if (!strcmp(name, "frags")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->nfrags = value;
sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags);
return count;
}
if (!strcmp(name, "delay")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value == 0x7FFFFFFF)
pkt_dev->delay = ULLONG_MAX;
else
@@ -1107,13 +1140,13 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "rate")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (!value)
- return len;
+ return -EINVAL;
pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value;
if (debug)
pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
@@ -1122,13 +1155,13 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "ratep")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (!value)
- return len;
+ return -EINVAL;
pkt_dev->delay = NSEC_PER_SEC/value;
if (debug)
pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
@@ -1137,11 +1170,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_src_min")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value != pkt_dev->udp_src_min) {
pkt_dev->udp_src_min = value;
pkt_dev->cur_udp_src = value;
@@ -1150,11 +1183,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_dst_min")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value != pkt_dev->udp_dst_min) {
pkt_dev->udp_dst_min = value;
pkt_dev->cur_udp_dst = value;
@@ -1163,11 +1196,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_src_max")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value != pkt_dev->udp_src_max) {
pkt_dev->udp_src_max = value;
pkt_dev->cur_udp_src = value;
@@ -1176,11 +1209,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_dst_max")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value != pkt_dev->udp_dst_max) {
pkt_dev->udp_dst_max = value;
pkt_dev->cur_udp_dst = value;
@@ -1189,7 +1222,8 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "clone_skb")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
/* clone_skb is not supported for netif_receive xmit_mode and
@@ -1198,34 +1232,33 @@ static ssize_t pktgen_if_write(struct file *file,
if ((value > 0) &&
((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (value > 0 && (pkt_dev->n_imix_entries > 0 ||
!(pkt_dev->flags & F_SHARED)))
return -EINVAL;
- i += len;
pkt_dev->clone_skb = value;
sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb);
return count;
}
if (!strcmp(name, "count")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->count = value;
sprintf(pg_result, "OK: count=%llu",
(unsigned long long)pkt_dev->count);
return count;
}
if (!strcmp(name, "src_mac_count")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (pkt_dev->src_mac_count != value) {
pkt_dev->src_mac_count = value;
pkt_dev->cur_src_mac_offset = 0;
@@ -1235,11 +1268,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "dst_mac_count")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (pkt_dev->dst_mac_count != value) {
pkt_dev->dst_mac_count = value;
pkt_dev->cur_dst_mac_offset = 0;
@@ -1249,16 +1282,16 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "burst")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value > 1) &&
((pkt_dev->xmit_mode == M_QUEUE_XMIT) ||
((pkt_dev->xmit_mode == M_START_XMIT) &&
(!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
if (value > 1 && !(pkt_dev->flags & F_SHARED))
return -EINVAL;
@@ -1268,12 +1301,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "node")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
-
if (node_possible(value)) {
pkt_dev->node = value;
sprintf(pg_result, "OK: node=%d", pkt_dev->node);
@@ -1281,29 +1313,29 @@ static ssize_t pktgen_if_write(struct file *file,
put_page(pkt_dev->page);
pkt_dev->page = NULL;
}
- }
- else
+ } else {
sprintf(pg_result, "ERROR: node not possible");
+ }
return count;
}
if (!strcmp(name, "xmit_mode")) {
char f[32];
- memset(f, 0, 32);
- len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ max = min(sizeof(f) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
+ memset(f, 0, sizeof(f));
if (copy_from_user(f, &user_buffer[i], len))
return -EFAULT;
- i += len;
if (strcmp(f, "start_xmit") == 0) {
pkt_dev->xmit_mode = M_START_XMIT;
} else if (strcmp(f, "netif_receive") == 0) {
/* clone_skb set earlier, not supported in this mode */
if (pkt_dev->clone_skb > 0)
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
pkt_dev->xmit_mode = M_NETIF_RECEIVE;
@@ -1329,14 +1361,14 @@ static ssize_t pktgen_if_write(struct file *file,
char f[32];
char *end;
- memset(f, 0, 32);
- len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ max = min(sizeof(f) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
+ memset(f, 0, 32);
if (copy_from_user(f, &user_buffer[i], len))
return -EFAULT;
- i += len;
flag = pktgen_read_flag(f, &disable);
if (flag) {
@@ -1378,7 +1410,8 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1);
+ max = min(sizeof(pkt_dev->dst_min) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1386,19 +1419,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_min) != 0) {
- memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
- strcpy(pkt_dev->dst_min, buf);
+ strscpy_pad(pkt_dev->dst_min, buf);
pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
pkt_dev->cur_daddr = pkt_dev->daddr_min;
}
if (debug)
pr_debug("dst_min set to: %s\n", pkt_dev->dst_min);
- i += len;
+
sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min);
return count;
}
if (!strcmp(name, "dst_max")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1);
+ max = min(sizeof(pkt_dev->dst_max) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1406,19 +1439,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_max) != 0) {
- memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
- strcpy(pkt_dev->dst_max, buf);
+ strscpy_pad(pkt_dev->dst_max, buf);
pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
pkt_dev->cur_daddr = pkt_dev->daddr_max;
}
if (debug)
pr_debug("dst_max set to: %s\n", pkt_dev->dst_max);
- i += len;
+
sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max);
return count;
}
if (!strcmp(name, "dst6")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1436,12 +1469,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (debug)
pr_debug("dst6 set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: dst6=%s", buf);
return count;
}
if (!strcmp(name, "dst6_min")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1458,12 +1491,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (debug)
pr_debug("dst6_min set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: dst6_min=%s", buf);
return count;
}
if (!strcmp(name, "dst6_max")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1479,12 +1512,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (debug)
pr_debug("dst6_max set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: dst6_max=%s", buf);
return count;
}
if (!strcmp(name, "src6")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1502,12 +1535,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (debug)
pr_debug("src6 set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: src6=%s", buf);
return count;
}
if (!strcmp(name, "src_min")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1);
+ max = min(sizeof(pkt_dev->src_min) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1515,19 +1548,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_min) != 0) {
- memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
- strcpy(pkt_dev->src_min, buf);
+ strscpy_pad(pkt_dev->src_min, buf);
pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
pkt_dev->cur_saddr = pkt_dev->saddr_min;
}
if (debug)
pr_debug("src_min set to: %s\n", pkt_dev->src_min);
- i += len;
+
sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min);
return count;
}
if (!strcmp(name, "src_max")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1);
+ max = min(sizeof(pkt_dev->src_max) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1535,19 +1568,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_max) != 0) {
- memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
- strcpy(pkt_dev->src_max, buf);
+ strscpy_pad(pkt_dev->src_max, buf);
pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
pkt_dev->cur_saddr = pkt_dev->saddr_max;
}
if (debug)
pr_debug("src_max set to: %s\n", pkt_dev->src_max);
- i += len;
+
sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max);
return count;
}
if (!strcmp(name, "dst_mac")) {
- len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+ max = min(sizeof(valstr) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1564,7 +1597,8 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "src_mac")) {
- len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+ max = min(sizeof(valstr) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1588,11 +1622,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "flows")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value > MAX_CFLOWS)
value = MAX_CFLOWS;
@@ -1602,44 +1636,44 @@ static ssize_t pktgen_if_write(struct file *file,
}
#ifdef CONFIG_XFRM
if (!strcmp(name, "spi")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->spi = value;
sprintf(pg_result, "OK: spi=%u", pkt_dev->spi);
return count;
}
#endif
if (!strcmp(name, "flowlen")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->lflow = value;
sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow);
return count;
}
if (!strcmp(name, "queue_map_min")) {
- len = num_arg(&user_buffer[i], 5, &value);
+ max = min(5, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->queue_map_min = value;
sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min);
return count;
}
if (!strcmp(name, "queue_map_max")) {
- len = num_arg(&user_buffer[i], 5, &value);
+ max = min(5, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->queue_map_max = value;
sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max);
return count;
@@ -1648,10 +1682,11 @@ static ssize_t pktgen_if_write(struct file *file,
if (!strcmp(name, "mpls")) {
unsigned int n, cnt;
- len = get_labels(&user_buffer[i], pkt_dev);
+ max = count - i;
+ len = get_labels(&user_buffer[i], max, pkt_dev);
if (len < 0)
return len;
- i += len;
+
cnt = sprintf(pg_result, "OK: mpls=");
for (n = 0; n < pkt_dev->nr_labels; n++)
cnt += sprintf(pg_result + cnt,
@@ -1669,11 +1704,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "vlan_id")) {
- len = num_arg(&user_buffer[i], 4, &value);
+ max = min(4, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value <= 4095) {
pkt_dev->vlan_id = value; /* turn on VLAN */
@@ -1696,11 +1731,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "vlan_p")) {
- len = num_arg(&user_buffer[i], 1, &value);
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) {
pkt_dev->vlan_p = value;
sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p);
@@ -1711,11 +1746,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "vlan_cfi")) {
- len = num_arg(&user_buffer[i], 1, &value);
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) {
pkt_dev->vlan_cfi = value;
sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi);
@@ -1726,11 +1761,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "svlan_id")) {
- len = num_arg(&user_buffer[i], 4, &value);
+ max = min(4, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) {
pkt_dev->svlan_id = value; /* turn on SVLAN */
@@ -1753,11 +1788,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "svlan_p")) {
- len = num_arg(&user_buffer[i], 1, &value);
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) {
pkt_dev->svlan_p = value;
sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p);
@@ -1768,11 +1803,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "svlan_cfi")) {
- len = num_arg(&user_buffer[i], 1, &value);
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) {
pkt_dev->svlan_cfi = value;
sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi);
@@ -1783,12 +1818,13 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "tos")) {
- __u32 tmp_value = 0;
- len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ __u32 tmp_value;
+
+ max = min(2, count - i);
+ len = hex32_arg(&user_buffer[i], max, &tmp_value);
if (len < 0)
return len;
- i += len;
if (len == 2) {
pkt_dev->tos = tmp_value;
sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos);
@@ -1799,12 +1835,13 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "traffic_class")) {
- __u32 tmp_value = 0;
- len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ __u32 tmp_value;
+
+ max = min(2, count - i);
+ len = hex32_arg(&user_buffer[i], max, &tmp_value);
if (len < 0)
return len;
- i += len;
if (len == 2) {
pkt_dev->traffic_class = tmp_value;
sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class);
@@ -1815,11 +1852,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "skb_priority")) {
- len = num_arg(&user_buffer[i], 9, &value);
+ max = min(9, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->skb_priority = value;
sprintf(pg_result, "OK: skb_priority=%i",
pkt_dev->skb_priority);
@@ -1874,12 +1911,13 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
}
static ssize_t pktgen_thread_write(struct file *file,
- const char __user * user_buffer,
- size_t count, loff_t * offset)
+ const char __user *user_buffer,
+ size_t count, loff_t *offset)
{
struct seq_file *seq = file->private_data;
struct pktgen_thread *t = seq->private;
- int i, max, len, ret;
+ size_t i, max;
+ ssize_t len, ret;
char name[40];
char *pg_result;
@@ -1896,8 +1934,8 @@ static ssize_t pktgen_thread_write(struct file *file,
i = len;
/* Read variable name */
-
- len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ max = min(sizeof(name) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1926,15 +1964,17 @@ static ssize_t pktgen_thread_write(struct file *file,
if (!strcmp(name, "add_device")) {
char f[32];
+
memset(f, 0, 32);
- len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ max = min(sizeof(f) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0) {
ret = len;
goto out;
}
if (copy_from_user(f, &user_buffer[i], len))
return -EFAULT;
- i += len;
+
mutex_lock(&pktgen_thread_lock);
ret = pktgen_add_device(t, f);
mutex_unlock(&pktgen_thread_lock);
@@ -2358,15 +2398,16 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)
}
-#ifdef CONFIG_XFRM
/* If there was already an IPSEC SA, we keep it as is, else
* we go look for it ...
-*/
+ */
#define DUMMY_MARK 0
static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
{
+#ifdef CONFIG_XFRM
struct xfrm_state *x = pkt_dev->flows[flow].x;
struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id);
+
if (!x) {
if (pkt_dev->spi) {
@@ -2390,16 +2431,16 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
}
}
-}
#endif
+}
static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
{
-
if (pkt_dev->flags & F_QUEUE_MAP_CPU)
pkt_dev->cur_queue_map = smp_processor_id();
else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
__u16 t;
+
if (pkt_dev->flags & F_QUEUE_MAP_RND) {
t = get_random_u32_inclusive(pkt_dev->queue_map_min,
pkt_dev->queue_map_max);
@@ -2481,6 +2522,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->flags & F_MPLS_RND) {
unsigned int i;
+
for (i = 0; i < pkt_dev->nr_labels; i++)
if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
@@ -2525,6 +2567,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
imx = ntohl(pkt_dev->saddr_max);
if (imn < imx) {
__u32 t;
+
if (pkt_dev->flags & F_IPSRC_RND)
t = get_random_u32_inclusive(imn, imx - 1);
else {
@@ -2545,6 +2588,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (imn < imx) {
__u32 t;
__be32 s;
+
if (pkt_dev->flags & F_IPDST_RND) {
do {
@@ -2569,10 +2613,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
pkt_dev->flows[flow].flags |= F_INIT;
pkt_dev->flows[flow].cur_daddr =
pkt_dev->cur_daddr;
-#ifdef CONFIG_XFRM
if (pkt_dev->flags & F_IPSEC)
get_ipsec_sa(pkt_dev, flow);
-#endif
pkt_dev->nflows++;
}
}
@@ -2594,6 +2636,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
__u32 t;
+
if (pkt_dev->flags & F_TXSIZE_RND) {
t = get_random_u32_inclusive(pkt_dev->min_pkt_size,
pkt_dev->max_pkt_size - 1);
@@ -2660,7 +2703,8 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
if (!x)
return 0;
/* XXX: we dont support tunnel mode for now until
- * we resolve the dst issue */
+ * we resolve the dst issue
+ */
if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0))
return 0;
@@ -2695,8 +2739,10 @@ static void free_SAs(struct pktgen_dev *pkt_dev)
if (pkt_dev->cflows) {
/* let go of the SAs if we have them */
int i;
+
for (i = 0; i < pkt_dev->cflows; i++) {
struct xfrm_state *x = pkt_dev->flows[i].x;
+
if (x) {
xfrm_state_put(x);
pkt_dev->flows[i].x = NULL;
@@ -2711,6 +2757,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
if (pkt_dev->flags & F_IPSEC) {
struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
int nhead = 0;
+
if (x) {
struct ethhdr *eth;
struct iphdr *iph;
@@ -2754,6 +2801,7 @@ err:
static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev)
{
unsigned int i;
+
for (i = 0; i < pkt_dev->nr_labels; i++)
*mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM;
@@ -2866,7 +2914,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
skb->dev = dev;
}
} else {
- skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
+ skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
}
/* the caller pre-fetches from skb->data and reserves for the mac hdr */
@@ -2947,7 +2995,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
skb->priority = pkt_dev->skb_priority;
memcpy(eth, pkt_dev->hh, 12);
- *(__be16 *) & eth[12] = protocol;
+ *(__be16 *)&eth[12] = protocol;
/* Eth + IPh + UDPh + mpls */
datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 -
@@ -3176,11 +3224,11 @@ static void pktgen_run(struct pktgen_thread *t)
set_pkt_overhead(pkt_dev);
- strcpy(pkt_dev->result, "Starting");
+ strscpy(pkt_dev->result, "Starting");
pkt_dev->running = 1; /* Cranke yeself! */
started++;
} else
- strcpy(pkt_dev->result, "Error starting");
+ strscpy(pkt_dev->result, "Error starting");
}
rcu_read_unlock();
if (started)
@@ -3439,6 +3487,7 @@ static void pktgen_rem_thread(struct pktgen_thread *t)
static void pktgen_resched(struct pktgen_dev *pkt_dev)
{
ktime_t idle_start = ktime_get();
+
schedule();
pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
}
@@ -3754,7 +3803,8 @@ static int add_dev_to_thread(struct pktgen_thread *t,
* userspace on another CPU than the kthread. The if_lock()
* is used here to sync with concurrent instances of
* _rem_dev_from_if_list() invoked via kthread, which is also
- * updating the if_list */
+ * updating the if_list
+ */
if_lock(t);
if (pkt_dev->pg_thread) {
@@ -3792,7 +3842,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
if (!pkt_dev)
return -ENOMEM;
- strcpy(pkt_dev->odevname, ifname);
+ strscpy(pkt_dev->odevname, ifname);
pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS,
sizeof(struct flow_state)),
node);
@@ -3883,17 +3933,14 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
list_add_tail(&t->th_list, &pn->pktgen_threads);
init_completion(&t->start_done);
- p = kthread_create_on_node(pktgen_thread_worker,
- t,
- cpu_to_node(cpu),
- "kpktgend_%d", cpu);
+ p = kthread_create_on_cpu(pktgen_thread_worker, t, cpu, "kpktgend_%d");
if (IS_ERR(p)) {
pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu);
list_del(&t->th_list);
kfree(t);
return PTR_ERR(p);
}
- kthread_bind(p, cpu);
+
t->tsk = p;
pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,
@@ -3952,7 +3999,8 @@ static int pktgen_remove_device(struct pktgen_thread *t,
/* Remove proc before if_list entry, because add_device uses
* list to determine if interface already exist, avoid race
- * with proc_create_data() */
+ * with proc_create_data()
+ */
proc_remove(pkt_dev->entry);
/* And update the thread if_list */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d9f959c619d9..c57692eb8da9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -53,6 +53,7 @@
#include <net/fib_rules.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
+#include <net/netdev_lock.h>
#include <net/devlink.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/addrconf.h>
@@ -80,11 +81,15 @@ void rtnl_lock(void)
}
EXPORT_SYMBOL(rtnl_lock);
+int rtnl_lock_interruptible(void)
+{
+ return mutex_lock_interruptible(&rtnl_mutex);
+}
+
int rtnl_lock_killable(void)
{
return mutex_lock_killable(&rtnl_mutex);
}
-EXPORT_SYMBOL(rtnl_lock_killable);
static struct sk_buff *defer_kfree_skb_list;
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
@@ -221,6 +226,16 @@ int rtnl_net_trylock(struct net *net)
}
EXPORT_SYMBOL(rtnl_net_trylock);
+int rtnl_net_lock_killable(struct net *net)
+{
+ int ret = rtnl_lock_killable();
+
+ if (!ret)
+ __rtnl_net_lock(net);
+
+ return ret;
+}
+
static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
{
if (net_eq(net_a, net_b))
@@ -1028,7 +1043,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
}
EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
-void netdev_set_operstate(struct net_device *dev, int newstate)
+void netif_set_operstate(struct net_device *dev, int newstate)
{
unsigned int old = READ_ONCE(dev->operstate);
@@ -1037,9 +1052,9 @@ void netdev_set_operstate(struct net_device *dev, int newstate)
return;
} while (!try_cmpxchg(&dev->operstate, &old, newstate));
- netdev_state_change(dev);
+ netif_state_change(dev);
}
-EXPORT_SYMBOL(netdev_set_operstate);
+EXPORT_SYMBOL(netif_set_operstate);
static void set_operstate(struct net_device *dev, unsigned char transition)
{
@@ -1065,7 +1080,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
break;
}
- netdev_set_operstate(dev, operstate);
+ netif_set_operstate(dev, operstate);
}
static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
@@ -1162,6 +1177,9 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
/* IFLA_VF_STATS_TX_DROPPED */
nla_total_size_64bit(sizeof(__u64)));
}
+ if (dev->netdev_ops->ndo_get_vf_guid)
+ size += num_vfs * 2 *
+ nla_total_size(sizeof(struct ifla_vf_guid));
return size;
} else
return 0;
@@ -1278,6 +1296,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_TSO_MAX_SEGS */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ + nla_total_size(1) /* IFLA_NETNS_IMMUTABLE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ nla_total_size(4) /* IFLA_LINK_NETNSID */
+ nla_total_size(4) /* IFLA_GROUP */
@@ -2032,6 +2051,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
netif_running(dev) ? READ_ONCE(dev->operstate) :
IF_OPER_DOWN) ||
nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) ||
+ nla_put_u8(skb, IFLA_NETNS_IMMUTABLE, dev->netns_immutable) ||
nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) ||
nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) ||
nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) ||
@@ -2220,6 +2240,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_ALLMULTI] = { .type = NLA_REJECT },
[IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
[IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2369,12 +2390,12 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
if (strict_check) {
struct ifinfomsg *ifm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "Invalid header for link dump");
return -EINVAL;
}
- ifm = nlmsg_data(nlh);
if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
ifm->ifi_change) {
NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request");
@@ -2895,12 +2916,19 @@ static int do_set_master(struct net_device *dev, int ifindex,
const struct net_device_ops *ops;
int err;
+ /* Release the lower lock, the upper is responsible for locking
+ * the lower if needed. None of the existing upper devices
+ * use netdev instance lock, so don't grab it.
+ */
+
if (upper_dev) {
if (upper_dev->ifindex == ifindex)
return 0;
ops = upper_dev->netdev_ops;
if (ops->ndo_del_slave) {
+ netdev_unlock_ops(dev);
err = ops->ndo_del_slave(upper_dev, dev);
+ netdev_lock_ops(dev);
if (err)
return err;
} else {
@@ -2914,7 +2942,9 @@ static int do_set_master(struct net_device *dev, int ifindex,
return -EINVAL;
ops = upper_dev->netdev_ops;
if (ops->ndo_add_slave) {
+ netdev_unlock_ops(dev);
err = ops->ndo_add_slave(upper_dev, dev, extack);
+ netdev_lock_ops(dev);
if (err)
return err;
} else {
@@ -2964,7 +2994,7 @@ static int do_set_proto_down(struct net_device *dev,
if (pdreason[IFLA_PROTO_DOWN_REASON_MASK])
mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]);
- dev_change_proto_down_reason(dev, mask, value);
+ netdev_change_proto_down_reason_locked(dev, mask, value);
}
if (nl_proto_down) {
@@ -2975,8 +3005,7 @@ static int do_set_proto_down(struct net_device *dev,
NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons");
return -EBUSY;
}
- err = dev_change_proto_down(dev,
- proto_down);
+ err = netif_change_proto_down(dev, proto_down);
if (err)
return err;
}
@@ -2998,7 +3027,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
err = validate_linkmsg(dev, tb, extack);
if (err < 0)
- goto errout;
+ return err;
if (tb[IFLA_IFNAME])
nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
@@ -3011,13 +3040,16 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0);
- err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex);
+ err = __dev_change_net_namespace(dev, tgt_net, pat,
+ new_ifindex, extack);
if (err)
- goto errout;
+ return err;
status |= DO_SETLINK_MODIFIED;
}
+ netdev_lock_ops(dev);
+
if (tb[IFLA_MAP]) {
struct rtnl_link_ifmap *u_map;
struct ifmap k_map;
@@ -3048,35 +3080,35 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
}
if (tb[IFLA_ADDRESS]) {
- struct sockaddr *sa;
- int len;
-
- len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len,
- sizeof(*sa));
- sa = kmalloc(len, GFP_KERNEL);
- if (!sa) {
- err = -ENOMEM;
+ struct sockaddr_storage ss = { };
+
+ netdev_unlock_ops(dev);
+
+ /* dev_addr_sem is an outer lock, enforce proper ordering */
+ down_write(&dev_addr_sem);
+ netdev_lock_ops(dev);
+
+ ss.ss_family = dev->type;
+ memcpy(ss.__data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len);
+ err = netif_set_mac_address(dev, &ss, extack);
+ if (err) {
+ up_write(&dev_addr_sem);
goto errout;
}
- sa->sa_family = dev->type;
- memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
- dev->addr_len);
- err = dev_set_mac_address_user(dev, sa, extack);
- kfree(sa);
- if (err)
- goto errout;
status |= DO_SETLINK_MODIFIED;
+
+ up_write(&dev_addr_sem);
}
if (tb[IFLA_MTU]) {
- err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
+ err = netif_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
if (err < 0)
goto errout;
status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_GROUP]) {
- dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
status |= DO_SETLINK_NOTIFY;
}
@@ -3086,15 +3118,15 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
* requested.
*/
if (ifm->ifi_index > 0 && ifname[0]) {
- err = dev_change_name(dev, ifname);
+ err = netif_change_name(dev, ifname);
if (err < 0)
goto errout;
status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_IFALIAS]) {
- err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
- nla_len(tb[IFLA_IFALIAS]));
+ err = netif_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
+ nla_len(tb[IFLA_IFALIAS]));
if (err < 0)
goto errout;
status |= DO_SETLINK_NOTIFY;
@@ -3106,8 +3138,8 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
}
if (ifm->ifi_flags || ifm->ifi_change) {
- err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
- extack);
+ err = netif_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
+ extack);
if (err < 0)
goto errout;
}
@@ -3120,7 +3152,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
}
if (tb[IFLA_CARRIER]) {
- err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
+ err = netif_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
if (err)
goto errout;
status |= DO_SETLINK_MODIFIED;
@@ -3129,7 +3161,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
if (tb[IFLA_TXQLEN]) {
unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]);
- err = dev_change_tx_queue_len(dev, value);
+ err = netif_change_tx_queue_len(dev, value);
if (err)
goto errout;
status |= DO_SETLINK_MODIFIED;
@@ -3353,13 +3385,15 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
errout:
if (status & DO_SETLINK_MODIFIED) {
if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY)
- netdev_state_change(dev);
+ netif_state_change(dev);
if (err < 0)
net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
dev->name);
}
+ netdev_unlock_ops(dev);
+
return err;
}
@@ -3423,6 +3457,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
err = -ENODEV;
rtnl_nets_unlock(&rtnl_nets);
+ rtnl_nets_destroy(&rtnl_nets);
errout:
return err;
}
@@ -3534,7 +3569,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
u32 portid, const struct nlmsghdr *nlh)
{
- unsigned int old_flags;
+ unsigned int old_flags, changed;
int err;
old_flags = dev->flags;
@@ -3545,12 +3580,13 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
return err;
}
- if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
- __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh);
- } else {
- dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
- __dev_notify_flags(dev, old_flags, ~0U, portid, nlh);
+ changed = old_flags ^ dev->flags;
+ if (dev->rtnl_link_initializing) {
+ dev->rtnl_link_initializing = false;
+ changed = ~0U;
}
+
+ __dev_notify_flags(dev, old_flags, changed, portid, nlh);
return 0;
}
EXPORT_SYMBOL(rtnl_configure_link);
@@ -3608,7 +3644,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
dev_net_set(dev, net);
dev->rtnl_link_ops = ops;
- dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
+ dev->rtnl_link_initializing = true;
if (tb[IFLA_MTU]) {
u32 mtu = nla_get_u32(tb[IFLA_MTU]);
@@ -3635,7 +3671,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
if (tb[IFLA_LINKMODE])
dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
if (tb[IFLA_GROUP])
- dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
if (tb[IFLA_GSO_MAX_SIZE])
netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
if (tb[IFLA_GSO_MAX_SEGS])
@@ -3752,7 +3788,13 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
struct netlink_ext_ack *extack)
{
unsigned char name_assign_type = NET_NAME_USER;
- struct net *net = sock_net(skb->sk);
+ struct rtnl_newlink_params params = {
+ .src_net = sock_net(skb->sk),
+ .link_net = link_net,
+ .peer_net = peer_net,
+ .tb = tb,
+ .data = data,
+ };
u32 portid = NETLINK_CB(skb).portid;
struct net_device *dev;
char ifname[IFNAMSIZ];
@@ -3768,8 +3810,8 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
name_assign_type = NET_NAME_ENUM;
}
- dev = rtnl_create_link(link_net ? : tgt_net, ifname,
- name_assign_type, ops, tb, extack);
+ dev = rtnl_create_link(tgt_net, ifname, name_assign_type, ops, tb,
+ extack);
if (IS_ERR(dev)) {
err = PTR_ERR(dev);
goto out;
@@ -3777,13 +3819,8 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
dev->ifindex = ifm->ifi_index;
- if (link_net)
- net = link_net;
- if (peer_net)
- net = peer_net;
-
if (ops->newlink)
- err = ops->newlink(net, dev, tb, data, extack);
+ err = ops->newlink(dev, &params, extack);
else
err = register_netdevice(dev);
if (err < 0) {
@@ -3791,22 +3828,22 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
goto out;
}
+ netdev_lock_ops(dev);
+
err = rtnl_configure_link(dev, ifm, portid, nlh);
if (err < 0)
goto out_unregister;
- if (link_net) {
- err = dev_change_net_namespace(dev, tgt_net, ifname);
- if (err < 0)
- goto out_unregister;
- }
if (tb[IFLA_MASTER]) {
err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
if (err)
goto out_unregister;
}
+
+ netdev_unlock_ops(dev);
out:
return err;
out_unregister:
+ netdev_unlock_ops(dev);
if (ops->newlink) {
LIST_HEAD(list_kill);
@@ -3852,20 +3889,26 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
{
struct nlattr ** const tb = tbs->tb;
struct net *net = sock_net(skb->sk);
+ struct net *device_net;
struct net_device *dev;
struct ifinfomsg *ifm;
bool link_specified;
+ /* When creating, lookup for existing device in target net namespace */
+ device_net = (nlh->nlmsg_flags & NLM_F_CREATE) &&
+ (nlh->nlmsg_flags & NLM_F_EXCL) ?
+ tgt_net : net;
+
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0) {
link_specified = true;
- dev = __dev_get_by_index(net, ifm->ifi_index);
+ dev = __dev_get_by_index(device_net, ifm->ifi_index);
} else if (ifm->ifi_index < 0) {
NL_SET_ERR_MSG(extack, "ifindex can't be negative");
return -EINVAL;
} else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) {
link_specified = true;
- dev = rtnl_dev_get(net, tb);
+ dev = rtnl_dev_get(device_net, tb);
} else {
link_specified = false;
dev = NULL;
@@ -4030,7 +4073,8 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb,
struct ifinfomsg *ifm;
int i, err;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "Invalid header for get link");
return -EINVAL;
}
@@ -4039,7 +4083,6 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb,
return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
ifla_policy, extack);
- ifm = nlmsg_data(nlh);
if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
ifm->ifi_change) {
NL_SET_ERR_MSG(extack, "Invalid values in header for get link request");
@@ -4765,15 +4808,16 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
int *idx,
struct netdev_hw_addr_list *list)
{
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
struct netdev_hw_addr *ha;
- int err;
u32 portid, seq;
+ int err;
portid = NETLINK_CB(cb->skb).portid;
seq = cb->nlh->nlmsg_seq;
list_for_each_entry(ha, &list->list, list) {
- if (*idx < cb->args[2])
+ if (*idx < ctx->fdb_idx)
goto skip;
err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
@@ -4829,12 +4873,12 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh,
struct ndmsg *ndm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request");
return -EINVAL;
}
- ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
ndm->ndm_flags || ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request");
@@ -4912,18 +4956,16 @@ static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh,
static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct net_device *dev;
- struct net_device *br_dev = NULL;
- const struct net_device_ops *ops = NULL;
- const struct net_device_ops *cops = NULL;
+ const struct net_device_ops *ops = NULL, *cops = NULL;
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
+ struct net_device *dev, *br_dev = NULL;
struct net *net = sock_net(skb->sk);
- struct hlist_head *head;
int brport_idx = 0;
int br_idx = 0;
- int h, s_h;
- int idx = 0, s_idx;
- int err = 0;
int fidx = 0;
+ int err;
+
+ NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context);
if (cb->strict_check)
err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx,
@@ -4942,70 +4984,51 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
ops = br_dev->netdev_ops;
}
- s_h = cb->args[0];
- s_idx = cb->args[1];
-
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry(dev, head, index_hlist) {
-
- if (brport_idx && (dev->ifindex != brport_idx))
- continue;
-
- if (!br_idx) { /* user did not specify a specific bridge */
- if (netif_is_bridge_port(dev)) {
- br_dev = netdev_master_upper_dev_get(dev);
- cops = br_dev->netdev_ops;
- }
- } else {
- if (dev != br_dev &&
- !netif_is_bridge_port(dev))
- continue;
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ if (brport_idx && (dev->ifindex != brport_idx))
+ continue;
- if (br_dev != netdev_master_upper_dev_get(dev) &&
- !netif_is_bridge_master(dev))
- continue;
- cops = ops;
+ if (!br_idx) { /* user did not specify a specific bridge */
+ if (netif_is_bridge_port(dev)) {
+ br_dev = netdev_master_upper_dev_get(dev);
+ cops = br_dev->netdev_ops;
}
+ } else {
+ if (dev != br_dev &&
+ !netif_is_bridge_port(dev))
+ continue;
- if (idx < s_idx)
- goto cont;
+ if (br_dev != netdev_master_upper_dev_get(dev) &&
+ !netif_is_bridge_master(dev))
+ continue;
+ cops = ops;
+ }
- if (netif_is_bridge_port(dev)) {
- if (cops && cops->ndo_fdb_dump) {
- err = cops->ndo_fdb_dump(skb, cb,
- br_dev, dev,
- &fidx);
- if (err == -EMSGSIZE)
- goto out;
- }
+ if (netif_is_bridge_port(dev)) {
+ if (cops && cops->ndo_fdb_dump) {
+ err = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
+ &fidx);
+ if (err == -EMSGSIZE)
+ break;
}
+ }
- if (dev->netdev_ops->ndo_fdb_dump)
- err = dev->netdev_ops->ndo_fdb_dump(skb, cb,
- dev, NULL,
- &fidx);
- else
- err = ndo_dflt_fdb_dump(skb, cb, dev, NULL,
- &fidx);
- if (err == -EMSGSIZE)
- goto out;
+ if (dev->netdev_ops->ndo_fdb_dump)
+ err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
+ &fidx);
+ else
+ err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx);
+ if (err == -EMSGSIZE)
+ break;
- cops = NULL;
+ cops = NULL;
- /* reset fdb offset to 0 for rest of the interfaces */
- cb->args[2] = 0;
- fidx = 0;
-cont:
- idx++;
- }
+ /* reset fdb offset to 0 for rest of the interfaces */
+ ctx->fdb_idx = 0;
+ fidx = 0;
}
-out:
- cb->args[0] = h;
- cb->args[1] = idx;
- cb->args[2] = fidx;
+ ctx->fdb_idx = fidx;
return skb->len;
}
@@ -5018,12 +5041,12 @@ static int valid_fdb_get_strict(const struct nlmsghdr *nlh,
struct ndmsg *ndm;
int err, i;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
NL_SET_ERR_MSG(extack, "Invalid header for fdb get request");
return -EINVAL;
}
- ndm = nlmsg_data(nlh);
if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
ndm->ndm_type) {
NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request");
@@ -5290,12 +5313,12 @@ static int valid_bridge_getlink_req(const struct nlmsghdr *nlh,
if (strict_check) {
struct ifinfomsg *ifm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump");
return -EINVAL;
}
- ifm = nlmsg_data(nlh);
if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
ifm->ifi_change || ifm->ifi_index) {
NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request");
@@ -6187,7 +6210,8 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
{
struct if_stats_msg *ifsm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) {
+ ifsm = nlmsg_payload(nlh, sizeof(*ifsm));
+ if (!ifsm) {
NL_SET_ERR_MSG(extack, "Invalid header for stats dump");
return -EINVAL;
}
@@ -6195,8 +6219,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
if (!strict_check)
return 0;
- ifsm = nlmsg_data(nlh);
-
/* only requests using strict checks can pass data to influence
* the dump. The legacy exception is filter_mask.
*/
@@ -6424,12 +6446,12 @@ static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh,
{
struct br_port_msg *bpm;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) {
+ bpm = nlmsg_payload(nlh, sizeof(*bpm));
+ if (!bpm) {
NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request");
return -EINVAL;
}
- bpm = nlmsg_data(nlh);
if (bpm->ifindex) {
NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request");
return -EINVAL;
diff --git a/net/core/scm.c b/net/core/scm.c
index 4f6a14babe5a..0225bd94170f 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -282,6 +282,16 @@ efault:
}
EXPORT_SYMBOL(put_cmsg);
+int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len,
+ void *data)
+{
+ /* Don't produce truncated CMSGs */
+ if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len))
+ return -ETOOSMALL;
+
+ return put_cmsg(msg, level, type, len, data);
+}
+
void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal)
{
struct scm_timestamping64 tss;
@@ -394,3 +404,125 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
return new_fpl;
}
EXPORT_SYMBOL(scm_fp_dup);
+
+#ifdef CONFIG_SECURITY_NETWORK
+static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct lsm_context ctx;
+ int err;
+
+ if (sk->sk_scm_security) {
+ err = security_secid_to_secctx(scm->secid, &ctx);
+
+ if (err >= 0) {
+ put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len,
+ ctx.context);
+
+ security_release_secctx(&ctx);
+ }
+ }
+}
+
+static bool scm_has_secdata(struct sock *sk)
+{
+ return sk->sk_scm_security;
+}
+#else
+static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm)
+{
+}
+
+static bool scm_has_secdata(struct sock *sk)
+{
+ return false;
+}
+#endif
+
+static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct file *pidfd_file = NULL;
+ int len, pidfd;
+
+ /* put_cmsg() doesn't return an error if CMSG is truncated,
+ * that's why we need to opencode these checks here.
+ */
+ if (msg->msg_flags & MSG_CMSG_COMPAT)
+ len = sizeof(struct compat_cmsghdr) + sizeof(int);
+ else
+ len = sizeof(struct cmsghdr) + sizeof(int);
+
+ if (msg->msg_controllen < len) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return;
+ }
+
+ if (!scm->pid)
+ return;
+
+ pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file);
+
+ if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
+ if (pidfd_file) {
+ put_unused_fd(pidfd);
+ fput(pidfd_file);
+ }
+
+ return;
+ }
+
+ if (pidfd_file)
+ fd_install(pidfd, pidfd_file);
+}
+
+static bool __scm_recv_common(struct sock *sk, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!msg->msg_control) {
+ if (sk->sk_scm_credentials || sk->sk_scm_pidfd ||
+ scm->fp || scm_has_secdata(sk))
+ msg->msg_flags |= MSG_CTRUNC;
+
+ scm_destroy(scm);
+ return false;
+ }
+
+ if (sk->sk_scm_credentials) {
+ struct user_namespace *current_ns = current_user_ns();
+ struct ucred ucreds = {
+ .pid = scm->creds.pid,
+ .uid = from_kuid_munged(current_ns, scm->creds.uid),
+ .gid = from_kgid_munged(current_ns, scm->creds.gid),
+ };
+
+ put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
+ }
+
+ scm_passec(sk, msg, scm);
+
+ if (scm->fp)
+ scm_detach_fds(msg, scm);
+
+ return true;
+}
+
+void scm_recv(struct socket *sock, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!__scm_recv_common(sock->sk, msg, scm, flags))
+ return;
+
+ scm_destroy_cred(scm);
+}
+EXPORT_SYMBOL(scm_recv);
+
+void scm_recv_unix(struct socket *sock, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!__scm_recv_common(sock->sk, msg, scm, flags))
+ return;
+
+ if (sock->sk->sk_scm_pidfd)
+ scm_pidfd_recv(msg, scm);
+
+ scm_destroy_cred(scm);
+}
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index b0ff6153be62..9a3965680451 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -71,7 +71,7 @@ u32 secure_tcpv6_ts_off(const struct net *net,
return siphash(&combined, offsetofend(typeof(combined), daddr),
&ts_secret);
}
-EXPORT_SYMBOL(secure_tcpv6_ts_off);
+EXPORT_IPV6_MOD(secure_tcpv6_ts_off);
u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
__be16 sport, __be16 dport)
@@ -156,45 +156,3 @@ u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
}
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
#endif
-
-#if IS_ENABLED(CONFIG_IP_DCCP)
-u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport)
-{
- u64 seq;
- net_secret_init();
- seq = siphash_3u32((__force u32)saddr, (__force u32)daddr,
- (__force u32)sport << 16 | (__force u32)dport,
- &net_secret);
- seq += ktime_get_real_ns();
- seq &= (1ull << 48) - 1;
- return seq;
-}
-EXPORT_SYMBOL(secure_dccp_sequence_number);
-
-#if IS_ENABLED(CONFIG_IPV6)
-u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
- __be16 sport, __be16 dport)
-{
- const struct {
- struct in6_addr saddr;
- struct in6_addr daddr;
- __be16 sport;
- __be16 dport;
- } __aligned(SIPHASH_ALIGNMENT) combined = {
- .saddr = *(struct in6_addr *)saddr,
- .daddr = *(struct in6_addr *)daddr,
- .sport = sport,
- .dport = dport
- };
- u64 seq;
- net_secret_init();
- seq = siphash(&combined, offsetofend(typeof(combined), dport),
- &net_secret);
- seq += ktime_get_real_ns();
- seq &= (1ull << 48) - 1;
- return seq;
-}
-EXPORT_SYMBOL(secure_dccpv6_sequence_number);
-#endif
-#endif
diff --git a/net/core/selftests.c b/net/core/selftests.c
index 8f801e6e3b91..35f807ea9952 100644
--- a/net/core/selftests.c
+++ b/net/core/selftests.c
@@ -100,10 +100,10 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev,
ehdr->h_proto = htons(ETH_P_IP);
if (attr->tcp) {
+ memset(thdr, 0, sizeof(*thdr));
thdr->source = htons(attr->sport);
thdr->dest = htons(attr->dport);
thdr->doff = sizeof(struct tcphdr) / 4;
- thdr->check = 0;
} else {
uhdr->source = htons(attr->sport);
uhdr->dest = htons(attr->dport);
@@ -144,10 +144,18 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev,
attr->id = net_test_next_id;
shdr->id = net_test_next_id++;
- if (attr->size)
- skb_put(skb, attr->size);
- if (attr->max_size && attr->max_size > skb->len)
- skb_put(skb, attr->max_size - skb->len);
+ if (attr->size) {
+ void *payload = skb_put(skb, attr->size);
+
+ memset(payload, 0, attr->size);
+ }
+
+ if (attr->max_size && attr->max_size > skb->len) {
+ size_t pad_len = attr->max_size - skb->len;
+ void *pad = skb_put(skb, pad_len);
+
+ memset(pad, 0, pad_len);
+ }
skb->csum = 0;
skb->ip_summed = CHECKSUM_PARTIAL;
@@ -299,7 +307,7 @@ static int net_test_phy_loopback_enable(struct net_device *ndev)
if (!ndev->phydev)
return -EOPNOTSUPP;
- return phy_loopback(ndev->phydev, true);
+ return phy_loopback(ndev->phydev, true, 0);
}
static int net_test_phy_loopback_disable(struct net_device *ndev)
@@ -307,7 +315,7 @@ static int net_test_phy_loopback_disable(struct net_device *ndev)
if (!ndev->phydev)
return -EOPNOTSUPP;
- return phy_loopback(ndev->phydev, false);
+ return phy_loopback(ndev->phydev, false, 0);
}
static int net_test_phy_loopback_udp(struct net_device *ndev)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6841e61a6bd0..85fc82f72d26 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -64,11 +64,13 @@
#include <linux/mpls.h>
#include <linux/kcov.h>
#include <linux/iov_iter.h>
+#include <linux/crc32.h>
#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/gro.h>
#include <net/gso.h>
#include <net/hotdata.h>
#include <net/ip6_checksum.h>
@@ -88,6 +90,7 @@
#include <linux/textsearch.h>
#include "dev.h"
+#include "devmem.h"
#include "netmem_priv.h"
#include "sock_destructor.h"
@@ -95,7 +98,9 @@
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif
-#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
+#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
+#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
+ GRO_MAX_HEAD_PAD))
/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
* This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
@@ -220,67 +225,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
#define NAPI_SKB_CACHE_BULK 16
#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
-#if PAGE_SIZE == SZ_4K
-
-#define NAPI_HAS_SMALL_PAGE_FRAG 1
-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
-
-/* specialized page frag allocator using a single order 0 page
- * and slicing it into 1K sized fragment. Constrained to systems
- * with a very limited amount of 1K fragments fitting a single
- * page - to avoid excessive truesize underestimation
- */
-
-struct page_frag_1k {
- void *va;
- u16 offset;
- bool pfmemalloc;
-};
-
-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
-{
- struct page *page;
- int offset;
-
- offset = nc->offset - SZ_1K;
- if (likely(offset >= 0))
- goto use_frag;
-
- page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
- if (!page)
- return NULL;
-
- nc->va = page_address(page);
- nc->pfmemalloc = page_is_pfmemalloc(page);
- offset = PAGE_SIZE - SZ_1K;
- page_ref_add(page, offset / SZ_1K);
-
-use_frag:
- nc->offset = offset;
- return nc->va + offset;
-}
-#else
-
-/* the small page is actually unused in this build; add dummy helpers
- * to please the compiler and avoid later preprocessor's conditionals
- */
-#define NAPI_HAS_SMALL_PAGE_FRAG 0
-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
-
-struct page_frag_1k {
-};
-
-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
-{
- return NULL;
-}
-
-#endif
-
struct napi_alloc_cache {
local_lock_t bh_lock;
struct page_frag_cache page;
- struct page_frag_1k page_small;
unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
@@ -290,23 +237,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
-/* Double check that napi_get_frags() allocates skbs with
- * skb->head being backed by slab, not a page fragment.
- * This is to make sure bug fixed in 3226b158e67c
- * ("net: avoid 32 x truesize under-estimation for tiny skbs")
- * does not accidentally come back.
- */
-void napi_get_frags_check(struct napi_struct *napi)
-{
- struct sk_buff *skb;
-
- local_bh_disable();
- skb = napi_get_frags(napi);
- WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
- napi_free_frags(napi);
- local_bh_enable();
-}
-
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
@@ -367,6 +297,68 @@ static struct sk_buff *napi_skb_cache_get(void)
return skb;
}
+/**
+ * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
+ * @skbs: pointer to an at least @n-sized array to fill with skb pointers
+ * @n: number of entries to provide
+ *
+ * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes
+ * the pointers into the provided array @skbs. If there are less entries
+ * available, tries to replenish the cache and bulk-allocates the diff from
+ * the MM layer if needed.
+ * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are
+ * ready for {,__}build_skb_around() and don't have any data buffers attached.
+ * Must be called *only* from the BH context.
+ *
+ * Return: number of successfully allocated skbs (@n if no actual allocation
+ * needed or kmem_cache_alloc_bulk() didn't fail).
+ */
+u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ u32 bulk, total = n;
+
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+
+ if (nc->skb_count >= n)
+ goto get;
+
+ /* No enough cached skbs. Try refilling the cache first */
+ bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK);
+ nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN, bulk,
+ &nc->skb_cache[nc->skb_count]);
+ if (likely(nc->skb_count >= n))
+ goto get;
+
+ /* Still not enough. Bulk-allocate the missing part directly, zeroed */
+ n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN,
+ n - nc->skb_count, &skbs[nc->skb_count]);
+ if (likely(nc->skb_count >= n))
+ goto get;
+
+ /* kmem_cache didn't allocate the number we need, limit the output */
+ total -= n - nc->skb_count;
+ n = nc->skb_count;
+
+get:
+ for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
+ u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache);
+
+ skbs[i] = nc->skb_cache[base + i];
+
+ kasan_mempool_unpoison_object(skbs[i], cache_size);
+ memset(skbs[i], 0, offsetof(struct sk_buff, tail));
+ }
+
+ nc->skb_count -= n;
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+
+ return total;
+}
+EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk);
+
static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
unsigned int size)
{
@@ -736,7 +728,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
*/
- if (len <= SKB_WITH_OVERHEAD(1024) ||
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
@@ -813,10 +805,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
- * When the small frag allocator is available, prefer it over kmalloc
- * for small fragments
*/
- if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -826,32 +816,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
goto skb_success;
}
+ len = SKB_HEAD_ALIGN(len);
+
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
nc = this_cpu_ptr(&napi_alloc_cache);
- if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
- /* we are artificially inflating the allocation size, but
- * that is not as bad as it may look like, as:
- * - 'len' less than GRO_MAX_HEAD makes little sense
- * - On most systems, larger 'len' values lead to fragment
- * size above 512 bytes
- * - kmalloc would use the kmalloc-1k slab for such values
- * - Builds with smaller GRO_MAX_HEAD will very likely do
- * little networking, as that implies no WiFi and no
- * tunnels support, and 32 bits arches.
- */
- len = SZ_1K;
-
- data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
- pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
- } else {
- len = SKB_HEAD_ALIGN(len);
- data = page_frag_alloc(&nc->page, len, gfp_mask);
- pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
- }
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
+ pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!data))
@@ -921,11 +895,6 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
-static bool is_pp_netmem(netmem_ref netmem)
-{
- return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE;
-}
-
int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
unsigned int headroom)
{
@@ -1009,7 +978,7 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
EXPORT_SYMBOL(skb_pp_cow_data);
int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
- struct bpf_prog *prog)
+ const struct bpf_prog *prog)
{
if (!prog->aux->xdp_has_frags)
return -EINVAL;
@@ -1023,14 +992,7 @@ bool napi_pp_put_page(netmem_ref netmem)
{
netmem = netmem_compound_head(netmem);
- /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
- * in order to preserve any existing bits, such as bit 0 for the
- * head page of compound page and bit 1 for pfmemalloc page, so
- * mask those bits for freeing side when doing below checking,
- * and page_is_pfmemalloc() is checked in __page_pool_put_page()
- * to avoid recycling the pfmemalloc page.
- */
- if (unlikely(!is_pp_netmem(netmem)))
+ if (unlikely(!netmem_is_pp(netmem)))
return false;
page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
@@ -1070,7 +1032,7 @@ static int skb_pp_frag_ref(struct sk_buff *skb)
for (i = 0; i < shinfo->nr_frags; i++) {
head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
- if (likely(is_pp_netmem(head_netmem)))
+ if (likely(netmem_is_pp(head_netmem)))
page_pool_ref_netmem(head_netmem);
else
page_ref_inc(netmem_to_page(head_netmem));
@@ -1694,7 +1656,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp)
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
-static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
+static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size,
+ bool devmem)
{
struct ubuf_info_msgzc *uarg;
struct sk_buff *skb;
@@ -1709,7 +1672,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
uarg = (void *)skb->cb;
uarg->mmp.user = NULL;
- if (mm_account_pinned_pages(&uarg->mmp, size)) {
+ if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) {
kfree_skb(skb);
return NULL;
}
@@ -1732,7 +1695,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
}
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg, bool devmem)
{
if (uarg) {
struct ubuf_info_msgzc *uarg_zc;
@@ -1762,7 +1725,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
next = (u32)atomic_read(&sk->sk_zckey);
if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
- if (mm_account_pinned_pages(&uarg_zc->mmp, size))
+ if (likely(!devmem) &&
+ mm_account_pinned_pages(&uarg_zc->mmp, size))
return NULL;
uarg_zc->len++;
uarg_zc->bytelen = bytelen;
@@ -1777,7 +1741,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
}
new_alloc:
- return msg_zerocopy_alloc(sk, size);
+ return msg_zerocopy_alloc(sk, size, devmem);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
@@ -1881,7 +1845,8 @@ EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg,
+ struct net_devmem_dmabuf_binding *binding)
{
int err, orig_len = skb->len;
@@ -1900,7 +1865,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
return -EEXIST;
}
- err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
+ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len,
+ binding);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
struct sock *save_sk = skb->sk;
@@ -3267,7 +3233,7 @@ static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
- int len, sendmsg_func sendmsg)
+ int len, sendmsg_func sendmsg, int flags)
{
unsigned int orig_len = len;
struct sk_buff *head = skb;
@@ -3285,7 +3251,7 @@ do_frag_list:
kv.iov_base = skb->data + offset;
kv.iov_len = slen;
memset(&msg, 0, sizeof(msg));
- msg.msg_flags = MSG_DONTWAIT;
+ msg.msg_flags = MSG_DONTWAIT | flags;
iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
@@ -3322,7 +3288,8 @@ do_frag_list:
while (slen) {
struct bio_vec bvec;
struct msghdr msg = {
- .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT,
+ .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT |
+ flags,
};
bvec_set_page(&bvec, skb_frag_page(frag), slen,
@@ -3368,14 +3335,21 @@ error:
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
int len)
{
- return __skb_send_sock(sk, skb, offset, len, sendmsg_locked);
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
+ int offset, int len, int flags)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags);
+}
+EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags);
+
/* Send skb data on a socket. Socket must be unlocked. */
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
{
- return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked);
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0);
}
/**
@@ -3471,8 +3445,7 @@ fault:
EXPORT_SYMBOL(skb_store_bits);
/* Checksum skb data. */
-__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
- __wsum csum, const struct skb_checksum_ops *ops)
+__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
@@ -3483,8 +3456,7 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
if (copy > 0) {
if (copy > len)
copy = len;
- csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
- skb->data + offset, copy, csum);
+ csum = csum_partial(skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -3514,13 +3486,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
- csum2 = INDIRECT_CALL_1(ops->update,
- csum_partial_ext,
- vaddr + p_off, p_len, 0);
+ csum2 = csum_partial(vaddr + p_off, p_len, 0);
kunmap_atomic(vaddr);
- csum = INDIRECT_CALL_1(ops->combine,
- csum_block_add_ext, csum,
- csum2, pos, p_len);
+ csum = csum_block_add(csum, csum2, pos);
pos += p_len;
}
@@ -3541,10 +3509,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
__wsum csum2;
if (copy > len)
copy = len;
- csum2 = __skb_checksum(frag_iter, offset - start,
- copy, 0, ops);
- csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
- csum, csum2, pos, copy);
+ csum2 = skb_checksum(frag_iter, offset - start, copy,
+ 0);
+ csum = csum_block_add(csum, csum2, pos);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -3556,18 +3523,6 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
return csum;
}
-EXPORT_SYMBOL(__skb_checksum);
-
-__wsum skb_checksum(const struct sk_buff *skb, int offset,
- int len, __wsum csum)
-{
- const struct skb_checksum_ops ops = {
- .update = csum_partial_ext,
- .combine = csum_block_add_ext,
- };
-
- return __skb_checksum(skb, offset, len, csum, &ops);
-}
EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
@@ -3660,6 +3615,78 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
+#ifdef CONFIG_NET_CRC32C
+u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = crc32c(crc, skb->data + offset, copy);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+ if (copy > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ copy = min(copy, len);
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ crc = crc32c(crc, vaddr + p_off, p_len);
+ kunmap_atomic(vaddr);
+ }
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ copy = end - offset;
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = skb_crc32c(frag_iter, offset - start, copy, crc);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+
+ return crc;
+}
+EXPORT_SYMBOL(skb_crc32c);
+#endif /* CONFIG_NET_CRC32C */
+
__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
__sum16 sum;
@@ -3719,32 +3746,6 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(__skb_checksum_complete);
-static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
-{
- net_warn_ratelimited(
- "%s: attempt to compute crc32c without libcrc32c.ko\n",
- __func__);
- return 0;
-}
-
-static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
- int offset, int len)
-{
- net_warn_ratelimited(
- "%s: attempt to compute crc32c without libcrc32c.ko\n",
- __func__);
- return 0;
-}
-
-static const struct skb_checksum_ops default_crc32c_ops = {
- .update = warn_crc32c_csum_update,
- .combine = warn_crc32c_csum_combine,
-};
-
-const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
- &default_crc32c_ops;
-EXPORT_SYMBOL(crc32c_csum_stub);
-
/**
* skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
* @from: source buffer
@@ -5539,6 +5540,54 @@ err:
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ int tstype)
+{
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP;
+ case SCM_TSTAMP_SND:
+ return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF :
+ SKBTX_SW_TSTAMP);
+ case SCM_TSTAMP_ACK:
+ return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK;
+ case SCM_TSTAMP_COMPLETION:
+ return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP;
+ }
+
+ return false;
+}
+
+static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk,
+ int tstype)
+{
+ int op;
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ op = BPF_SOCK_OPS_TSTAMP_SCHED_CB;
+ break;
+ case SCM_TSTAMP_SND:
+ if (hwtstamps) {
+ op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB;
+ *skb_hwtstamps(skb) = *hwtstamps;
+ } else {
+ op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB;
+ }
+ break;
+ case SCM_TSTAMP_ACK:
+ op = BPF_SOCK_OPS_TSTAMP_ACK_CB;
+ break;
+ default:
+ return;
+ }
+
+ bpf_skops_tx_timestamping(sk, skb, op);
+}
+
void __skb_tstamp_tx(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
@@ -5551,6 +5600,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!sk)
return;
+ if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF)
+ skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps,
+ sk, tstype);
+
+ if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype))
+ return;
+
tsflags = READ_ONCE(sk->sk_tsflags);
if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
@@ -6123,11 +6179,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->offload_fwd_mark = 0;
skb->offload_l3_fwd_mark = 0;
#endif
+ ipvs_reset(skb);
if (!xnet)
return;
- ipvs_reset(skb);
skb->mark = 0;
skb_clear_tstamp(skb);
}
@@ -7290,3 +7346,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
return false;
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);
+
+void get_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_get_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+ get_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(get_netmem);
+
+void put_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_put_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+
+ put_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(put_netmem);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 61f3f3d4e528..34c51eb1a14f 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -530,16 +530,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
u32 off, u32 len,
struct sk_psock *psock,
struct sock *sk,
- struct sk_msg *msg)
+ struct sk_msg *msg,
+ bool take_ref)
{
int num_sge, copied;
+ /* skb_to_sgvec will fail when the total number of fragments in
+ * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the
+ * caller may aggregate multiple skbs.
+ */
num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
if (num_sge < 0) {
/* skb linearize may fail with ENOMEM, but lets simply try again
* later if this happens. Under memory pressure we don't want to
* drop the skb. We need to linearize the skb so that the mapping
* in skb_to_sgvec can not error.
+ * Note that skb_linearize requires the skb not to be shared.
*/
if (skb_linearize(skb))
return -EAGAIN;
@@ -549,11 +555,14 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
return num_sge;
}
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+ psock->ingress_bytes += len;
+#endif
copied = len;
msg->sg.start = 0;
msg->sg.size = copied;
msg->sg.end = num_sge;
- msg->skb = skb;
+ msg->skb = take_ref ? skb_get(skb) : skb;
sk_psock_queue_msg(psock, msg);
sk_psock_data_ready(sk, psock);
@@ -561,7 +570,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
}
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
- u32 off, u32 len);
+ u32 off, u32 len, bool take_ref);
static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len)
@@ -575,7 +584,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
* correctly.
*/
if (unlikely(skb->sk == sk))
- return sk_psock_skb_ingress_self(psock, skb, off, len);
+ return sk_psock_skb_ingress_self(psock, skb, off, len, true);
msg = sk_psock_create_ingress_msg(sk, skb);
if (!msg)
return -EAGAIN;
@@ -587,7 +596,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
* into user buffers.
*/
skb_set_owner_r(skb, sk);
- err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
+ err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true);
if (err < 0)
kfree(msg);
return err;
@@ -598,7 +607,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
* because the skb is already accounted for here.
*/
static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
- u32 off, u32 len)
+ u32 off, u32 len, bool take_ref)
{
struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
struct sock *sk = psock->sk;
@@ -607,7 +616,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
if (unlikely(!msg))
return -EAGAIN;
skb_set_owner_r(skb, sk);
- err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg);
+ err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref);
if (err < 0)
kfree(msg);
return err;
@@ -616,18 +625,13 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb
static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
u32 off, u32 len, bool ingress)
{
- int err = 0;
-
if (!ingress) {
if (!sock_writeable(psock->sk))
return -EAGAIN;
return skb_send_sock(psock->sk, skb, off, len);
}
- skb_get(skb);
- err = sk_psock_skb_ingress(psock, skb, off, len);
- if (err < 0)
- kfree_skb(skb);
- return err;
+
+ return sk_psock_skb_ingress(psock, skb, off, len);
}
static void sk_psock_skb_state(struct sk_psock *psock,
@@ -652,12 +656,14 @@ static void sk_psock_backlog(struct work_struct *work)
bool ingress;
int ret;
+ /* Increment the psock refcnt to synchronize with close(fd) path in
+ * sock_map_close(), ensuring we wait for backlog thread completion
+ * before sk_socket freed. If refcnt increment fails, it indicates
+ * sock_map_close() completed with sk_socket potentially already freed.
+ */
+ if (!sk_psock_get(psock->sk))
+ return;
mutex_lock(&psock->work_mutex);
- if (unlikely(state->len)) {
- len = state->len;
- off = state->off;
- }
-
while ((skb = skb_peek(&psock->ingress_skb))) {
len = skb->len;
off = 0;
@@ -667,6 +673,13 @@ static void sk_psock_backlog(struct work_struct *work)
off = stm->offset;
len = stm->full_len;
}
+
+ /* Resume processing from previous partial state */
+ if (unlikely(state->len)) {
+ len = state->len;
+ off = state->off;
+ }
+
ingress = skb_bpf_ingress(skb);
skb_bpf_redirect_clear(skb);
do {
@@ -677,7 +690,8 @@ static void sk_psock_backlog(struct work_struct *work)
if (ret <= 0) {
if (ret == -EAGAIN) {
sk_psock_skb_state(psock, state, len, off);
-
+ /* Restore redir info we cleared before */
+ skb_bpf_set_redir(skb, psock->sk, ingress);
/* Delay slightly to prioritize any
* other work that might be here.
*/
@@ -694,11 +708,14 @@ static void sk_psock_backlog(struct work_struct *work)
len -= ret;
} while (len);
+ /* The entire skb sent, clear state */
+ sk_psock_skb_state(psock, state, 0, 0);
skb = skb_dequeue(&psock->ingress_skb);
kfree_skb(skb);
}
end:
mutex_unlock(&psock->work_mutex);
+ sk_psock_put(psock->sk, psock);
}
struct sk_psock *sk_psock_init(struct sock *sk, int node)
@@ -1011,7 +1028,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
off = stm->offset;
len = stm->full_len;
}
- err = sk_psock_skb_ingress_self(psock, skb, off, len);
+ err = sk_psock_skb_ingress_self(psock, skb, off, len, false);
}
if (err < 0) {
spin_lock_bh(&psock->ingress_lock);
@@ -1144,6 +1161,10 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
if (!ret)
sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED);
+ if (sk_is_tcp(sk)) {
+ psock->strp.cb.read_sock = tcp_bpf_strp_read_sock;
+ psock->copied_seq = tcp_sk(sk)->copied_seq;
+ }
return ret;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index be84885f9290..3b409bc8ef6d 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -148,6 +148,8 @@
#include <linux/ethtool.h>
+#include <uapi/linux/pidfd.h>
+
#include "dev.h"
static DEFINE_MUTEX(proto_list_mutex);
@@ -454,6 +456,13 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
return 0;
}
+static bool sk_set_prio_allowed(const struct sock *sk, int val)
+{
+ return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
+}
+
static bool sock_needs_netstamp(const struct sock *sk)
{
switch (sk->sk_family) {
@@ -931,6 +940,7 @@ int sock_set_timestamping(struct sock *sk, int optname,
WRITE_ONCE(sk->sk_tsflags, val);
sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
sock_enable_timestamp(sk,
@@ -941,6 +951,20 @@ int sock_set_timestamping(struct sock *sk, int optname,
return 0;
}
+#if defined(CONFIG_CGROUP_BPF)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+{
+ struct bpf_sock_ops_kern sock_ops;
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = op;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ bpf_skops_init_skb(&sock_ops, skb, 0);
+ __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
+}
+#endif
+
void sock_set_keepalive(struct sock *sk)
{
lock_sock(sk);
@@ -1193,22 +1217,11 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
/* handle options which do not require locking the socket. */
switch (optname) {
case SO_PRIORITY:
- if ((val >= 0 && val <= 6) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (sk_set_prio_allowed(sk, val)) {
sock_set_priority(sk, val);
return 0;
}
return -EPERM;
- case SO_PASSSEC:
- assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
- return 0;
- case SO_PASSCRED:
- assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
- return 0;
- case SO_PASSPIDFD:
- assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
- return 0;
case SO_TYPE:
case SO_PROTOCOL:
case SO_DOMAIN:
@@ -1256,6 +1269,8 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
return 0;
}
case SO_TXREHASH:
+ if (!sk_is_tcp(sk))
+ return -EOPNOTSUPP;
if (val < -1 || val > 1)
return -EINVAL;
if ((u8)val == SOCK_TXREHASH_DEFAULT)
@@ -1517,6 +1532,10 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
break;
+ case SO_RCVPRIORITY:
+ sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
+ break;
+
case SO_RXQ_OVFL:
sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
break;
@@ -1533,6 +1552,33 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
break;
+ case SO_PASSCRED:
+ if (sk_may_scm_recv(sk))
+ sk->sk_scm_credentials = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSSEC:
+ if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
+ sk->sk_scm_security = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSPIDFD:
+ if (sk_is_unix(sk))
+ sk->sk_scm_pidfd = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSRIGHTS:
+ if (sk_is_unix(sk))
+ sk->sk_scm_rights = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
case SO_INCOMING_CPU:
reuseport_update_incoming_cpu(sk, val);
@@ -1829,11 +1875,24 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
break;
case SO_PASSCRED:
- v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
+ if (!sk_may_scm_recv(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_credentials;
break;
case SO_PASSPIDFD:
- v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
+ if (!sk_is_unix(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_pidfd;
+ break;
+
+ case SO_PASSRIGHTS:
+ if (!sk_is_unix(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_rights;
break;
case SO_PEERCRED:
@@ -1855,6 +1914,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
{
struct pid *peer_pid;
struct file *pidfd_file = NULL;
+ unsigned int flags = 0;
int pidfd;
if (len > sizeof(pidfd))
@@ -1867,7 +1927,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
if (!peer_pid)
return -ENODATA;
- pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
+ /* The use of PIDFD_STALE requires stashing of struct pid
+ * on pidfs with pidfs_register_pid() and only AF_UNIX
+ * were prepared for this.
+ */
+ if (sk->sk_family == AF_UNIX)
+ flags = PIDFD_STALE;
+
+ pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
put_pid(peer_pid);
if (pidfd < 0)
return pidfd;
@@ -1930,7 +1997,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
break;
case SO_PASSSEC:
- v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
+ if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_security;
break;
case SO_PEERSEC:
@@ -1945,6 +2015,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
v.val = sock_flag(sk, SOCK_RCVMARK);
break;
+ case SO_RCVPRIORITY:
+ v.val = sock_flag(sk, SOCK_RCVPRIORITY);
+ break;
+
case SO_RXQ_OVFL:
v.val = sock_flag(sk, SOCK_RXQ_OVFL);
break;
@@ -2028,7 +2102,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
v.val = READ_ONCE(sk->sk_napi_id);
/* aggregate non-NAPI IDs down to 0 */
- if (v.val < MIN_NAPI_ID)
+ if (!napi_id_valid(v.val))
v.val = 0;
break;
@@ -2074,6 +2148,9 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
break;
case SO_TXREHASH:
+ if (!sk_is_tcp(sk))
+ return -EOPNOTSUPP;
+
/* Paired with WRITE_ONCE() in sk_setsockopt() */
v.val = READ_ONCE(sk->sk_txrehash);
break;
@@ -2102,6 +2179,8 @@ lenout:
*/
static inline void sock_lock_init(struct sock *sk)
{
+ sk_owner_clear(sk);
+
if (sk->sk_kern_sock)
sock_lock_init_class_and_name(
sk,
@@ -2198,6 +2277,9 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
cgroup_sk_free(&sk->sk_cgrp_data);
mem_cgroup_sk_free(sk);
security_sk_free(sk);
+
+ sk_owner_put(sk);
+
if (slab != NULL)
kmem_cache_free(slab, sk);
else
@@ -2233,6 +2315,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
get_net_track(net, &sk->ns_tracker, priority);
sock_inuse_add(net, 1);
} else {
+ net_passive_inc(net);
__netns_tracker_alloc(net, &sk->ns_tracker,
false, priority);
}
@@ -2257,6 +2340,7 @@ EXPORT_SYMBOL(sk_alloc);
static void __sk_destruct(struct rcu_head *head)
{
struct sock *sk = container_of(head, struct sock, sk_rcu);
+ struct net *net = sock_net(sk);
struct sk_filter *filter;
if (sk->sk_destruct)
@@ -2288,14 +2372,28 @@ static void __sk_destruct(struct rcu_head *head)
put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
- if (likely(sk->sk_net_refcnt))
- put_net_track(sock_net(sk), &sk->ns_tracker);
- else
- __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
-
+ if (likely(sk->sk_net_refcnt)) {
+ put_net_track(net, &sk->ns_tracker);
+ } else {
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ net_passive_dec(net);
+ }
sk_prot_free(sk->sk_prot_creator, sk);
}
+void sk_net_refcnt_upgrade(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+
+ WARN_ON_ONCE(sk->sk_net_refcnt);
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ net_passive_dec(net);
+ sk->sk_net_refcnt = 1;
+ get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
+}
+EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
+
void sk_destruct(struct sock *sk)
{
bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
@@ -2392,6 +2490,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
* is not properly dismantling its kernel sockets at netns
* destroy time.
*/
+ net_passive_inc(sock_net(newsk));
__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
false, priority);
}
@@ -2444,17 +2543,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
*/
if (!is_charged)
RCU_INIT_POINTER(newsk->sk_filter, NULL);
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
+
+ goto free;
}
+
RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
- if (bpf_sk_storage_clone(sk, newsk)) {
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
- }
+ if (bpf_sk_storage_clone(sk, newsk))
+ goto free;
/* Clear sk_user_data if parent had the pointer tagged
* as not suitable for copying when cloning.
@@ -2484,18 +2580,17 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
net_enable_timestamp();
out:
return newsk;
-}
-EXPORT_SYMBOL_GPL(sk_clone_lock);
-
-void sk_free_unlock_clone(struct sock *sk)
-{
+free:
/* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- sk->sk_destruct = NULL;
- bh_unlock_sock(sk);
- sk_free(sk);
+ * destructor and make plain sk_free()
+ */
+ newsk->sk_destruct = NULL;
+ bh_unlock_sock(newsk);
+ sk_free(newsk);
+ newsk = NULL;
+ goto out;
}
-EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
+EXPORT_SYMBOL_GPL(sk_clone_lock);
static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
{
@@ -2520,8 +2615,12 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
u32 max_segs = 1;
sk->sk_route_caps = dst->dev->features;
- if (sk_is_tcp(sk))
+ if (sk_is_tcp(sk)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
sk->sk_route_caps |= NETIF_F_GSO;
+ icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
+ }
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
if (unlikely(sk->sk_gso_disabled))
@@ -2791,6 +2890,22 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
}
EXPORT_SYMBOL(sock_kmalloc);
+/*
+ * Duplicate the input "src" memory block using the socket's
+ * option memory buffer.
+ */
+void *sock_kmemdup(struct sock *sk, const void *src,
+ int size, gfp_t priority)
+{
+ void *mem;
+
+ mem = sock_kmalloc(sk, size, priority);
+ if (mem)
+ memcpy(mem, src, size);
+ return mem;
+}
+EXPORT_SYMBOL(sock_kmemdup);
+
/* Free an option memory block. Note, we actually want the inline
* here as this allows gcc to detect the nullify and fold away the
* condition entirely.
@@ -2945,6 +3060,18 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
case SCM_RIGHTS:
case SCM_CREDENTIALS:
break;
+ case SO_PRIORITY:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
+ return -EPERM;
+ sockc->priority = *(u32 *)CMSG_DATA(cmsg);
+ break;
+ case SCM_DEVMEM_DMABUF:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
+ break;
default:
return -EINVAL;
}
@@ -3157,16 +3284,16 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
struct proto *prot = sk->sk_prot;
- bool charged = false;
+ bool charged = true;
long allocated;
sk_memory_allocated_add(sk, amt);
allocated = sk_memory_allocated(sk);
if (memcg) {
- if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
+ charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge());
+ if (!charged)
goto suppress_allocation;
- charged = true;
}
/* Under limit. */
@@ -3251,7 +3378,7 @@ suppress_allocation:
sk_memory_allocated_sub(sk, amt);
- if (charged)
+ if (memcg && charged)
mem_cgroup_uncharge_skmem(memcg, amt);
return 0;
@@ -3526,14 +3653,14 @@ EXPORT_SYMBOL(sk_reset_timer);
void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
- if (del_timer(timer))
+ if (timer_delete(timer))
__sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer);
void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
{
- if (del_timer_sync(timer))
+ if (timer_delete_sync(timer))
__sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer_sync);
@@ -3861,7 +3988,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
- mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
+ mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
@@ -3927,7 +4054,7 @@ static int assign_proto_idx(struct proto *prot)
{
prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
- if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
+ if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
pr_err("PROTO_INUSE_NR exhausted\n");
return -ENOSPC;
}
@@ -3938,7 +4065,7 @@ static int assign_proto_idx(struct proto *prot)
static void release_proto_idx(struct proto *prot)
{
- if (prot->inuse_idx != PROTO_INUSE_NR - 1)
+ if (prot->inuse_idx != PROTO_INUSE_NR)
clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index a08eed9b9142..b23594c767f2 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -264,8 +264,6 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
switch (nlh->nlmsg_type) {
case TCPDIAG_GETSOCK:
- case DCCPDIAG_GETSOCK:
-
if (!rcu_access_pointer(inet_rcv_compat))
sock_load_diag_module(AF_INET, 0);
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index f1b9b3958792..82a14f131d00 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -303,7 +303,10 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk)
write_lock_bh(&sk->sk_callback_lock);
if (stream_parser && stream_verdict && !psock->saved_data_ready) {
- ret = sk_psock_init_strp(sk, psock);
+ if (sk_is_tcp(sk))
+ ret = sk_psock_init_strp(sk, psock);
+ else
+ ret = -EOPNOTSUPP;
if (ret) {
write_unlock_bh(&sk->sk_callback_lock);
sk_psock_put(sk, psock);
@@ -541,6 +544,9 @@ static bool sock_map_sk_state_allowed(const struct sock *sk)
return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN);
if (sk_is_stream_unix(sk))
return (1 << sk->sk_state) & TCPF_ESTABLISHED;
+ if (sk_is_vsock(sk) &&
+ (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET))
+ return (1 << sk->sk_state) & TCPF_ESTABLISHED;
return true;
}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index cb8d32e5c14e..5dbb2c6f371d 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -34,6 +34,7 @@ static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
static int max_skb_frags = MAX_SKB_FRAGS;
static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE;
+static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ;
static int net_msg_warn; /* Unused, but still a sysctl */
@@ -200,7 +201,7 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
if (orig_sock_table) {
static_branch_dec(&rps_needed);
static_branch_dec(&rfs_needed);
- kvfree_rcu_mightsleep(orig_sock_table);
+ kvfree_rcu(orig_sock_table, rcu);
}
}
}
@@ -238,7 +239,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write,
lockdep_is_held(&flow_limit_update_mutex));
if (cur && !cpumask_test_cpu(i, mask)) {
RCU_INIT_POINTER(sd->flow_limit, NULL);
- kfree_rcu_mightsleep(cur);
+ kfree_rcu(cur, rcu);
} else if (!cur && cpumask_test_cpu(i, mask)) {
cur = kzalloc_node(len, GFP_KERNEL,
cpu_to_node(i));
@@ -247,7 +248,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write,
ret = -ENOMEM;
goto write_unlock;
}
- cur->num_buckets = netdev_flow_limit_table_len;
+ cur->log_buckets = ilog2(netdev_flow_limit_table_len);
rcu_assign_pointer(sd->flow_limit, cur);
}
}
@@ -319,7 +320,7 @@ static int proc_do_dev_weight(const struct ctl_table *table, int write,
int ret, weight;
mutex_lock(&dev_weight_mutex);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write) {
weight = READ_ONCE(weight_p);
WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias);
@@ -412,6 +413,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "dev_weight_rx_bias",
@@ -419,6 +421,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "dev_weight_tx_bias",
@@ -426,6 +429,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "netdev_max_backlog",
@@ -584,7 +588,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ZERO,
+ .extra1 = &netdev_budget_usecs_min,
},
{
.procname = "fb_tunnels_only_for_init_net",
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 3717fb152ecc..a50a7ef49ae8 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -9,6 +9,7 @@
#include <linux/ptp_classify.h>
#include <linux/skbuff.h>
#include <linux/export.h>
+#include <linux/ptp_clock_kernel.h>
static unsigned int classify(const struct sk_buff *skb)
{
@@ -21,19 +22,39 @@ static unsigned int classify(const struct sk_buff *skb)
void skb_clone_tx_timestamp(struct sk_buff *skb)
{
+ struct hwtstamp_provider *hwprov;
struct mii_timestamper *mii_ts;
+ struct phy_device *phydev;
struct sk_buff *clone;
unsigned int type;
- if (!skb->sk || !skb->dev ||
- !phy_is_default_hwtstamp(skb->dev->phydev))
+ if (!skb->sk || !skb->dev)
return;
+ rcu_read_lock();
+ hwprov = rcu_dereference(skb->dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB ||
+ !hwprov->phydev) {
+ rcu_read_unlock();
+ return;
+ }
+
+ phydev = hwprov->phydev;
+ } else {
+ phydev = skb->dev->phydev;
+ if (!phy_is_default_hwtstamp(phydev)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
type = classify(skb);
if (type == PTP_CLASS_NONE)
return;
- mii_ts = skb->dev->phydev->mii_ts;
+ mii_ts = phydev->mii_ts;
if (likely(mii_ts->txtstamp)) {
clone = skb_clone_sk(skb);
if (!clone)
@@ -45,12 +66,33 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
+ struct hwtstamp_provider *hwprov;
struct mii_timestamper *mii_ts;
+ struct phy_device *phydev;
unsigned int type;
- if (!skb->dev || !phy_is_default_hwtstamp(skb->dev->phydev))
+ if (!skb->dev)
return false;
+ rcu_read_lock();
+ hwprov = rcu_dereference(skb->dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB ||
+ !hwprov->phydev) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ phydev = hwprov->phydev;
+ } else {
+ phydev = skb->dev->phydev;
+ if (!phy_is_default_hwtstamp(phydev)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+
if (skb_headroom(skb) < ETH_HLEN)
return false;
@@ -63,7 +105,7 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
if (type == PTP_CLASS_NONE)
return false;
- mii_ts = skb->dev->phydev->mii_ts;
+ mii_ts = phydev->mii_ts;
if (likely(mii_ts->rxtstamp))
return mii_ts->rxtstamp(mii_ts, skb, type);
diff --git a/net/core/utils.c b/net/core/utils.c
index 27f4cffaae05..5e63b0ea21f3 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -399,9 +399,9 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
}
EXPORT_SYMBOL(inet_pton_with_scope);
-bool inet_addr_is_any(struct sockaddr *addr)
+bool inet_addr_is_any(struct sockaddr_storage *addr)
{
- if (addr->sa_family == AF_INET6) {
+ if (addr->ss_family == AF_INET6) {
struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
const struct sockaddr_in6 in6_any =
{ .sin6_addr = IN6ADDR_ANY_INIT };
@@ -409,13 +409,13 @@ bool inet_addr_is_any(struct sockaddr *addr)
if (!memcmp(in6->sin6_addr.s6_addr,
in6_any.sin6_addr.s6_addr, 16))
return true;
- } else if (addr->sa_family == AF_INET) {
+ } else if (addr->ss_family == AF_INET) {
struct sockaddr_in *in = (struct sockaddr_in *)addr;
if (in->sin_addr.s_addr == htonl(INADDR_ANY))
return true;
} else {
- pr_warn("unexpected address family %u\n", addr->sa_family);
+ pr_warn("unexpected address family %u\n", addr->ss_family);
}
return false;
@@ -473,11 +473,11 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
EXPORT_SYMBOL(inet_proto_csum_replace16);
void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
- __wsum diff, bool pseudohdr)
+ __wsum diff, bool pseudohdr, bool ipv6)
{
if (skb->ip_summed != CHECKSUM_PARTIAL) {
csum_replace_by_diff(sum, diff);
- if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr && !ipv6)
skb->csum = ~csum_sub(diff, skb->csum);
} else if (pseudohdr) {
*sum = ~csum_fold(csum_add(diff, csum_unfold(*sum)));
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 2315feed94ef..491334b9b8be 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -17,6 +17,7 @@
#include <net/page_pool/helpers.h>
#include <net/hotdata.h>
+#include <net/netdev_lock.h>
#include <net/xdp.h>
#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
#include <trace/events/xdp.h>
@@ -357,6 +358,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
if (IS_ERR(xdp_alloc))
return PTR_ERR(xdp_alloc);
+ if (type == MEM_TYPE_XSK_BUFF_POOL && allocator)
+ xsk_pool_set_rxq_info(allocator, xdp_rxq);
+
if (trace_mem_connect_enabled() && xdp_alloc)
trace_mem_connect(xdp_alloc, xdp_rxq);
return 0;
@@ -364,33 +368,87 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+/**
+ * xdp_reg_page_pool - register &page_pool as a memory provider for XDP
+ * @pool: &page_pool to register
+ *
+ * Can be used to register pools manually without connecting to any XDP RxQ
+ * info, so that the XDP layer will be aware of them. Then, they can be
+ * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool().
+ *
+ * Return: %0 on success, -errno on error.
+ */
+int xdp_reg_page_pool(struct page_pool *pool)
+{
+ struct xdp_mem_info mem;
+
+ return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool);
+}
+EXPORT_SYMBOL_GPL(xdp_reg_page_pool);
+
+/**
+ * xdp_unreg_page_pool - unregister &page_pool from the memory providers list
+ * @pool: &page_pool to unregister
+ *
+ * A shorthand for manual unregistering page pools. If the pool was previously
+ * attached to an RxQ info, it must be detached first.
+ */
+void xdp_unreg_page_pool(const struct page_pool *pool)
+{
+ struct xdp_mem_info mem = {
+ .type = MEM_TYPE_PAGE_POOL,
+ .id = pool->xdp_mem_id,
+ };
+
+ xdp_unreg_mem_model(&mem);
+}
+EXPORT_SYMBOL_GPL(xdp_unreg_page_pool);
+
+/**
+ * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info
+ * @xdp_rxq: XDP RxQ info to attach the pool to
+ * @pool: pool to attach
+ *
+ * If the pool was registered manually, this function must be called instead
+ * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info.
+ */
+void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq,
+ const struct page_pool *pool)
+{
+ struct xdp_mem_info mem = {
+ .type = MEM_TYPE_PAGE_POOL,
+ .id = pool->xdp_mem_id,
+ };
+
+ xdp_rxq_info_attach_mem_model(xdp_rxq, &mem);
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool);
+
/* XDP RX runs under NAPI protection, and in different delivery error
* scenarios (e.g. queue full), it is possible to return the xdp_frame
* while still leveraging this protection. The @napi_direct boolean
* is used for those calls sites. Thus, allowing for faster recycling
* of xdp_frames/pages in those cases.
*/
-void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
- struct xdp_buff *xdp)
+void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
+ bool napi_direct, struct xdp_buff *xdp)
{
- struct page *page;
-
- switch (mem->type) {
+ switch (mem_type) {
case MEM_TYPE_PAGE_POOL:
- page = virt_to_head_page(data);
+ netmem = netmem_compound_head(netmem);
if (napi_direct && xdp_return_frame_no_direct())
napi_direct = false;
- /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE)
- * as mem->type knows this a page_pool page
+ /* No need to check netmem_is_pp() as mem->type knows this a
+ * page_pool page
*/
- page_pool_put_full_page(page->pp, page, napi_direct);
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
+ napi_direct);
break;
case MEM_TYPE_PAGE_SHARED:
- page_frag_free(data);
+ page_frag_free(__netmem_address(netmem));
break;
case MEM_TYPE_PAGE_ORDER0:
- page = virt_to_page(data); /* Assumes order0 page*/
- put_page(page);
+ put_page(__netmem_to_page(netmem));
break;
case MEM_TYPE_XSK_BUFF_POOL:
/* NB! Only valid from an xdp_buff! */
@@ -398,7 +456,7 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
break;
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
- WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
+ WARN(1, "Incorrect XDP memory type (%d) usage", mem_type);
break;
}
}
@@ -406,38 +464,34 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
void xdp_return_frame(struct xdp_frame *xdpf)
{
struct skb_shared_info *sinfo;
- int i;
if (likely(!xdp_frame_has_frags(xdpf)))
goto out;
sinfo = xdp_get_shared_info_from_frame(xdpf);
- for (i = 0; i < sinfo->nr_frags; i++) {
- struct page *page = skb_frag_page(&sinfo->frags[i]);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
+ false, NULL);
- __xdp_return(page_address(page), &xdpf->mem, false, NULL);
- }
out:
- __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
struct skb_shared_info *sinfo;
- int i;
if (likely(!xdp_frame_has_frags(xdpf)))
goto out;
sinfo = xdp_get_shared_info_from_frame(xdpf);
- for (i = 0; i < sinfo->nr_frags; i++) {
- struct page *page = skb_frag_page(&sinfo->frags[i]);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
+ true, NULL);
- __xdp_return(page_address(page), &xdpf->mem, true, NULL);
- }
out:
- __xdp_return(xdpf->data, &xdpf->mem, true, NULL);
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
@@ -451,46 +505,19 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
* xdp_frame_bulk is usually stored/allocated on the function
* call-stack to avoid locking penalties.
*/
-void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
-{
- struct xdp_mem_allocator *xa = bq->xa;
-
- if (unlikely(!xa || !bq->count))
- return;
-
- page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count);
- /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */
- bq->count = 0;
-}
-EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk);
/* Must be called with rcu_read_lock held */
void xdp_return_frame_bulk(struct xdp_frame *xdpf,
struct xdp_frame_bulk *bq)
{
- struct xdp_mem_info *mem = &xdpf->mem;
- struct xdp_mem_allocator *xa;
-
- if (mem->type != MEM_TYPE_PAGE_POOL) {
+ if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) {
xdp_return_frame(xdpf);
return;
}
- xa = bq->xa;
- if (unlikely(!xa)) {
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- bq->count = 0;
- bq->xa = xa;
- }
-
if (bq->count == XDP_BULK_QUEUE_SIZE)
xdp_flush_frame_bulk(bq);
- if (unlikely(mem->id != xa->mem.id)) {
- xdp_flush_frame_bulk(bq);
- bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- }
-
if (unlikely(xdp_frame_has_frags(xdpf))) {
struct skb_shared_info *sinfo;
int i;
@@ -499,31 +526,40 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf,
for (i = 0; i < sinfo->nr_frags; i++) {
skb_frag_t *frag = &sinfo->frags[i];
- bq->q[bq->count++] = skb_frag_address(frag);
+ bq->q[bq->count++] = skb_frag_netmem(frag);
if (bq->count == XDP_BULK_QUEUE_SIZE)
xdp_flush_frame_bulk(bq);
}
}
- bq->q[bq->count++] = xdpf->data;
+ bq->q[bq->count++] = virt_to_netmem(xdpf->data);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
+/**
+ * xdp_return_frag -- free one XDP frag or decrement its refcount
+ * @netmem: network memory reference to release
+ * @xdp: &xdp_buff to release the frag for
+ */
+void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp)
+{
+ __xdp_return(netmem, xdp->rxq->mem.type, true, NULL);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frag);
+
void xdp_return_buff(struct xdp_buff *xdp)
{
struct skb_shared_info *sinfo;
- int i;
if (likely(!xdp_buff_has_frags(xdp)))
goto out;
sinfo = xdp_get_shared_info_from_buff(xdp);
- for (i = 0; i < sinfo->nr_frags; i++) {
- struct page *page = skb_frag_page(&sinfo->frags[i]);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]),
+ xdp->rxq->mem.type, true, xdp);
- __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp);
- }
out:
- __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
+ __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
@@ -569,7 +605,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
xdpf->headroom = 0;
xdpf->metasize = metasize;
xdpf->frame_sz = PAGE_SIZE;
- xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+ xdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
xsk_buff_free(xdp);
return xdpf;
@@ -583,15 +619,177 @@ void xdp_warn(const char *msg, const char *func, const int line)
};
EXPORT_SYMBOL_GPL(xdp_warn);
-int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp)
+/**
+ * xdp_build_skb_from_buff - create an skb from &xdp_buff
+ * @xdp: &xdp_buff to convert to an skb
+ *
+ * Perform common operations to create a new skb to pass up the stack from
+ * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize
+ * skb data pointers and offsets, set the recycle bit if the buff is
+ * PP-backed, Rx queue index, protocol and update frags info.
+ *
+ * Return: new &sk_buff on success, %NULL on error.
+ */
+struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp)
{
- n_skb = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, n_skb, skbs);
- if (unlikely(!n_skb))
- return -ENOMEM;
+ const struct xdp_rxq_info *rxq = xdp->rxq;
+ const struct skb_shared_info *sinfo;
+ struct sk_buff *skb;
+ u32 nr_frags = 0;
+ int metalen;
- return 0;
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = sinfo->nr_frags;
+ }
+
+ skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ __skb_put(skb, xdp->data_end - xdp->data);
+
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen > 0)
+ skb_metadata_set(skb, metalen);
+
+ if (rxq->mem.type == MEM_TYPE_PAGE_POOL)
+ skb_mark_for_recycle(skb);
+
+ skb_record_rx_queue(skb, rxq->queue_index);
+
+ if (unlikely(nr_frags)) {
+ u32 tsize;
+
+ tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz;
+ xdp_update_skb_shared_info(skb, nr_frags,
+ sinfo->xdp_frags_size, tsize,
+ xdp_buff_is_frag_pfmemalloc(xdp));
+ }
+
+ skb->protocol = eth_type_trans(skb, rxq->dev);
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff);
+
+/**
+ * xdp_copy_frags_from_zc - copy frags from XSk buff to skb
+ * @skb: skb to copy frags to
+ * @xdp: XSk &xdp_buff from which the frags will be copied
+ * @pp: &page_pool backing page allocation, if available
+ *
+ * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack.
+ * Allocate a new buffer for each frag, copy it and attach to the skb.
+ *
+ * Return: true on success, false on netmem allocation fail.
+ */
+static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
+ const struct xdp_buff *xdp,
+ struct page_pool *pp)
+{
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+ const struct skb_shared_info *xinfo;
+ u32 nr_frags, tsize = 0;
+ bool pfmemalloc = false;
+
+ xinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = xinfo->nr_frags;
+
+ for (u32 i = 0; i < nr_frags; i++) {
+ const skb_frag_t *frag = &xinfo->frags[i];
+ u32 len = skb_frag_size(frag);
+ u32 offset, truesize = len;
+ struct page *page;
+
+ page = page_pool_dev_alloc(pp, &offset, &truesize);
+ if (unlikely(!page)) {
+ sinfo->nr_frags = i;
+ return false;
+ }
+
+ memcpy(page_address(page) + offset, skb_frag_address(frag),
+ LARGEST_ALIGN(len));
+ __skb_fill_page_desc_noacc(sinfo, i, page, offset, len);
+
+ tsize += truesize;
+ pfmemalloc |= page_is_pfmemalloc(page);
+ }
+
+ xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size,
+ tsize, pfmemalloc);
+
+ return true;
+}
+
+/**
+ * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff
+ * @xdp: source XSk buff
+ *
+ * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb
+ * head, new buffer for the head, copy the data and initialize the skb fields.
+ * If there are frags, allocate new buffers for them and copy.
+ * Buffers are allocated from the system percpu pools to try recycling them.
+ * If new skb was built successfully, @xdp is returned to XSk pool's freelist.
+ * On error, it remains untouched and the caller must take care of this.
+ *
+ * Return: new &sk_buff on success, %NULL on error.
+ */
+struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
+{
+ const struct xdp_rxq_info *rxq = xdp->rxq;
+ u32 len = xdp->data_end - xdp->data_meta;
+ u32 truesize = xdp->frame_sz;
+ struct sk_buff *skb = NULL;
+ struct page_pool *pp;
+ int metalen;
+ void *data;
+
+ if (!IS_ENABLED(CONFIG_PAGE_POOL))
+ return NULL;
+
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ pp = this_cpu_read(system_page_pool.pool);
+ data = page_pool_dev_alloc_va(pp, &truesize);
+ if (unlikely(!data))
+ goto out;
+
+ skb = napi_build_skb(data, truesize);
+ if (unlikely(!skb)) {
+ page_pool_free_va(pp, data, true);
+ goto out;
+ }
+
+ skb_mark_for_recycle(skb);
+ skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
+
+ memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len));
+
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen > 0) {
+ skb_metadata_set(skb, metalen);
+ __skb_pull(skb, metalen);
+ }
+
+ skb_record_rx_queue(skb, rxq->queue_index);
+
+ if (unlikely(xdp_buff_has_frags(xdp)) &&
+ unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) {
+ napi_consume_skb(skb, true);
+ skb = NULL;
+ goto out;
+ }
+
+ xsk_buff_free(xdp);
+
+ skb->protocol = eth_type_trans(skb, rxq->dev);
+
+out:
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
+ return skb;
}
-EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk);
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc);
struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct sk_buff *skb,
@@ -639,7 +837,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
* - RX ring dev queue index (skb_record_rx_queue)
*/
- if (xdpf->mem.type == MEM_TYPE_PAGE_POOL)
+ if (xdpf->mem_type == MEM_TYPE_PAGE_POOL)
skb_mark_for_recycle(skb);
/* Allow SKB to reuse area used by xdp_frame */
@@ -686,8 +884,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
nxdpf = addr;
nxdpf->data = addr + headroom;
nxdpf->frame_sz = PAGE_SIZE;
- nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
- nxdpf->mem.id = 0;
+ nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
return nxdpf;
}
@@ -800,34 +997,60 @@ static int __init xdp_metadata_init(void)
}
late_initcall(xdp_metadata_init);
-void xdp_set_features_flag(struct net_device *dev, xdp_features_t val)
+void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val)
{
val &= NETDEV_XDP_ACT_MASK;
if (dev->xdp_features == val)
return;
+ netdev_assert_locked_or_invisible(dev);
dev->xdp_features = val;
if (dev->reg_state == NETREG_REGISTERED)
call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev);
}
+EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked);
+
+void xdp_set_features_flag(struct net_device *dev, xdp_features_t val)
+{
+ netdev_lock(dev);
+ xdp_set_features_flag_locked(dev, val);
+ netdev_unlock(dev);
+}
EXPORT_SYMBOL_GPL(xdp_set_features_flag);
-void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
+void xdp_features_set_redirect_target_locked(struct net_device *dev,
+ bool support_sg)
{
xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT);
if (support_sg)
val |= NETDEV_XDP_ACT_NDO_XMIT_SG;
- xdp_set_features_flag(dev, val);
+ xdp_set_features_flag_locked(dev, val);
+}
+EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked);
+
+void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
+{
+ netdev_lock(dev);
+ xdp_features_set_redirect_target_locked(dev, support_sg);
+ netdev_unlock(dev);
}
EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target);
-void xdp_features_clear_redirect_target(struct net_device *dev)
+void xdp_features_clear_redirect_target_locked(struct net_device *dev)
{
xdp_features_t val = dev->xdp_features;
val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG);
- xdp_set_features_flag(dev, val);
+ xdp_set_features_flag_locked(dev, val);
+}
+EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked);
+
+void xdp_features_clear_redirect_target(struct net_device *dev)
+{
+ netdev_lock(dev);
+ xdp_features_clear_redirect_target_locked(dev);
+ netdev_unlock(dev);
}
EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target);