summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/bluetooth/hci_conn.c23
-rw-r--r--net/bluetooth/l2cap_core.c24
-rw-r--r--net/bluetooth/l2cap_sock.c8
-rw-r--r--net/bluetooth/mgmt.c12
-rw-r--r--net/bpf/test_run.c72
-rw-r--r--net/bridge/br_mdb.c66
-rw-r--r--net/bridge/br_multicast.c179
-rw-r--r--net/bridge/br_netlink.c19
-rw-r--r--net/bridge/br_netlink_tunnel.c3
-rw-r--r--net/bridge/br_private.h12
-rw-r--r--net/bridge/br_vlan.c11
-rw-r--r--net/bridge/br_vlan_options.c27
-rw-r--r--net/can/gw.c7
-rw-r--r--net/can/isotp.c3
-rw-r--r--net/can/j1939/address-claim.c40
-rw-r--r--net/can/raw.c11
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/dev.c1
-rw-r--r--net/core/dev.h2
-rw-r--r--net/core/filter.c17
-rw-r--r--net/core/neighbour.c18
-rw-r--r--net/core/net-sysfs.c79
-rw-r--r--net/core/net-traces.c1
-rw-r--r--net/core/netdev-genl-gen.c48
-rw-r--r--net/core/netdev-genl-gen.h23
-rw-r--r--net/core/netdev-genl.c179
-rw-r--r--net/core/page_pool.c6
-rw-r--r--net/core/rtnetlink.c2
-rw-r--r--net/core/skbuff.c156
-rw-r--r--net/core/sock.c18
-rw-r--r--net/core/sysctl_net_core.c81
-rw-r--r--net/core/xdp.c28
-rw-r--r--net/devlink/Makefile2
-rw-r--r--net/devlink/core.c5
-rw-r--r--net/devlink/dev.c1343
-rw-r--r--net/devlink/devl_internal.h30
-rw-r--r--net/devlink/leftover.c1376
-rw-r--r--net/dsa/slave.c11
-rw-r--r--net/ethtool/mm.c2
-rw-r--r--net/ipv4/af_inet.c1
-rw-r--r--net/ipv4/inet_connection_sock.c3
-rw-r--r--net/ipv4/raw.c21
-rw-r--r--net/ipv4/tcp_bbr.c16
-rw-r--r--net/ipv4/tcp_cong.c10
-rw-r--r--net/ipv4/tcp_cubic.c12
-rw-r--r--net/ipv4/tcp_dctcp.c12
-rw-r--r--net/ipv6/af_inet6.c1
-rw-r--r--net/ipv6/raw.c16
-rw-r--r--net/mptcp/pm_netlink.c10
-rw-r--r--net/mptcp/protocol.c9
-rw-r--r--net/mptcp/sockopt.c11
-rw-r--r--net/mptcp/subflow.c12
-rw-r--r--net/netfilter/Kconfig3
-rw-r--r--net/netfilter/Makefile1
-rw-r--r--net/netfilter/nf_conntrack_bpf.c20
-rw-r--r--net/netfilter/nf_conntrack_helper.c98
-rw-r--r--net/netfilter/nf_conntrack_ovs.c178
-rw-r--r--net/netfilter/nf_nat_bpf.c6
-rw-r--r--net/netlink/genetlink.c4
-rw-r--r--net/openvswitch/Kconfig1
-rw-r--r--net/openvswitch/conntrack.c85
-rw-r--r--net/openvswitch/flow.c9
-rw-r--r--net/openvswitch/flow.h2
-rw-r--r--net/openvswitch/flow_table.c8
-rw-r--r--net/rds/message.c6
-rw-r--r--net/rxrpc/call_object.c6
-rw-r--r--net/rxrpc/conn_event.c2
-rw-r--r--net/rxrpc/output.c10
-rw-r--r--net/rxrpc/recvmsg.c2
-rw-r--r--net/rxrpc/skbuff.c4
-rw-r--r--net/sched/Kconfig8
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_ct.c76
-rw-r--r--net/sched/sch_api.c29
-rw-r--r--net/sched/sch_htb.c2
-rw-r--r--net/sched/sch_mqprio.c291
-rw-r--r--net/sched/sch_mqprio_lib.c117
-rw-r--r--net/sched/sch_mqprio_lib.h18
-rw-r--r--net/sched/sch_taprio.c709
-rw-r--r--net/smc/af_smc.c25
-rw-r--r--net/smc/smc_core.c75
-rw-r--r--net/smc/smc_core.h6
-rw-r--r--net/smc/smc_llc.c34
-rw-r--r--net/tipc/netlink_compat.c16
-rw-r--r--net/xdp/xsk_buff_pool.c7
-rw-r--r--net/xfrm/xfrm_compat.c4
-rw-r--r--net/xfrm/xfrm_input.c3
-rw-r--r--net/xfrm/xfrm_interface_bpf.c7
-rw-r--r--net/xfrm/xfrm_interface_core.c54
-rw-r--r--net/xfrm/xfrm_policy.c14
-rw-r--r--net/xfrm/xfrm_state.c18
91 files changed, 3629 insertions, 2412 deletions
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index acf563fbdfd9..17b946f9ba31 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1061,8 +1061,15 @@ int hci_conn_del(struct hci_conn *conn)
if (conn->type == ACL_LINK) {
struct hci_conn *sco = conn->link;
- if (sco)
+ if (sco) {
sco->link = NULL;
+ /* Due to race, SCO connection might be not established
+ * yet at this point. Delete it now, otherwise it is
+ * possible for it to be stuck and can't be deleted.
+ */
+ if (sco->handle == HCI_CONN_HANDLE_UNSET)
+ hci_conn_del(sco);
+ }
/* Unacked frames */
hdev->acl_cnt += conn->sent;
@@ -1243,6 +1250,8 @@ static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err)
if (conn != hci_lookup_le_connect(hdev))
goto done;
+ /* Flush to make sure we send create conn cancel command if needed */
+ flush_delayed_work(&conn->le_conn_timeout);
hci_conn_failed(conn, bt_status(err));
done:
@@ -1981,16 +1990,14 @@ static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn,
qos->latency = conn->le_conn_latency;
}
-static struct hci_conn *hci_bind_bis(struct hci_conn *conn,
- struct bt_iso_qos *qos)
+static void hci_bind_bis(struct hci_conn *conn,
+ struct bt_iso_qos *qos)
{
/* Update LINK PHYs according to QoS preference */
conn->le_tx_phy = qos->out.phy;
conn->le_tx_phy = qos->out.phy;
conn->iso_qos = *qos;
conn->state = BT_BOUND;
-
- return conn;
}
static int create_big_sync(struct hci_dev *hdev, void *data)
@@ -2119,11 +2126,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
if (IS_ERR(conn))
return conn;
- conn = hci_bind_bis(conn, qos);
- if (!conn) {
- hci_conn_drop(conn);
- return ERR_PTR(-ENOMEM);
- }
+ hci_bind_bis(conn, qos);
/* Add Basic Announcement into Peridic Adv Data if BASE is set */
if (base_len && base) {
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index a3e0dc6a6e73..adfc3ea06d08 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -2683,14 +2683,6 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
if (IS_ERR(skb))
return PTR_ERR(skb);
- /* Channel lock is released before requesting new skb and then
- * reacquired thus we need to recheck channel state.
- */
- if (chan->state != BT_CONNECTED) {
- kfree_skb(skb);
- return -ENOTCONN;
- }
-
l2cap_do_send(chan, skb);
return len;
}
@@ -2735,14 +2727,6 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
if (IS_ERR(skb))
return PTR_ERR(skb);
- /* Channel lock is released before requesting new skb and then
- * reacquired thus we need to recheck channel state.
- */
- if (chan->state != BT_CONNECTED) {
- kfree_skb(skb);
- return -ENOTCONN;
- }
-
l2cap_do_send(chan, skb);
err = len;
break;
@@ -2763,14 +2747,6 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
*/
err = l2cap_segment_sdu(chan, &seg_queue, msg, len);
- /* The channel could have been closed while segmenting,
- * check that it is still connected.
- */
- if (chan->state != BT_CONNECTED) {
- __skb_queue_purge(&seg_queue);
- err = -ENOTCONN;
- }
-
if (err)
break;
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index ca8f07f3542b..eebe256104bc 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1624,6 +1624,14 @@ static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan,
if (!skb)
return ERR_PTR(err);
+ /* Channel lock is released before requesting new skb and then
+ * reacquired thus we need to recheck channel state.
+ */
+ if (chan->state != BT_CONNECTED) {
+ kfree_skb(skb);
+ return ERR_PTR(-ENOTCONN);
+ }
+
skb->priority = sk->sk_priority;
bt_cb(skb)->l2cap.chan = chan;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index d2ea8e19aa1b..7add66f30e4d 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -859,6 +859,12 @@ static u32 get_supported_settings(struct hci_dev *hdev)
hdev->set_bdaddr)
settings |= MGMT_SETTING_CONFIGURATION;
+ if (cis_central_capable(hdev))
+ settings |= MGMT_SETTING_CIS_CENTRAL;
+
+ if (cis_peripheral_capable(hdev))
+ settings |= MGMT_SETTING_CIS_PERIPHERAL;
+
settings |= MGMT_SETTING_PHY_CONFIGURATION;
return settings;
@@ -932,6 +938,12 @@ static u32 get_current_settings(struct hci_dev *hdev)
if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED))
settings |= MGMT_SETTING_WIDEBAND_SPEECH;
+ if (cis_central_capable(hdev))
+ settings |= MGMT_SETTING_CIS_CENTRAL;
+
+ if (cis_peripheral_capable(hdev))
+ settings |= MGMT_SETTING_CIS_PERIPHERAL;
+
return settings;
}
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 8da0d73b368e..b766a84c8536 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -234,7 +234,7 @@ static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
int i, n;
LIST_HEAD(list);
- n = kmem_cache_alloc_bulk(skbuff_head_cache, gfp, nframes, (void **)skbs);
+ n = kmem_cache_alloc_bulk(skbuff_cache, gfp, nframes, (void **)skbs);
if (unlikely(n == 0)) {
for (i = 0; i < nframes; i++)
xdp_return_frame(frames[i]);
@@ -484,7 +484,7 @@ out:
__diag_push();
__diag_ignore_all("-Wmissing-prototypes",
"Global functions as their definitions will be in vmlinux BTF");
-int noinline bpf_fentry_test1(int a)
+__bpf_kfunc int bpf_fentry_test1(int a)
{
return a + 1;
}
@@ -529,27 +529,35 @@ int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg)
return (long)arg->a;
}
-int noinline bpf_modify_return_test(int a, int *b)
+__bpf_kfunc int bpf_modify_return_test(int a, int *b)
{
*b += 1;
return a + *b;
}
-u64 noinline bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d)
+__bpf_kfunc u64 bpf_kfunc_call_test1(struct sock *sk, u32 a, u64 b, u32 c, u64 d)
{
return a + b + c + d;
}
-int noinline bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b)
+__bpf_kfunc int bpf_kfunc_call_test2(struct sock *sk, u32 a, u32 b)
{
return a + b;
}
-struct sock * noinline bpf_kfunc_call_test3(struct sock *sk)
+__bpf_kfunc struct sock *bpf_kfunc_call_test3(struct sock *sk)
{
return sk;
}
+long noinline bpf_kfunc_call_test4(signed char a, short b, int c, long d)
+{
+ /* Provoke the compiler to assume that the caller has sign-extended a,
+ * b and c on platforms where this is required (e.g. s390x).
+ */
+ return (long)a + (long)b + (long)c + d;
+}
+
struct prog_test_member1 {
int a;
};
@@ -574,21 +582,21 @@ static struct prog_test_ref_kfunc prog_test_struct = {
.cnt = REFCOUNT_INIT(1),
};
-noinline struct prog_test_ref_kfunc *
+__bpf_kfunc struct prog_test_ref_kfunc *
bpf_kfunc_call_test_acquire(unsigned long *scalar_ptr)
{
refcount_inc(&prog_test_struct.cnt);
return &prog_test_struct;
}
-noinline struct prog_test_member *
+__bpf_kfunc struct prog_test_member *
bpf_kfunc_call_memb_acquire(void)
{
WARN_ON_ONCE(1);
return NULL;
}
-noinline void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
+__bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
{
if (!p)
return;
@@ -596,11 +604,11 @@ noinline void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
refcount_dec(&p->cnt);
}
-noinline void bpf_kfunc_call_memb_release(struct prog_test_member *p)
+__bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p)
{
}
-noinline void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p)
+__bpf_kfunc void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p)
{
WARN_ON_ONCE(1);
}
@@ -613,12 +621,14 @@ static int *__bpf_kfunc_call_test_get_mem(struct prog_test_ref_kfunc *p, const i
return (int *)p;
}
-noinline int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size)
+__bpf_kfunc int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p,
+ const int rdwr_buf_size)
{
return __bpf_kfunc_call_test_get_mem(p, rdwr_buf_size);
}
-noinline int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size)
+__bpf_kfunc int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p,
+ const int rdonly_buf_size)
{
return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size);
}
@@ -628,16 +638,17 @@ noinline int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p,
* Acquire functions must return struct pointers, so these ones are
* failing.
*/
-noinline int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size)
+__bpf_kfunc int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p,
+ const int rdonly_buf_size)
{
return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size);
}
-noinline void bpf_kfunc_call_int_mem_release(int *p)
+__bpf_kfunc void bpf_kfunc_call_int_mem_release(int *p)
{
}
-noinline struct prog_test_ref_kfunc *
+__bpf_kfunc struct prog_test_ref_kfunc *
bpf_kfunc_call_test_kptr_get(struct prog_test_ref_kfunc **pp, int a, int b)
{
struct prog_test_ref_kfunc *p = READ_ONCE(*pp);
@@ -686,48 +697,53 @@ struct prog_test_fail3 {
char arr2[];
};
-noinline void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb)
+__bpf_kfunc void bpf_kfunc_call_test_pass_ctx(struct __sk_buff *skb)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p)
{
}
-noinline void bpf_kfunc_call_test_pass1(struct prog_test_pass1 *p)
+__bpf_kfunc void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p)
{
}
-noinline void bpf_kfunc_call_test_pass2(struct prog_test_pass2 *p)
+__bpf_kfunc void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p)
{
}
-noinline void bpf_kfunc_call_test_fail1(struct prog_test_fail1 *p)
+__bpf_kfunc void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p)
{
}
-noinline void bpf_kfunc_call_test_fail2(struct prog_test_fail2 *p)
+__bpf_kfunc void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p)
{
}
-noinline void bpf_kfunc_call_test_fail3(struct prog_test_fail3 *p)
+__bpf_kfunc void bpf_kfunc_call_test_mem_len_pass1(void *mem, int mem__sz)
{
}
-noinline void bpf_kfunc_call_test_mem_len_pass1(void *mem, int mem__sz)
+__bpf_kfunc void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len)
{
}
-noinline void bpf_kfunc_call_test_mem_len_fail1(void *mem, int len)
+__bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
{
}
-noinline void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
+__bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
{
}
-noinline void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
+__bpf_kfunc void bpf_kfunc_call_test_destructive(void)
{
}
-noinline void bpf_kfunc_call_test_destructive(void)
+__bpf_kfunc static u32 bpf_kfunc_call_test_static_unused_arg(u32 arg, u32 unused)
{
+ return arg;
}
__diag_pop();
@@ -746,6 +762,7 @@ BTF_SET8_START(test_sk_check_kfunc_ids)
BTF_ID_FLAGS(func, bpf_kfunc_call_test1)
BTF_ID_FLAGS(func, bpf_kfunc_call_test2)
BTF_ID_FLAGS(func, bpf_kfunc_call_test3)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test4)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_acquire, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_kfunc_call_memb_acquire, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE)
@@ -767,6 +784,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
BTF_SET8_END(test_sk_check_kfunc_ids)
static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 00e5743647b0..25c48d81a597 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -259,7 +259,7 @@ static int __mdb_fill_info(struct sk_buff *skb,
#endif
} else {
ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr);
- e.state = MDB_PG_FLAGS_PERMANENT;
+ e.state = MDB_PERMANENT;
}
e.addr.proto = mp->addr.proto;
nest_ent = nla_nest_start_noflag(skb,
@@ -421,8 +421,6 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock();
- cb->seq = net->dev_base_seq;
-
for_each_netdev_rcu(net, dev) {
if (netif_is_bridge_master(dev)) {
struct net_bridge *br = netdev_priv(dev);
@@ -685,51 +683,58 @@ static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = {
[MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC),
};
-static bool is_valid_mdb_entry(struct br_mdb_entry *entry,
- struct netlink_ext_ack *extack)
+static int validate_mdb_entry(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
{
+ struct br_mdb_entry *entry = nla_data(attr);
+
+ if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid MDBA_SET_ENTRY attribute length");
+ return -EINVAL;
+ }
+
if (entry->ifindex == 0) {
NL_SET_ERR_MSG_MOD(extack, "Zero entry ifindex is not allowed");
- return false;
+ return -EINVAL;
}
if (entry->addr.proto == htons(ETH_P_IP)) {
if (!ipv4_is_multicast(entry->addr.u.ip4)) {
NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address is not multicast");
- return false;
+ return -EINVAL;
}
if (ipv4_is_local_multicast(entry->addr.u.ip4)) {
NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address is local multicast");
- return false;
+ return -EINVAL;
}
#if IS_ENABLED(CONFIG_IPV6)
} else if (entry->addr.proto == htons(ETH_P_IPV6)) {
if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) {
NL_SET_ERR_MSG_MOD(extack, "IPv6 entry group address is link-local all nodes");
- return false;
+ return -EINVAL;
}
#endif
} else if (entry->addr.proto == 0) {
/* L2 mdb */
if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) {
NL_SET_ERR_MSG_MOD(extack, "L2 entry group is not multicast");
- return false;
+ return -EINVAL;
}
} else {
NL_SET_ERR_MSG_MOD(extack, "Unknown entry protocol");
- return false;
+ return -EINVAL;
}
if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) {
NL_SET_ERR_MSG_MOD(extack, "Unknown entry state");
- return false;
+ return -EINVAL;
}
if (entry->vid >= VLAN_VID_MASK) {
NL_SET_ERR_MSG_MOD(extack, "Invalid entry VLAN id");
- return false;
+ return -EINVAL;
}
- return true;
+ return 0;
}
static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto,
@@ -849,11 +854,10 @@ static int br_mdb_add_group_sg(const struct br_mdb_config *cfg,
}
p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL,
- MCAST_INCLUDE, cfg->rt_protocol);
- if (unlikely(!p)) {
- NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (S, G) port group");
+ MCAST_INCLUDE, cfg->rt_protocol, extack);
+ if (unlikely(!p))
return -ENOMEM;
- }
+
rcu_assign_pointer(*pp, p);
if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry)
mod_timer(&p->timer,
@@ -1075,11 +1079,10 @@ static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg,
}
p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL,
- cfg->filter_mode, cfg->rt_protocol);
- if (unlikely(!p)) {
- NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new (*, G) port group");
+ cfg->filter_mode, cfg->rt_protocol,
+ extack);
+ if (unlikely(!p))
return -ENOMEM;
- }
err = br_mdb_add_group_srcs(cfg, p, brmctx, extack);
if (err)
@@ -1101,8 +1104,7 @@ static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg,
return 0;
err_del_port_group:
- hlist_del_init(&p->mglist);
- kfree(p);
+ br_multicast_del_port_group(p);
return err;
}
@@ -1297,6 +1299,14 @@ static int br_mdb_config_attrs_init(struct nlattr *set_attrs,
return 0;
}
+static const struct nla_policy mdba_policy[MDBA_SET_ENTRY_MAX + 1] = {
+ [MDBA_SET_ENTRY_UNSPEC] = { .strict_start_type = MDBA_SET_ENTRY_ATTRS + 1 },
+ [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
+ validate_mdb_entry,
+ sizeof(struct br_mdb_entry)),
+ [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED },
+};
+
static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh,
struct br_mdb_config *cfg,
struct netlink_ext_ack *extack)
@@ -1307,7 +1317,7 @@ static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh,
int err;
err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
- MDBA_SET_ENTRY_MAX, NULL, extack);
+ MDBA_SET_ENTRY_MAX, mdba_policy, extack);
if (err)
return err;
@@ -1349,14 +1359,8 @@ static int br_mdb_config_init(struct net *net, const struct nlmsghdr *nlh,
NL_SET_ERR_MSG_MOD(extack, "Missing MDBA_SET_ENTRY attribute");
return -EINVAL;
}
- if (nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) {
- NL_SET_ERR_MSG_MOD(extack, "Invalid MDBA_SET_ENTRY attribute length");
- return -EINVAL;
- }
cfg->entry = nla_data(tb[MDBA_SET_ENTRY]);
- if (!is_valid_mdb_entry(cfg->entry, extack))
- return -EINVAL;
if (cfg->entry->ifindex != cfg->br->dev->ifindex) {
struct net_device *pdev;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index dea1ee1bd095..96d1fc78dd39 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -31,6 +31,7 @@
#include <net/ip6_checksum.h>
#include <net/addrconf.h>
#endif
+#include <trace/events/bridge.h>
#include "br_private.h"
#include "br_private_mcast_eht.h"
@@ -234,6 +235,29 @@ out:
return pmctx;
}
+static struct net_bridge_mcast_port *
+br_multicast_port_vid_to_port_ctx(struct net_bridge_port *port, u16 vid)
+{
+ struct net_bridge_mcast_port *pmctx = NULL;
+ struct net_bridge_vlan *vlan;
+
+ lockdep_assert_held_once(&port->br->multicast_lock);
+
+ if (!br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED))
+ return NULL;
+
+ /* Take RCU to access the vlan. */
+ rcu_read_lock();
+
+ vlan = br_vlan_find(nbp_vlan_group_rcu(port), vid);
+ if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx))
+ pmctx = &vlan->port_mcast_ctx;
+
+ rcu_read_unlock();
+
+ return pmctx;
+}
+
/* when snooping we need to check if the contexts should be used
* in the following order:
* - if pmctx is non-NULL (port), check if it should be used
@@ -668,6 +692,101 @@ void br_multicast_del_group_src(struct net_bridge_group_src *src,
__br_multicast_del_group_src(src);
}
+static int
+br_multicast_port_ngroups_inc_one(struct net_bridge_mcast_port *pmctx,
+ struct netlink_ext_ack *extack,
+ const char *what)
+{
+ u32 max = READ_ONCE(pmctx->mdb_max_entries);
+ u32 n = READ_ONCE(pmctx->mdb_n_entries);
+
+ if (max && n >= max) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "%s is already in %u groups, and mcast_max_groups=%u",
+ what, n, max);
+ return -E2BIG;
+ }
+
+ WRITE_ONCE(pmctx->mdb_n_entries, n + 1);
+ return 0;
+}
+
+static void br_multicast_port_ngroups_dec_one(struct net_bridge_mcast_port *pmctx)
+{
+ u32 n = READ_ONCE(pmctx->mdb_n_entries);
+
+ WARN_ON_ONCE(n == 0);
+ WRITE_ONCE(pmctx->mdb_n_entries, n - 1);
+}
+
+static int br_multicast_port_ngroups_inc(struct net_bridge_port *port,
+ const struct br_ip *group,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_mcast_port *pmctx;
+ int err;
+
+ lockdep_assert_held_once(&port->br->multicast_lock);
+
+ /* Always count on the port context. */
+ err = br_multicast_port_ngroups_inc_one(&port->multicast_ctx, extack,
+ "Port");
+ if (err) {
+ trace_br_mdb_full(port->dev, group);
+ return err;
+ }
+
+ /* Only count on the VLAN context if VID is given, and if snooping on
+ * that VLAN is enabled.
+ */
+ if (!group->vid)
+ return 0;
+
+ pmctx = br_multicast_port_vid_to_port_ctx(port, group->vid);
+ if (!pmctx)
+ return 0;
+
+ err = br_multicast_port_ngroups_inc_one(pmctx, extack, "Port-VLAN");
+ if (err) {
+ trace_br_mdb_full(port->dev, group);
+ goto dec_one_out;
+ }
+
+ return 0;
+
+dec_one_out:
+ br_multicast_port_ngroups_dec_one(&port->multicast_ctx);
+ return err;
+}
+
+static void br_multicast_port_ngroups_dec(struct net_bridge_port *port, u16 vid)
+{
+ struct net_bridge_mcast_port *pmctx;
+
+ lockdep_assert_held_once(&port->br->multicast_lock);
+
+ if (vid) {
+ pmctx = br_multicast_port_vid_to_port_ctx(port, vid);
+ if (pmctx)
+ br_multicast_port_ngroups_dec_one(pmctx);
+ }
+ br_multicast_port_ngroups_dec_one(&port->multicast_ctx);
+}
+
+u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx)
+{
+ return READ_ONCE(pmctx->mdb_n_entries);
+}
+
+void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max)
+{
+ WRITE_ONCE(pmctx->mdb_max_entries, max);
+}
+
+u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx)
+{
+ return READ_ONCE(pmctx->mdb_max_entries);
+}
+
static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc)
{
struct net_bridge_port_group *pg;
@@ -702,6 +821,7 @@ void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
} else {
br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE);
}
+ br_multicast_port_ngroups_dec(pg->key.port, pg->key.addr.vid);
hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list);
queue_work(system_long_wq, &br->mcast_gc_work);
@@ -1165,6 +1285,7 @@ struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
return mp;
if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) {
+ trace_br_mdb_full(br->dev, group);
br_mc_disabled_update(br->dev, false, NULL);
br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false);
return ERR_PTR(-E2BIG);
@@ -1284,14 +1405,22 @@ struct net_bridge_port_group *br_multicast_new_port_group(
unsigned char flags,
const unsigned char *src,
u8 filter_mode,
- u8 rt_protocol)
+ u8 rt_protocol,
+ struct netlink_ext_ack *extack)
{
struct net_bridge_port_group *p;
+ int err;
- p = kzalloc(sizeof(*p), GFP_ATOMIC);
- if (unlikely(!p))
+ err = br_multicast_port_ngroups_inc(port, group, extack);
+ if (err)
return NULL;
+ p = kzalloc(sizeof(*p), GFP_ATOMIC);
+ if (unlikely(!p)) {
+ NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group");
+ goto dec_out;
+ }
+
p->key.addr = *group;
p->key.port = port;
p->flags = flags;
@@ -1305,8 +1434,8 @@ struct net_bridge_port_group *br_multicast_new_port_group(
if (!br_multicast_is_star_g(group) &&
rhashtable_lookup_insert_fast(&port->br->sg_port_tbl, &p->rhnode,
br_sg_port_rht_params)) {
- kfree(p);
- return NULL;
+ NL_SET_ERR_MSG_MOD(extack, "Couldn't insert new port group");
+ goto free_out;
}
rcu_assign_pointer(p->next, next);
@@ -1320,6 +1449,25 @@ struct net_bridge_port_group *br_multicast_new_port_group(
eth_broadcast_addr(p->eth_addr);
return p;
+
+free_out:
+ kfree(p);
+dec_out:
+ br_multicast_port_ngroups_dec(port, group->vid);
+ return NULL;
+}
+
+void br_multicast_del_port_group(struct net_bridge_port_group *p)
+{
+ struct net_bridge_port *port = p->key.port;
+ __u16 vid = p->key.addr.vid;
+
+ hlist_del_init(&p->mglist);
+ if (!br_multicast_is_star_g(&p->key.addr))
+ rhashtable_remove_fast(&port->br->sg_port_tbl, &p->rhnode,
+ br_sg_port_rht_params);
+ kfree(p);
+ br_multicast_port_ngroups_dec(port, vid);
}
void br_multicast_host_join(const struct net_bridge_mcast *brmctx,
@@ -1387,7 +1535,7 @@ __br_multicast_add_group(struct net_bridge_mcast *brmctx,
}
p = br_multicast_new_port_group(pmctx->port, group, *pp, 0, src,
- filter_mode, RTPROT_KERNEL);
+ filter_mode, RTPROT_KERNEL, NULL);
if (unlikely(!p)) {
p = ERR_PTR(-ENOMEM);
goto out;
@@ -1933,6 +2081,25 @@ static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx)
br_ip4_multicast_add_router(brmctx, pmctx);
br_ip6_multicast_add_router(brmctx, pmctx);
}
+
+ if (br_multicast_port_ctx_is_vlan(pmctx)) {
+ struct net_bridge_port_group *pg;
+ u32 n = 0;
+
+ /* The mcast_n_groups counter might be wrong. First,
+ * BR_VLFLAG_MCAST_ENABLED is toggled before temporary entries
+ * are flushed, thus mcast_n_groups after the toggle does not
+ * reflect the true values. And second, permanent entries added
+ * while BR_VLFLAG_MCAST_ENABLED was disabled, are not reflected
+ * either. Thus we have to refresh the counter.
+ */
+
+ hlist_for_each_entry(pg, &pmctx->port->mglist, mglist) {
+ if (pg->key.addr.vid == pmctx->vlan->vid)
+ n++;
+ }
+ WRITE_ONCE(pmctx->mdb_n_entries, n);
+ }
}
void br_multicast_enable_port(struct net_bridge_port *port)
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 4316cc82ae17..9173e52b89e2 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -202,6 +202,8 @@ static inline size_t br_port_info_size(void)
+ nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */
+ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_N_GROUPS */
+ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_MAX_GROUPS */
#endif
+ nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */
+ nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */
@@ -298,7 +300,11 @@ static int br_port_fill_attrs(struct sk_buff *skb,
nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT,
p->multicast_eht_hosts_limit) ||
nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT,
- p->multicast_eht_hosts_cnt))
+ p->multicast_eht_hosts_cnt) ||
+ nla_put_u32(skb, IFLA_BRPORT_MCAST_N_GROUPS,
+ br_multicast_ngroups_get(&p->multicast_ctx)) ||
+ nla_put_u32(skb, IFLA_BRPORT_MCAST_MAX_GROUPS,
+ br_multicast_ngroups_get_max(&p->multicast_ctx)))
return -EMSGSIZE;
#endif
@@ -858,6 +864,8 @@ static int br_afspec(struct net_bridge *br,
}
static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
+ [IFLA_BRPORT_UNSPEC] = { .strict_start_type =
+ IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT + 1 },
[IFLA_BRPORT_STATE] = { .type = NLA_U8 },
[IFLA_BRPORT_COST] = { .type = NLA_U32 },
[IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 },
@@ -881,6 +889,8 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
[IFLA_BRPORT_MAB] = { .type = NLA_U8 },
[IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 },
+ [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT },
+ [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 },
};
/* Change the state of the port and notify spanning tree */
@@ -1015,6 +1025,13 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[],
if (err)
return err;
}
+
+ if (tb[IFLA_BRPORT_MCAST_MAX_GROUPS]) {
+ u32 max_groups;
+
+ max_groups = nla_get_u32(tb[IFLA_BRPORT_MCAST_MAX_GROUPS]);
+ br_multicast_ngroups_set_max(&p->multicast_ctx, max_groups);
+ }
#endif
if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) {
diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
index 8914290c75d4..17abf092f7ca 100644
--- a/net/bridge/br_netlink_tunnel.c
+++ b/net/bridge/br_netlink_tunnel.c
@@ -188,6 +188,9 @@ initvars:
}
static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = {
+ [IFLA_BRIDGE_VLAN_TUNNEL_UNSPEC] = {
+ .strict_start_type = IFLA_BRIDGE_VLAN_TUNNEL_FLAGS + 1
+ },
[IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 },
[IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 },
[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 },
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 15ef7fd508ee..cef5f6ea850c 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -126,6 +126,8 @@ struct net_bridge_mcast_port {
struct hlist_node ip6_rlist;
#endif /* IS_ENABLED(CONFIG_IPV6) */
unsigned char multicast_router;
+ u32 mdb_n_entries;
+ u32 mdb_max_entries;
#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */
};
@@ -956,7 +958,9 @@ br_multicast_new_port_group(struct net_bridge_port *port,
const struct br_ip *group,
struct net_bridge_port_group __rcu *next,
unsigned char flags, const unsigned char *src,
- u8 filter_mode, u8 rt_protocol);
+ u8 filter_mode, u8 rt_protocol,
+ struct netlink_ext_ack *extack);
+void br_multicast_del_port_group(struct net_bridge_port_group *p);
int br_mdb_hash_init(struct net_bridge *br);
void br_mdb_hash_fini(struct net_bridge *br);
void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
@@ -974,6 +978,9 @@ void br_multicast_uninit_stats(struct net_bridge *br);
void br_multicast_get_stats(const struct net_bridge *br,
const struct net_bridge_port *p,
struct br_mcast_stats *dest);
+u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx);
+void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max);
+u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx);
void br_mdb_init(void);
void br_mdb_uninit(void);
void br_multicast_host_join(const struct net_bridge_mcast *brmctx,
@@ -1757,7 +1764,8 @@ static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid)
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
const struct net_bridge_vlan *range_end);
-bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v);
+bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v,
+ const struct net_bridge_port *p);
size_t br_vlan_opts_nl_size(void);
int br_vlan_process_options(const struct net_bridge *br,
const struct net_bridge_port *p,
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index bc75fa1e4666..8a3dbc09ba38 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1816,6 +1816,7 @@ out_err:
/* v_opts is used to dump the options which must be equal in the whole range */
static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range,
const struct net_bridge_vlan *v_opts,
+ const struct net_bridge_port *p,
u16 flags,
bool dump_stats)
{
@@ -1842,7 +1843,7 @@ static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range,
goto out_err;
if (v_opts) {
- if (!br_vlan_opts_fill(skb, v_opts))
+ if (!br_vlan_opts_fill(skb, v_opts, p))
goto out_err;
if (dump_stats && !br_vlan_stats_fill(skb, v_opts))
@@ -1925,7 +1926,7 @@ void br_vlan_notify(const struct net_bridge *br,
goto out_kfree;
}
- if (!br_vlan_fill_vids(skb, vid, vid_range, v, flags, false))
+ if (!br_vlan_fill_vids(skb, vid, vid_range, v, p, flags, false))
goto out_err;
nlmsg_end(skb, nlh);
@@ -2030,7 +2031,7 @@ static int br_vlan_dump_dev(const struct net_device *dev,
if (!br_vlan_fill_vids(skb, range_start->vid,
range_end->vid, range_start,
- vlan_flags, dump_stats)) {
+ p, vlan_flags, dump_stats)) {
err = -EMSGSIZE;
break;
}
@@ -2056,7 +2057,7 @@ update_end:
else if (!dump_global &&
!br_vlan_fill_vids(skb, range_start->vid,
range_end->vid, range_start,
- br_vlan_flags(range_start, pvid),
+ p, br_vlan_flags(range_start, pvid),
dump_stats))
err = -EMSGSIZE;
}
@@ -2131,6 +2132,8 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] =
[BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 },
[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED },
[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS] = { .type = NLA_REJECT },
+ [BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS] = { .type = NLA_U32 },
};
static int br_vlan_rtm_process_one(struct net_device *dev,
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
index a2724d03278c..e378c2f3a9e2 100644
--- a/net/bridge/br_vlan_options.c
+++ b/net/bridge/br_vlan_options.c
@@ -48,7 +48,8 @@ bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
curr_mc_rtr == range_mc_rtr;
}
-bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v)
+bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v,
+ const struct net_bridge_port *p)
{
if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) ||
!__vlan_tun_put(skb, v))
@@ -58,6 +59,12 @@ bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v)
if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_MCAST_ROUTER,
br_vlan_multicast_router(v)))
return false;
+ if (p && !br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx) &&
+ (nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS,
+ br_multicast_ngroups_get(&v->port_mcast_ctx)) ||
+ nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS,
+ br_multicast_ngroups_get_max(&v->port_mcast_ctx))))
+ return false;
#endif
return true;
@@ -70,6 +77,8 @@ size_t br_vlan_opts_nl_size(void)
+ nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_TINFO_ID */
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_MCAST_ROUTER */
+ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS */
+ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS */
#endif
+ 0;
}
@@ -212,6 +221,22 @@ static int br_vlan_process_one_opts(const struct net_bridge *br,
return err;
*changed = true;
}
+ if (tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]) {
+ u32 val;
+
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't set mcast_max_groups for non-port vlans");
+ return -EINVAL;
+ }
+ if (br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx)) {
+ NL_SET_ERR_MSG_MOD(extack, "Multicast snooping disabled on this VLAN");
+ return -EINVAL;
+ }
+
+ val = nla_get_u32(tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]);
+ br_multicast_ngroups_set_max(&v->port_mcast_ctx, val);
+ *changed = true;
+ }
#endif
return 0;
diff --git a/net/can/gw.c b/net/can/gw.c
index 23a3d89cad81..37528826935e 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1139,6 +1139,13 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh,
if (gwj->dst.dev->type != ARPHRD_CAN)
goto out;
+ /* is sending the skb back to the incoming interface intended? */
+ if (gwj->src.dev == gwj->dst.dev &&
+ !(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK)) {
+ err = -EINVAL;
+ goto out;
+ }
+
ASSERT_RTNL();
err = cgw_register_filter(net, gwj);
diff --git a/net/can/isotp.c b/net/can/isotp.c
index fc81d77724a1..9bc344851704 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -1220,6 +1220,9 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len)
if (len < ISOTP_MIN_NAMELEN)
return -EINVAL;
+ if (addr->can_family != AF_CAN)
+ return -EINVAL;
+
/* sanitize tx CAN identifier */
if (tx_id & CAN_EFF_FLAG)
tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK);
diff --git a/net/can/j1939/address-claim.c b/net/can/j1939/address-claim.c
index f33c47327927..ca4ad6cdd5cb 100644
--- a/net/can/j1939/address-claim.c
+++ b/net/can/j1939/address-claim.c
@@ -165,6 +165,46 @@ static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb)
* leaving this function.
*/
ecu = j1939_ecu_get_by_name_locked(priv, name);
+
+ if (ecu && ecu->addr == skcb->addr.sa) {
+ /* The ISO 11783-5 standard, in "4.5.2 - Address claim
+ * requirements", states:
+ * d) No CF shall begin, or resume, transmission on the
+ * network until 250 ms after it has successfully claimed
+ * an address except when responding to a request for
+ * address-claimed.
+ *
+ * But "Figure 6" and "Figure 7" in "4.5.4.2 - Address-claim
+ * prioritization" show that the CF begins the transmission
+ * after 250 ms from the first AC (address-claimed) message
+ * even if it sends another AC message during that time window
+ * to resolve the address contention with another CF.
+ *
+ * As stated in "4.4.2.3 - Address-claimed message":
+ * In order to successfully claim an address, the CF sending
+ * an address claimed message shall not receive a contending
+ * claim from another CF for at least 250 ms.
+ *
+ * As stated in "4.4.3.2 - NAME management (NM) message":
+ * 1) A commanding CF can
+ * d) request that a CF with a specified NAME transmit
+ * the address-claimed message with its current NAME.
+ * 2) A target CF shall
+ * d) send an address-claimed message in response to a
+ * request for a matching NAME
+ *
+ * Taking the above arguments into account, the 250 ms wait is
+ * requested only during network initialization.
+ *
+ * Do not restart the timer on AC message if both the NAME and
+ * the address match and so if the address has already been
+ * claimed (timer has expired) or the AC message has been sent
+ * to resolve the contention with another CF (timer is still
+ * running).
+ */
+ goto out_ecu_put;
+ }
+
if (!ecu && j1939_address_is_unicast(skcb->addr.sa))
ecu = j1939_ecu_create_locked(priv, name);
diff --git a/net/can/raw.c b/net/can/raw.c
index ba86782ba8bb..f64469b98260 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -523,6 +523,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
struct can_filter sfilter; /* single filter */
struct net_device *dev = NULL;
can_err_mask_t err_mask = 0;
+ int fd_frames;
int count = 0;
int err = 0;
@@ -664,17 +665,17 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
break;
case CAN_RAW_FD_FRAMES:
- if (optlen != sizeof(ro->fd_frames))
+ if (optlen != sizeof(fd_frames))
return -EINVAL;
- if (copy_from_sockptr(&ro->fd_frames, optval, optlen))
+ if (copy_from_sockptr(&fd_frames, optval, optlen))
return -EFAULT;
/* Enabling CAN XL includes CAN FD */
- if (ro->xl_frames && !ro->fd_frames) {
- ro->fd_frames = ro->xl_frames;
+ if (ro->xl_frames && !fd_frames)
return -EINVAL;
- }
+
+ ro->fd_frames = fd_frames;
break;
case CAN_RAW_XL_FRAMES:
diff --git a/net/core/Makefile b/net/core/Makefile
index 10edd66a8a37..8f367813bc68 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -12,7 +12,8 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
- fib_notifier.o xdp.o flow_offload.o gro.o
+ fib_notifier.o xdp.o flow_offload.o gro.o \
+ netdev-genl.o netdev-genl-gen.o
obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
diff --git a/net/core/dev.c b/net/core/dev.c
index bb42150a38ec..7307a0c15c9f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1614,6 +1614,7 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
+ N(XDP_FEAT_CHANGE)
}
#undef N
return "UNKNOWN_NETDEV_EVENT";
diff --git a/net/core/dev.h b/net/core/dev.h
index a065b7571441..e075e198092c 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -9,6 +9,7 @@ struct net_device;
struct netdev_bpf;
struct netdev_phys_item_id;
struct netlink_ext_ack;
+struct cpumask;
/* Random bits of netdevice that don't need to be exposed */
#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */
@@ -134,4 +135,5 @@ static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
WRITE_ONCE(dev->gro_ipv4_max_size, size);
}
+int rps_cpumask_housekeeping(struct cpumask *mask);
#endif
diff --git a/net/core/filter.c b/net/core/filter.c
index d8f9b53f3db6..2ce06a72a5ba 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4318,16 +4318,13 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
enum bpf_map_type map_type = ri->map_type;
- /* XDP_REDIRECT is not fully supported yet for xdp frags since
- * not all XDP capable drivers can map non-linear xdp_frame in
- * ndo_xdp_xmit.
- */
- if (unlikely(xdp_buff_has_frags(xdp) &&
- map_type != BPF_MAP_TYPE_CPUMAP))
- return -EOPNOTSUPP;
+ if (map_type == BPF_MAP_TYPE_XSKMAP) {
+ /* XDP_REDIRECT is not supported AF_XDP yet. */
+ if (unlikely(xdp_buff_has_frags(xdp)))
+ return -EOPNOTSUPP;
- if (map_type == BPF_MAP_TYPE_XSKMAP)
return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
+ }
return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
xdp_prog);
@@ -7536,7 +7533,7 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
.arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
.arg1_size = sizeof(struct iphdr),
.arg2_type = ARG_PTR_TO_MEM,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
};
BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph,
@@ -7568,7 +7565,7 @@ static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
.arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
.arg1_size = sizeof(struct ipv6hdr),
.arg2_type = ARG_PTR_TO_MEM,
- .arg3_type = ARG_CONST_SIZE,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
};
BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 57258110bccd..6798f6d2423b 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -269,7 +269,7 @@ static int neigh_forced_gc(struct neigh_table *tbl)
(n->nud_state == NUD_NOARP) ||
(tbl->is_multicast &&
tbl->is_multicast(n->primary_key)) ||
- time_after(tref, n->updated))
+ !time_in_range(n->updated, tref, jiffies))
remove = true;
write_unlock(&n->lock);
@@ -289,7 +289,17 @@ static int neigh_forced_gc(struct neigh_table *tbl)
static void neigh_add_timer(struct neighbour *n, unsigned long when)
{
+ /* Use safe distance from the jiffies - LONG_MAX point while timer
+ * is running in DELAY/PROBE state but still show to user space
+ * large times in the past.
+ */
+ unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ);
+
neigh_hold(n);
+ if (!time_in_range(n->confirmed, mint, jiffies))
+ n->confirmed = mint;
+ if (time_before(n->used, n->confirmed))
+ n->used = n->confirmed;
if (unlikely(mod_timer(&n->timer, when))) {
printk("NEIGH: BUG, double timer add, state is %x\n",
n->nud_state);
@@ -1001,12 +1011,14 @@ static void neigh_periodic_work(struct work_struct *work)
goto next_elt;
}
- if (time_before(n->used, n->confirmed))
+ if (time_before(n->used, n->confirmed) &&
+ time_is_before_eq_jiffies(n->confirmed))
n->used = n->confirmed;
if (refcount_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
- time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
+ !time_in_range_open(jiffies, n->used,
+ n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
*np = n->next;
neigh_mark_dead(n);
write_unlock(&n->lock);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index ca55dd747d6c..4b361ac6a252 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -831,42 +831,18 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf)
return len < PAGE_SIZE ? len : -EINVAL;
}
-static ssize_t store_rps_map(struct netdev_rx_queue *queue,
- const char *buf, size_t len)
+static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue,
+ cpumask_var_t mask)
{
- struct rps_map *old_map, *map;
- cpumask_var_t mask;
- int err, cpu, i;
static DEFINE_MUTEX(rps_map_mutex);
-
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
-
- err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
- if (err) {
- free_cpumask_var(mask);
- return err;
- }
-
- if (!cpumask_empty(mask)) {
- cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
- cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
- if (cpumask_empty(mask)) {
- free_cpumask_var(mask);
- return -EINVAL;
- }
- }
+ struct rps_map *old_map, *map;
+ int cpu, i;
map = kzalloc(max_t(unsigned int,
RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
GFP_KERNEL);
- if (!map) {
- free_cpumask_var(mask);
+ if (!map)
return -ENOMEM;
- }
i = 0;
for_each_cpu_and(cpu, mask, cpu_online_mask)
@@ -893,9 +869,45 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
if (old_map)
kfree_rcu(old_map, rcu);
+ return 0;
+}
+int rps_cpumask_housekeeping(struct cpumask *mask)
+{
+ if (!cpumask_empty(mask)) {
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
+ if (cpumask_empty(mask))
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static ssize_t store_rps_map(struct netdev_rx_queue *queue,
+ const char *buf, size_t len)
+{
+ cpumask_var_t mask;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err)
+ goto out;
+
+ err = rps_cpumask_housekeeping(mask);
+ if (err)
+ goto out;
+
+ err = netdev_rx_queue_set_rps_mask(queue, mask);
+
+out:
free_cpumask_var(mask);
- return len;
+ return err ? : len;
}
static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
@@ -1071,6 +1083,13 @@ static int rx_queue_add_kobject(struct net_device *dev, int index)
goto err;
}
+#if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL)
+ if (!cpumask_empty(&rps_default_mask)) {
+ error = netdev_rx_queue_set_rps_mask(queue, &rps_default_mask);
+ if (error)
+ goto err;
+ }
+#endif
kobject_uevent(kobj, KOBJ_ADD);
return error;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index ee7006bbe49b..805b7385dd8d 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -41,6 +41,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add);
EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_mdb_full);
#endif
#if IS_ENABLED(CONFIG_PAGE_POOL)
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
new file mode 100644
index 000000000000..48812ec843f5
--- /dev/null
+++ b/net/core/netdev-genl-gen.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: BSD-3-Clause
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/netdev.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "netdev-genl-gen.h"
+
+#include <linux/netdev.h>
+
+/* NETDEV_CMD_DEV_GET - do */
+static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = {
+ [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+/* Ops table for netdev */
+static const struct genl_split_ops netdev_nl_ops[2] = {
+ {
+ .cmd = NETDEV_CMD_DEV_GET,
+ .doit = netdev_nl_dev_get_doit,
+ .policy = netdev_dev_get_nl_policy,
+ .maxattr = NETDEV_A_DEV_IFINDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_DEV_GET,
+ .dumpit = netdev_nl_dev_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+};
+
+static const struct genl_multicast_group netdev_nl_mcgrps[] = {
+ [NETDEV_NLGRP_MGMT] = { "mgmt", },
+};
+
+struct genl_family netdev_nl_family __ro_after_init = {
+ .name = NETDEV_FAMILY_NAME,
+ .version = NETDEV_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = netdev_nl_ops,
+ .n_split_ops = ARRAY_SIZE(netdev_nl_ops),
+ .mcgrps = netdev_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps),
+};
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
new file mode 100644
index 000000000000..b16dc7e026bb
--- /dev/null
+++ b/net/core/netdev-genl-gen.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/netdev.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_NETDEV_GEN_H
+#define _LINUX_NETDEV_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <linux/netdev.h>
+
+int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+
+enum {
+ NETDEV_NLGRP_MGMT,
+};
+
+extern struct genl_family netdev_nl_family;
+
+#endif /* _LINUX_NETDEV_GEN_H */
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
new file mode 100644
index 000000000000..a4270fafdf11
--- /dev/null
+++ b/net/core/netdev-genl.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+
+#include "netdev-genl-gen.h"
+
+static int
+netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
+ u32 portid, u32 seq, int flags, u32 cmd)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(rsp, portid, seq, &netdev_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
+ netdev->xdp_features, NETDEV_A_DEV_PAD)) {
+ genlmsg_cancel(rsp, hdr);
+ return -EINVAL;
+ }
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+}
+
+static void
+netdev_genl_dev_notify(struct net_device *netdev, int cmd)
+{
+ struct sk_buff *ntf;
+
+ if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev),
+ NETDEV_NLGRP_MGMT))
+ return;
+
+ ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!ntf)
+ return;
+
+ if (netdev_nl_dev_fill(netdev, ntf, 0, 0, 0, cmd)) {
+ nlmsg_free(ntf);
+ return;
+ }
+
+ genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf,
+ 0, NETDEV_NLGRP_MGMT, GFP_KERNEL);
+}
+
+int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *netdev;
+ struct sk_buff *rsp;
+ u32 ifindex;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ rtnl_lock();
+
+ netdev = __dev_get_by_index(genl_info_net(info), ifindex);
+ if (netdev)
+ err = netdev_nl_dev_fill(netdev, rsp, info->snd_portid,
+ info->snd_seq, 0, info->genlhdr->cmd);
+ else
+ err = -ENODEV;
+
+ rtnl_unlock();
+
+ if (err)
+ goto err_free_msg;
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+ return err;
+}
+
+int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ int idx = 0, s_idx;
+ int h, s_h;
+ int err;
+
+ s_h = cb->args[0];
+ s_idx = cb->args[1];
+
+ rtnl_lock();
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ struct hlist_head *head;
+
+ idx = 0;
+ head = &net->dev_index_head[h];
+ hlist_for_each_entry(netdev, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ err = netdev_nl_dev_fill(netdev, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ NETDEV_CMD_DEV_GET);
+ if (err < 0)
+ break;
+cont:
+ idx++;
+ }
+ }
+
+ rtnl_unlock();
+
+ if (err != -EMSGSIZE)
+ return err;
+
+ cb->args[1] = idx;
+ cb->args[0] = h;
+ cb->seq = net->dev_base_seq;
+
+ return skb->len;
+}
+
+static int netdev_genl_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF);
+ break;
+ case NETDEV_UNREGISTER:
+ netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF);
+ break;
+ case NETDEV_XDP_FEAT_CHANGE:
+ netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block netdev_genl_nb = {
+ .notifier_call = netdev_genl_netdevice_event,
+};
+
+static int __init netdev_genl_init(void)
+{
+ int err;
+
+ err = register_netdevice_notifier(&netdev_genl_nb);
+ if (err)
+ return err;
+
+ err = genl_register_family(&netdev_nl_family);
+ if (err)
+ goto err_unreg_ntf;
+
+ return 0;
+
+err_unreg_ntf:
+ unregister_netdevice_notifier(&netdev_genl_nb);
+ return err;
+}
+
+subsys_initcall(netdev_genl_init);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 9b203d8660e4..193c18799865 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -511,8 +511,8 @@ static void page_pool_return_page(struct page_pool *pool, struct page *page)
static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
{
int ret;
- /* BH protection not needed if current is serving softirq */
- if (in_serving_softirq())
+ /* BH protection not needed if current is softirq */
+ if (in_softirq())
ret = ptr_ring_produce(&pool->ring, page);
else
ret = ptr_ring_produce_bh(&pool->ring, page);
@@ -570,7 +570,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
page_pool_dma_sync_for_device(pool, page,
dma_sync_size);
- if (allow_direct && in_serving_softirq() &&
+ if (allow_direct && in_softirq() &&
page_pool_recycle_in_cache(page, pool))
return NULL;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b9f584955b77..5d8eb57867a9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -58,7 +58,7 @@
#include "dev.h"
#define RTNL_MAX_TYPE 50
-#define RTNL_SLAVE_MAX_TYPE 40
+#define RTNL_SLAVE_MAX_TYPE 42
struct rtnl_link {
rtnl_doit_func doit;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 44a19805c355..13ea10cf8544 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -84,11 +84,39 @@
#include "dev.h"
#include "sock_destructor.h"
-struct kmem_cache *skbuff_head_cache __ro_after_init;
+struct kmem_cache *skbuff_cache __ro_after_init;
static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif
+
+/* skb_small_head_cache and related code is only supported
+ * for CONFIG_SLAB and CONFIG_SLUB.
+ * As soon as SLOB is removed from the kernel, we can clean up this.
+ */
+#if !defined(CONFIG_SLOB)
+# define HAVE_SKB_SMALL_HEAD_CACHE 1
+#endif
+
+#ifdef HAVE_SKB_SMALL_HEAD_CACHE
+static struct kmem_cache *skb_small_head_cache __ro_after_init;
+
+#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
+
+/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
+ * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
+ * size, and we can differentiate heads from skb_small_head_cache
+ * vs system slabs by looking at their size (skb_end_offset()).
+ */
+#define SKB_SMALL_HEAD_CACHE_SIZE \
+ (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
+ (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
+ SKB_SMALL_HEAD_SIZE)
+
+#define SKB_SMALL_HEAD_HEADROOM \
+ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
+#endif /* HAVE_SKB_SMALL_HEAD_CACHE */
+
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
@@ -257,7 +285,7 @@ static struct sk_buff *napi_skb_cache_get(void)
struct sk_buff *skb;
if (unlikely(!nc->skb_count)) {
- nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
+ nc->skb_count = kmem_cache_alloc_bulk(skbuff_cache,
GFP_ATOMIC,
NAPI_SKB_CACHE_BULK,
nc->skb_cache);
@@ -266,7 +294,7 @@ static struct sk_buff *napi_skb_cache_get(void)
}
skb = nc->skb_cache[--nc->skb_count];
- kasan_unpoison_object_data(skbuff_head_cache, skb);
+ kasan_unpoison_object_data(skbuff_cache, skb);
return skb;
}
@@ -324,7 +352,7 @@ struct sk_buff *slab_build_skb(void *data)
struct sk_buff *skb;
unsigned int size;
- skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
if (unlikely(!skb))
return NULL;
@@ -375,7 +403,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb;
- skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
if (unlikely(!skb))
return NULL;
@@ -478,17 +506,37 @@ EXPORT_SYMBOL(napi_build_skb);
* may be used. Otherwise, the packet data may be discarded until enough
* memory is free
*/
-static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
+static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
bool *pfmemalloc)
{
- void *obj;
bool ret_pfmemalloc = false;
+ unsigned int obj_size;
+ void *obj;
+
+ obj_size = SKB_HEAD_ALIGN(*size);
+#ifdef HAVE_SKB_SMALL_HEAD_CACHE
+ if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
+ !(flags & KMALLOC_NOT_NORMAL_BITS)) {
+ /* skb_small_head_cache has non power of two size,
+ * likely forcing SLUB to use order-3 pages.
+ * We deliberately attempt a NOMEMALLOC allocation only.
+ */
+ obj = kmem_cache_alloc_node(skb_small_head_cache,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj) {
+ *size = SKB_SMALL_HEAD_CACHE_SIZE;
+ goto out;
+ }
+ }
+#endif
+ *size = obj_size = kmalloc_size_roundup(obj_size);
/*
* Try a regular allocation, when that fails and we're not entitled
* to the reserves, fail.
*/
- obj = kmalloc_node_track_caller(size,
+ obj = kmalloc_node_track_caller(obj_size,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node);
if (obj || !(gfp_pfmemalloc_allowed(flags)))
@@ -496,7 +544,7 @@ static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
/* Try again but now we are using pfmemalloc reserves */
ret_pfmemalloc = true;
- obj = kmalloc_node_track_caller(size, flags, node);
+ obj = kmalloc_node_track_caller(obj_size, flags, node);
out:
if (pfmemalloc)
@@ -533,12 +581,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
{
struct kmem_cache *cache;
struct sk_buff *skb;
- unsigned int osize;
bool pfmemalloc;
u8 *data;
cache = (flags & SKB_ALLOC_FCLONE)
- ? skbuff_fclone_cache : skbuff_head_cache;
+ ? skbuff_fclone_cache : skbuff_cache;
if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
gfp_mask |= __GFP_MEMALLOC;
@@ -558,18 +605,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
* aligned memory blocks, unless SLUB/SLAB debug is enabled.
* Both skb->head and skb_shared_info are cache line aligned.
*/
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- osize = kmalloc_size_roundup(size);
- data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc);
+ data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
if (unlikely(!data))
goto nodata;
/* kmalloc_size_roundup() might give us more room than requested.
* Put skb_shared_info exactly at the end of allocated zone,
* to allow max possible filling before reallocation.
*/
- size = SKB_WITH_OVERHEAD(osize);
- prefetchw(data + size);
+ prefetchw(data + SKB_WITH_OVERHEAD(size));
/*
* Only clear those fields we need to clear, not those that we will
@@ -577,7 +620,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
* the tail pointer in struct sk_buff!
*/
memset(skb, 0, offsetof(struct sk_buff, tail));
- __build_skb_around(skb, data, osize);
+ __build_skb_around(skb, data, size);
skb->pfmemalloc = pfmemalloc;
if (flags & SKB_ALLOC_FCLONE) {
@@ -632,8 +675,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
goto skb_success;
}
- len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- len = SKB_DATA_ALIGN(len);
+ len = SKB_HEAD_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
@@ -732,8 +774,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
} else {
- len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- len = SKB_DATA_ALIGN(len);
+ len = SKB_HEAD_ALIGN(len);
data = page_frag_alloc(&nc->page, len, gfp_mask);
pfmemalloc = nc->page.pfmemalloc;
@@ -809,6 +850,16 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data)
return page_pool_return_skb_page(virt_to_page(data));
}
+static void skb_kfree_head(void *head, unsigned int end_offset)
+{
+#ifdef HAVE_SKB_SMALL_HEAD_CACHE
+ if (end_offset == SKB_SMALL_HEAD_HEADROOM)
+ kmem_cache_free(skb_small_head_cache, head);
+ else
+#endif
+ kfree(head);
+}
+
static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;
@@ -818,7 +869,7 @@ static void skb_free_head(struct sk_buff *skb)
return;
skb_free_frag(head);
} else {
- kfree(head);
+ skb_kfree_head(head, skb_end_offset(skb));
}
}
@@ -870,7 +921,7 @@ static void kfree_skbmem(struct sk_buff *skb)
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
- kmem_cache_free(skbuff_head_cache, skb);
+ kmem_cache_free(skbuff_cache, skb);
return;
case SKB_FCLONE_ORIG:
@@ -984,7 +1035,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
sa->skb_array[sa->skb_count++] = skb;
if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
- kmem_cache_free_bulk(skbuff_head_cache, KFREE_SKB_BULK_SIZE,
+ kmem_cache_free_bulk(skbuff_cache, KFREE_SKB_BULK_SIZE,
sa->skb_array);
sa->skb_count = 0;
}
@@ -1000,15 +1051,16 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
while (segs) {
struct sk_buff *next = segs->next;
- if (__kfree_skb_reason(segs, reason))
+ if (__kfree_skb_reason(segs, reason)) {
+ skb_poison_list(segs);
kfree_skb_add_bulk(segs, &sa, reason);
+ }
segs = next;
}
if (sa.skb_count)
- kmem_cache_free_bulk(skbuff_head_cache, sa.skb_count,
- sa.skb_array);
+ kmem_cache_free_bulk(skbuff_cache, sa.skb_count, sa.skb_array);
}
EXPORT_SYMBOL(kfree_skb_list_reason);
@@ -1162,15 +1214,15 @@ static void napi_skb_cache_put(struct sk_buff *skb)
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
u32 i;
- kasan_poison_object_data(skbuff_head_cache, skb);
+ kasan_poison_object_data(skbuff_cache, skb);
nc->skb_cache[nc->skb_count++] = skb;
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
- kasan_unpoison_object_data(skbuff_head_cache,
+ kasan_unpoison_object_data(skbuff_cache,
nc->skb_cache[i]);
- kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF,
+ kmem_cache_free_bulk(skbuff_cache, NAPI_SKB_CACHE_HALF,
nc->skb_cache + NAPI_SKB_CACHE_HALF);
nc->skb_count = NAPI_SKB_CACHE_HALF;
}
@@ -1754,7 +1806,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+ n = kmem_cache_alloc(skbuff_cache, gfp_mask);
if (!n)
return NULL;
@@ -1936,10 +1988,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- size = kmalloc_size_roundup(size);
- data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
size = SKB_WITH_OVERHEAD(size);
@@ -2002,7 +2051,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
return 0;
nofrags:
- kfree(data);
+ skb_kfree_head(data, size);
nodata:
return -ENOMEM;
}
@@ -4627,7 +4676,7 @@ static void skb_extensions_init(void) {}
void __init skb_init(void)
{
- skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
+ skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
sizeof(struct sk_buff),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
@@ -4639,6 +4688,19 @@ void __init skb_init(void)
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
+#ifdef HAVE_SKB_SMALL_HEAD_CACHE
+ /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
+ * struct skb_shared_info is located at the end of skb->head,
+ * and should not be copied to/from user.
+ */
+ skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
+ SKB_SMALL_HEAD_CACHE_SIZE,
+ 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+ 0,
+ SKB_SMALL_HEAD_HEADROOM,
+ NULL);
+#endif
skb_extensions_init();
}
@@ -5493,7 +5555,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
{
if (head_stolen) {
skb_release_head_state(skb);
- kmem_cache_free(skbuff_head_cache, skb);
+ kmem_cache_free(skbuff_cache, skb);
} else {
__kfree_skb(skb);
}
@@ -6287,10 +6349,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- size = kmalloc_size_roundup(size);
- data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
size = SKB_WITH_OVERHEAD(size);
@@ -6306,7 +6365,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
if (skb_cloned(skb)) {
/* drop the old head gracefully */
if (skb_orphan_frags(skb, gfp_mask)) {
- kfree(data);
+ skb_kfree_head(data, size);
return -ENOMEM;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
@@ -6406,10 +6465,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- size = kmalloc_size_roundup(size);
- data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
size = SKB_WITH_OVERHEAD(size);
@@ -6417,7 +6473,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
memcpy((struct skb_shared_info *)(data + size),
skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
if (skb_orphan_frags(skb, gfp_mask)) {
- kfree(data);
+ skb_kfree_head(data, size);
return -ENOMEM;
}
shinfo = (struct skb_shared_info *)(data + size);
@@ -6453,7 +6509,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
if (skb_has_frag_list(skb))
kfree_skb_list(skb_shinfo(skb)->frag_list);
- kfree(data);
+ skb_kfree_head(data, size);
return -ENOMEM;
}
skb_release_data(skb, SKB_CONSUMED);
diff --git a/net/core/sock.c b/net/core/sock.c
index f08b76acde9b..afbb02984d5f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1531,6 +1531,8 @@ set_sndbuf:
ret = -EINVAL;
break;
}
+ if ((u8)val == SOCK_TXREHASH_DEFAULT)
+ val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
/* Paired with READ_ONCE() in tcp_rtx_synack() */
WRITE_ONCE(sk->sk_txrehash, (u8)val);
break;
@@ -3383,7 +3385,7 @@ void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
}
EXPORT_SYMBOL(sk_stop_timer_sync);
-void sock_init_data(struct socket *sock, struct sock *sk)
+void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
{
sk_init_common(sk);
sk->sk_send_head = NULL;
@@ -3403,11 +3405,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_type = sock->type;
RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
sock->sk = sk;
- sk->sk_uid = SOCK_INODE(sock)->i_uid;
} else {
RCU_INIT_POINTER(sk->sk_wq, NULL);
- sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
}
+ sk->sk_uid = uid;
rwlock_init(&sk->sk_callback_lock);
if (sk->sk_kern_sock)
@@ -3455,7 +3456,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_pacing_rate = ~0UL;
WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
- sk->sk_txrehash = SOCK_TXREHASH_DEFAULT;
sk_rx_queue_clear(sk);
/*
@@ -3466,6 +3466,16 @@ void sock_init_data(struct socket *sock, struct sock *sk)
refcount_set(&sk->sk_refcnt, 1);
atomic_set(&sk->sk_drops, 0);
}
+EXPORT_SYMBOL(sock_init_data_uid);
+
+void sock_init_data(struct socket *sock, struct sock *sk)
+{
+ kuid_t uid = sock ?
+ SOCK_INODE(sock)->i_uid :
+ make_kuid(sock_net(sk)->user_ns, 0);
+
+ sock_init_data_uid(sock, sk, uid);
+}
EXPORT_SYMBOL(sock_init_data);
void lock_sock_nested(struct sock *sk, int subclass)
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index e7b98162c632..7130e6d9e263 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -16,6 +16,7 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/sched/isolation.h>
#include <net/ip.h>
#include <net/sock.h>
@@ -45,7 +46,59 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
int sysctl_devconf_inherit_init_net __read_mostly;
EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
+#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS)
+static void dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos,
+ struct cpumask *mask)
+{
+ char kbuf[128];
+ int len;
+
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ return;
+ }
+
+ len = min(sizeof(kbuf) - 1, *lenp);
+ len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
+ if (!len) {
+ *lenp = 0;
+ return;
+ }
+
+ if (len < *lenp)
+ kbuf[len++] = '\n';
+ memcpy(buffer, kbuf, len);
+ *lenp = len;
+ *ppos += len;
+}
+#endif
+
#ifdef CONFIG_RPS
+struct cpumask rps_default_mask;
+
+static int rps_default_mask_sysctl(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int err = 0;
+
+ rtnl_lock();
+ if (write) {
+ err = cpumask_parse(buffer, &rps_default_mask);
+ if (err)
+ goto done;
+
+ err = rps_cpumask_housekeeping(&rps_default_mask);
+ if (err)
+ goto done;
+ } else {
+ dump_cpumask(buffer, lenp, ppos, &rps_default_mask);
+ }
+
+done:
+ rtnl_unlock();
+ return err;
+}
+
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -155,13 +208,6 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
write_unlock:
mutex_unlock(&flow_limit_update_mutex);
} else {
- char kbuf[128];
-
- if (*ppos || !*lenp) {
- *lenp = 0;
- goto done;
- }
-
cpumask_clear(mask);
rcu_read_lock();
for_each_possible_cpu(i) {
@@ -171,17 +217,7 @@ write_unlock:
}
rcu_read_unlock();
- len = min(sizeof(kbuf) - 1, *lenp);
- len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
- if (!len) {
- *lenp = 0;
- goto done;
- }
- if (len < *lenp)
- kbuf[len++] = '\n';
- memcpy(buffer, kbuf, len);
- *lenp = len;
- *ppos += len;
+ dump_cpumask(buffer, lenp, ppos, mask);
}
done:
@@ -472,6 +508,11 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = rps_sock_flow_sysctl
},
+ {
+ .procname = "rps_default_mask",
+ .mode = 0644,
+ .proc_handler = rps_default_mask_sysctl
+ },
#endif
#ifdef CONFIG_NET_FLOW_LIMIT
{
@@ -675,6 +716,10 @@ static __net_initdata struct pernet_operations sysctl_core_ops = {
static __init int sysctl_core_init(void)
{
+#if IS_ENABLED(CONFIG_RPS)
+ cpumask_copy(&rps_default_mask, cpu_none_mask);
+#endif
+
register_net_sysctl(&init_net, "net/core", net_core_table);
return register_pernet_subsys(&sysctl_core_ops);
}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index a5a7ecf6391c..8c92fc553317 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -4,6 +4,7 @@
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/filter.h>
#include <linux/types.h>
@@ -603,8 +604,7 @@ EXPORT_SYMBOL_GPL(xdp_warn);
int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp)
{
- n_skb = kmem_cache_alloc_bulk(skbuff_head_cache, gfp,
- n_skb, skbs);
+ n_skb = kmem_cache_alloc_bulk(skbuff_cache, gfp, n_skb, skbs);
if (unlikely(!n_skb))
return -ENOMEM;
@@ -673,7 +673,7 @@ struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
{
struct sk_buff *skb;
- skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
if (unlikely(!skb))
return NULL;
@@ -722,7 +722,7 @@ __diag_ignore_all("-Wmissing-prototypes",
*
* Returns 0 on success or ``-errno`` on error.
*/
-int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
+__bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
{
return -EOPNOTSUPP;
}
@@ -734,7 +734,7 @@ int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
*
* Returns 0 on success or ``-errno`` on error.
*/
-int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash)
+__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash)
{
return -EOPNOTSUPP;
}
@@ -773,3 +773,21 @@ static int __init xdp_metadata_init(void)
return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
}
late_initcall(xdp_metadata_init);
+
+void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
+{
+ dev->xdp_features |= NETDEV_XDP_ACT_NDO_XMIT;
+ if (support_sg)
+ dev->xdp_features |= NETDEV_XDP_ACT_NDO_XMIT_SG;
+
+ call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev);
+}
+EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target);
+
+void xdp_features_clear_redirect_target(struct net_device *dev)
+{
+ dev->xdp_features &= ~(NETDEV_XDP_ACT_NDO_XMIT |
+ NETDEV_XDP_ACT_NDO_XMIT_SG);
+ call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev);
+}
+EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target);
diff --git a/net/devlink/Makefile b/net/devlink/Makefile
index 1b1eeac59cb3..daad4521c61e 100644
--- a/net/devlink/Makefile
+++ b/net/devlink/Makefile
@@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y := leftover.o core.o netlink.o
+obj-y := leftover.o core.o netlink.o dev.o
diff --git a/net/devlink/core.c b/net/devlink/core.c
index aeffd1b8206d..a4f47dafb864 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -205,7 +205,7 @@ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
goto err_xa_alloc;
devlink->netdevice_nb.notifier_call = devlink_port_netdevice_event;
- ret = register_netdevice_notifier_net(net, &devlink->netdevice_nb);
+ ret = register_netdevice_notifier(&devlink->netdevice_nb);
if (ret)
goto err_register_netdevice_notifier;
@@ -266,8 +266,7 @@ void devlink_free(struct devlink *devlink)
xa_destroy(&devlink->snapshot_ids);
xa_destroy(&devlink->ports);
- WARN_ON_ONCE(unregister_netdevice_notifier_net(devlink_net(devlink),
- &devlink->netdevice_nb));
+ WARN_ON_ONCE(unregister_netdevice_notifier(&devlink->netdevice_nb));
xa_erase(&devlinks, devlink->index);
diff --git a/net/devlink/dev.c b/net/devlink/dev.c
new file mode 100644
index 000000000000..78d824eda5ec
--- /dev/null
+++ b/net/devlink/dev.c
@@ -0,0 +1,1343 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <net/genetlink.h>
+#include <net/sock.h>
+#include "devl_internal.h"
+
+struct devlink_info_req {
+ struct sk_buff *msg;
+ void (*version_cb)(const char *version_name,
+ enum devlink_info_version_type version_type,
+ void *version_cb_priv);
+ void *version_cb_priv;
+};
+
+struct devlink_reload_combination {
+ enum devlink_reload_action action;
+ enum devlink_reload_limit limit;
+};
+
+static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = {
+ {
+ /* can't reinitialize driver with no down time */
+ .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+ .limit = DEVLINK_RELOAD_LIMIT_NO_RESET,
+ },
+};
+
+static bool
+devlink_reload_combination_is_invalid(enum devlink_reload_action action,
+ enum devlink_reload_limit limit)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++)
+ if (devlink_reload_invalid_combinations[i].action == action &&
+ devlink_reload_invalid_combinations[i].limit == limit)
+ return true;
+ return false;
+}
+
+static bool
+devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action)
+{
+ return test_bit(action, &devlink->ops->reload_actions);
+}
+
+static bool
+devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit)
+{
+ return test_bit(limit, &devlink->ops->reload_limits);
+}
+
+static int devlink_reload_stat_put(struct sk_buff *msg,
+ enum devlink_reload_limit limit, u32 value)
+{
+ struct nlattr *reload_stats_entry;
+
+ reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY);
+ if (!reload_stats_entry)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) ||
+ nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value))
+ goto nla_put_failure;
+ nla_nest_end(msg, reload_stats_entry);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, reload_stats_entry);
+ return -EMSGSIZE;
+}
+
+static int
+devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote)
+{
+ struct nlattr *reload_stats_attr, *act_info, *act_stats;
+ int i, j, stat_idx;
+ u32 value;
+
+ if (!is_remote)
+ reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS);
+ else
+ reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS);
+
+ if (!reload_stats_attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) {
+ if ((!is_remote &&
+ !devlink_reload_action_is_supported(devlink, i)) ||
+ i == DEVLINK_RELOAD_ACTION_UNSPEC)
+ continue;
+ act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO);
+ if (!act_info)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i))
+ goto action_info_nest_cancel;
+ act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS);
+ if (!act_stats)
+ goto action_info_nest_cancel;
+
+ for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) {
+ /* Remote stats are shown even if not locally supported.
+ * Stats of actions with unspecified limit are shown
+ * though drivers don't need to register unspecified
+ * limit.
+ */
+ if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC &&
+ !devlink_reload_limit_is_supported(devlink, j)) ||
+ devlink_reload_combination_is_invalid(i, j))
+ continue;
+
+ stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i;
+ if (!is_remote)
+ value = devlink->stats.reload_stats[stat_idx];
+ else
+ value = devlink->stats.remote_reload_stats[stat_idx];
+ if (devlink_reload_stat_put(msg, j, value))
+ goto action_stats_nest_cancel;
+ }
+ nla_nest_end(msg, act_stats);
+ nla_nest_end(msg, act_info);
+ }
+ nla_nest_end(msg, reload_stats_attr);
+ return 0;
+
+action_stats_nest_cancel:
+ nla_nest_cancel(msg, act_stats);
+action_info_nest_cancel:
+ nla_nest_cancel(msg, act_info);
+nla_put_failure:
+ nla_nest_cancel(msg, reload_stats_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ struct nlattr *dev_stats;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed))
+ goto nla_put_failure;
+
+ dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS);
+ if (!dev_stats)
+ goto nla_put_failure;
+
+ if (devlink_reload_stats_put(msg, devlink, false))
+ goto dev_stats_nest_cancel;
+ if (devlink_reload_stats_put(msg, devlink, true))
+ goto dev_stats_nest_cancel;
+
+ nla_nest_end(msg, dev_stats);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+dev_stats_nest_cancel:
+ nla_nest_cancel(msg, dev_stats);
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
+ WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED));
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_cmd_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+}
+
+const struct devlink_cmd devl_cmd_get = {
+ .dump_one = devlink_nl_cmd_get_dump_one,
+};
+
+static void devlink_reload_failed_set(struct devlink *devlink,
+ bool reload_failed)
+{
+ if (devlink->reload_failed == reload_failed)
+ return;
+ devlink->reload_failed = reload_failed;
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+bool devlink_is_reload_failed(const struct devlink *devlink)
+{
+ return devlink->reload_failed;
+}
+EXPORT_SYMBOL_GPL(devlink_is_reload_failed);
+
+static void
+__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats,
+ enum devlink_reload_limit limit, u32 actions_performed)
+{
+ unsigned long actions = actions_performed;
+ int stat_idx;
+ int action;
+
+ for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) {
+ stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action;
+ reload_stats[stat_idx]++;
+ }
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+static void
+devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit,
+ u32 actions_performed)
+{
+ __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit,
+ actions_performed);
+}
+
+/**
+ * devlink_remote_reload_actions_performed - Update devlink on reload actions
+ * performed which are not a direct result of devlink reload call.
+ *
+ * This should be called by a driver after performing reload actions in case it was not
+ * a result of devlink reload call. For example fw_activate was performed as a result
+ * of devlink reload triggered fw_activate on another host.
+ * The motivation for this function is to keep data on reload actions performed on this
+ * function whether it was done due to direct devlink reload call or not.
+ *
+ * @devlink: devlink
+ * @limit: reload limit
+ * @actions_performed: bitmask of actions performed
+ */
+void devlink_remote_reload_actions_performed(struct devlink *devlink,
+ enum devlink_reload_limit limit,
+ u32 actions_performed)
+{
+ if (WARN_ON(!actions_performed ||
+ actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) ||
+ actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) ||
+ limit > DEVLINK_RELOAD_LIMIT_MAX))
+ return;
+
+ __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit,
+ actions_performed);
+}
+EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed);
+
+static struct net *devlink_netns_get(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID];
+ struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD];
+ struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID];
+ struct net *net;
+
+ if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) {
+ NL_SET_ERR_MSG_MOD(info->extack, "multiple netns identifying attributes specified");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (netns_pid_attr) {
+ net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr));
+ } else if (netns_fd_attr) {
+ net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr));
+ } else if (netns_id_attr) {
+ net = get_net_ns_by_id(sock_net(skb->sk),
+ nla_get_u32(netns_id_attr));
+ if (!net)
+ net = ERR_PTR(-EINVAL);
+ } else {
+ WARN_ON(1);
+ net = ERR_PTR(-EINVAL);
+ }
+ if (IS_ERR(net)) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Unknown network namespace");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+ put_net(net);
+ return ERR_PTR(-EPERM);
+ }
+ return net;
+}
+
+static void devlink_reload_netns_change(struct devlink *devlink,
+ struct net *curr_net,
+ struct net *dest_net)
+{
+ /* Userspace needs to be notified about devlink objects
+ * removed from original and entering new network namespace.
+ * The rest of the devlink objects are re-created during
+ * reload process so the notifications are generated separatelly.
+ */
+ devlink_notify_unregister(devlink);
+ move_netdevice_notifier_net(curr_net, dest_net,
+ &devlink->netdevice_nb);
+ write_pnet(&devlink->_net, dest_net);
+ devlink_notify_register(devlink);
+}
+
+int devlink_reload(struct devlink *devlink, struct net *dest_net,
+ enum devlink_reload_action action,
+ enum devlink_reload_limit limit,
+ u32 *actions_performed, struct netlink_ext_ack *extack)
+{
+ u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
+ struct net *curr_net;
+ int err;
+
+ memcpy(remote_reload_stats, devlink->stats.remote_reload_stats,
+ sizeof(remote_reload_stats));
+
+ err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack);
+ if (err)
+ return err;
+
+ curr_net = devlink_net(devlink);
+ if (dest_net && !net_eq(dest_net, curr_net))
+ devlink_reload_netns_change(devlink, curr_net, dest_net);
+
+ err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack);
+ devlink_reload_failed_set(devlink, !!err);
+ if (err)
+ return err;
+
+ WARN_ON(!(*actions_performed & BIT(action)));
+ /* Catch driver on updating the remote action within devlink reload */
+ WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats,
+ sizeof(remote_reload_stats)));
+ devlink_reload_stats_update(devlink, limit, *actions_performed);
+ return 0;
+}
+
+static int
+devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed,
+ enum devlink_command cmd, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ goto free_msg;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed,
+ actions_performed))
+ goto nla_put_failure;
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ enum devlink_reload_action action;
+ enum devlink_reload_limit limit;
+ struct net *dest_net = NULL;
+ u32 actions_performed;
+ int err;
+
+ err = devlink_resources_validate(devlink, NULL, info);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
+ return err;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION])
+ action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]);
+ else
+ action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT;
+
+ if (!devlink_reload_action_is_supported(devlink, action)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Requested reload action is not supported by the driver");
+ return -EOPNOTSUPP;
+ }
+
+ limit = DEVLINK_RELOAD_LIMIT_UNSPEC;
+ if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) {
+ struct nla_bitfield32 limits;
+ u32 limits_selected;
+
+ limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]);
+ limits_selected = limits.value & limits.selector;
+ if (!limits_selected) {
+ NL_SET_ERR_MSG_MOD(info->extack, "Invalid limit selected");
+ return -EINVAL;
+ }
+ for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++)
+ if (limits_selected & BIT(limit))
+ break;
+ /* UAPI enables multiselection, but currently it is not used */
+ if (limits_selected != BIT(limit)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Multiselection of limit is not supported");
+ return -EOPNOTSUPP;
+ }
+ if (!devlink_reload_limit_is_supported(devlink, limit)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Requested limit is not supported by the driver");
+ return -EOPNOTSUPP;
+ }
+ if (devlink_reload_combination_is_invalid(action, limit)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Requested limit is invalid for this action");
+ return -EINVAL;
+ }
+ }
+ if (info->attrs[DEVLINK_ATTR_NETNS_PID] ||
+ info->attrs[DEVLINK_ATTR_NETNS_FD] ||
+ info->attrs[DEVLINK_ATTR_NETNS_ID]) {
+ dest_net = devlink_netns_get(skb, info);
+ if (IS_ERR(dest_net))
+ return PTR_ERR(dest_net);
+ }
+
+ err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack);
+
+ if (dest_net)
+ put_net(dest_net);
+
+ if (err)
+ return err;
+ /* For backward compatibility generate reply only if attributes used by user */
+ if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS])
+ return 0;
+
+ return devlink_nl_reload_actions_performed_snd(devlink, actions_performed,
+ DEVLINK_CMD_RELOAD, info);
+}
+
+bool devlink_reload_actions_valid(const struct devlink_ops *ops)
+{
+ const struct devlink_reload_combination *comb;
+ int i;
+
+ if (!devlink_reload_supported(ops)) {
+ if (WARN_ON(ops->reload_actions))
+ return false;
+ return true;
+ }
+
+ if (WARN_ON(!ops->reload_actions ||
+ ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) ||
+ ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX)))
+ return false;
+
+ if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) ||
+ ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX)))
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) {
+ comb = &devlink_reload_invalid_combinations[i];
+ if (ops->reload_actions == BIT(comb->action) &&
+ ops->reload_limits == BIT(comb->limit))
+ return false;
+ }
+ return true;
+}
+
+static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ enum devlink_eswitch_encap_mode encap_mode;
+ u8 inline_mode;
+ void *hdr;
+ int err = 0;
+ u16 mode;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ if (ops->eswitch_mode_get) {
+ err = ops->eswitch_mode_get(devlink, &mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (ops->eswitch_inline_mode_get) {
+ err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
+ inline_mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (ops->eswitch_encap_mode_get) {
+ err = ops->eswitch_encap_mode_get(devlink, &encap_mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
+ info->snd_portid, info->snd_seq, 0);
+
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ enum devlink_eswitch_encap_mode encap_mode;
+ u8 inline_mode;
+ int err = 0;
+ u16 mode;
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
+ if (!ops->eswitch_mode_set)
+ return -EOPNOTSUPP;
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ err = devlink_rate_nodes_check(devlink, mode, info->extack);
+ if (err)
+ return err;
+ err = ops->eswitch_mode_set(devlink, mode, info->extack);
+ if (err)
+ return err;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) {
+ if (!ops->eswitch_inline_mode_set)
+ return -EOPNOTSUPP;
+ inline_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
+ err = ops->eswitch_inline_mode_set(devlink, inline_mode,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) {
+ if (!ops->eswitch_encap_mode_set)
+ return -EOPNOTSUPP;
+ encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]);
+ err = ops->eswitch_encap_mode_set(devlink, encap_mode,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
+{
+ if (!req->msg)
+ return 0;
+ return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn);
+}
+EXPORT_SYMBOL_GPL(devlink_info_serial_number_put);
+
+int devlink_info_board_serial_number_put(struct devlink_info_req *req,
+ const char *bsn)
+{
+ if (!req->msg)
+ return 0;
+ return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER,
+ bsn);
+}
+EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put);
+
+static int devlink_info_version_put(struct devlink_info_req *req, int attr,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ struct nlattr *nest;
+ int err;
+
+ if (req->version_cb)
+ req->version_cb(version_name, version_type,
+ req->version_cb_priv);
+
+ if (!req->msg)
+ return 0;
+
+ nest = nla_nest_start_noflag(req->msg, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME,
+ version_name);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE,
+ version_value);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(req->msg, nest);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(req->msg, nest);
+ return err;
+}
+
+int devlink_info_version_fixed_put(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED,
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put);
+
+int devlink_info_version_stored_put(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_stored_put);
+
+int devlink_info_version_stored_put_ext(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
+ version_name, version_value,
+ version_type);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext);
+
+int devlink_info_version_running_put(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_running_put);
+
+int devlink_info_version_running_put_ext(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
+ version_name, version_value,
+ version_type);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext);
+
+static int devlink_nl_driver_info_get(struct device_driver *drv,
+ struct devlink_info_req *req)
+{
+ if (!drv)
+ return 0;
+
+ if (drv->name[0])
+ return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME,
+ drv->name);
+
+ return 0;
+}
+
+static int
+devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags, struct netlink_ext_ack *extack)
+{
+ struct device *dev = devlink_to_dev(devlink);
+ struct devlink_info_req req = {};
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = -EMSGSIZE;
+ if (devlink_nl_put_handle(msg, devlink))
+ goto err_cancel_msg;
+
+ req.msg = msg;
+ if (devlink->ops->info_get) {
+ err = devlink->ops->info_get(devlink, &req, extack);
+ if (err)
+ goto err_cancel_msg;
+ }
+
+ err = devlink_nl_driver_info_get(dev->driver, &req);
+ if (err)
+ goto err_cancel_msg;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+err_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_cmd_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb)
+{
+ int err;
+
+ err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->extack);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ return err;
+}
+
+const struct devlink_cmd devl_cmd_info_get = {
+ .dump_one = devlink_nl_cmd_info_get_dump_one,
+};
+
+static int devlink_nl_flash_update_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ enum devlink_command cmd,
+ struct devlink_flash_notify *params)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS)
+ goto out;
+
+ if (params->status_msg &&
+ nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,
+ params->status_msg))
+ goto nla_put_failure;
+ if (params->component &&
+ nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,
+ params->component))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,
+ params->done, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,
+ params->total, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT,
+ params->timeout, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+out:
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void __devlink_flash_update_notify(struct devlink *devlink,
+ enum devlink_command cmd,
+ struct devlink_flash_notify *params)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE &&
+ cmd != DEVLINK_CMD_FLASH_UPDATE_END &&
+ cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS);
+
+ if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_flash_update_fill(msg, devlink, cmd, params);
+ if (err)
+ goto out_free_msg;
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+ return;
+
+out_free_msg:
+ nlmsg_free(msg);
+}
+
+static void devlink_flash_update_begin_notify(struct devlink *devlink)
+{
+ struct devlink_flash_notify params = {};
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE,
+ &params);
+}
+
+static void devlink_flash_update_end_notify(struct devlink *devlink)
+{
+ struct devlink_flash_notify params = {};
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_END,
+ &params);
+}
+
+void devlink_flash_update_status_notify(struct devlink *devlink,
+ const char *status_msg,
+ const char *component,
+ unsigned long done,
+ unsigned long total)
+{
+ struct devlink_flash_notify params = {
+ .status_msg = status_msg,
+ .component = component,
+ .done = done,
+ .total = total,
+ };
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_STATUS,
+ &params);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify);
+
+void devlink_flash_update_timeout_notify(struct devlink *devlink,
+ const char *status_msg,
+ const char *component,
+ unsigned long timeout)
+{
+ struct devlink_flash_notify params = {
+ .status_msg = status_msg,
+ .component = component,
+ .timeout = timeout,
+ };
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_STATUS,
+ &params);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify);
+
+struct devlink_flash_component_lookup_ctx {
+ const char *lookup_name;
+ bool lookup_name_found;
+};
+
+static void
+devlink_flash_component_lookup_cb(const char *version_name,
+ enum devlink_info_version_type version_type,
+ void *version_cb_priv)
+{
+ struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv;
+
+ if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT ||
+ lookup_ctx->lookup_name_found)
+ return;
+
+ lookup_ctx->lookup_name_found =
+ !strcmp(lookup_ctx->lookup_name, version_name);
+}
+
+static int devlink_flash_component_get(struct devlink *devlink,
+ struct nlattr *nla_component,
+ const char **p_component,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_flash_component_lookup_ctx lookup_ctx = {};
+ struct devlink_info_req req = {};
+ const char *component;
+ int ret;
+
+ if (!nla_component)
+ return 0;
+
+ component = nla_data(nla_component);
+
+ if (!devlink->ops->info_get) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_component,
+ "component update is not supported by this device");
+ return -EOPNOTSUPP;
+ }
+
+ lookup_ctx.lookup_name = component;
+ req.version_cb = devlink_flash_component_lookup_cb;
+ req.version_cb_priv = &lookup_ctx;
+
+ ret = devlink->ops->info_get(devlink, &req, NULL);
+ if (ret)
+ return ret;
+
+ if (!lookup_ctx.lookup_name_found) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_component,
+ "selected component is not supported by this device");
+ return -EINVAL;
+ }
+ *p_component = component;
+ return 0;
+}
+
+int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *nla_overwrite_mask, *nla_file_name;
+ struct devlink_flash_update_params params = {};
+ struct devlink *devlink = info->user_ptr[0];
+ const char *file_name;
+ u32 supported_params;
+ int ret;
+
+ if (!devlink->ops->flash_update)
+ return -EOPNOTSUPP;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME))
+ return -EINVAL;
+
+ ret = devlink_flash_component_get(devlink,
+ info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT],
+ &params.component, info->extack);
+ if (ret)
+ return ret;
+
+ supported_params = devlink->ops->supported_flash_update_params;
+
+ nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK];
+ if (nla_overwrite_mask) {
+ struct nla_bitfield32 sections;
+
+ if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) {
+ NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask,
+ "overwrite settings are not supported by this device");
+ return -EOPNOTSUPP;
+ }
+ sections = nla_get_bitfield32(nla_overwrite_mask);
+ params.overwrite_mask = sections.value & sections.selector;
+ }
+
+ nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME];
+ file_name = nla_data(nla_file_name);
+ ret = request_firmware(&params.fw, file_name, devlink->dev);
+ if (ret) {
+ NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name,
+ "failed to locate the requested firmware file");
+ return ret;
+ }
+
+ devlink_flash_update_begin_notify(devlink);
+ ret = devlink->ops->flash_update(devlink, &params, info->extack);
+ devlink_flash_update_end_notify(devlink);
+
+ release_firmware(params.fw);
+
+ return ret;
+}
+
+static void __devlink_compat_running_version(struct devlink *devlink,
+ char *buf, size_t len)
+{
+ struct devlink_info_req req = {};
+ const struct nlattr *nlattr;
+ struct sk_buff *msg;
+ int rem, err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ req.msg = msg;
+ err = devlink->ops->info_get(devlink, &req, NULL);
+ if (err)
+ goto free_msg;
+
+ nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) {
+ const struct nlattr *kv;
+ int rem_kv;
+
+ if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING)
+ continue;
+
+ nla_for_each_nested(kv, nlattr, rem_kv) {
+ if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE)
+ continue;
+
+ strlcat(buf, nla_data(kv), len);
+ strlcat(buf, " ", len);
+ }
+ }
+free_msg:
+ nlmsg_free(msg);
+}
+
+void devlink_compat_running_version(struct devlink *devlink,
+ char *buf, size_t len)
+{
+ if (!devlink->ops->info_get)
+ return;
+
+ devl_lock(devlink);
+ if (devl_is_registered(devlink))
+ __devlink_compat_running_version(devlink, buf, len);
+ devl_unlock(devlink);
+}
+
+int devlink_compat_flash_update(struct devlink *devlink, const char *file_name)
+{
+ struct devlink_flash_update_params params = {};
+ int ret;
+
+ devl_lock(devlink);
+ if (!devl_is_registered(devlink)) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ if (!devlink->ops->flash_update) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ ret = request_firmware(&params.fw, file_name, devlink->dev);
+ if (ret)
+ goto out_unlock;
+
+ devlink_flash_update_begin_notify(devlink);
+ ret = devlink->ops->flash_update(devlink, &params, NULL);
+ devlink_flash_update_end_notify(devlink);
+
+ release_firmware(params.fw);
+out_unlock:
+ devl_unlock(devlink);
+
+ return ret;
+}
+
+static int
+devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink,
+ u32 portid, u32 seq, int flags,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *selftests;
+ void *hdr;
+ int err;
+ int i;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags,
+ DEVLINK_CMD_SELFTESTS_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = -EMSGSIZE;
+ if (devlink_nl_put_handle(msg, devlink))
+ goto err_cancel_msg;
+
+ selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
+ if (!selftests)
+ goto err_cancel_msg;
+
+ for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
+ i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
+ if (devlink->ops->selftest_check(devlink, i, extack)) {
+ err = nla_put_flag(msg, i);
+ if (err)
+ goto err_cancel_msg;
+ }
+ }
+
+ nla_nest_end(msg, selftests);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+err_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ if (!devlink->ops->selftest_check)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid,
+ info->snd_seq, 0, info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_cmd_selftests_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb)
+{
+ if (!devlink->ops->selftest_check)
+ return 0;
+
+ return devlink_nl_selftests_fill(msg, devlink,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->extack);
+}
+
+const struct devlink_cmd devl_cmd_selftests_get = {
+ .dump_one = devlink_nl_cmd_selftests_get_dump_one,
+};
+
+static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id,
+ enum devlink_selftest_status test_status)
+{
+ struct nlattr *result_attr;
+
+ result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT);
+ if (!result_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS,
+ test_status))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, result_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, result_attr);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
+ [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG },
+};
+
+int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1];
+ struct devlink *devlink = info->user_ptr[0];
+ struct nlattr *attrs, *selftests;
+ struct sk_buff *msg;
+ void *hdr;
+ int err;
+ int i;
+
+ if (!devlink->ops->selftest_run || !devlink->ops->selftest_check)
+ return -EOPNOTSUPP;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS))
+ return -EINVAL;
+
+ attrs = info->attrs[DEVLINK_ATTR_SELFTESTS];
+
+ err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs,
+ devlink_selftest_nl_policy, info->extack);
+ if (err < 0)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = -EMSGSIZE;
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN);
+ if (!hdr)
+ goto free_msg;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto genlmsg_cancel;
+
+ selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
+ if (!selftests)
+ goto genlmsg_cancel;
+
+ for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
+ i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
+ enum devlink_selftest_status test_status;
+
+ if (nla_get_flag(tb[i])) {
+ if (!devlink->ops->selftest_check(devlink, i,
+ info->extack)) {
+ if (devlink_selftest_result_put(msg, i,
+ DEVLINK_SELFTEST_STATUS_SKIP))
+ goto selftests_nest_cancel;
+ continue;
+ }
+
+ test_status = devlink->ops->selftest_run(devlink, i,
+ info->extack);
+ if (devlink_selftest_result_put(msg, i, test_status))
+ goto selftests_nest_cancel;
+ }
+ }
+
+ nla_nest_end(msg, selftests);
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+selftests_nest_cancel:
+ nla_nest_cancel(msg, selftests);
+genlmsg_cancel:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return err;
+}
diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h
index bdd7ad25c7e8..941174e157d4 100644
--- a/net/devlink/devl_internal.h
+++ b/net/devlink/devl_internal.h
@@ -139,6 +139,16 @@ devlink_dump_state(struct netlink_callback *cb)
return (struct devlink_nl_dump_state *)cb->ctx;
}
+static inline int
+devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+ if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
+ return -EMSGSIZE;
+ if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
+ return -EMSGSIZE;
+ return 0;
+}
+
/* Commands */
extern const struct devlink_cmd devl_cmd_get;
extern const struct devlink_cmd devl_cmd_port_get;
@@ -157,6 +167,9 @@ extern const struct devlink_cmd devl_cmd_rate_get;
extern const struct devlink_cmd devl_cmd_linecard_get;
extern const struct devlink_cmd devl_cmd_selftests_get;
+/* Notify */
+void devlink_notify(struct devlink *devlink, enum devlink_command cmd);
+
/* Ports */
int devlink_port_netdevice_event(struct notifier_block *nb,
unsigned long event, void *ptr);
@@ -176,6 +189,12 @@ static inline bool devlink_reload_supported(const struct devlink_ops *ops)
return ops->reload_down && ops->reload_up;
}
+/* Resources */
+struct devlink_resource;
+int devlink_resources_validate(struct devlink *devlink,
+ struct devlink_resource *resource,
+ struct genl_info *info);
+
/* Line cards */
struct devlink_linecard;
@@ -183,8 +202,19 @@ struct devlink_linecard *
devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info);
/* Rates */
+int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
+ struct netlink_ext_ack *extack);
struct devlink_rate *
devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info);
struct devlink_rate *
devlink_rate_node_get_from_info(struct devlink *devlink,
struct genl_info *info);
+/* Devlink nl cmds */
+int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_cmd_flash_update(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_cmd_selftests_run(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c
index 056d9ca14a3d..f05ab093d231 100644
--- a/net/devlink/leftover.c
+++ b/net/devlink/leftover.c
@@ -143,10 +143,6 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_
NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK),
};
-static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
- [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG },
-};
-
#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \
WARN_ON_ONCE(!(devlink_port)->registered)
#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \
@@ -596,15 +592,6 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
return NULL;
}
-static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
-{
- if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
- return -EMSGSIZE;
- if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
- return -EMSGSIZE;
- return 0;
-}
-
static int devlink_nl_put_nested_handle(struct sk_buff *msg, struct devlink *devlink)
{
struct nlattr *nested_attr;
@@ -641,185 +628,6 @@ size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port)
+ nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */
}
-struct devlink_reload_combination {
- enum devlink_reload_action action;
- enum devlink_reload_limit limit;
-};
-
-static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = {
- {
- /* can't reinitialize driver with no down time */
- .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
- .limit = DEVLINK_RELOAD_LIMIT_NO_RESET,
- },
-};
-
-static bool
-devlink_reload_combination_is_invalid(enum devlink_reload_action action,
- enum devlink_reload_limit limit)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++)
- if (devlink_reload_invalid_combinations[i].action == action &&
- devlink_reload_invalid_combinations[i].limit == limit)
- return true;
- return false;
-}
-
-static bool
-devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action)
-{
- return test_bit(action, &devlink->ops->reload_actions);
-}
-
-static bool
-devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit)
-{
- return test_bit(limit, &devlink->ops->reload_limits);
-}
-
-static int devlink_reload_stat_put(struct sk_buff *msg,
- enum devlink_reload_limit limit, u32 value)
-{
- struct nlattr *reload_stats_entry;
-
- reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY);
- if (!reload_stats_entry)
- return -EMSGSIZE;
-
- if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) ||
- nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value))
- goto nla_put_failure;
- nla_nest_end(msg, reload_stats_entry);
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(msg, reload_stats_entry);
- return -EMSGSIZE;
-}
-
-static int devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote)
-{
- struct nlattr *reload_stats_attr, *act_info, *act_stats;
- int i, j, stat_idx;
- u32 value;
-
- if (!is_remote)
- reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS);
- else
- reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS);
-
- if (!reload_stats_attr)
- return -EMSGSIZE;
-
- for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) {
- if ((!is_remote &&
- !devlink_reload_action_is_supported(devlink, i)) ||
- i == DEVLINK_RELOAD_ACTION_UNSPEC)
- continue;
- act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO);
- if (!act_info)
- goto nla_put_failure;
-
- if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i))
- goto action_info_nest_cancel;
- act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS);
- if (!act_stats)
- goto action_info_nest_cancel;
-
- for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) {
- /* Remote stats are shown even if not locally supported.
- * Stats of actions with unspecified limit are shown
- * though drivers don't need to register unspecified
- * limit.
- */
- if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC &&
- !devlink_reload_limit_is_supported(devlink, j)) ||
- devlink_reload_combination_is_invalid(i, j))
- continue;
-
- stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i;
- if (!is_remote)
- value = devlink->stats.reload_stats[stat_idx];
- else
- value = devlink->stats.remote_reload_stats[stat_idx];
- if (devlink_reload_stat_put(msg, j, value))
- goto action_stats_nest_cancel;
- }
- nla_nest_end(msg, act_stats);
- nla_nest_end(msg, act_info);
- }
- nla_nest_end(msg, reload_stats_attr);
- return 0;
-
-action_stats_nest_cancel:
- nla_nest_cancel(msg, act_stats);
-action_info_nest_cancel:
- nla_nest_cancel(msg, act_info);
-nla_put_failure:
- nla_nest_cancel(msg, reload_stats_attr);
- return -EMSGSIZE;
-}
-
-static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags)
-{
- struct nlattr *dev_stats;
- void *hdr;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
- if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed))
- goto nla_put_failure;
-
- dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS);
- if (!dev_stats)
- goto nla_put_failure;
-
- if (devlink_reload_stats_put(msg, devlink, false))
- goto dev_stats_nest_cancel;
- if (devlink_reload_stats_put(msg, devlink, true))
- goto dev_stats_nest_cancel;
-
- nla_nest_end(msg, dev_stats);
- genlmsg_end(msg, hdr);
- return 0;
-
-dev_stats_nest_cancel:
- nla_nest_cancel(msg, dev_stats);
-nla_put_failure:
- genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
-}
-
-static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
-{
- struct sk_buff *msg;
- int err;
-
- WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
- WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED));
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return;
-
- err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
- if (err) {
- nlmsg_free(msg);
- return;
- }
-
- genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
-}
-
static int devlink_nl_port_attrs_put(struct sk_buff *msg,
struct devlink_port *devlink_port)
{
@@ -1274,39 +1082,6 @@ devlink_rate_is_parent_node(struct devlink_rate *devlink_rate,
return false;
}
-static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct sk_buff *msg;
- int err;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
- info->snd_portid, info->snd_seq, 0);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int
-devlink_nl_cmd_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
- struct netlink_callback *cb)
-{
- return devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI);
-}
-
-const struct devlink_cmd devl_cmd_get = {
- .dump_one = devlink_nl_cmd_get_dump_one,
-};
-
static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
struct genl_info *info)
{
@@ -3064,85 +2839,8 @@ static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
return -EOPNOTSUPP;
}
-static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags)
-{
- const struct devlink_ops *ops = devlink->ops;
- enum devlink_eswitch_encap_mode encap_mode;
- u8 inline_mode;
- void *hdr;
- int err = 0;
- u16 mode;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- err = devlink_nl_put_handle(msg, devlink);
- if (err)
- goto nla_put_failure;
-
- if (ops->eswitch_mode_get) {
- err = ops->eswitch_mode_get(devlink, &mode);
- if (err)
- goto nla_put_failure;
- err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
- if (err)
- goto nla_put_failure;
- }
-
- if (ops->eswitch_inline_mode_get) {
- err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
- if (err)
- goto nla_put_failure;
- err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
- inline_mode);
- if (err)
- goto nla_put_failure;
- }
-
- if (ops->eswitch_encap_mode_get) {
- err = ops->eswitch_encap_mode_get(devlink, &encap_mode);
- if (err)
- goto nla_put_failure;
- err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode);
- if (err)
- goto nla_put_failure;
- }
-
- genlmsg_end(msg, hdr);
- return 0;
-
-nla_put_failure:
- genlmsg_cancel(msg, hdr);
- return err;
-}
-
-static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct sk_buff *msg;
- int err;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
- info->snd_portid, info->snd_seq, 0);
-
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
- struct netlink_ext_ack *extack)
+int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
+ struct netlink_ext_ack *extack)
{
struct devlink_rate *devlink_rate;
@@ -3154,52 +2852,6 @@ static int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
return 0;
}
-static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- const struct devlink_ops *ops = devlink->ops;
- enum devlink_eswitch_encap_mode encap_mode;
- u8 inline_mode;
- int err = 0;
- u16 mode;
-
- if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
- if (!ops->eswitch_mode_set)
- return -EOPNOTSUPP;
- mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
- err = devlink_rate_nodes_check(devlink, mode, info->extack);
- if (err)
- return err;
- err = ops->eswitch_mode_set(devlink, mode, info->extack);
- if (err)
- return err;
- }
-
- if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) {
- if (!ops->eswitch_inline_mode_set)
- return -EOPNOTSUPP;
- inline_mode = nla_get_u8(
- info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
- err = ops->eswitch_inline_mode_set(devlink, inline_mode,
- info->extack);
- if (err)
- return err;
- }
-
- if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) {
- if (!ops->eswitch_encap_mode_set)
- return -EOPNOTSUPP;
- encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]);
- err = ops->eswitch_encap_mode_set(devlink, encap_mode,
- info->extack);
- if (err)
- return err;
- }
-
- return 0;
-}
-
int devlink_dpipe_match_put(struct sk_buff *skb,
struct devlink_dpipe_match *match)
{
@@ -4170,10 +3822,9 @@ static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
}
-static int
-devlink_resources_validate(struct devlink *devlink,
- struct devlink_resource *resource,
- struct genl_info *info)
+int devlink_resources_validate(struct devlink *devlink,
+ struct devlink_resource *resource,
+ struct genl_info *info)
{
struct list_head *resource_list;
int err = 0;
@@ -4193,698 +3844,6 @@ devlink_resources_validate(struct devlink *devlink,
return err;
}
-static struct net *devlink_netns_get(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID];
- struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD];
- struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID];
- struct net *net;
-
- if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) {
- NL_SET_ERR_MSG_MOD(info->extack, "multiple netns identifying attributes specified");
- return ERR_PTR(-EINVAL);
- }
-
- if (netns_pid_attr) {
- net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr));
- } else if (netns_fd_attr) {
- net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr));
- } else if (netns_id_attr) {
- net = get_net_ns_by_id(sock_net(skb->sk),
- nla_get_u32(netns_id_attr));
- if (!net)
- net = ERR_PTR(-EINVAL);
- } else {
- WARN_ON(1);
- net = ERR_PTR(-EINVAL);
- }
- if (IS_ERR(net)) {
- NL_SET_ERR_MSG_MOD(info->extack, "Unknown network namespace");
- return ERR_PTR(-EINVAL);
- }
- if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
- put_net(net);
- return ERR_PTR(-EPERM);
- }
- return net;
-}
-
-static void devlink_reload_netns_change(struct devlink *devlink,
- struct net *curr_net,
- struct net *dest_net)
-{
- /* Userspace needs to be notified about devlink objects
- * removed from original and entering new network namespace.
- * The rest of the devlink objects are re-created during
- * reload process so the notifications are generated separatelly.
- */
- devlink_notify_unregister(devlink);
- move_netdevice_notifier_net(curr_net, dest_net,
- &devlink->netdevice_nb);
- write_pnet(&devlink->_net, dest_net);
- devlink_notify_register(devlink);
-}
-
-static void devlink_reload_failed_set(struct devlink *devlink,
- bool reload_failed)
-{
- if (devlink->reload_failed == reload_failed)
- return;
- devlink->reload_failed = reload_failed;
- devlink_notify(devlink, DEVLINK_CMD_NEW);
-}
-
-bool devlink_is_reload_failed(const struct devlink *devlink)
-{
- return devlink->reload_failed;
-}
-EXPORT_SYMBOL_GPL(devlink_is_reload_failed);
-
-static void
-__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats,
- enum devlink_reload_limit limit, u32 actions_performed)
-{
- unsigned long actions = actions_performed;
- int stat_idx;
- int action;
-
- for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) {
- stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action;
- reload_stats[stat_idx]++;
- }
- devlink_notify(devlink, DEVLINK_CMD_NEW);
-}
-
-static void
-devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit,
- u32 actions_performed)
-{
- __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit,
- actions_performed);
-}
-
-/**
- * devlink_remote_reload_actions_performed - Update devlink on reload actions
- * performed which are not a direct result of devlink reload call.
- *
- * This should be called by a driver after performing reload actions in case it was not
- * a result of devlink reload call. For example fw_activate was performed as a result
- * of devlink reload triggered fw_activate on another host.
- * The motivation for this function is to keep data on reload actions performed on this
- * function whether it was done due to direct devlink reload call or not.
- *
- * @devlink: devlink
- * @limit: reload limit
- * @actions_performed: bitmask of actions performed
- */
-void devlink_remote_reload_actions_performed(struct devlink *devlink,
- enum devlink_reload_limit limit,
- u32 actions_performed)
-{
- if (WARN_ON(!actions_performed ||
- actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) ||
- actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) ||
- limit > DEVLINK_RELOAD_LIMIT_MAX))
- return;
-
- __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit,
- actions_performed);
-}
-EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed);
-
-int devlink_reload(struct devlink *devlink, struct net *dest_net,
- enum devlink_reload_action action,
- enum devlink_reload_limit limit,
- u32 *actions_performed, struct netlink_ext_ack *extack)
-{
- u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
- struct net *curr_net;
- int err;
-
- memcpy(remote_reload_stats, devlink->stats.remote_reload_stats,
- sizeof(remote_reload_stats));
-
- err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack);
- if (err)
- return err;
-
- curr_net = devlink_net(devlink);
- if (dest_net && !net_eq(dest_net, curr_net))
- devlink_reload_netns_change(devlink, curr_net, dest_net);
-
- err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack);
- devlink_reload_failed_set(devlink, !!err);
- if (err)
- return err;
-
- WARN_ON(!(*actions_performed & BIT(action)));
- /* Catch driver on updating the remote action within devlink reload */
- WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats,
- sizeof(remote_reload_stats)));
- devlink_reload_stats_update(devlink, limit, *actions_performed);
- return 0;
-}
-
-static int
-devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed,
- enum devlink_command cmd, struct genl_info *info)
-{
- struct sk_buff *msg;
- void *hdr;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd);
- if (!hdr)
- goto free_msg;
-
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
-
- if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed,
- actions_performed))
- goto nla_put_failure;
- genlmsg_end(msg, hdr);
-
- return genlmsg_reply(msg, info);
-
-nla_put_failure:
- genlmsg_cancel(msg, hdr);
-free_msg:
- nlmsg_free(msg);
- return -EMSGSIZE;
-}
-
-static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- enum devlink_reload_action action;
- enum devlink_reload_limit limit;
- struct net *dest_net = NULL;
- u32 actions_performed;
- int err;
-
- err = devlink_resources_validate(devlink, NULL, info);
- if (err) {
- NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
- return err;
- }
-
- if (info->attrs[DEVLINK_ATTR_RELOAD_ACTION])
- action = nla_get_u8(info->attrs[DEVLINK_ATTR_RELOAD_ACTION]);
- else
- action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT;
-
- if (!devlink_reload_action_is_supported(devlink, action)) {
- NL_SET_ERR_MSG_MOD(info->extack,
- "Requested reload action is not supported by the driver");
- return -EOPNOTSUPP;
- }
-
- limit = DEVLINK_RELOAD_LIMIT_UNSPEC;
- if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) {
- struct nla_bitfield32 limits;
- u32 limits_selected;
-
- limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]);
- limits_selected = limits.value & limits.selector;
- if (!limits_selected) {
- NL_SET_ERR_MSG_MOD(info->extack, "Invalid limit selected");
- return -EINVAL;
- }
- for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++)
- if (limits_selected & BIT(limit))
- break;
- /* UAPI enables multiselection, but currently it is not used */
- if (limits_selected != BIT(limit)) {
- NL_SET_ERR_MSG_MOD(info->extack,
- "Multiselection of limit is not supported");
- return -EOPNOTSUPP;
- }
- if (!devlink_reload_limit_is_supported(devlink, limit)) {
- NL_SET_ERR_MSG_MOD(info->extack,
- "Requested limit is not supported by the driver");
- return -EOPNOTSUPP;
- }
- if (devlink_reload_combination_is_invalid(action, limit)) {
- NL_SET_ERR_MSG_MOD(info->extack,
- "Requested limit is invalid for this action");
- return -EINVAL;
- }
- }
- if (info->attrs[DEVLINK_ATTR_NETNS_PID] ||
- info->attrs[DEVLINK_ATTR_NETNS_FD] ||
- info->attrs[DEVLINK_ATTR_NETNS_ID]) {
- dest_net = devlink_netns_get(skb, info);
- if (IS_ERR(dest_net))
- return PTR_ERR(dest_net);
- }
-
- err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack);
-
- if (dest_net)
- put_net(dest_net);
-
- if (err)
- return err;
- /* For backward compatibility generate reply only if attributes used by user */
- if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS])
- return 0;
-
- return devlink_nl_reload_actions_performed_snd(devlink, actions_performed,
- DEVLINK_CMD_RELOAD, info);
-}
-
-static int devlink_nl_flash_update_fill(struct sk_buff *msg,
- struct devlink *devlink,
- enum devlink_command cmd,
- struct devlink_flash_notify *params)
-{
- void *hdr;
-
- hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
-
- if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS)
- goto out;
-
- if (params->status_msg &&
- nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,
- params->status_msg))
- goto nla_put_failure;
- if (params->component &&
- nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,
- params->component))
- goto nla_put_failure;
- if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,
- params->done, DEVLINK_ATTR_PAD))
- goto nla_put_failure;
- if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,
- params->total, DEVLINK_ATTR_PAD))
- goto nla_put_failure;
- if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT,
- params->timeout, DEVLINK_ATTR_PAD))
- goto nla_put_failure;
-
-out:
- genlmsg_end(msg, hdr);
- return 0;
-
-nla_put_failure:
- genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
-}
-
-static void __devlink_flash_update_notify(struct devlink *devlink,
- enum devlink_command cmd,
- struct devlink_flash_notify *params)
-{
- struct sk_buff *msg;
- int err;
-
- WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE &&
- cmd != DEVLINK_CMD_FLASH_UPDATE_END &&
- cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS);
-
- if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED))
- return;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return;
-
- err = devlink_nl_flash_update_fill(msg, devlink, cmd, params);
- if (err)
- goto out_free_msg;
-
- genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
- return;
-
-out_free_msg:
- nlmsg_free(msg);
-}
-
-static void devlink_flash_update_begin_notify(struct devlink *devlink)
-{
- struct devlink_flash_notify params = {};
-
- __devlink_flash_update_notify(devlink,
- DEVLINK_CMD_FLASH_UPDATE,
- &params);
-}
-
-static void devlink_flash_update_end_notify(struct devlink *devlink)
-{
- struct devlink_flash_notify params = {};
-
- __devlink_flash_update_notify(devlink,
- DEVLINK_CMD_FLASH_UPDATE_END,
- &params);
-}
-
-void devlink_flash_update_status_notify(struct devlink *devlink,
- const char *status_msg,
- const char *component,
- unsigned long done,
- unsigned long total)
-{
- struct devlink_flash_notify params = {
- .status_msg = status_msg,
- .component = component,
- .done = done,
- .total = total,
- };
-
- __devlink_flash_update_notify(devlink,
- DEVLINK_CMD_FLASH_UPDATE_STATUS,
- &params);
-}
-EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify);
-
-void devlink_flash_update_timeout_notify(struct devlink *devlink,
- const char *status_msg,
- const char *component,
- unsigned long timeout)
-{
- struct devlink_flash_notify params = {
- .status_msg = status_msg,
- .component = component,
- .timeout = timeout,
- };
-
- __devlink_flash_update_notify(devlink,
- DEVLINK_CMD_FLASH_UPDATE_STATUS,
- &params);
-}
-EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify);
-
-struct devlink_info_req {
- struct sk_buff *msg;
- void (*version_cb)(const char *version_name,
- enum devlink_info_version_type version_type,
- void *version_cb_priv);
- void *version_cb_priv;
-};
-
-struct devlink_flash_component_lookup_ctx {
- const char *lookup_name;
- bool lookup_name_found;
-};
-
-static void
-devlink_flash_component_lookup_cb(const char *version_name,
- enum devlink_info_version_type version_type,
- void *version_cb_priv)
-{
- struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv;
-
- if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT ||
- lookup_ctx->lookup_name_found)
- return;
-
- lookup_ctx->lookup_name_found =
- !strcmp(lookup_ctx->lookup_name, version_name);
-}
-
-static int devlink_flash_component_get(struct devlink *devlink,
- struct nlattr *nla_component,
- const char **p_component,
- struct netlink_ext_ack *extack)
-{
- struct devlink_flash_component_lookup_ctx lookup_ctx = {};
- struct devlink_info_req req = {};
- const char *component;
- int ret;
-
- if (!nla_component)
- return 0;
-
- component = nla_data(nla_component);
-
- if (!devlink->ops->info_get) {
- NL_SET_ERR_MSG_ATTR(extack, nla_component,
- "component update is not supported by this device");
- return -EOPNOTSUPP;
- }
-
- lookup_ctx.lookup_name = component;
- req.version_cb = devlink_flash_component_lookup_cb;
- req.version_cb_priv = &lookup_ctx;
-
- ret = devlink->ops->info_get(devlink, &req, NULL);
- if (ret)
- return ret;
-
- if (!lookup_ctx.lookup_name_found) {
- NL_SET_ERR_MSG_ATTR(extack, nla_component,
- "selected component is not supported by this device");
- return -EINVAL;
- }
- *p_component = component;
- return 0;
-}
-
-static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct nlattr *nla_overwrite_mask, *nla_file_name;
- struct devlink_flash_update_params params = {};
- struct devlink *devlink = info->user_ptr[0];
- const char *file_name;
- u32 supported_params;
- int ret;
-
- if (!devlink->ops->flash_update)
- return -EOPNOTSUPP;
-
- if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME))
- return -EINVAL;
-
- ret = devlink_flash_component_get(devlink,
- info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT],
- &params.component, info->extack);
- if (ret)
- return ret;
-
- supported_params = devlink->ops->supported_flash_update_params;
-
- nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK];
- if (nla_overwrite_mask) {
- struct nla_bitfield32 sections;
-
- if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) {
- NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask,
- "overwrite settings are not supported by this device");
- return -EOPNOTSUPP;
- }
- sections = nla_get_bitfield32(nla_overwrite_mask);
- params.overwrite_mask = sections.value & sections.selector;
- }
-
- nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME];
- file_name = nla_data(nla_file_name);
- ret = request_firmware(&params.fw, file_name, devlink->dev);
- if (ret) {
- NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name, "failed to locate the requested firmware file");
- return ret;
- }
-
- devlink_flash_update_begin_notify(devlink);
- ret = devlink->ops->flash_update(devlink, &params, info->extack);
- devlink_flash_update_end_notify(devlink);
-
- release_firmware(params.fw);
-
- return ret;
-}
-
-static int
-devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink,
- u32 portid, u32 seq, int flags,
- struct netlink_ext_ack *extack)
-{
- struct nlattr *selftests;
- void *hdr;
- int err;
- int i;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags,
- DEVLINK_CMD_SELFTESTS_GET);
- if (!hdr)
- return -EMSGSIZE;
-
- err = -EMSGSIZE;
- if (devlink_nl_put_handle(msg, devlink))
- goto err_cancel_msg;
-
- selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
- if (!selftests)
- goto err_cancel_msg;
-
- for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
- i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
- if (devlink->ops->selftest_check(devlink, i, extack)) {
- err = nla_put_flag(msg, i);
- if (err)
- goto err_cancel_msg;
- }
- }
-
- nla_nest_end(msg, selftests);
- genlmsg_end(msg, hdr);
- return 0;
-
-err_cancel_msg:
- genlmsg_cancel(msg, hdr);
- return err;
-}
-
-static int devlink_nl_cmd_selftests_get_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct sk_buff *msg;
- int err;
-
- if (!devlink->ops->selftest_check)
- return -EOPNOTSUPP;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid,
- info->snd_seq, 0, info->extack);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int
-devlink_nl_cmd_selftests_get_dump_one(struct sk_buff *msg,
- struct devlink *devlink,
- struct netlink_callback *cb)
-{
- if (!devlink->ops->selftest_check)
- return 0;
-
- return devlink_nl_selftests_fill(msg, devlink,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->extack);
-}
-
-const struct devlink_cmd devl_cmd_selftests_get = {
- .dump_one = devlink_nl_cmd_selftests_get_dump_one,
-};
-
-static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id,
- enum devlink_selftest_status test_status)
-{
- struct nlattr *result_attr;
-
- result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT);
- if (!result_attr)
- return -EMSGSIZE;
-
- if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) ||
- nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS,
- test_status))
- goto nla_put_failure;
-
- nla_nest_end(skb, result_attr);
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(skb, result_attr);
- return -EMSGSIZE;
-}
-
-static int devlink_nl_cmd_selftests_run(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1];
- struct devlink *devlink = info->user_ptr[0];
- struct nlattr *attrs, *selftests;
- struct sk_buff *msg;
- void *hdr;
- int err;
- int i;
-
- if (!devlink->ops->selftest_run || !devlink->ops->selftest_check)
- return -EOPNOTSUPP;
-
- if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS))
- return -EINVAL;
-
- attrs = info->attrs[DEVLINK_ATTR_SELFTESTS];
-
- err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs,
- devlink_selftest_nl_policy, info->extack);
- if (err < 0)
- return err;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = -EMSGSIZE;
- hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
- &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN);
- if (!hdr)
- goto free_msg;
-
- if (devlink_nl_put_handle(msg, devlink))
- goto genlmsg_cancel;
-
- selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
- if (!selftests)
- goto genlmsg_cancel;
-
- for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
- i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
- enum devlink_selftest_status test_status;
-
- if (nla_get_flag(tb[i])) {
- if (!devlink->ops->selftest_check(devlink, i,
- info->extack)) {
- if (devlink_selftest_result_put(msg, i,
- DEVLINK_SELFTEST_STATUS_SKIP))
- goto selftests_nest_cancel;
- continue;
- }
-
- test_status = devlink->ops->selftest_run(devlink, i,
- info->extack);
- if (devlink_selftest_result_put(msg, i, test_status))
- goto selftests_nest_cancel;
- }
- }
-
- nla_nest_end(msg, selftests);
- genlmsg_end(msg, hdr);
- return genlmsg_reply(msg, info);
-
-selftests_nest_cancel:
- nla_nest_cancel(msg, selftests);
-genlmsg_cancel:
- genlmsg_cancel(msg, hdr);
-free_msg:
- nlmsg_free(msg);
- return err;
-}
-
static const struct devlink_param devlink_param_generic[] = {
{
.id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
@@ -6430,205 +5389,6 @@ out_unlock:
return err;
}
-int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
-{
- if (!req->msg)
- return 0;
- return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn);
-}
-EXPORT_SYMBOL_GPL(devlink_info_serial_number_put);
-
-int devlink_info_board_serial_number_put(struct devlink_info_req *req,
- const char *bsn)
-{
- if (!req->msg)
- return 0;
- return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER,
- bsn);
-}
-EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put);
-
-static int devlink_info_version_put(struct devlink_info_req *req, int attr,
- const char *version_name,
- const char *version_value,
- enum devlink_info_version_type version_type)
-{
- struct nlattr *nest;
- int err;
-
- if (req->version_cb)
- req->version_cb(version_name, version_type,
- req->version_cb_priv);
-
- if (!req->msg)
- return 0;
-
- nest = nla_nest_start_noflag(req->msg, attr);
- if (!nest)
- return -EMSGSIZE;
-
- err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME,
- version_name);
- if (err)
- goto nla_put_failure;
-
- err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE,
- version_value);
- if (err)
- goto nla_put_failure;
-
- nla_nest_end(req->msg, nest);
-
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(req->msg, nest);
- return err;
-}
-
-int devlink_info_version_fixed_put(struct devlink_info_req *req,
- const char *version_name,
- const char *version_value)
-{
- return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED,
- version_name, version_value,
- DEVLINK_INFO_VERSION_TYPE_NONE);
-}
-EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put);
-
-int devlink_info_version_stored_put(struct devlink_info_req *req,
- const char *version_name,
- const char *version_value)
-{
- return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
- version_name, version_value,
- DEVLINK_INFO_VERSION_TYPE_NONE);
-}
-EXPORT_SYMBOL_GPL(devlink_info_version_stored_put);
-
-int devlink_info_version_stored_put_ext(struct devlink_info_req *req,
- const char *version_name,
- const char *version_value,
- enum devlink_info_version_type version_type)
-{
- return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
- version_name, version_value,
- version_type);
-}
-EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext);
-
-int devlink_info_version_running_put(struct devlink_info_req *req,
- const char *version_name,
- const char *version_value)
-{
- return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
- version_name, version_value,
- DEVLINK_INFO_VERSION_TYPE_NONE);
-}
-EXPORT_SYMBOL_GPL(devlink_info_version_running_put);
-
-int devlink_info_version_running_put_ext(struct devlink_info_req *req,
- const char *version_name,
- const char *version_value,
- enum devlink_info_version_type version_type)
-{
- return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
- version_name, version_value,
- version_type);
-}
-EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext);
-
-static int devlink_nl_driver_info_get(struct device_driver *drv,
- struct devlink_info_req *req)
-{
- if (!drv)
- return 0;
-
- if (drv->name[0])
- return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME,
- drv->name);
-
- return 0;
-}
-
-static int
-devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags, struct netlink_ext_ack *extack)
-{
- struct device *dev = devlink_to_dev(devlink);
- struct devlink_info_req req = {};
- void *hdr;
- int err;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- err = -EMSGSIZE;
- if (devlink_nl_put_handle(msg, devlink))
- goto err_cancel_msg;
-
- req.msg = msg;
- if (devlink->ops->info_get) {
- err = devlink->ops->info_get(devlink, &req, extack);
- if (err)
- goto err_cancel_msg;
- }
-
- err = devlink_nl_driver_info_get(dev->driver, &req);
- if (err)
- goto err_cancel_msg;
-
- genlmsg_end(msg, hdr);
- return 0;
-
-err_cancel_msg:
- genlmsg_cancel(msg, hdr);
- return err;
-}
-
-static int devlink_nl_cmd_info_get_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct sk_buff *msg;
- int err;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
- info->snd_portid, info->snd_seq, 0,
- info->extack);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int
-devlink_nl_cmd_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
- struct netlink_callback *cb)
-{
- int err;
-
- err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->extack);
- if (err == -EOPNOTSUPP)
- err = 0;
- return err;
-}
-
-const struct devlink_cmd devl_cmd_info_get = {
- .dump_one = devlink_nl_cmd_info_get_dump_one,
-};
-
struct devlink_fmsg_item {
struct list_head list;
int attrtype;
@@ -7866,18 +6626,22 @@ static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb,
err = devlink_fmsg_obj_nest_start(fmsg);
if (err)
- return err;
+ goto out;
err = reporter->ops->diagnose(reporter, fmsg, info->extack);
if (err)
- return err;
+ goto out;
err = devlink_fmsg_obj_nest_end(fmsg);
if (err)
- return err;
+ goto out;
+
+ err = devlink_fmsg_snd(fmsg, info,
+ DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0);
- return devlink_fmsg_snd(fmsg, info,
- DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0);
+out:
+ devlink_fmsg_free(fmsg);
+ return err;
}
static int
@@ -9257,35 +8021,6 @@ const struct genl_small_ops devlink_nl_ops[56] = {
/* -- No new ops here! Use split ops going forward! -- */
};
-bool devlink_reload_actions_valid(const struct devlink_ops *ops)
-{
- const struct devlink_reload_combination *comb;
- int i;
-
- if (!devlink_reload_supported(ops)) {
- if (WARN_ON(ops->reload_actions))
- return false;
- return true;
- }
-
- if (WARN_ON(!ops->reload_actions ||
- ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) ||
- ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX)))
- return false;
-
- if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) ||
- ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX)))
- return false;
-
- for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) {
- comb = &devlink_reload_invalid_combinations[i];
- if (ops->reload_actions == BIT(comb->action) &&
- ops->reload_limits == BIT(comb->limit))
- return false;
- }
- return true;
-}
-
static void
devlink_trap_policer_notify(struct devlink *devlink,
const struct devlink_trap_policer_item *policer_item,
@@ -9695,6 +8430,8 @@ int devlink_port_netdevice_event(struct notifier_block *nb,
break;
case NETDEV_REGISTER:
case NETDEV_CHANGENAME:
+ if (devlink_net(devlink) != dev_net(netdev))
+ return NOTIFY_OK;
/* Set the netdev on top of previously set type. Note this
* event happens also during net namespace change so here
* we take into account netdev pointer appearing in this
@@ -9704,6 +8441,8 @@ int devlink_port_netdevice_event(struct notifier_block *nb,
netdev);
break;
case NETDEV_UNREGISTER:
+ if (devlink_net(devlink) != dev_net(netdev))
+ return NOTIFY_OK;
/* Clear netdev pointer, but not the type. This event happens
* also during net namespace change so we need to clear
* pointer to netdev that is going to another net namespace.
@@ -12068,85 +10807,6 @@ devl_trap_policers_unregister(struct devlink *devlink,
}
EXPORT_SYMBOL_GPL(devl_trap_policers_unregister);
-static void __devlink_compat_running_version(struct devlink *devlink,
- char *buf, size_t len)
-{
- struct devlink_info_req req = {};
- const struct nlattr *nlattr;
- struct sk_buff *msg;
- int rem, err;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return;
-
- req.msg = msg;
- err = devlink->ops->info_get(devlink, &req, NULL);
- if (err)
- goto free_msg;
-
- nla_for_each_attr(nlattr, (void *)msg->data, msg->len, rem) {
- const struct nlattr *kv;
- int rem_kv;
-
- if (nla_type(nlattr) != DEVLINK_ATTR_INFO_VERSION_RUNNING)
- continue;
-
- nla_for_each_nested(kv, nlattr, rem_kv) {
- if (nla_type(kv) != DEVLINK_ATTR_INFO_VERSION_VALUE)
- continue;
-
- strlcat(buf, nla_data(kv), len);
- strlcat(buf, " ", len);
- }
- }
-free_msg:
- nlmsg_free(msg);
-}
-
-void devlink_compat_running_version(struct devlink *devlink,
- char *buf, size_t len)
-{
- if (!devlink->ops->info_get)
- return;
-
- devl_lock(devlink);
- if (devl_is_registered(devlink))
- __devlink_compat_running_version(devlink, buf, len);
- devl_unlock(devlink);
-}
-
-int devlink_compat_flash_update(struct devlink *devlink, const char *file_name)
-{
- struct devlink_flash_update_params params = {};
- int ret;
-
- devl_lock(devlink);
- if (!devl_is_registered(devlink)) {
- ret = -ENODEV;
- goto out_unlock;
- }
-
- if (!devlink->ops->flash_update) {
- ret = -EOPNOTSUPP;
- goto out_unlock;
- }
-
- ret = request_firmware(&params.fw, file_name, devlink->dev);
- if (ret)
- goto out_unlock;
-
- devlink_flash_update_begin_notify(devlink);
- ret = devlink->ops->flash_update(devlink, &params, NULL);
- devlink_flash_update_end_notify(devlink);
-
- release_firmware(params.fw);
-out_unlock:
- devl_unlock(devlink);
-
- return ret;
-}
-
int devlink_compat_phys_port_name_get(struct net_device *dev,
char *name, size_t len)
{
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 26c458f50ac6..6957971c2db2 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2692,7 +2692,8 @@ static int dsa_slave_changeupper(struct net_device *dev,
if (!err)
dsa_bridge_mtu_normalization(dp);
if (err == -EOPNOTSUPP) {
- NL_SET_ERR_MSG_WEAK_MOD(extack, "Offloading not supported");
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "Offloading not supported");
err = 0;
}
err = notifier_from_errno(err);
@@ -2705,8 +2706,8 @@ static int dsa_slave_changeupper(struct net_device *dev,
err = dsa_port_lag_join(dp, info->upper_dev,
info->upper_info, extack);
if (err == -EOPNOTSUPP) {
- NL_SET_ERR_MSG_MOD(info->info.extack,
- "Offloading not supported");
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "Offloading not supported");
err = 0;
}
err = notifier_from_errno(err);
@@ -2718,8 +2719,8 @@ static int dsa_slave_changeupper(struct net_device *dev,
if (info->linking) {
err = dsa_port_hsr_join(dp, info->upper_dev);
if (err == -EOPNOTSUPP) {
- NL_SET_ERR_MSG_MOD(info->info.extack,
- "Offloading not supported");
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "Offloading not supported");
err = 0;
}
err = notifier_from_errno(err);
diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c
index 7e51f7633001..e612856eed8c 100644
--- a/net/ethtool/mm.c
+++ b/net/ethtool/mm.c
@@ -56,7 +56,7 @@ static int mm_prepare_data(const struct ethnl_req_info *req_base,
out_complete:
ethnl_ops_complete(dev);
- return 0;
+ return ret;
}
static int mm_reply_size(const struct ethnl_req_info *req_base,
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 2f992a323b95..2c778b013cb0 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -347,6 +347,7 @@ lookup_protocol:
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+ sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
inet->uc_ttl = -1;
inet->mc_loop = 1;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6ed7e65de494..7d206a10ad14 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1246,9 +1246,6 @@ int inet_csk_listen_start(struct sock *sk)
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
- if (sk->sk_txrehash == SOCK_TXREHASH_DEFAULT)
- sk->sk_txrehash = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
-
/* There is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port().
* It is OK, because this socket enters to hash table only
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 006c1f0ed8b4..94df935ee0c5 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -93,7 +93,7 @@ int raw_hash_sk(struct sock *sk)
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
struct hlist_nulls_head *hlist;
- hlist = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
+ hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)];
spin_lock(&h->lock);
__sk_nulls_add_node_rcu(sk, hlist);
@@ -160,9 +160,9 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
* RFC 1122: SHOULD pass TOS value up to the transport layer.
* -> It does. And not only TOS, but all IP header.
*/
-static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
+static int raw_v4_input(struct net *net, struct sk_buff *skb,
+ const struct iphdr *iph, int hash)
{
- struct net *net = dev_net(skb->dev);
struct hlist_nulls_head *hlist;
struct hlist_nulls_node *hnode;
int sdif = inet_sdif(skb);
@@ -193,9 +193,10 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
int raw_local_deliver(struct sk_buff *skb, int protocol)
{
- int hash = protocol & (RAW_HTABLE_SIZE - 1);
+ struct net *net = dev_net(skb->dev);
- return raw_v4_input(skb, ip_hdr(skb), hash);
+ return raw_v4_input(net, skb, ip_hdr(skb),
+ raw_hashfunc(net, protocol));
}
static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
@@ -271,7 +272,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
struct sock *sk;
int hash;
- hash = protocol & (RAW_HTABLE_SIZE - 1);
+ hash = raw_hashfunc(net, protocol);
hlist = &raw_v4_hashinfo.ht[hash];
rcu_read_lock();
@@ -287,11 +288,13 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason;
+
/* Charge it to the socket. */
ipv4_pktinfo_prepare(sk, skb);
- if (sock_queue_rcv_skb(sk, skb) < 0) {
- kfree_skb(skb);
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
+ kfree_skb_reason(skb, reason);
return NET_RX_DROP;
}
@@ -302,7 +305,7 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
nf_reset_ct(skb);
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index d2c470524e58..146792cd26fe 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -295,7 +295,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
}
/* override sysctl_tcp_min_tso_segs */
-static u32 bbr_min_tso_segs(struct sock *sk)
+__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
{
return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
}
@@ -328,7 +328,7 @@ static void bbr_save_cwnd(struct sock *sk)
bbr->prior_cwnd = max(bbr->prior_cwnd, tcp_snd_cwnd(tp));
}
-static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+__bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
@@ -1023,7 +1023,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
bbr_update_gains(sk);
}
-static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs)
{
struct bbr *bbr = inet_csk_ca(sk);
u32 bw;
@@ -1035,7 +1035,7 @@ static void bbr_main(struct sock *sk, const struct rate_sample *rs)
bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
}
-static void bbr_init(struct sock *sk)
+__bpf_kfunc static void bbr_init(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
@@ -1077,7 +1077,7 @@ static void bbr_init(struct sock *sk)
cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
}
-static u32 bbr_sndbuf_expand(struct sock *sk)
+__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
{
/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
return 3;
@@ -1086,7 +1086,7 @@ static u32 bbr_sndbuf_expand(struct sock *sk)
/* In theory BBR does not need to undo the cwnd since it does not
* always reduce cwnd on losses (see bbr_main()). Keep it for now.
*/
-static u32 bbr_undo_cwnd(struct sock *sk)
+__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
{
struct bbr *bbr = inet_csk_ca(sk);
@@ -1097,7 +1097,7 @@ static u32 bbr_undo_cwnd(struct sock *sk)
}
/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
-static u32 bbr_ssthresh(struct sock *sk)
+__bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
{
bbr_save_cwnd(sk);
return tcp_sk(sk)->snd_ssthresh;
@@ -1125,7 +1125,7 @@ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
return 0;
}
-static void bbr_set_state(struct sock *sk, u8 new_state)
+__bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
{
struct bbr *bbr = inet_csk_ca(sk);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index d3cae40749e8..db8b4b488c31 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -403,7 +403,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
* ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
* returns the leftover acks to adjust cwnd in congestion avoidance mode.
*/
-u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
+__bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh);
@@ -417,7 +417,7 @@ EXPORT_SYMBOL_GPL(tcp_slow_start);
/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
* for every packet that was ACKed.
*/
-void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
+__bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
{
/* If credits accumulated at a higher w, apply them gently now. */
if (tp->snd_cwnd_cnt >= w) {
@@ -443,7 +443,7 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+__bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -462,7 +462,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
/* Slow start threshold is half the congestion window (min 2) */
-u32 tcp_reno_ssthresh(struct sock *sk)
+__bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -470,7 +470,7 @@ u32 tcp_reno_ssthresh(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
-u32 tcp_reno_undo_cwnd(struct sock *sk)
+__bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 768c10c1f649..0fd78ecb67e7 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -126,7 +126,7 @@ static inline void bictcp_hystart_reset(struct sock *sk)
ca->sample_cnt = 0;
}
-static void cubictcp_init(struct sock *sk)
+__bpf_kfunc static void cubictcp_init(struct sock *sk)
{
struct bictcp *ca = inet_csk_ca(sk);
@@ -139,7 +139,7 @@ static void cubictcp_init(struct sock *sk)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
-static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+__bpf_kfunc static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
if (event == CA_EVENT_TX_START) {
struct bictcp *ca = inet_csk_ca(sk);
@@ -321,7 +321,7 @@ tcp_friendliness:
ca->cnt = max(ca->cnt, 2U);
}
-static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+__bpf_kfunc static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -338,7 +338,7 @@ static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
-static u32 cubictcp_recalc_ssthresh(struct sock *sk)
+__bpf_kfunc static u32 cubictcp_recalc_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -355,7 +355,7 @@ static u32 cubictcp_recalc_ssthresh(struct sock *sk)
return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
}
-static void cubictcp_state(struct sock *sk, u8 new_state)
+__bpf_kfunc static void cubictcp_state(struct sock *sk, u8 new_state)
{
if (new_state == TCP_CA_Loss) {
bictcp_reset(inet_csk_ca(sk));
@@ -445,7 +445,7 @@ static void hystart_update(struct sock *sk, u32 delay)
}
}
-static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
+__bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index e0a2ca7456ff..bb23bb5b387a 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -75,7 +75,7 @@ static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
ca->old_delivered_ce = tp->delivered_ce;
}
-static void dctcp_init(struct sock *sk)
+__bpf_kfunc static void dctcp_init(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -104,7 +104,7 @@ static void dctcp_init(struct sock *sk)
INET_ECN_dontxmit(sk);
}
-static u32 dctcp_ssthresh(struct sock *sk)
+__bpf_kfunc static u32 dctcp_ssthresh(struct sock *sk)
{
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -113,7 +113,7 @@ static u32 dctcp_ssthresh(struct sock *sk)
return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U);
}
-static void dctcp_update_alpha(struct sock *sk, u32 flags)
+__bpf_kfunc static void dctcp_update_alpha(struct sock *sk, u32 flags)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct dctcp *ca = inet_csk_ca(sk);
@@ -169,7 +169,7 @@ static void dctcp_react_to_loss(struct sock *sk)
tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U);
}
-static void dctcp_state(struct sock *sk, u8 new_state)
+__bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state)
{
if (new_state == TCP_CA_Recovery &&
new_state != inet_csk(sk)->icsk_ca_state)
@@ -179,7 +179,7 @@ static void dctcp_state(struct sock *sk, u8 new_state)
*/
}
-static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+__bpf_kfunc static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
{
struct dctcp *ca = inet_csk_ca(sk);
@@ -229,7 +229,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
return 0;
}
-static u32 dctcp_cwnd_undo(struct sock *sk)
+__bpf_kfunc static u32 dctcp_cwnd_undo(struct sock *sk)
{
const struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index fee9163382c2..847934763868 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -222,6 +222,7 @@ lookup_protocol:
np->pmtudisc = IPV6_PMTUDISC_WANT;
np->repflow = net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ESTABLISHED;
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
+ sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
/* Init the ipv4 part of the socket since we can have sockets
* using v6 API for ipv4.
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index ada087b50541..bac9ba747bde 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -152,7 +152,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
saddr = &ipv6_hdr(skb)->saddr;
daddr = saddr + 1;
- hash = nexthdr & (RAW_HTABLE_SIZE - 1);
+ hash = raw_hashfunc(net, nexthdr);
hlist = &raw_v6_hashinfo.ht[hash];
rcu_read_lock();
sk_nulls_for_each(sk, hnode, hlist) {
@@ -338,7 +338,7 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
struct sock *sk;
int hash;
- hash = nexthdr & (RAW_HTABLE_SIZE - 1);
+ hash = raw_hashfunc(net, nexthdr);
hlist = &raw_v6_hashinfo.ht[hash];
rcu_read_lock();
sk_nulls_for_each(sk, hnode, hlist) {
@@ -355,17 +355,19 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason;
+
if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) &&
skb_checksum_complete(skb)) {
atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM);
return NET_RX_DROP;
}
/* Charge it to the socket. */
skb_dst_drop(skb);
- if (sock_queue_rcv_skb(sk, skb) < 0) {
- kfree_skb(skb);
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
+ kfree_skb_reason(skb, reason);
return NET_RX_DROP;
}
@@ -386,7 +388,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
@@ -410,7 +412,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
if (inet->hdrincl) {
if (skb_checksum_complete(skb)) {
atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_SKB_CSUM);
return NET_RX_DROP;
}
}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index db07cc5b4fcb..56628b52d100 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -1002,8 +1002,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
{
int addrlen = sizeof(struct sockaddr_in);
struct sockaddr_storage addr;
- struct mptcp_sock *msk;
struct socket *ssock;
+ struct sock *newsk;
int backlog = 1024;
int err;
@@ -1012,11 +1012,13 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
if (err)
return err;
- msk = mptcp_sk(entry->lsk->sk);
- if (!msk)
+ newsk = entry->lsk->sk;
+ if (!newsk)
return -EINVAL;
- ssock = __mptcp_nmpc_socket(msk);
+ lock_sock(newsk);
+ ssock = __mptcp_nmpc_socket(mptcp_sk(newsk));
+ release_sock(newsk);
if (!ssock)
return -EINVAL;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 1ff2efbab29e..c9817aa0f413 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -2902,6 +2902,7 @@ bool __mptcp_close(struct sock *sk, long timeout)
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
bool do_cancel_work = false;
+ int subflows_alive = 0;
sk->sk_shutdown = SHUTDOWN_MASK;
@@ -2928,6 +2929,8 @@ cleanup:
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast_nested(ssk);
+ subflows_alive += ssk->sk_state != TCP_CLOSE;
+
/* since the close timeout takes precedence on the fail one,
* cancel the latter
*/
@@ -2943,6 +2946,12 @@ cleanup:
}
sock_orphan(sk);
+ /* all the subflows are closed, only timeout can change the msk
+ * state, let's not keep resources busy for no reasons
+ */
+ if (subflows_alive == 0)
+ inet_sk_state_store(sk, TCP_CLOSE);
+
sock_hold(sk);
pr_debug("msk=%p state=%d", sk, sk->sk_state);
if (msk->token)
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 9986681aaf40..8a9656248b0f 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -760,14 +760,21 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname,
static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
+ struct sock *sk = (struct sock *)msk;
struct socket *sock;
+ int ret = -EINVAL;
/* Limit to first subflow, before the connection establishment */
+ lock_sock(sk);
sock = __mptcp_nmpc_socket(msk);
if (!sock)
- return -EINVAL;
+ goto unlock;
- return tcp_setsockopt(sock->sk, level, optname, optval, optlen);
+ ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen);
+
+unlock:
+ release_sock(sk);
+ return ret;
}
static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname,
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index beaec843f5ca..4ae1a7304cf0 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1400,6 +1400,7 @@ void __mptcp_error_report(struct sock *sk)
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
int err = sock_error(ssk);
+ int ssk_state;
if (!err)
continue;
@@ -1410,7 +1411,14 @@ void __mptcp_error_report(struct sock *sk)
if (sk->sk_state != TCP_SYN_SENT && !__mptcp_check_fallback(msk))
continue;
- inet_sk_state_store(sk, inet_sk_state_load(ssk));
+ /* We need to propagate only transition to CLOSE state.
+ * Orphaned socket will see such state change via
+ * subflow_sched_work_if_closed() and that path will properly
+ * destroy the msk as needed.
+ */
+ ssk_state = inet_sk_state_load(ssk);
+ if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD))
+ inet_sk_state_store(sk, ssk_state);
sk->sk_err = -err;
/* This barrier is coupled with smp_rmb() in mptcp_poll() */
@@ -1682,7 +1690,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
if (err)
return err;
- lock_sock(sf->sk);
+ lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING);
/* the newly created socket has to be in the same cgroup as its parent */
mptcp_attach_cgroup(sk, sf->sk);
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f71b41c7ce2f..4d6737160857 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -189,6 +189,9 @@ config NF_CONNTRACK_LABELS
to connection tracking entries. It can be used with xtables connlabel
match and the nftables ct expression.
+config NF_CONNTRACK_OVS
+ bool
+
config NF_CT_PROTO_DCCP
bool 'DCCP protocol connection tracking support'
depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index ba2a6b5e93d9..5ffef1cd6143 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -11,6 +11,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o
+nf_conntrack-$(CONFIG_NF_CONNTRACK_OVS) += nf_conntrack_ovs.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o
nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index 24002bc61e07..34913521c385 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -249,7 +249,7 @@ __diag_ignore_all("-Wmissing-prototypes",
* @opts__sz - Length of the bpf_ct_opts structure
* Must be NF_BPF_CT_OPTS_SZ (12)
*/
-struct nf_conn___init *
+__bpf_kfunc struct nf_conn___init *
bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
{
@@ -283,7 +283,7 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
* @opts__sz - Length of the bpf_ct_opts structure
* Must be NF_BPF_CT_OPTS_SZ (12)
*/
-struct nf_conn *
+__bpf_kfunc struct nf_conn *
bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
{
@@ -316,7 +316,7 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
* @opts__sz - Length of the bpf_ct_opts structure
* Must be NF_BPF_CT_OPTS_SZ (12)
*/
-struct nf_conn___init *
+__bpf_kfunc struct nf_conn___init *
bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
{
@@ -351,7 +351,7 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
* @opts__sz - Length of the bpf_ct_opts structure
* Must be NF_BPF_CT_OPTS_SZ (12)
*/
-struct nf_conn *
+__bpf_kfunc struct nf_conn *
bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
u32 tuple__sz, struct bpf_ct_opts *opts, u32 opts__sz)
{
@@ -376,7 +376,7 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
* @nfct - Pointer to referenced nf_conn___init object, obtained
* using bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
*/
-struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
+__bpf_kfunc struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
{
struct nf_conn *nfct = (struct nf_conn *)nfct_i;
int err;
@@ -400,7 +400,7 @@ struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i)
* @nf_conn - Pointer to referenced nf_conn object, obtained using
* bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
*/
-void bpf_ct_release(struct nf_conn *nfct)
+__bpf_kfunc void bpf_ct_release(struct nf_conn *nfct)
{
if (!nfct)
return;
@@ -417,7 +417,7 @@ void bpf_ct_release(struct nf_conn *nfct)
* bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
* @timeout - Timeout in msecs.
*/
-void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
+__bpf_kfunc void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
{
__nf_ct_set_timeout((struct nf_conn *)nfct, msecs_to_jiffies(timeout));
}
@@ -432,7 +432,7 @@ void bpf_ct_set_timeout(struct nf_conn___init *nfct, u32 timeout)
* bpf_ct_insert_entry, bpf_xdp_ct_lookup, or bpf_skb_ct_lookup.
* @timeout - New timeout in msecs.
*/
-int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
+__bpf_kfunc int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
{
return __nf_ct_change_timeout(nfct, msecs_to_jiffies(timeout));
}
@@ -447,7 +447,7 @@ int bpf_ct_change_timeout(struct nf_conn *nfct, u32 timeout)
* bpf_xdp_ct_alloc or bpf_skb_ct_alloc.
* @status - New status value.
*/
-int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
+__bpf_kfunc int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
{
return nf_ct_change_status_common((struct nf_conn *)nfct, status);
}
@@ -462,7 +462,7 @@ int bpf_ct_set_status(const struct nf_conn___init *nfct, u32 status)
* bpf_ct_insert_entry, bpf_xdp_ct_lookup or bpf_skb_ct_lookup.
* @status - New status value.
*/
-int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
+__bpf_kfunc int bpf_ct_change_status(struct nf_conn *nfct, u32 status)
{
return nf_ct_change_status_common(nfct, status);
}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 48ea6d0264b5..0c4db2f2ac43 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -242,104 +242,6 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
}
EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
-/* 'skb' should already be pulled to nh_ofs. */
-int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
- enum ip_conntrack_info ctinfo, u16 proto)
-{
- const struct nf_conntrack_helper *helper;
- const struct nf_conn_help *help;
- unsigned int protoff;
- int err;
-
- if (ctinfo == IP_CT_RELATED_REPLY)
- return NF_ACCEPT;
-
- help = nfct_help(ct);
- if (!help)
- return NF_ACCEPT;
-
- helper = rcu_dereference(help->helper);
- if (!helper)
- return NF_ACCEPT;
-
- if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
- helper->tuple.src.l3num != proto)
- return NF_ACCEPT;
-
- switch (proto) {
- case NFPROTO_IPV4:
- protoff = ip_hdrlen(skb);
- proto = ip_hdr(skb)->protocol;
- break;
- case NFPROTO_IPV6: {
- u8 nexthdr = ipv6_hdr(skb)->nexthdr;
- __be16 frag_off;
- int ofs;
-
- ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
- &frag_off);
- if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("proto header not found\n");
- return NF_ACCEPT;
- }
- protoff = ofs;
- proto = nexthdr;
- break;
- }
- default:
- WARN_ONCE(1, "helper invoked on non-IP family!");
- return NF_DROP;
- }
-
- if (helper->tuple.dst.protonum != proto)
- return NF_ACCEPT;
-
- err = helper->help(skb, protoff, ct, ctinfo);
- if (err != NF_ACCEPT)
- return err;
-
- /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
- * FTP with NAT) adusting the TCP payload size when mangling IP
- * addresses and/or port numbers in the text-based control connection.
- */
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
- return NF_DROP;
- return NF_ACCEPT;
-}
-EXPORT_SYMBOL_GPL(nf_ct_helper);
-
-int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
- u8 proto, bool nat, struct nf_conntrack_helper **hp)
-{
- struct nf_conntrack_helper *helper;
- struct nf_conn_help *help;
- int ret = 0;
-
- helper = nf_conntrack_helper_try_module_get(name, family, proto);
- if (!helper)
- return -EINVAL;
-
- help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
- if (!help) {
- nf_conntrack_helper_put(helper);
- return -ENOMEM;
- }
-#if IS_ENABLED(CONFIG_NF_NAT)
- if (nat) {
- ret = nf_nat_helper_try_module_get(name, family, proto);
- if (ret) {
- nf_conntrack_helper_put(helper);
- return ret;
- }
- }
-#endif
- rcu_assign_pointer(help->helper, helper);
- *hp = helper;
- return ret;
-}
-EXPORT_SYMBOL_GPL(nf_ct_add_helper);
-
/* appropriate ct lock protecting must be taken by caller */
static int unhelp(struct nf_conn *ct, void *me)
{
diff --git a/net/netfilter/nf_conntrack_ovs.c b/net/netfilter/nf_conntrack_ovs.c
new file mode 100644
index 000000000000..52b776bdf526
--- /dev/null
+++ b/net/netfilter/nf_conntrack_ovs.c
@@ -0,0 +1,178 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Support ct functions for openvswitch and used by OVS and TC conntrack. */
+
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/ipv6_frag.h>
+#include <net/ip.h>
+
+/* 'skb' should already be pulled to nh_ofs. */
+int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, u16 proto)
+{
+ const struct nf_conntrack_helper *helper;
+ const struct nf_conn_help *help;
+ unsigned int protoff;
+ int err;
+
+ if (ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
+ helper->tuple.src.l3num != proto)
+ return NF_ACCEPT;
+
+ switch (proto) {
+ case NFPROTO_IPV4:
+ protoff = ip_hdrlen(skb);
+ proto = ip_hdr(skb)->protocol;
+ break;
+ case NFPROTO_IPV6: {
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ __be16 frag_off;
+ int ofs;
+
+ ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ return NF_ACCEPT;
+ }
+ protoff = ofs;
+ proto = nexthdr;
+ break;
+ }
+ default:
+ WARN_ONCE(1, "helper invoked on non-IP family!");
+ return NF_DROP;
+ }
+
+ if (helper->tuple.dst.protonum != proto)
+ return NF_ACCEPT;
+
+ err = helper->help(skb, protoff, ct, ctinfo);
+ if (err != NF_ACCEPT)
+ return err;
+
+ /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
+ * FTP with NAT) adusting the TCP payload size when mangling IP
+ * addresses and/or port numbers in the text-based control connection.
+ */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+ return NF_DROP;
+ return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper);
+
+int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
+ u8 proto, bool nat, struct nf_conntrack_helper **hp)
+{
+ struct nf_conntrack_helper *helper;
+ struct nf_conn_help *help;
+ int ret = 0;
+
+ helper = nf_conntrack_helper_try_module_get(name, family, proto);
+ if (!helper)
+ return -EINVAL;
+
+ help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
+ if (!help) {
+ nf_conntrack_helper_put(helper);
+ return -ENOMEM;
+ }
+#if IS_ENABLED(CONFIG_NF_NAT)
+ if (nat) {
+ ret = nf_nat_helper_try_module_get(name, family, proto);
+ if (ret) {
+ nf_conntrack_helper_put(helper);
+ return ret;
+ }
+ }
+#endif
+ rcu_assign_pointer(help->helper, helper);
+ *hp = helper;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_add_helper);
+
+/* Trim the skb to the length specified by the IP/IPv6 header,
+ * removing any trailing lower-layer padding. This prepares the skb
+ * for higher-layer processing that assumes skb->len excludes padding
+ * (such as nf_ip_checksum). The caller needs to pull the skb to the
+ * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
+ */
+int nf_ct_skb_network_trim(struct sk_buff *skb, int family)
+{
+ unsigned int len;
+
+ switch (family) {
+ case NFPROTO_IPV4:
+ len = skb_ip_totlen(skb);
+ break;
+ case NFPROTO_IPV6:
+ len = sizeof(struct ipv6hdr)
+ + ntohs(ipv6_hdr(skb)->payload_len);
+ break;
+ default:
+ len = skb->len;
+ }
+
+ return pskb_trim_rcsum(skb, len);
+}
+EXPORT_SYMBOL_GPL(nf_ct_skb_network_trim);
+
+/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
+ * value if 'skb' is freed.
+ */
+int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
+ u16 zone, u8 family, u8 *proto, u16 *mru)
+{
+ int err;
+
+ if (family == NFPROTO_IPV4) {
+ enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+ local_bh_disable();
+ err = ip_defrag(net, skb, user);
+ local_bh_enable();
+ if (err)
+ return err;
+
+ *mru = IPCB(skb)->frag_max_size;
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ } else if (family == NFPROTO_IPV6) {
+ enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
+
+ memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+ err = nf_ct_frag6_gather(net, skb, user);
+ if (err) {
+ if (err != -EINPROGRESS)
+ kfree_skb(skb);
+ return err;
+ }
+
+ *proto = ipv6_hdr(skb)->nexthdr;
+ *mru = IP6CB(skb)->frag_max_size;
+#endif
+ } else {
+ kfree_skb(skb);
+ return -EPFNOSUPPORT;
+ }
+
+ skb_clear_hash(skb);
+ skb->ignore_df = 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nf_ct_handle_fragments);
diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c
index 0fa5a0bbb0ff..141ee7783223 100644
--- a/net/netfilter/nf_nat_bpf.c
+++ b/net/netfilter/nf_nat_bpf.c
@@ -30,9 +30,9 @@ __diag_ignore_all("-Wmissing-prototypes",
* interpreted as select a random port.
* @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST
*/
-int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
- union nf_inet_addr *addr, int port,
- enum nf_nat_manip_type manip)
+__bpf_kfunc int bpf_ct_set_nat_info(struct nf_conn___init *nfct,
+ union nf_inet_addr *addr, int port,
+ enum nf_nat_manip_type manip)
{
struct nf_conn *ct = (struct nf_conn *)nfct;
u16 proto = nf_ct_l3num(ct);
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 600993c80050..04c4036bf406 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -13,7 +13,7 @@
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
-#include <linux/string.h>
+#include <linux/string_helpers.h>
#include <linux/skbuff.h>
#include <linux/mutex.h>
#include <linux/bitmap.h>
@@ -457,7 +457,7 @@ static int genl_validate_assign_mc_groups(struct genl_family *family)
if (WARN_ON(grp->name[0] == '\0'))
return -EINVAL;
- if (WARN_ON(memchr(grp->name, '\0', GENL_NAMSIZ) == NULL))
+ if (WARN_ON(!string_is_terminated(grp->name, GENL_NAMSIZ)))
return -EINVAL;
}
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 747d537a3f06..29a7081858cd 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -15,6 +15,7 @@ config OPENVSWITCH
select NET_MPLS_GSO
select DST_CACHE
select NET_NSH
+ select NF_CONNTRACK_OVS if NF_CONNTRACK
select NF_NAT_OVS if NF_NAT
help
Open vSwitch is a multilayer Ethernet switch targeted at virtualized
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 2172930b1f17..331730fd3580 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -9,6 +9,7 @@
#include <linux/udp.h>
#include <linux/sctp.h>
#include <linux/static_key.h>
+#include <linux/string_helpers.h>
#include <net/ip.h>
#include <net/genetlink.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -434,52 +435,21 @@ static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
return 0;
}
-/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
- * value if 'skb' is freed.
- */
-static int handle_fragments(struct net *net, struct sw_flow_key *key,
- u16 zone, struct sk_buff *skb)
+static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
+ u16 zone, int family, struct sk_buff *skb)
{
struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
int err;
- if (key->eth.type == htons(ETH_P_IP)) {
- enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
-
- memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
- err = ip_defrag(net, skb, user);
- if (err)
- return err;
-
- ovs_cb.mru = IPCB(skb)->frag_max_size;
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
- } else if (key->eth.type == htons(ETH_P_IPV6)) {
- enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
-
- memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
- err = nf_ct_frag6_gather(net, skb, user);
- if (err) {
- if (err != -EINPROGRESS)
- kfree_skb(skb);
- return err;
- }
-
- key->ip.proto = ipv6_hdr(skb)->nexthdr;
- ovs_cb.mru = IP6CB(skb)->frag_max_size;
-#endif
- } else {
- kfree_skb(skb);
- return -EPFNOSUPPORT;
- }
+ err = nf_ct_handle_fragments(net, skb, zone, family, &key->ip.proto, &ovs_cb.mru);
+ if (err)
+ return err;
/* The key extracted from the fragment that completed this datagram
* likely didn't have an L4 header, so regenerate it.
*/
ovs_flow_key_update_l3l4(skb, key);
-
key->ip.frag = OVS_FRAG_TYPE_NONE;
- skb_clear_hash(skb);
- skb->ignore_df = 1;
*OVS_CB(skb) = ovs_cb;
return 0;
@@ -1090,36 +1060,6 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
return 0;
}
-/* Trim the skb to the length specified by the IP/IPv6 header,
- * removing any trailing lower-layer padding. This prepares the skb
- * for higher-layer processing that assumes skb->len excludes padding
- * (such as nf_ip_checksum). The caller needs to pull the skb to the
- * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
- */
-static int ovs_skb_network_trim(struct sk_buff *skb)
-{
- unsigned int len;
- int err;
-
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- len = skb_ip_totlen(skb);
- break;
- case htons(ETH_P_IPV6):
- len = sizeof(struct ipv6hdr)
- + ntohs(ipv6_hdr(skb)->payload_len);
- break;
- default:
- len = skb->len;
- }
-
- err = pskb_trim_rcsum(skb, len);
- if (err)
- kfree_skb(skb);
-
- return err;
-}
-
/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
* value if 'skb' is freed.
*/
@@ -1134,12 +1074,15 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
nh_ofs = skb_network_offset(skb);
skb_pull_rcsum(skb, nh_ofs);
- err = ovs_skb_network_trim(skb);
- if (err)
+ err = nf_ct_skb_network_trim(skb, info->family);
+ if (err) {
+ kfree_skb(skb);
return err;
+ }
if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
- err = handle_fragments(net, key, info->zone.id, skb);
+ err = ovs_ct_handle_fragments(net, key, info->zone.id,
+ info->family, skb);
if (err)
return err;
}
@@ -1383,7 +1326,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
#endif
case OVS_CT_ATTR_HELPER:
*helper = nla_data(a);
- if (!memchr(*helper, '\0', nla_len(a))) {
+ if (!string_is_terminated(*helper, nla_len(a))) {
OVS_NLERR(log, "Invalid conntrack helper");
return -EINVAL;
}
@@ -1404,7 +1347,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
case OVS_CT_ATTR_TIMEOUT:
memcpy(info->timeout, nla_data(a), nla_len(a));
- if (!memchr(info->timeout, '\0', nla_len(a))) {
+ if (!string_is_terminated(info->timeout, nla_len(a))) {
OVS_NLERR(log, "Invalid conntrack timeout");
return -EINVAL;
}
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index e20d1a973417..416976f70322 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -107,7 +107,8 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags,
rcu_assign_pointer(flow->stats[cpu],
new_stats);
- cpumask_set_cpu(cpu, &flow->cpu_used_mask);
+ cpumask_set_cpu(cpu,
+ flow->cpu_used_mask);
goto unlock;
}
}
@@ -135,7 +136,8 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
memset(ovs_stats, 0, sizeof(*ovs_stats));
/* We open code this to make sure cpu 0 is always considered */
- for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) {
+ for (cpu = 0; cpu < nr_cpu_ids;
+ cpu = cpumask_next(cpu, flow->cpu_used_mask)) {
struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]);
if (stats) {
@@ -159,7 +161,8 @@ void ovs_flow_stats_clear(struct sw_flow *flow)
int cpu;
/* We open code this to make sure cpu 0 is always considered */
- for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) {
+ for (cpu = 0; cpu < nr_cpu_ids;
+ cpu = cpumask_next(cpu, flow->cpu_used_mask)) {
struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
if (stats) {
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 073ab73ffeaa..b5711aff6e76 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -229,7 +229,7 @@ struct sw_flow {
*/
struct sw_flow_key key;
struct sw_flow_id id;
- struct cpumask cpu_used_mask;
+ struct cpumask *cpu_used_mask;
struct sw_flow_mask *mask;
struct sw_flow_actions __rcu *sf_acts;
struct sw_flow_stats __rcu *stats[]; /* One for each CPU. First one
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 0a0e4c283f02..791504b7f42b 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -79,6 +79,7 @@ struct sw_flow *ovs_flow_alloc(void)
return ERR_PTR(-ENOMEM);
flow->stats_last_writer = -1;
+ flow->cpu_used_mask = (struct cpumask *)&flow->stats[nr_cpu_ids];
/* Initialize the default stat node. */
stats = kmem_cache_alloc_node(flow_stats_cache,
@@ -91,7 +92,7 @@ struct sw_flow *ovs_flow_alloc(void)
RCU_INIT_POINTER(flow->stats[0], stats);
- cpumask_set_cpu(0, &flow->cpu_used_mask);
+ cpumask_set_cpu(0, flow->cpu_used_mask);
return flow;
err:
@@ -115,7 +116,7 @@ static void flow_free(struct sw_flow *flow)
flow->sf_acts);
/* We open code this to make sure cpu 0 is always considered */
for (cpu = 0; cpu < nr_cpu_ids;
- cpu = cpumask_next(cpu, &flow->cpu_used_mask)) {
+ cpu = cpumask_next(cpu, flow->cpu_used_mask)) {
if (flow->stats[cpu])
kmem_cache_free(flow_stats_cache,
(struct sw_flow_stats __force *)flow->stats[cpu]);
@@ -1196,7 +1197,8 @@ int ovs_flow_init(void)
flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
+ (nr_cpu_ids
- * sizeof(struct sw_flow_stats *)),
+ * sizeof(struct sw_flow_stats *))
+ + cpumask_size(),
0, 0, NULL);
if (flow_cache == NULL)
return -ENOMEM;
diff --git a/net/rds/message.c b/net/rds/message.c
index b47e4f0a1639..c19c93561227 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -104,9 +104,9 @@ static void rds_rm_zerocopy_callback(struct rds_sock *rs,
spin_lock_irqsave(&q->lock, flags);
head = &q->zcookie_head;
if (!list_empty(head)) {
- info = list_entry(head, struct rds_msg_zcopy_info,
- rs_zcookie_next);
- if (info && rds_zcookie_add(info, cookie)) {
+ info = list_first_entry(head, struct rds_msg_zcopy_info,
+ rs_zcookie_next);
+ if (rds_zcookie_add(info, cookie)) {
spin_unlock_irqrestore(&q->lock, flags);
kfree(rds_info_from_znotifier(znotif));
/* caller invokes rds_wake_sk_sleep() */
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index 6eaffb0d8fdc..e9f1f49d18c2 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -54,12 +54,14 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what)
spin_lock_bh(&local->lock);
busy = !list_empty(&call->attend_link);
trace_rxrpc_poke_call(call, busy, what);
+ if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke))
+ busy = true;
if (!busy) {
- rxrpc_get_call(call, rxrpc_call_get_poke);
list_add_tail(&call->attend_link, &local->call_attend_q);
}
spin_unlock_bh(&local->lock);
- rxrpc_wake_up_io_thread(local);
+ if (!busy)
+ rxrpc_wake_up_io_thread(local);
}
}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 44414e724415..95f4bc206b3d 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -163,7 +163,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
trace_rxrpc_tx_ack(chan->call_debug_id, serial,
ntohl(pkt.ack.firstPacket),
ntohl(pkt.ack.serial),
- pkt.ack.reason, 0);
+ pkt.ack.reason, 0, rxrpc_rx_window_size);
break;
default:
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 6b2022240076..5e53429c6922 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -80,7 +80,8 @@ static void rxrpc_set_keepalive(struct rxrpc_call *call)
*/
static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn,
struct rxrpc_call *call,
- struct rxrpc_txbuf *txb)
+ struct rxrpc_txbuf *txb,
+ u16 *_rwind)
{
struct rxrpc_ackinfo ackinfo;
unsigned int qsize, sack, wrap, to;
@@ -124,6 +125,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn,
jmax = rxrpc_rx_jumbo_max;
qsize = (window - 1) - call->rx_consumed;
rsize = max_t(int, call->rx_winsize - qsize, 0);
+ *_rwind = rsize;
ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
ackinfo.maxMTU = htonl(mtu);
ackinfo.rwind = htonl(rsize);
@@ -190,6 +192,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
rxrpc_serial_t serial;
size_t len, n;
int ret, rtt_slot = -1;
+ u16 rwind;
if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
return -ECONNRESET;
@@ -205,7 +208,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
if (txb->ack.reason == RXRPC_ACK_PING)
txb->wire.flags |= RXRPC_REQUEST_ACK;
- n = rxrpc_fill_out_ack(conn, call, txb);
+ n = rxrpc_fill_out_ack(conn, call, txb, &rwind);
if (n == 0)
return 0;
@@ -217,7 +220,8 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
txb->wire.serial = htonl(serial);
trace_rxrpc_tx_ack(call->debug_id, serial,
ntohl(txb->ack.firstPacket),
- ntohl(txb->ack.serial), txb->ack.reason, txb->ack.nAcks);
+ ntohl(txb->ack.serial), txb->ack.reason, txb->ack.nAcks,
+ rwind);
if (txb->ack.reason == RXRPC_ACK_PING)
rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping);
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 50d263a6359d..76eb2b9cd936 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -137,7 +137,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call)
/* Check to see if there's an ACK that needs sending. */
acked = atomic_add_return(call->rx_consumed - old_consumed,
&call->ackr_nr_consumed);
- if (acked > 2 &&
+ if (acked > 8 &&
!test_and_set_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags))
rxrpc_poke_call(call, rxrpc_call_poke_idle);
}
diff --git a/net/rxrpc/skbuff.c b/net/rxrpc/skbuff.c
index 944320e65ea8..3bcd6ee80396 100644
--- a/net/rxrpc/skbuff.c
+++ b/net/rxrpc/skbuff.c
@@ -63,7 +63,7 @@ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace why)
if (skb) {
int n = atomic_dec_return(select_skb_count(skb));
trace_rxrpc_skb(skb, refcount_read(&skb->users), n, why);
- kfree_skb_reason(skb, SKB_CONSUMED);
+ consume_skb(skb);
}
}
@@ -78,6 +78,6 @@ void rxrpc_purge_queue(struct sk_buff_head *list)
int n = atomic_dec_return(select_skb_count(skb));
trace_rxrpc_skb(skb, refcount_read(&skb->users), n,
rxrpc_skb_put_purge);
- kfree_skb_reason(skb, SKB_CONSUMED);
+ consume_skb(skb);
}
}
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index de18a0dda6df..4f7b52f5a11c 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -195,8 +195,14 @@ config NET_SCH_ETF
To compile this code as a module, choose M here: the
module will be called sch_etf.
+config NET_SCH_MQPRIO_LIB
+ tristate
+ help
+ Common library for manipulating mqprio queue configurations.
+
config NET_SCH_TAPRIO
tristate "Time Aware Priority (taprio) Scheduler"
+ select NET_SCH_MQPRIO_LIB
help
Say Y here if you want to use the Time Aware Priority (taprio) packet
scheduling algorithm.
@@ -253,6 +259,7 @@ config NET_SCH_DRR
config NET_SCH_MQPRIO
tristate "Multi-queue priority scheduler (MQPRIO)"
+ select NET_SCH_MQPRIO_LIB
help
Say Y here if you want to use the Multi-queue Priority scheduler.
This scheduler allows QOS to be offloaded on NICs that have support
@@ -977,6 +984,7 @@ config NET_ACT_TUNNEL_KEY
config NET_ACT_CT
tristate "connection tracking tc action"
depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE
+ select NF_CONNTRACK_OVS
select NF_NAT_OVS if NF_NAT
help
Say Y here to allow sending the packets to conntrack module.
diff --git a/net/sched/Makefile b/net/sched/Makefile
index dd14ef413fda..7911eec09837 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -52,6 +52,7 @@ obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_MQPRIO_LIB) += sch_mqprio_lib.o
obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o
obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index b126f03c1bb6..9cc0bc7c71ed 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -726,31 +726,6 @@ drop_ct:
return false;
}
-/* Trim the skb to the length specified by the IP/IPv6 header,
- * removing any trailing lower-layer padding. This prepares the skb
- * for higher-layer processing that assumes skb->len excludes padding
- * (such as nf_ip_checksum). The caller needs to pull the skb to the
- * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
- */
-static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family)
-{
- unsigned int len;
-
- switch (family) {
- case NFPROTO_IPV4:
- len = skb_ip_totlen(skb);
- break;
- case NFPROTO_IPV6:
- len = sizeof(struct ipv6hdr)
- + ntohs(ipv6_hdr(skb)->payload_len);
- break;
- default:
- len = skb->len;
- }
-
- return pskb_trim_rcsum(skb, len);
-}
-
static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
{
u8 family = NFPROTO_UNSPEC;
@@ -810,6 +785,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
struct nf_conn *ct;
int err = 0;
bool frag;
+ u8 proto;
u16 mru;
/* Previously seen (loopback)? Ignore. */
@@ -825,50 +801,14 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
return err;
skb_get(skb);
- mru = tc_skb_cb(skb)->mru;
-
- if (family == NFPROTO_IPV4) {
- enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
-
- memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
- local_bh_disable();
- err = ip_defrag(net, skb, user);
- local_bh_enable();
- if (err && err != -EINPROGRESS)
- return err;
-
- if (!err) {
- *defrag = true;
- mru = IPCB(skb)->frag_max_size;
- }
- } else { /* NFPROTO_IPV6 */
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
- enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
-
- memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
- err = nf_ct_frag6_gather(net, skb, user);
- if (err && err != -EINPROGRESS)
- goto out_free;
-
- if (!err) {
- *defrag = true;
- mru = IP6CB(skb)->frag_max_size;
- }
-#else
- err = -EOPNOTSUPP;
- goto out_free;
-#endif
- }
+ err = nf_ct_handle_fragments(net, skb, zone, family, &proto, &mru);
+ if (err)
+ return err;
- if (err != -EINPROGRESS)
- tc_skb_cb(skb)->mru = mru;
- skb_clear_hash(skb);
- skb->ignore_df = 1;
- return err;
+ *defrag = true;
+ tc_skb_cb(skb)->mru = mru;
-out_free:
- kfree_skb(skb);
- return err;
+ return 0;
}
static void tcf_ct_params_free(struct tcf_ct_params *params)
@@ -1011,7 +951,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
if (err)
goto drop;
- err = tcf_ct_skb_network_trim(skb, family);
+ err = nf_ct_skb_network_trim(skb, family);
if (err)
goto drop;
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index c14018a8052c..e9780631b5b5 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1282,12 +1282,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
if (err)
goto err_out3;
- if (ops->init) {
- err = ops->init(sch, tca[TCA_OPTIONS], extack);
- if (err != 0)
- goto err_out5;
- }
-
if (tca[TCA_STAB]) {
stab = qdisc_get_stab(tca[TCA_STAB], extack);
if (IS_ERR(stab)) {
@@ -1296,11 +1290,18 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
}
rcu_assign_pointer(sch->stab, stab);
}
+
+ if (ops->init) {
+ err = ops->init(sch, tca[TCA_OPTIONS], extack);
+ if (err != 0)
+ goto err_out5;
+ }
+
if (tca[TCA_RATE]) {
err = -EOPNOTSUPP;
if (sch->flags & TCQ_F_MQROOT) {
NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
- goto err_out4;
+ goto err_out5;
}
err = gen_new_estimator(&sch->bstats,
@@ -1311,7 +1312,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
tca[TCA_RATE]);
if (err) {
NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
- goto err_out4;
+ goto err_out5;
}
}
@@ -1321,6 +1322,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
return sch;
err_out5:
+ qdisc_put_stab(rtnl_dereference(sch->stab));
+err_out4:
/* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
if (ops->destroy)
ops->destroy(sch);
@@ -1332,16 +1335,6 @@ err_out2:
err_out:
*errp = err;
return NULL;
-
-err_out4:
- /*
- * Any broken qdiscs that would require a ops->reset() here?
- * The qdisc was never in action so it shouldn't be necessary.
- */
- qdisc_put_stab(rtnl_dereference(sch->stab));
- if (ops->destroy)
- ops->destroy(sch);
- goto err_out3;
}
static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index cc28e41fb745..92f2975b6a82 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -433,7 +433,7 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
while (m) {
unsigned int prio = ffz(~m);
- if (WARN_ON_ONCE(prio > ARRAY_SIZE(p->inner.clprio)))
+ if (WARN_ON_ONCE(prio >= ARRAY_SIZE(p->inner.clprio)))
break;
m &= ~(1 << prio);
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 4c68abaa289b..48ed87b91086 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -17,6 +17,8 @@
#include <net/sch_generic.h>
#include <net/pkt_cls.h>
+#include "sch_mqprio_lib.h"
+
struct mqprio_sched {
struct Qdisc **qdiscs;
u16 mode;
@@ -27,6 +29,62 @@ struct mqprio_sched {
u64 max_rate[TC_QOPT_MAX_QUEUE];
};
+static int mqprio_enable_offload(struct Qdisc *sch,
+ const struct tc_mqprio_qopt *qopt,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt};
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int err, i;
+
+ switch (priv->mode) {
+ case TC_MQPRIO_MODE_DCB:
+ if (priv->shaper != TC_MQPRIO_SHAPER_DCB)
+ return -EINVAL;
+ break;
+ case TC_MQPRIO_MODE_CHANNEL:
+ mqprio.flags = priv->flags;
+ if (priv->flags & TC_MQPRIO_F_MODE)
+ mqprio.mode = priv->mode;
+ if (priv->flags & TC_MQPRIO_F_SHAPER)
+ mqprio.shaper = priv->shaper;
+ if (priv->flags & TC_MQPRIO_F_MIN_RATE)
+ for (i = 0; i < mqprio.qopt.num_tc; i++)
+ mqprio.min_rate[i] = priv->min_rate[i];
+ if (priv->flags & TC_MQPRIO_F_MAX_RATE)
+ for (i = 0; i < mqprio.qopt.num_tc; i++)
+ mqprio.max_rate[i] = priv->max_rate[i];
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO,
+ &mqprio);
+ if (err)
+ return err;
+
+ priv->hw_offload = mqprio.qopt.hw;
+
+ return 0;
+}
+
+static void mqprio_disable_offload(struct Qdisc *sch)
+{
+ struct tc_mqprio_qopt_offload mqprio = { { 0 } };
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+
+ switch (priv->mode) {
+ case TC_MQPRIO_MODE_DCB:
+ case TC_MQPRIO_MODE_CHANNEL:
+ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO,
+ &mqprio);
+ break;
+ }
+}
+
static void mqprio_destroy(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
@@ -41,37 +99,17 @@ static void mqprio_destroy(struct Qdisc *sch)
kfree(priv->qdiscs);
}
- if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) {
- struct tc_mqprio_qopt_offload mqprio = { { 0 } };
-
- switch (priv->mode) {
- case TC_MQPRIO_MODE_DCB:
- case TC_MQPRIO_MODE_CHANNEL:
- dev->netdev_ops->ndo_setup_tc(dev,
- TC_SETUP_QDISC_MQPRIO,
- &mqprio);
- break;
- default:
- return;
- }
- } else {
+ if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc)
+ mqprio_disable_offload(sch);
+ else
netdev_set_num_tc(dev, 0);
- }
}
-static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
+ const struct tc_mqprio_caps *caps,
+ struct netlink_ext_ack *extack)
{
- int i, j;
-
- /* Verify num_tc is not out of max range */
- if (qopt->num_tc > TC_MAX_QUEUE)
- return -EINVAL;
-
- /* Verify priority mapping uses valid tcs */
- for (i = 0; i < TC_BITMASK + 1; i++) {
- if (qopt->prio_tc_map[i] >= qopt->num_tc)
- return -EINVAL;
- }
+ int err;
/* Limit qopt->hw to maximum supported offload value. Drivers have
* the option of overriding this later if they don't support the a
@@ -80,31 +118,23 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
if (qopt->hw > TC_MQPRIO_HW_OFFLOAD_MAX)
qopt->hw = TC_MQPRIO_HW_OFFLOAD_MAX;
- /* If hardware offload is requested we will leave it to the device
- * to either populate the queue counts itself or to validate the
- * provided queue counts. If ndo_setup_tc is not present then
- * hardware doesn't support offload and we should return an error.
+ /* If hardware offload is requested, we will leave 3 options to the
+ * device driver:
+ * - populate the queue counts itself (and ignore what was requested)
+ * - validate the provided queue counts by itself (and apply them)
+ * - request queue count validation here (and apply them)
*/
- if (qopt->hw)
- return dev->netdev_ops->ndo_setup_tc ? 0 : -EINVAL;
-
- for (i = 0; i < qopt->num_tc; i++) {
- unsigned int last = qopt->offset[i] + qopt->count[i];
-
- /* Verify the queue count is in tx range being equal to the
- * real_num_tx_queues indicates the last queue is in use.
- */
- if (qopt->offset[i] >= dev->real_num_tx_queues ||
- !qopt->count[i] ||
- last > dev->real_num_tx_queues)
- return -EINVAL;
-
- /* Verify that the offset and counts do not overlap */
- for (j = i + 1; j < qopt->num_tc; j++) {
- if (last > qopt->offset[j])
- return -EINVAL;
- }
- }
+ err = mqprio_validate_qopt(dev, qopt,
+ !qopt->hw || caps->validate_queue_counts,
+ false, extack);
+ if (err)
+ return err;
+
+ /* If ndo_setup_tc is not present then hardware doesn't support offload
+ * and we should return an error.
+ */
+ if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
+ return -EINVAL;
return 0;
}
@@ -130,6 +160,67 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
return 0;
}
+static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt,
+ struct nlattr *opt)
+{
+ struct mqprio_sched *priv = qdisc_priv(sch);
+ struct nlattr *tb[TCA_MQPRIO_MAX + 1];
+ struct nlattr *attr;
+ int i, rem, err;
+
+ err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy,
+ sizeof(*qopt));
+ if (err < 0)
+ return err;
+
+ if (!qopt->hw)
+ return -EINVAL;
+
+ if (tb[TCA_MQPRIO_MODE]) {
+ priv->flags |= TC_MQPRIO_F_MODE;
+ priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]);
+ }
+
+ if (tb[TCA_MQPRIO_SHAPER]) {
+ priv->flags |= TC_MQPRIO_F_SHAPER;
+ priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]);
+ }
+
+ if (tb[TCA_MQPRIO_MIN_RATE64]) {
+ if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+ return -EINVAL;
+ i = 0;
+ nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64],
+ rem) {
+ if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64)
+ return -EINVAL;
+ if (i >= qopt->num_tc)
+ break;
+ priv->min_rate[i] = *(u64 *)nla_data(attr);
+ i++;
+ }
+ priv->flags |= TC_MQPRIO_F_MIN_RATE;
+ }
+
+ if (tb[TCA_MQPRIO_MAX_RATE64]) {
+ if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
+ return -EINVAL;
+ i = 0;
+ nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64],
+ rem) {
+ if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64)
+ return -EINVAL;
+ if (i >= qopt->num_tc)
+ break;
+ priv->max_rate[i] = *(u64 *)nla_data(attr);
+ i++;
+ }
+ priv->flags |= TC_MQPRIO_F_MAX_RATE;
+ }
+
+ return 0;
+}
+
static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
@@ -139,9 +230,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
struct Qdisc *qdisc;
int i, err = -EOPNOTSUPP;
struct tc_mqprio_qopt *qopt = NULL;
- struct nlattr *tb[TCA_MQPRIO_MAX + 1];
- struct nlattr *attr;
- int rem;
+ struct tc_mqprio_caps caps;
int len;
BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
@@ -160,61 +249,18 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
if (!opt || nla_len(opt) < sizeof(*qopt))
return -EINVAL;
+ qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO,
+ &caps, sizeof(caps));
+
qopt = nla_data(opt);
- if (mqprio_parse_opt(dev, qopt))
+ if (mqprio_parse_opt(dev, qopt, &caps, extack))
return -EINVAL;
len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt));
if (len > 0) {
- err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy,
- sizeof(*qopt));
- if (err < 0)
+ err = mqprio_parse_nlattr(sch, qopt, opt);
+ if (err)
return err;
-
- if (!qopt->hw)
- return -EINVAL;
-
- if (tb[TCA_MQPRIO_MODE]) {
- priv->flags |= TC_MQPRIO_F_MODE;
- priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]);
- }
-
- if (tb[TCA_MQPRIO_SHAPER]) {
- priv->flags |= TC_MQPRIO_F_SHAPER;
- priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]);
- }
-
- if (tb[TCA_MQPRIO_MIN_RATE64]) {
- if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
- return -EINVAL;
- i = 0;
- nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64],
- rem) {
- if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64)
- return -EINVAL;
- if (i >= qopt->num_tc)
- break;
- priv->min_rate[i] = *(u64 *)nla_data(attr);
- i++;
- }
- priv->flags |= TC_MQPRIO_F_MIN_RATE;
- }
-
- if (tb[TCA_MQPRIO_MAX_RATE64]) {
- if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE)
- return -EINVAL;
- i = 0;
- nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64],
- rem) {
- if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64)
- return -EINVAL;
- if (i >= qopt->num_tc)
- break;
- priv->max_rate[i] = *(u64 *)nla_data(attr);
- i++;
- }
- priv->flags |= TC_MQPRIO_F_MAX_RATE;
- }
}
/* pre-allocate qdisc, attachment can't fail */
@@ -241,36 +287,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt,
* supplied and verified mapping
*/
if (qopt->hw) {
- struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt};
-
- switch (priv->mode) {
- case TC_MQPRIO_MODE_DCB:
- if (priv->shaper != TC_MQPRIO_SHAPER_DCB)
- return -EINVAL;
- break;
- case TC_MQPRIO_MODE_CHANNEL:
- mqprio.flags = priv->flags;
- if (priv->flags & TC_MQPRIO_F_MODE)
- mqprio.mode = priv->mode;
- if (priv->flags & TC_MQPRIO_F_SHAPER)
- mqprio.shaper = priv->shaper;
- if (priv->flags & TC_MQPRIO_F_MIN_RATE)
- for (i = 0; i < mqprio.qopt.num_tc; i++)
- mqprio.min_rate[i] = priv->min_rate[i];
- if (priv->flags & TC_MQPRIO_F_MAX_RATE)
- for (i = 0; i < mqprio.qopt.num_tc; i++)
- mqprio.max_rate[i] = priv->max_rate[i];
- break;
- default:
- return -EINVAL;
- }
- err = dev->netdev_ops->ndo_setup_tc(dev,
- TC_SETUP_QDISC_MQPRIO,
- &mqprio);
+ err = mqprio_enable_offload(sch, qopt, extack);
if (err)
return err;
-
- priv->hw_offload = mqprio.qopt.hw;
} else {
netdev_set_num_tc(dev, qopt->num_tc);
for (i = 0; i < qopt->num_tc; i++)
@@ -387,7 +406,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb);
struct tc_mqprio_qopt opt = { 0 };
struct Qdisc *qdisc;
- unsigned int ntx, tc;
+ unsigned int ntx;
sch->q.qlen = 0;
gnet_stats_basic_sync_init(&sch->bstats);
@@ -411,15 +430,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
spin_unlock_bh(qdisc_lock(qdisc));
}
- opt.num_tc = netdev_get_num_tc(dev);
- memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+ mqprio_qopt_reconstruct(dev, &opt);
opt.hw = priv->hw_offload;
- for (tc = 0; tc < netdev_get_num_tc(dev); tc++) {
- opt.count[tc] = dev->tc_to_txq[tc].count;
- opt.offset[tc] = dev->tc_to_txq[tc].offset;
- }
-
if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
goto nla_put_failure;
diff --git a/net/sched/sch_mqprio_lib.c b/net/sched/sch_mqprio_lib.c
new file mode 100644
index 000000000000..c58a533b8ec5
--- /dev/null
+++ b/net/sched/sch_mqprio_lib.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/types.h>
+#include <net/pkt_sched.h>
+
+#include "sch_mqprio_lib.h"
+
+/* Returns true if the intervals [a, b) and [c, d) overlap. */
+static bool intervals_overlap(int a, int b, int c, int d)
+{
+ int left = max(a, c), right = min(b, d);
+
+ return left < right;
+}
+
+static int mqprio_validate_queue_counts(struct net_device *dev,
+ const struct tc_mqprio_qopt *qopt,
+ bool allow_overlapping_txqs,
+ struct netlink_ext_ack *extack)
+{
+ int i, j;
+
+ for (i = 0; i < qopt->num_tc; i++) {
+ unsigned int last = qopt->offset[i] + qopt->count[i];
+
+ if (!qopt->count[i]) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "No queues for TC %d",
+ i);
+ return -EINVAL;
+ }
+
+ /* Verify the queue count is in tx range being equal to the
+ * real_num_tx_queues indicates the last queue is in use.
+ */
+ if (qopt->offset[i] >= dev->real_num_tx_queues ||
+ last > dev->real_num_tx_queues) {
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "Queues %d:%d for TC %d exceed the %d TX queues available",
+ qopt->count[i], qopt->offset[i],
+ i, dev->real_num_tx_queues);
+ return -EINVAL;
+ }
+
+ if (allow_overlapping_txqs)
+ continue;
+
+ /* Verify that the offset and counts do not overlap */
+ for (j = i + 1; j < qopt->num_tc; j++) {
+ if (intervals_overlap(qopt->offset[i], last,
+ qopt->offset[j],
+ qopt->offset[j] +
+ qopt->count[j])) {
+ NL_SET_ERR_MSG_FMT_MOD(extack,
+ "TC %d queues %d@%d overlap with TC %d queues %d@%d",
+ i, qopt->count[i], qopt->offset[i],
+ j, qopt->count[j], qopt->offset[j]);
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
+ bool validate_queue_counts,
+ bool allow_overlapping_txqs,
+ struct netlink_ext_ack *extack)
+{
+ int i, err;
+
+ /* Verify num_tc is not out of max range */
+ if (qopt->num_tc > TC_MAX_QUEUE) {
+ NL_SET_ERR_MSG(extack,
+ "Number of traffic classes is outside valid range");
+ return -EINVAL;
+ }
+
+ /* Verify priority mapping uses valid tcs */
+ for (i = 0; i <= TC_BITMASK; i++) {
+ if (qopt->prio_tc_map[i] >= qopt->num_tc) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid traffic class in priority to traffic class mapping");
+ return -EINVAL;
+ }
+ }
+
+ if (validate_queue_counts) {
+ err = mqprio_validate_queue_counts(dev, qopt,
+ allow_overlapping_txqs,
+ extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mqprio_validate_qopt);
+
+void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+{
+ int tc, num_tc = netdev_get_num_tc(dev);
+
+ qopt->num_tc = num_tc;
+ memcpy(qopt->prio_tc_map, dev->prio_tc_map, sizeof(qopt->prio_tc_map));
+
+ for (tc = 0; tc < num_tc; tc++) {
+ qopt->count[tc] = dev->tc_to_txq[tc].count;
+ qopt->offset[tc] = dev->tc_to_txq[tc].offset;
+ }
+}
+EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct);
+
+MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_mqprio_lib.h b/net/sched/sch_mqprio_lib.h
new file mode 100644
index 000000000000..63f725ab8761
--- /dev/null
+++ b/net/sched/sch_mqprio_lib.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __SCH_MQPRIO_LIB_H
+#define __SCH_MQPRIO_LIB_H
+
+#include <linux/types.h>
+
+struct net_device;
+struct netlink_ext_ack;
+struct tc_mqprio_qopt;
+
+int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt,
+ bool validate_queue_counts,
+ bool allow_overlapping_txqs,
+ struct netlink_ext_ack *extack);
+void mqprio_qopt_reconstruct(struct net_device *dev,
+ struct tc_mqprio_qopt *qopt);
+
+#endif
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index c322a61eaeea..9781b47962bb 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -26,7 +26,11 @@
#include <net/sock.h>
#include <net/tcp.h>
+#include "sch_mqprio_lib.h"
+
static LIST_HEAD(taprio_list);
+static struct static_key_false taprio_have_broken_mqprio;
+static struct static_key_false taprio_have_working_mqprio;
#define TAPRIO_ALL_GATES_OPEN -1
@@ -35,15 +39,19 @@ static LIST_HEAD(taprio_list);
#define TAPRIO_FLAGS_INVALID U32_MAX
struct sched_entry {
- struct list_head list;
-
- /* The instant that this entry "closes" and the next one
- * should open, the qdisc will make some effort so that no
- * packet leaves after this time.
+ /* Durations between this GCL entry and the GCL entry where the
+ * respective traffic class gate closes
*/
- ktime_t close_time;
+ u64 gate_duration[TC_MAX_QUEUE];
+ atomic_t budget[TC_MAX_QUEUE];
+ /* The qdisc makes some effort so that no packet leaves
+ * after this time
+ */
+ ktime_t gate_close_time[TC_MAX_QUEUE];
+ struct list_head list;
+ /* Used to calculate when to advance the schedule */
+ ktime_t end_time;
ktime_t next_txtime;
- atomic_t budget;
int index;
u32 gate_mask;
u32 interval;
@@ -51,10 +59,16 @@ struct sched_entry {
};
struct sched_gate_list {
+ /* Longest non-zero contiguous gate durations per traffic class,
+ * or 0 if a traffic class gate never opens during the schedule.
+ */
+ u64 max_open_gate_duration[TC_MAX_QUEUE];
+ u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
+ u32 max_sdu[TC_MAX_QUEUE]; /* for dump */
struct rcu_head rcu;
struct list_head entries;
size_t num_entries;
- ktime_t cycle_close_time;
+ ktime_t cycle_end_time;
s64 cycle_time;
s64 cycle_time_extension;
s64 base_time;
@@ -67,6 +81,8 @@ struct taprio_sched {
enum tk_offsets tk_offset;
int clockid;
bool offloaded;
+ bool detected_mqprio;
+ bool broken_mqprio;
atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
* speeds it's sub-nanoseconds per byte
*/
@@ -78,8 +94,8 @@ struct taprio_sched {
struct sched_gate_list __rcu *admin_sched;
struct hrtimer advance_timer;
struct list_head taprio_list;
- u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
- u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */
+ int cur_txq[TC_MAX_QUEUE];
+ u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */
u32 txtime_delay;
};
@@ -88,6 +104,57 @@ struct __tc_taprio_qopt_offload {
struct tc_taprio_qopt_offload offload;
};
+static void taprio_calculate_gate_durations(struct taprio_sched *q,
+ struct sched_gate_list *sched)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ struct sched_entry *entry, *cur;
+ int tc;
+
+ list_for_each_entry(entry, &sched->entries, list) {
+ u32 gates_still_open = entry->gate_mask;
+
+ /* For each traffic class, calculate each open gate duration,
+ * starting at this schedule entry and ending at the schedule
+ * entry containing a gate close event for that TC.
+ */
+ cur = entry;
+
+ do {
+ if (!gates_still_open)
+ break;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ if (!(gates_still_open & BIT(tc)))
+ continue;
+
+ if (cur->gate_mask & BIT(tc))
+ entry->gate_duration[tc] += cur->interval;
+ else
+ gates_still_open &= ~BIT(tc);
+ }
+
+ cur = list_next_entry_circular(cur, &sched->entries, list);
+ } while (cur != entry);
+
+ /* Keep track of the maximum gate duration for each traffic
+ * class, taking care to not confuse a traffic class which is
+ * temporarily closed with one that is always closed.
+ */
+ for (tc = 0; tc < num_tc; tc++)
+ if (entry->gate_duration[tc] &&
+ sched->max_open_gate_duration[tc] < entry->gate_duration[tc])
+ sched->max_open_gate_duration[tc] = entry->gate_duration[tc];
+ }
+}
+
+static bool taprio_entry_allows_tx(ktime_t skb_end_time,
+ struct sched_entry *entry, int tc)
+{
+ return ktime_before(skb_end_time, entry->gate_close_time[tc]);
+}
+
static ktime_t sched_base_time(const struct sched_gate_list *sched)
{
if (!sched)
@@ -180,6 +247,55 @@ static int length_to_duration(struct taprio_sched *q, int len)
return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC);
}
+static int duration_to_length(struct taprio_sched *q, u64 duration)
+{
+ return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte));
+}
+
+/* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the
+ * q->max_sdu[] requested by the user and the max_sdu dynamically determined by
+ * the maximum open gate durations at the given link speed.
+ */
+static void taprio_update_queue_max_sdu(struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ struct qdisc_size_table *stab)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ u32 max_sdu_from_user;
+ u32 max_sdu_dynamic;
+ u32 max_sdu;
+ int tc;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX;
+
+ /* TC gate never closes => keep the queueMaxSDU
+ * selected by the user
+ */
+ if (sched->max_open_gate_duration[tc] == sched->cycle_time) {
+ max_sdu_dynamic = U32_MAX;
+ } else {
+ u32 max_frm_len;
+
+ max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]);
+ if (stab)
+ max_frm_len -= stab->szopts.overhead;
+ max_sdu_dynamic = max_frm_len - dev->hard_header_len;
+ }
+
+ max_sdu = min(max_sdu_dynamic, max_sdu_from_user);
+
+ if (max_sdu != U32_MAX) {
+ sched->max_frm_len[tc] = max_sdu + dev->hard_header_len;
+ sched->max_sdu[tc] = max_sdu;
+ } else {
+ sched->max_frm_len[tc] = U32_MAX; /* never oversized */
+ sched->max_sdu[tc] = 0;
+ }
+ }
+}
+
/* Returns the entry corresponding to next available interval. If
* validate_interval is set, it only validates whether the timestamp occurs
* when the gate corresponding to the skb's traffic class is open.
@@ -413,14 +529,33 @@ done:
return txtime;
}
-static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
- struct Qdisc *child, struct sk_buff **to_free)
+/* Devices with full offload are expected to honor this in hardware */
+static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch,
+ struct sk_buff *skb)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
+ struct sched_gate_list *sched;
int prio = skb->priority;
+ bool exceeds = false;
u8 tc;
+ tc = netdev_get_prio_tc_map(dev, prio);
+
+ rcu_read_lock();
+ sched = rcu_dereference(q->oper_sched);
+ if (sched && skb->len > sched->max_frm_len[tc])
+ exceeds = true;
+ rcu_read_unlock();
+
+ return exceeds;
+}
+
+static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
+ struct Qdisc *child, struct sk_buff **to_free)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+
/* sk_flags are only safe to use on full sockets. */
if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
if (!is_valid_interval(skb, sch))
@@ -431,17 +566,53 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
return qdisc_drop(skb, sch, to_free);
}
- /* Devices with full offload are expected to honor this in hardware */
- tc = netdev_get_prio_tc_map(dev, prio);
- if (skb->len > q->max_frm_len[tc])
- return qdisc_drop(skb, sch, to_free);
-
qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return qdisc_enqueue(skb, child, to_free);
}
+static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch,
+ struct Qdisc *child,
+ struct sk_buff **to_free)
+{
+ unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb);
+ netdev_features_t features = netif_skb_features(skb);
+ struct sk_buff *segs, *nskb;
+ int ret;
+
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs))
+ return qdisc_drop(skb, sch, to_free);
+
+ skb_list_walk_safe(segs, segs, nskb) {
+ skb_mark_not_on_list(segs);
+ qdisc_skb_cb(segs)->pkt_len = segs->len;
+ slen += segs->len;
+
+ /* FIXME: we should be segmenting to a smaller size
+ * rather than dropping these
+ */
+ if (taprio_skb_exceeds_queue_max_sdu(sch, segs))
+ ret = qdisc_drop(segs, sch, to_free);
+ else
+ ret = taprio_enqueue_one(segs, sch, child, to_free);
+
+ if (ret != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(ret))
+ qdisc_qstats_drop(sch);
+ } else {
+ numsegs++;
+ }
+ }
+
+ if (numsegs > 1)
+ qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen);
+ consume_skb(skb);
+
+ return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+}
+
/* Will not be called in the full offload case, since the TX queues are
* attached to the Qdisc created using qdisc_create_dflt()
*/
@@ -458,97 +629,190 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (unlikely(!child))
return qdisc_drop(skb, sch, to_free);
- /* Large packets might not be transmitted when the transmission duration
- * exceeds any configured interval. Therefore, segment the skb into
- * smaller chunks. Drivers with full offload are expected to handle
- * this in hardware.
- */
- if (skb_is_gso(skb)) {
- unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb);
- netdev_features_t features = netif_skb_features(skb);
- struct sk_buff *segs, *nskb;
- int ret;
-
- segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
- if (IS_ERR_OR_NULL(segs))
- return qdisc_drop(skb, sch, to_free);
+ if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) {
+ /* Large packets might not be transmitted when the transmission
+ * duration exceeds any configured interval. Therefore, segment
+ * the skb into smaller chunks. Drivers with full offload are
+ * expected to handle this in hardware.
+ */
+ if (skb_is_gso(skb))
+ return taprio_enqueue_segmented(skb, sch, child,
+ to_free);
- skb_list_walk_safe(segs, segs, nskb) {
- skb_mark_not_on_list(segs);
- qdisc_skb_cb(segs)->pkt_len = segs->len;
- slen += segs->len;
+ return qdisc_drop(skb, sch, to_free);
+ }
- ret = taprio_enqueue_one(segs, sch, child, to_free);
- if (ret != NET_XMIT_SUCCESS) {
- if (net_xmit_drop_count(ret))
- qdisc_qstats_drop(sch);
- } else {
- numsegs++;
- }
- }
+ return taprio_enqueue_one(skb, sch, child, to_free);
+}
+
+static struct sk_buff *taprio_peek(struct Qdisc *sch)
+{
+ WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented");
+ return NULL;
+}
+
+static void taprio_set_budgets(struct taprio_sched *q,
+ struct sched_gate_list *sched,
+ struct sched_entry *entry)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
+ int tc, budget;
- if (numsegs > 1)
- qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen);
- consume_skb(skb);
+ for (tc = 0; tc < num_tc; tc++) {
+ /* Traffic classes which never close have infinite budget */
+ if (entry->gate_duration[tc] == sched->cycle_time)
+ budget = INT_MAX;
+ else
+ budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC,
+ atomic64_read(&q->picos_per_byte));
- return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
+ atomic_set(&entry->budget[tc], budget);
}
+}
- return taprio_enqueue_one(skb, sch, child, to_free);
+/* When an skb is sent, it consumes from the budget of all traffic classes */
+static int taprio_update_budgets(struct sched_entry *entry, size_t len,
+ int tc_consumed, int num_tc)
+{
+ int tc, budget, new_budget = 0;
+
+ for (tc = 0; tc < num_tc; tc++) {
+ budget = atomic_read(&entry->budget[tc]);
+ /* Don't consume from infinite budget */
+ if (budget == INT_MAX) {
+ if (tc == tc_consumed)
+ new_budget = budget;
+ continue;
+ }
+
+ if (tc == tc_consumed)
+ new_budget = atomic_sub_return(len, &entry->budget[tc]);
+ else
+ atomic_sub(len, &entry->budget[tc]);
+ }
+
+ return new_budget;
}
-/* Will not be called in the full offload case, since the TX queues are
- * attached to the Qdisc created using qdisc_create_dflt()
- */
-static struct sk_buff *taprio_peek(struct Qdisc *sch)
+static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq,
+ struct sched_entry *entry,
+ u32 gate_mask)
{
struct taprio_sched *q = qdisc_priv(sch);
struct net_device *dev = qdisc_dev(sch);
- struct sched_entry *entry;
+ struct Qdisc *child = q->qdiscs[txq];
+ int num_tc = netdev_get_num_tc(dev);
struct sk_buff *skb;
- u32 gate_mask;
- int i;
+ ktime_t guard;
+ int prio;
+ int len;
+ u8 tc;
- rcu_read_lock();
- entry = rcu_dereference(q->current_entry);
- gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
- rcu_read_unlock();
+ if (unlikely(!child))
+ return NULL;
- if (!gate_mask)
+ if (TXTIME_ASSIST_IS_ENABLED(q->flags))
+ goto skip_peek_checks;
+
+ skb = child->ops->peek(child);
+ if (!skb)
return NULL;
- for (i = 0; i < dev->num_tx_queues; i++) {
- struct Qdisc *child = q->qdiscs[i];
- int prio;
- u8 tc;
+ prio = skb->priority;
+ tc = netdev_get_prio_tc_map(dev, prio);
- if (unlikely(!child))
- continue;
+ if (!(gate_mask & BIT(tc)))
+ return NULL;
- skb = child->ops->peek(child);
- if (!skb)
- continue;
+ len = qdisc_pkt_len(skb);
+ guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len));
- if (TXTIME_ASSIST_IS_ENABLED(q->flags))
- return skb;
+ /* In the case that there's no gate entry, there's no
+ * guard band ...
+ */
+ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+ !taprio_entry_allows_tx(guard, entry, tc))
+ return NULL;
+
+ /* ... and no budget. */
+ if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
+ taprio_update_budgets(entry, len, tc, num_tc) < 0)
+ return NULL;
+
+skip_peek_checks:
+ skb = child->ops->dequeue(child);
+ if (unlikely(!skb))
+ return NULL;
+
+ qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+}
+
+static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq)
+{
+ int offset = dev->tc_to_txq[tc].offset;
+ int count = dev->tc_to_txq[tc].count;
+
+ (*txq)++;
+ if (*txq == offset + count)
+ *txq = offset;
+}
- prio = skb->priority;
- tc = netdev_get_prio_tc_map(dev, prio);
+/* Prioritize higher traffic classes, and select among TXQs belonging to the
+ * same TC using round robin
+ */
+static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch,
+ struct sched_entry *entry,
+ u32 gate_mask)
+{
+ struct taprio_sched *q = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int num_tc = netdev_get_num_tc(dev);
+ struct sk_buff *skb;
+ int tc;
+
+ for (tc = num_tc - 1; tc >= 0; tc--) {
+ int first_txq = q->cur_txq[tc];
if (!(gate_mask & BIT(tc)))
continue;
- return skb;
+ do {
+ skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc],
+ entry, gate_mask);
+
+ taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]);
+
+ if (skb)
+ return skb;
+ } while (q->cur_txq[tc] != first_txq);
}
return NULL;
}
-static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
+/* Broken way of prioritizing smaller TXQ indices and ignoring the traffic
+ * class other than to determine whether the gate is open or not
+ */
+static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch,
+ struct sched_entry *entry,
+ u32 gate_mask)
{
- atomic_set(&entry->budget,
- div64_u64((u64)entry->interval * PSEC_PER_NSEC,
- atomic64_read(&q->picos_per_byte)));
+ struct net_device *dev = qdisc_dev(sch);
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask);
+ if (skb)
+ return skb;
+ }
+
+ return NULL;
}
/* Will not be called in the full offload case, since the TX queues are
@@ -557,11 +821,9 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry)
static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
{
struct taprio_sched *q = qdisc_priv(sch);
- struct net_device *dev = qdisc_dev(sch);
struct sk_buff *skb = NULL;
struct sched_entry *entry;
u32 gate_mask;
- int i;
rcu_read_lock();
entry = rcu_dereference(q->current_entry);
@@ -571,69 +833,23 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
* "AdminGateStates"
*/
gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
-
if (!gate_mask)
goto done;
- for (i = 0; i < dev->num_tx_queues; i++) {
- struct Qdisc *child = q->qdiscs[i];
- ktime_t guard;
- int prio;
- int len;
- u8 tc;
-
- if (unlikely(!child))
- continue;
-
- if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
- skb = child->ops->dequeue(child);
- if (!skb)
- continue;
- goto skb_found;
- }
-
- skb = child->ops->peek(child);
- if (!skb)
- continue;
-
- prio = skb->priority;
- tc = netdev_get_prio_tc_map(dev, prio);
-
- if (!(gate_mask & BIT(tc))) {
- skb = NULL;
- continue;
- }
-
- len = qdisc_pkt_len(skb);
- guard = ktime_add_ns(taprio_get_time(q),
- length_to_duration(q, len));
-
- /* In the case that there's no gate entry, there's no
- * guard band ...
- */
- if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
- ktime_after(guard, entry->close_time)) {
- skb = NULL;
- continue;
- }
-
- /* ... and no budget. */
- if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
- atomic_sub_return(len, &entry->budget) < 0) {
- skb = NULL;
- continue;
- }
-
- skb = child->ops->dequeue(child);
- if (unlikely(!skb))
- goto done;
-
-skb_found:
- qdisc_bstats_update(sch, skb);
- qdisc_qstats_backlog_dec(sch, skb);
- sch->q.qlen--;
-
- goto done;
+ if (static_branch_unlikely(&taprio_have_broken_mqprio) &&
+ !static_branch_likely(&taprio_have_working_mqprio)) {
+ /* Single NIC kind which is broken */
+ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
+ } else if (static_branch_likely(&taprio_have_working_mqprio) &&
+ !static_branch_unlikely(&taprio_have_broken_mqprio)) {
+ /* Single NIC kind which prioritizes properly */
+ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
+ } else {
+ /* Mixed NIC kinds present in system, need dynamic testing */
+ if (q->broken_mqprio)
+ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
+ else
+ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
}
done:
@@ -648,7 +864,7 @@ static bool should_restart_cycle(const struct sched_gate_list *oper,
if (list_is_last(&entry->list, &oper->entries))
return true;
- if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0)
+ if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0)
return true;
return false;
@@ -656,7 +872,7 @@ static bool should_restart_cycle(const struct sched_gate_list *oper,
static bool should_change_schedules(const struct sched_gate_list *admin,
const struct sched_gate_list *oper,
- ktime_t close_time)
+ ktime_t end_time)
{
ktime_t next_base_time, extension_time;
@@ -665,18 +881,18 @@ static bool should_change_schedules(const struct sched_gate_list *admin,
next_base_time = sched_base_time(admin);
- /* This is the simple case, the close_time would fall after
+ /* This is the simple case, the end_time would fall after
* the next schedule base_time.
*/
- if (ktime_compare(next_base_time, close_time) <= 0)
+ if (ktime_compare(next_base_time, end_time) <= 0)
return true;
- /* This is the cycle_time_extension case, if the close_time
+ /* This is the cycle_time_extension case, if the end_time
* plus the amount that can be extended would fall after the
* next schedule base_time, we can extend the current schedule
* for that amount.
*/
- extension_time = ktime_add_ns(close_time, oper->cycle_time_extension);
+ extension_time = ktime_add_ns(end_time, oper->cycle_time_extension);
/* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about
* how precisely the extension should be made. So after
@@ -692,10 +908,13 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
{
struct taprio_sched *q = container_of(timer, struct taprio_sched,
advance_timer);
+ struct net_device *dev = qdisc_dev(q->root);
struct sched_gate_list *oper, *admin;
+ int num_tc = netdev_get_num_tc(dev);
struct sched_entry *entry, *next;
struct Qdisc *sch = q->root;
- ktime_t close_time;
+ ktime_t end_time;
+ int tc;
spin_lock(&q->current_entry_lock);
entry = rcu_dereference_protected(q->current_entry,
@@ -714,41 +933,49 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
* entry of all schedules are pre-calculated during the
* schedule initialization.
*/
- if (unlikely(!entry || entry->close_time == oper->base_time)) {
+ if (unlikely(!entry || entry->end_time == oper->base_time)) {
next = list_first_entry(&oper->entries, struct sched_entry,
list);
- close_time = next->close_time;
+ end_time = next->end_time;
goto first_run;
}
if (should_restart_cycle(oper, entry)) {
next = list_first_entry(&oper->entries, struct sched_entry,
list);
- oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time,
- oper->cycle_time);
+ oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time,
+ oper->cycle_time);
} else {
next = list_next_entry(entry, list);
}
- close_time = ktime_add_ns(entry->close_time, next->interval);
- close_time = min_t(ktime_t, close_time, oper->cycle_close_time);
+ end_time = ktime_add_ns(entry->end_time, next->interval);
+ end_time = min_t(ktime_t, end_time, oper->cycle_end_time);
+
+ for (tc = 0; tc < num_tc; tc++) {
+ if (next->gate_duration[tc] == oper->cycle_time)
+ next->gate_close_time[tc] = KTIME_MAX;
+ else
+ next->gate_close_time[tc] = ktime_add_ns(entry->end_time,
+ next->gate_duration[tc]);
+ }
- if (should_change_schedules(admin, oper, close_time)) {
+ if (should_change_schedules(admin, oper, end_time)) {
/* Set things so the next time this runs, the new
* schedule runs.
*/
- close_time = sched_base_time(admin);
+ end_time = sched_base_time(admin);
switch_schedules(q, &admin, &oper);
}
- next->close_time = close_time;
- taprio_set_budget(q, next);
+ next->end_time = end_time;
+ taprio_set_budgets(q, oper, next);
first_run:
rcu_assign_pointer(q->current_entry, next);
spin_unlock(&q->current_entry_lock);
- hrtimer_set_expires(&q->advance_timer, close_time);
+ hrtimer_set_expires(&q->advance_timer, end_time);
rcu_read_lock();
__netif_schedule(sch);
@@ -916,6 +1143,8 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb,
new->cycle_time = cycle;
}
+ taprio_calculate_gate_durations(q, new);
+
return 0;
}
@@ -924,7 +1153,7 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
struct netlink_ext_ack *extack,
u32 taprio_flags)
{
- int i, j;
+ bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags);
if (!qopt && !dev->num_tc) {
NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary");
@@ -937,52 +1166,17 @@ static int taprio_parse_mqprio_opt(struct net_device *dev,
if (dev->num_tc)
return 0;
- /* Verify num_tc is not out of max range */
- if (qopt->num_tc > TC_MAX_QUEUE) {
- NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range");
- return -EINVAL;
- }
-
/* taprio imposes that traffic classes map 1:n to tx queues */
if (qopt->num_tc > dev->num_tx_queues) {
NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues");
return -EINVAL;
}
- /* Verify priority mapping uses valid tcs */
- for (i = 0; i <= TC_BITMASK; i++) {
- if (qopt->prio_tc_map[i] >= qopt->num_tc) {
- NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping");
- return -EINVAL;
- }
- }
-
- for (i = 0; i < qopt->num_tc; i++) {
- unsigned int last = qopt->offset[i] + qopt->count[i];
-
- /* Verify the queue count is in tx range being equal to the
- * real_num_tx_queues indicates the last queue is in use.
- */
- if (qopt->offset[i] >= dev->num_tx_queues ||
- !qopt->count[i] ||
- last > dev->real_num_tx_queues) {
- NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping");
- return -EINVAL;
- }
-
- if (TXTIME_ASSIST_IS_ENABLED(taprio_flags))
- continue;
-
- /* Verify that the offset and counts do not overlap */
- for (j = i + 1; j < qopt->num_tc; j++) {
- if (last > qopt->offset[j]) {
- NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping");
- return -EINVAL;
- }
- }
- }
-
- return 0;
+ /* For some reason, in txtime-assist mode, we allow TXQ ranges for
+ * different TCs to overlap, and just validate the TXQ ranges.
+ */
+ return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs,
+ extack);
}
static int taprio_get_start_time(struct Qdisc *sch,
@@ -1019,11 +1213,14 @@ static int taprio_get_start_time(struct Qdisc *sch,
return 0;
}
-static void setup_first_close_time(struct taprio_sched *q,
- struct sched_gate_list *sched, ktime_t base)
+static void setup_first_end_time(struct taprio_sched *q,
+ struct sched_gate_list *sched, ktime_t base)
{
+ struct net_device *dev = qdisc_dev(q->root);
+ int num_tc = netdev_get_num_tc(dev);
struct sched_entry *first;
ktime_t cycle;
+ int tc;
first = list_first_entry(&sched->entries,
struct sched_entry, list);
@@ -1031,10 +1228,18 @@ static void setup_first_close_time(struct taprio_sched *q,
cycle = sched->cycle_time;
/* FIXME: find a better place to do this */
- sched->cycle_close_time = ktime_add_ns(base, cycle);
+ sched->cycle_end_time = ktime_add_ns(base, cycle);
+
+ first->end_time = ktime_add_ns(base, first->interval);
+ taprio_set_budgets(q, sched, first);
+
+ for (tc = 0; tc < num_tc; tc++) {
+ if (first->gate_duration[tc] == sched->cycle_time)
+ first->gate_close_time[tc] = KTIME_MAX;
+ else
+ first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]);
+ }
- first->close_time = ktime_add_ns(base, first->interval);
- taprio_set_budget(q, first);
rcu_assign_pointer(q->current_entry, NULL);
}
@@ -1088,6 +1293,8 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct sched_gate_list *oper, *admin;
+ struct qdisc_size_table *stab;
struct taprio_sched *q;
ASSERT_RTNL();
@@ -1100,6 +1307,17 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
continue;
taprio_set_picos_per_byte(dev, q);
+
+ stab = rtnl_dereference(q->root->stab);
+
+ oper = rtnl_dereference(q->oper_sched);
+ if (oper)
+ taprio_update_queue_max_sdu(q, oper, stab);
+
+ admin = rtnl_dereference(q->admin_sched);
+ if (admin)
+ taprio_update_queue_max_sdu(q, admin, stab);
+
break;
}
@@ -1203,7 +1421,8 @@ static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask)
static void taprio_sched_to_offload(struct net_device *dev,
struct sched_gate_list *sched,
- struct tc_taprio_qopt_offload *offload)
+ struct tc_taprio_qopt_offload *offload,
+ const struct tc_taprio_caps *caps)
{
struct sched_entry *entry;
int i = 0;
@@ -1217,7 +1436,11 @@ static void taprio_sched_to_offload(struct net_device *dev,
e->command = entry->command;
e->interval = entry->interval;
- e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask);
+ if (caps->gate_mask_per_txq)
+ e->gate_mask = tc_map_to_queue_mask(dev,
+ entry->gate_mask);
+ else
+ e->gate_mask = entry->gate_mask;
i++;
}
@@ -1225,6 +1448,34 @@ static void taprio_sched_to_offload(struct net_device *dev,
offload->num_entries = i;
}
+static void taprio_detect_broken_mqprio(struct taprio_sched *q)
+{
+ struct net_device *dev = qdisc_dev(q->root);
+ struct tc_taprio_caps caps;
+
+ qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO,
+ &caps, sizeof(caps));
+
+ q->broken_mqprio = caps.broken_mqprio;
+ if (q->broken_mqprio)
+ static_branch_inc(&taprio_have_broken_mqprio);
+ else
+ static_branch_inc(&taprio_have_working_mqprio);
+
+ q->detected_mqprio = true;
+}
+
+static void taprio_cleanup_broken_mqprio(struct taprio_sched *q)
+{
+ if (!q->detected_mqprio)
+ return;
+
+ if (q->broken_mqprio)
+ static_branch_dec(&taprio_have_broken_mqprio);
+ else
+ static_branch_dec(&taprio_have_working_mqprio);
+}
+
static int taprio_enable_offload(struct net_device *dev,
struct taprio_sched *q,
struct sched_gate_list *sched,
@@ -1261,7 +1512,8 @@ static int taprio_enable_offload(struct net_device *dev,
return -ENOMEM;
}
offload->enable = 1;
- taprio_sched_to_offload(dev, sched, offload);
+ mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt);
+ taprio_sched_to_offload(dev, sched, offload, &caps);
for (tc = 0; tc < TC_MAX_QUEUE; tc++)
offload->max_sdu[tc] = q->max_sdu[tc];
@@ -1452,7 +1704,6 @@ static int taprio_parse_tc_entries(struct Qdisc *sch,
struct netlink_ext_ack *extack)
{
struct taprio_sched *q = qdisc_priv(sch);
- struct net_device *dev = qdisc_dev(sch);
u32 max_sdu[TC_QOPT_MAX_QUEUE];
unsigned long seen_tcs = 0;
struct nlattr *n;
@@ -1466,18 +1717,14 @@ static int taprio_parse_tc_entries(struct Qdisc *sch,
if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY)
continue;
- err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, extack);
+ err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs,
+ extack);
if (err)
goto out;
}
- for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
+ for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
q->max_sdu[tc] = max_sdu[tc];
- if (max_sdu[tc])
- q->max_frm_len[tc] = max_sdu[tc] + dev->hard_header_len;
- else
- q->max_frm_len[tc] = U32_MAX; /* never oversized */
- }
out:
return err;
@@ -1533,6 +1780,7 @@ static int taprio_new_flags(const struct nlattr *attr, u32 old,
static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
struct netlink_ext_ack *extack)
{
+ struct qdisc_size_table *stab = rtnl_dereference(sch->stab);
struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { };
struct sched_gate_list *oper, *admin, *new_admin;
struct taprio_sched *q = qdisc_priv(sch);
@@ -1600,15 +1848,18 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
goto free_sched;
taprio_set_picos_per_byte(dev, q);
+ taprio_update_queue_max_sdu(q, new_admin, stab);
if (mqprio) {
err = netdev_set_num_tc(dev, mqprio->num_tc);
if (err)
goto free_sched;
- for (i = 0; i < mqprio->num_tc; i++)
+ for (i = 0; i < mqprio->num_tc; i++) {
netdev_set_tc_queue(dev, i,
mqprio->count[i],
mqprio->offset[i]);
+ q->cur_txq[i] = mqprio->offset[i];
+ }
/* Always use supplied priority mappings */
for (i = 0; i <= TC_BITMASK; i++)
@@ -1663,7 +1914,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
if (admin)
call_rcu(&admin->rcu, taprio_free_sched_cb);
} else {
- setup_first_close_time(q, new_admin, start);
+ setup_first_end_time(q, new_admin, start);
/* Protects against advance_sched() */
spin_lock_irqsave(&q->current_entry_lock, flags);
@@ -1683,6 +1934,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
new_admin = NULL;
err = 0;
+ if (!stab)
+ NL_SET_ERR_MSG_MOD(extack,
+ "Size table not specified, frame length estimations may be inaccurate");
+
unlock:
spin_unlock_bh(qdisc_lock(sch));
@@ -1743,6 +1998,8 @@ static void taprio_destroy(struct Qdisc *sch)
if (admin)
call_rcu(&admin->rcu, taprio_free_sched_cb);
+
+ taprio_cleanup_broken_mqprio(q);
}
static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
@@ -1807,6 +2064,8 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
q->qdiscs[i] = qdisc;
}
+ taprio_detect_broken_mqprio(q);
+
return taprio_change(sch, opt, extack);
}
@@ -1947,7 +2206,8 @@ error_nest:
return -1;
}
-static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb)
+static int taprio_dump_tc_entries(struct sk_buff *skb,
+ struct sched_gate_list *sched)
{
struct nlattr *n;
int tc;
@@ -1961,7 +2221,7 @@ static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb)
goto nla_put_failure;
if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU,
- q->max_sdu[tc]))
+ sched->max_sdu[tc]))
goto nla_put_failure;
nla_nest_end(skb, n);
@@ -1981,18 +2241,11 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
struct sched_gate_list *oper, *admin;
struct tc_mqprio_qopt opt = { 0 };
struct nlattr *nest, *sched_nest;
- unsigned int i;
oper = rtnl_dereference(q->oper_sched);
admin = rtnl_dereference(q->admin_sched);
- opt.num_tc = netdev_get_num_tc(dev);
- memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
-
- for (i = 0; i < netdev_get_num_tc(dev); i++) {
- opt.count[i] = dev->tc_to_txq[i].count;
- opt.offset[i] = dev->tc_to_txq[i].offset;
- }
+ mqprio_qopt_reconstruct(dev, &opt);
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
@@ -2012,7 +2265,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
goto options_error;
- if (taprio_dump_tc_entries(q, skb))
+ if (oper && taprio_dump_tc_entries(skb, oper))
goto options_error;
if (oper && dump_schedule(skb, oper))
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 1c0fe9ba5358..b163266e581a 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -502,7 +502,7 @@ static int smcr_lgr_reg_sndbufs(struct smc_link *link,
return -EINVAL;
/* protect against parallel smcr_link_reg_buf() */
- mutex_lock(&lgr->llc_conf_mutex);
+ down_write(&lgr->llc_conf_mutex);
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
if (!smc_link_active(&lgr->lnk[i]))
continue;
@@ -510,7 +510,7 @@ static int smcr_lgr_reg_sndbufs(struct smc_link *link,
if (rc)
break;
}
- mutex_unlock(&lgr->llc_conf_mutex);
+ up_write(&lgr->llc_conf_mutex);
return rc;
}
@@ -519,15 +519,30 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link,
struct smc_buf_desc *rmb_desc)
{
struct smc_link_group *lgr = link->lgr;
+ bool do_slow = false;
int i, rc = 0;
rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
if (rc)
return rc;
+
+ down_read(&lgr->llc_conf_mutex);
+ for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+ if (!smc_link_active(&lgr->lnk[i]))
+ continue;
+ if (!rmb_desc->is_reg_mr[link->link_idx]) {
+ up_read(&lgr->llc_conf_mutex);
+ goto slow_path;
+ }
+ }
+ /* mr register already */
+ goto fast_path;
+slow_path:
+ do_slow = true;
/* protect against parallel smc_llc_cli_rkey_exchange() and
* parallel smcr_link_reg_buf()
*/
- mutex_lock(&lgr->llc_conf_mutex);
+ down_write(&lgr->llc_conf_mutex);
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
if (!smc_link_active(&lgr->lnk[i]))
continue;
@@ -535,7 +550,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link,
if (rc)
goto out;
}
-
+fast_path:
/* exchange confirm_rkey msg with peer */
rc = smc_llc_do_confirm_rkey(link, rmb_desc);
if (rc) {
@@ -544,7 +559,7 @@ static int smcr_lgr_reg_rmbs(struct smc_link *link,
}
rmb_desc->is_conf_rkey = true;
out:
- mutex_unlock(&lgr->llc_conf_mutex);
+ do_slow ? up_write(&lgr->llc_conf_mutex) : up_read(&lgr->llc_conf_mutex);
smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
return rc;
}
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 7642b16c41d1..b330a1fa453e 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -854,8 +854,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
lgr->freeing = 0;
lgr->vlan_id = ini->vlan_id;
refcount_set(&lgr->refcnt, 1); /* set lgr refcnt to 1 */
- mutex_init(&lgr->sndbufs_lock);
- mutex_init(&lgr->rmbs_lock);
+ init_rwsem(&lgr->sndbufs_lock);
+ init_rwsem(&lgr->rmbs_lock);
rwlock_init(&lgr->conns_lock);
for (i = 0; i < SMC_RMBE_SIZES; i++) {
INIT_LIST_HEAD(&lgr->sndbufs[i]);
@@ -1098,7 +1098,7 @@ err_out:
static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
struct smc_link_group *lgr)
{
- struct mutex *lock; /* lock buffer list */
+ struct rw_semaphore *lock; /* lock buffer list */
int rc;
if (is_rmb && buf_desc->is_conf_rkey && !list_empty(&lgr->list)) {
@@ -1106,10 +1106,10 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
if (!rc) {
/* protect against smc_llc_cli_rkey_exchange() */
- mutex_lock(&lgr->llc_conf_mutex);
+ down_read(&lgr->llc_conf_mutex);
smc_llc_do_delete_rkey(lgr, buf_desc);
buf_desc->is_conf_rkey = false;
- mutex_unlock(&lgr->llc_conf_mutex);
+ up_read(&lgr->llc_conf_mutex);
smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
}
}
@@ -1118,9 +1118,9 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
/* buf registration failed, reuse not possible */
lock = is_rmb ? &lgr->rmbs_lock :
&lgr->sndbufs_lock;
- mutex_lock(lock);
+ down_write(lock);
list_del(&buf_desc->list);
- mutex_unlock(lock);
+ up_write(lock);
smc_buf_free(lgr, is_rmb, buf_desc);
} else {
@@ -1224,15 +1224,16 @@ static void smcr_buf_unmap_lgr(struct smc_link *lnk)
int i;
for (i = 0; i < SMC_RMBE_SIZES; i++) {
- mutex_lock(&lgr->rmbs_lock);
+ down_write(&lgr->rmbs_lock);
list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list)
smcr_buf_unmap_link(buf_desc, true, lnk);
- mutex_unlock(&lgr->rmbs_lock);
- mutex_lock(&lgr->sndbufs_lock);
+ up_write(&lgr->rmbs_lock);
+
+ down_write(&lgr->sndbufs_lock);
list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i],
list)
smcr_buf_unmap_link(buf_desc, false, lnk);
- mutex_unlock(&lgr->sndbufs_lock);
+ up_write(&lgr->sndbufs_lock);
}
}
@@ -1377,12 +1378,12 @@ static void smc_lgr_free(struct smc_link_group *lgr)
int i;
if (!lgr->is_smcd) {
- mutex_lock(&lgr->llc_conf_mutex);
+ down_write(&lgr->llc_conf_mutex);
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
if (lgr->lnk[i].state != SMC_LNK_UNUSED)
smcr_link_clear(&lgr->lnk[i], false);
}
- mutex_unlock(&lgr->llc_conf_mutex);
+ up_write(&lgr->llc_conf_mutex);
smc_llc_lgr_clear(lgr);
}
@@ -1696,12 +1697,12 @@ static void smcr_link_down(struct smc_link *lnk)
} else {
if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) {
/* another llc task is ongoing */
- mutex_unlock(&lgr->llc_conf_mutex);
+ up_write(&lgr->llc_conf_mutex);
wait_event_timeout(lgr->llc_flow_waiter,
(list_empty(&lgr->list) ||
lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE),
SMC_LLC_WAIT_TIME);
- mutex_lock(&lgr->llc_conf_mutex);
+ down_write(&lgr->llc_conf_mutex);
}
if (!list_empty(&lgr->list)) {
smc_llc_send_delete_link(to_lnk, del_link_id,
@@ -1761,9 +1762,9 @@ static void smc_link_down_work(struct work_struct *work)
if (list_empty(&lgr->list))
return;
wake_up_all(&lgr->llc_msg_waiter);
- mutex_lock(&lgr->llc_conf_mutex);
+ down_write(&lgr->llc_conf_mutex);
smcr_link_down(link);
- mutex_unlock(&lgr->llc_conf_mutex);
+ up_write(&lgr->llc_conf_mutex);
}
static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev,
@@ -1990,19 +1991,19 @@ int smc_uncompress_bufsize(u8 compressed)
* buffer size; if not available, return NULL
*/
static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
- struct mutex *lock,
+ struct rw_semaphore *lock,
struct list_head *buf_list)
{
struct smc_buf_desc *buf_slot;
- mutex_lock(lock);
+ down_read(lock);
list_for_each_entry(buf_slot, buf_list, list) {
if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
- mutex_unlock(lock);
+ up_read(lock);
return buf_slot;
}
}
- mutex_unlock(lock);
+ up_read(lock);
return NULL;
}
@@ -2111,13 +2112,13 @@ int smcr_link_reg_buf(struct smc_link *link, struct smc_buf_desc *buf_desc)
return 0;
}
-static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock,
+static int _smcr_buf_map_lgr(struct smc_link *lnk, struct rw_semaphore *lock,
struct list_head *lst, bool is_rmb)
{
struct smc_buf_desc *buf_desc, *bf;
int rc = 0;
- mutex_lock(lock);
+ down_write(lock);
list_for_each_entry_safe(buf_desc, bf, lst, list) {
if (!buf_desc->used)
continue;
@@ -2126,7 +2127,7 @@ static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock,
goto out;
}
out:
- mutex_unlock(lock);
+ up_write(lock);
return rc;
}
@@ -2159,37 +2160,37 @@ int smcr_buf_reg_lgr(struct smc_link *lnk)
int i, rc = 0;
/* reg all RMBs for a new link */
- mutex_lock(&lgr->rmbs_lock);
+ down_write(&lgr->rmbs_lock);
for (i = 0; i < SMC_RMBE_SIZES; i++) {
list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) {
if (!buf_desc->used)
continue;
rc = smcr_link_reg_buf(lnk, buf_desc);
if (rc) {
- mutex_unlock(&lgr->rmbs_lock);
+ up_write(&lgr->rmbs_lock);
return rc;
}
}
}
- mutex_unlock(&lgr->rmbs_lock);
+ up_write(&lgr->rmbs_lock);
if (lgr->buf_type == SMCR_PHYS_CONT_BUFS)
return rc;
/* reg all vzalloced sndbufs for a new link */
- mutex_lock(&lgr->sndbufs_lock);
+ down_write(&lgr->sndbufs_lock);
for (i = 0; i < SMC_RMBE_SIZES; i++) {
list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], list) {
if (!buf_desc->used || !buf_desc->is_vm)
continue;
rc = smcr_link_reg_buf(lnk, buf_desc);
if (rc) {
- mutex_unlock(&lgr->sndbufs_lock);
+ up_write(&lgr->sndbufs_lock);
return rc;
}
}
}
- mutex_unlock(&lgr->sndbufs_lock);
+ up_write(&lgr->sndbufs_lock);
return rc;
}
@@ -2247,7 +2248,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
int i, rc = 0, cnt = 0;
/* protect against parallel link reconfiguration */
- mutex_lock(&lgr->llc_conf_mutex);
+ down_read(&lgr->llc_conf_mutex);
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
struct smc_link *lnk = &lgr->lnk[i];
@@ -2260,7 +2261,7 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr,
cnt++;
}
out:
- mutex_unlock(&lgr->llc_conf_mutex);
+ up_read(&lgr->llc_conf_mutex);
if (!rc && !cnt)
rc = -EINVAL;
return rc;
@@ -2309,8 +2310,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
struct smc_link_group *lgr = conn->lgr;
struct list_head *buf_list;
int bufsize, bufsize_short;
+ struct rw_semaphore *lock; /* lock buffer list */
bool is_dgraded = false;
- struct mutex *lock; /* lock buffer list */
int sk_buf_size;
if (is_rmb)
@@ -2358,9 +2359,9 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb);
SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, bufsize);
buf_desc->used = 1;
- mutex_lock(lock);
+ down_write(lock);
list_add(&buf_desc->list, buf_list);
- mutex_unlock(lock);
+ up_write(lock);
break; /* found */
}
@@ -2434,9 +2435,9 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd)
/* create rmb */
rc = __smc_buf_create(smc, is_smcd, true);
if (rc) {
- mutex_lock(&smc->conn.lgr->sndbufs_lock);
+ down_write(&smc->conn.lgr->sndbufs_lock);
list_del(&smc->conn.sndbuf_desc->list);
- mutex_unlock(&smc->conn.lgr->sndbufs_lock);
+ up_write(&smc->conn.lgr->sndbufs_lock);
smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc);
smc->conn.sndbuf_desc = NULL;
}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 285f9bd8e232..08b457c2d294 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -252,9 +252,9 @@ struct smc_link_group {
unsigned short vlan_id; /* vlan id of link group */
struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */
- struct mutex sndbufs_lock; /* protects tx buffers */
+ struct rw_semaphore sndbufs_lock; /* protects tx buffers */
struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */
- struct mutex rmbs_lock; /* protects rx buffers */
+ struct rw_semaphore rmbs_lock; /* protects rx buffers */
u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */
struct delayed_work free_work; /* delayed freeing of an lgr */
@@ -298,7 +298,7 @@ struct smc_link_group {
/* queue for llc events */
spinlock_t llc_event_q_lock;
/* protects llc_event_q */
- struct mutex llc_conf_mutex;
+ struct rw_semaphore llc_conf_mutex;
/* protects lgr reconfig. */
struct work_struct llc_add_link_work;
struct work_struct llc_del_link_work;
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 524649d0ab65..a0840b8c935b 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -608,7 +608,7 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext,
prim_lnk_idx = link->link_idx;
lnk_idx = link_new->link_idx;
- mutex_lock(&lgr->rmbs_lock);
+ down_write(&lgr->rmbs_lock);
ext->num_rkeys = lgr->conns_num;
if (!ext->num_rkeys)
goto out;
@@ -628,7 +628,7 @@ static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext,
}
len += i * sizeof(ext->rt[0]);
out:
- mutex_unlock(&lgr->rmbs_lock);
+ up_write(&lgr->rmbs_lock);
return len;
}
@@ -889,7 +889,7 @@ static int smc_llc_cli_rkey_exchange(struct smc_link *link,
int rc = 0;
int i;
- mutex_lock(&lgr->rmbs_lock);
+ down_write(&lgr->rmbs_lock);
num_rkeys_send = lgr->conns_num;
buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
do {
@@ -916,7 +916,7 @@ static int smc_llc_cli_rkey_exchange(struct smc_link *link,
break;
} while (num_rkeys_send || num_rkeys_recv);
- mutex_unlock(&lgr->rmbs_lock);
+ up_write(&lgr->rmbs_lock);
return rc;
}
@@ -999,14 +999,14 @@ static void smc_llc_save_add_link_rkeys(struct smc_link *link,
ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 +
SMC_WR_TX_SIZE);
max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2);
- mutex_lock(&lgr->rmbs_lock);
+ down_write(&lgr->rmbs_lock);
for (i = 0; i < max; i++) {
smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
ext->rt[i].rmb_key,
ext->rt[i].rmb_vaddr_new,
ext->rt[i].rmb_key_new);
}
- mutex_unlock(&lgr->rmbs_lock);
+ up_write(&lgr->rmbs_lock);
}
static void smc_llc_save_add_link_info(struct smc_link *link,
@@ -1202,12 +1202,12 @@ static void smc_llc_process_cli_add_link(struct smc_link_group *lgr)
qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
- mutex_lock(&lgr->llc_conf_mutex);
+ down_write(&lgr->llc_conf_mutex);
if (smc_llc_is_local_add_link(&qentry->msg))
smc_llc_cli_add_link_invite(qentry->link, qentry);
else
smc_llc_cli_add_link(qentry->link, qentry);
- mutex_unlock(&lgr->llc_conf_mutex);
+ up_write(&lgr->llc_conf_mutex);
}
static int smc_llc_active_link_count(struct smc_link_group *lgr)
@@ -1313,7 +1313,7 @@ static int smc_llc_srv_rkey_exchange(struct smc_link *link,
int rc = 0;
int i;
- mutex_lock(&lgr->rmbs_lock);
+ down_write(&lgr->rmbs_lock);
num_rkeys_send = lgr->conns_num;
buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
do {
@@ -1338,7 +1338,7 @@ static int smc_llc_srv_rkey_exchange(struct smc_link *link,
smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
} while (num_rkeys_send || num_rkeys_recv);
out:
- mutex_unlock(&lgr->rmbs_lock);
+ up_write(&lgr->rmbs_lock);
return rc;
}
@@ -1509,13 +1509,13 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr)
qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
- mutex_lock(&lgr->llc_conf_mutex);
+ down_write(&lgr->llc_conf_mutex);
rc = smc_llc_srv_add_link(link, qentry);
if (!rc && lgr->type == SMC_LGR_SYMMETRIC) {
/* delete any asymmetric link */
smc_llc_delete_asym_link(lgr);
}
- mutex_unlock(&lgr->llc_conf_mutex);
+ up_write(&lgr->llc_conf_mutex);
kfree(qentry);
}
@@ -1582,7 +1582,7 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr)
smc_lgr_terminate_sched(lgr);
goto out;
}
- mutex_lock(&lgr->llc_conf_mutex);
+ down_write(&lgr->llc_conf_mutex);
/* delete single link */
for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) {
if (lgr->lnk[lnk_idx].link_id != del_llc->link_num)
@@ -1616,7 +1616,7 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr)
smc_lgr_terminate_sched(lgr);
}
out_unlock:
- mutex_unlock(&lgr->llc_conf_mutex);
+ up_write(&lgr->llc_conf_mutex);
out:
kfree(qentry);
}
@@ -1652,7 +1652,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
int active_links;
int i;
- mutex_lock(&lgr->llc_conf_mutex);
+ down_write(&lgr->llc_conf_mutex);
qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
lnk = qentry->link;
del_llc = &qentry->msg.delete_link;
@@ -1708,7 +1708,7 @@ static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr)
smc_llc_add_link_local(lnk);
}
out:
- mutex_unlock(&lgr->llc_conf_mutex);
+ up_write(&lgr->llc_conf_mutex);
kfree(qentry);
}
@@ -2126,7 +2126,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc)
spin_lock_init(&lgr->llc_flow_lock);
init_waitqueue_head(&lgr->llc_flow_waiter);
init_waitqueue_head(&lgr->llc_msg_waiter);
- mutex_init(&lgr->llc_conf_mutex);
+ init_rwsem(&lgr->llc_conf_mutex);
lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time);
}
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index dfea27a906f2..9b47c8409231 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -39,6 +39,7 @@
#include "node.h"
#include "net.h"
#include <net/genetlink.h>
+#include <linux/string_helpers.h>
#include <linux/tipc_config.h>
/* The legacy API had an artificial message length limit called
@@ -173,11 +174,6 @@ static struct sk_buff *tipc_get_err_tlv(char *str)
return buf;
}
-static inline bool string_is_valid(char *s, int len)
-{
- return memchr(s, '\0', len) ? true : false;
-}
-
static int __tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd,
struct tipc_nl_compat_msg *msg,
struct sk_buff *arg)
@@ -445,7 +441,7 @@ static int tipc_nl_compat_bearer_enable(struct tipc_nl_compat_cmd_doit *cmd,
return -EINVAL;
len = min_t(int, len, TIPC_MAX_BEARER_NAME);
- if (!string_is_valid(b->name, len))
+ if (!string_is_terminated(b->name, len))
return -EINVAL;
if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, b->name))
@@ -486,7 +482,7 @@ static int tipc_nl_compat_bearer_disable(struct tipc_nl_compat_cmd_doit *cmd,
return -EINVAL;
len = min_t(int, len, TIPC_MAX_BEARER_NAME);
- if (!string_is_valid(name, len))
+ if (!string_is_terminated(name, len))
return -EINVAL;
if (nla_put_string(skb, TIPC_NLA_BEARER_NAME, name))
@@ -584,7 +580,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg,
return -EINVAL;
len = min_t(int, len, TIPC_MAX_LINK_NAME);
- if (!string_is_valid(name, len))
+ if (!string_is_terminated(name, len))
return -EINVAL;
if (strcmp(name, nla_data(link[TIPC_NLA_LINK_NAME])) != 0)
@@ -819,7 +815,7 @@ static int tipc_nl_compat_link_set(struct tipc_nl_compat_cmd_doit *cmd,
return -EINVAL;
len = min_t(int, len, TIPC_MAX_LINK_NAME);
- if (!string_is_valid(lc->name, len))
+ if (!string_is_terminated(lc->name, len))
return -EINVAL;
media = tipc_media_find(lc->name);
@@ -856,7 +852,7 @@ static int tipc_nl_compat_link_reset_stats(struct tipc_nl_compat_cmd_doit *cmd,
return -EINVAL;
len = min_t(int, len, TIPC_MAX_LINK_NAME);
- if (!string_is_valid(name, len))
+ if (!string_is_terminated(name, len))
return -EINVAL;
if (nla_put_string(skb, TIPC_NLA_LINK_NAME, name))
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index ed6c71826d31..b2df1e0f8153 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -140,6 +140,10 @@ static void xp_disable_drv_zc(struct xsk_buff_pool *pool)
}
}
+#define NETDEV_XDP_ACT_ZC (NETDEV_XDP_ACT_BASIC | \
+ NETDEV_XDP_ACT_REDIRECT | \
+ NETDEV_XDP_ACT_XSK_ZEROCOPY)
+
int xp_assign_dev(struct xsk_buff_pool *pool,
struct net_device *netdev, u16 queue_id, u16 flags)
{
@@ -178,8 +182,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool,
/* For copy-mode, we are done. */
return 0;
- if (!netdev->netdev_ops->ndo_bpf ||
- !netdev->netdev_ops->ndo_xsk_wakeup) {
+ if ((netdev->xdp_features & NETDEV_XDP_ACT_ZC) != NETDEV_XDP_ACT_ZC) {
err = -EOPNOTSUPP;
goto err_unreg_pool;
}
diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index a0f62fa02e06..8cbf45a8bcdc 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -5,6 +5,7 @@
* Based on code and translator idea by: Florian Westphal <fw@strlen.de>
*/
#include <linux/compat.h>
+#include <linux/nospec.h>
#include <linux/xfrm.h>
#include <net/xfrm.h>
@@ -302,7 +303,7 @@ static int xfrm_xlate64(struct sk_buff *dst, const struct nlmsghdr *nlh_src)
nla_for_each_attr(nla, attrs, len, remaining) {
int err;
- switch (type) {
+ switch (nlh_src->nlmsg_type) {
case XFRM_MSG_NEWSPDINFO:
err = xfrm_nla_cpy(dst, nla, nla_len(nla));
break;
@@ -437,6 +438,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla,
NL_SET_ERR_MSG(extack, "Bad attribute");
return -EOPNOTSUPP;
}
+ type = array_index_nospec(type, XFRMA_MAX + 1);
if (nla_len(nla) < compat_policy[type].len) {
NL_SET_ERR_MSG(extack, "Attribute bad length");
return -EOPNOTSUPP;
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index c06e54a10540..436d29640ac2 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -279,8 +279,7 @@ static int xfrm6_remove_tunnel_encap(struct xfrm_state *x, struct sk_buff *skb)
goto out;
if (x->props.flags & XFRM_STATE_DECAP_DSCP)
- ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)),
- ipipv6_hdr(skb));
+ ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipipv6_hdr(skb));
if (!(x->props.flags & XFRM_STATE_NOECN))
ipip6_ecn_decapsulate(skb);
diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
index 1ef2162cebcf..d74f3fd20f2b 100644
--- a/net/xfrm/xfrm_interface_bpf.c
+++ b/net/xfrm/xfrm_interface_bpf.c
@@ -39,8 +39,7 @@ __diag_ignore_all("-Wmissing-prototypes",
* @to - Pointer to memory to which the metadata will be copied
* Cannot be NULL
*/
-__used noinline
-int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
+__bpf_kfunc int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
{
struct sk_buff *skb = (struct sk_buff *)skb_ctx;
struct xfrm_md_info *info;
@@ -62,9 +61,7 @@ int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
* @from - Pointer to memory from which the metadata will be copied
* Cannot be NULL
*/
-__used noinline
-int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
- const struct bpf_xfrm_info *from)
+__bpf_kfunc int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx, const struct bpf_xfrm_info *from)
{
struct sk_buff *skb = (struct sk_buff *)skb_ctx;
struct metadata_dst *md_dst;
diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c
index 1f99dc469027..35279c220bd7 100644
--- a/net/xfrm/xfrm_interface_core.c
+++ b/net/xfrm/xfrm_interface_core.c
@@ -310,6 +310,52 @@ static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
skb->mark = 0;
}
+static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type, unsigned short family)
+{
+ struct sec_path *sp;
+
+ sp = skb_sec_path(skb);
+ if (sp && (sp->len || sp->olen) &&
+ !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family))
+ goto discard;
+
+ XFRM_SPI_SKB_CB(skb)->family = family;
+ if (family == AF_INET) {
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
+ } else {
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+ }
+
+ return xfrm_input(skb, nexthdr, spi, encap_type);
+discard:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int xfrmi4_rcv(struct sk_buff *skb)
+{
+ return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET);
+}
+
+static int xfrmi6_rcv(struct sk_buff *skb)
+{
+ return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff],
+ 0, 0, AF_INET6);
+}
+
+static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+{
+ return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET);
+}
+
+static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
+{
+ return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6);
+}
+
static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
{
const struct xfrm_mode *inner_mode;
@@ -945,8 +991,8 @@ static struct pernet_operations xfrmi_net_ops = {
};
static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = {
- .handler = xfrm6_rcv,
- .input_handler = xfrm_input,
+ .handler = xfrmi6_rcv,
+ .input_handler = xfrmi6_input,
.cb_handler = xfrmi_rcv_cb,
.err_handler = xfrmi6_err,
.priority = 10,
@@ -996,8 +1042,8 @@ static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = {
#endif
static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = {
- .handler = xfrm4_rcv,
- .input_handler = xfrm_input,
+ .handler = xfrmi4_rcv,
+ .input_handler = xfrmi4_input,
.cb_handler = xfrmi_rcv_cb,
.err_handler = xfrmi4_err,
.priority = 10,
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index e9eb82c5457d..5c61ec04b839 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -336,7 +336,7 @@ static void xfrm_policy_timer(struct timer_list *t)
}
if (xp->lft.hard_use_expires_seconds) {
time64_t tmo = xp->lft.hard_use_expires_seconds +
- (xp->curlft.use_time ? : xp->curlft.add_time) - now;
+ (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now;
if (tmo <= 0)
goto expired;
if (tmo < next)
@@ -354,7 +354,7 @@ static void xfrm_policy_timer(struct timer_list *t)
}
if (xp->lft.soft_use_expires_seconds) {
time64_t tmo = xp->lft.soft_use_expires_seconds +
- (xp->curlft.use_time ? : xp->curlft.add_time) - now;
+ (READ_ONCE(xp->curlft.use_time) ? : xp->curlft.add_time) - now;
if (tmo <= 0) {
warn = 1;
tmo = XFRM_KM_TIMEOUT;
@@ -3661,7 +3661,8 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
return 1;
}
- pol->curlft.use_time = ktime_get_real_seconds();
+ /* This lockless write can happen from different cpus. */
+ WRITE_ONCE(pol->curlft.use_time, ktime_get_real_seconds());
pols[0] = pol;
npols++;
@@ -3676,7 +3677,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
xfrm_pol_put(pols[0]);
return 0;
}
- pols[1]->curlft.use_time = ktime_get_real_seconds();
+ /* This write can happen from different cpus. */
+ WRITE_ONCE(pols[1]->curlft.use_time,
+ ktime_get_real_seconds());
npols++;
}
}
@@ -3742,6 +3745,9 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
goto reject;
}
+ if (if_id)
+ secpath_reset(skb);
+
xfrm_pols_put(pols, npols);
return 1;
}
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 59fffa02d1cc..2ab3e09e2227 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -577,7 +577,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
if (x->km.state == XFRM_STATE_EXPIRED)
goto expired;
if (x->lft.hard_add_expires_seconds) {
- long tmo = x->lft.hard_add_expires_seconds +
+ time64_t tmo = x->lft.hard_add_expires_seconds +
x->curlft.add_time - now;
if (tmo <= 0) {
if (x->xflags & XFRM_SOFT_EXPIRE) {
@@ -594,8 +594,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
next = tmo;
}
if (x->lft.hard_use_expires_seconds) {
- long tmo = x->lft.hard_use_expires_seconds +
- (x->curlft.use_time ? : now) - now;
+ time64_t tmo = x->lft.hard_use_expires_seconds +
+ (READ_ONCE(x->curlft.use_time) ? : now) - now;
if (tmo <= 0)
goto expired;
if (tmo < next)
@@ -604,7 +604,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
if (x->km.dying)
goto resched;
if (x->lft.soft_add_expires_seconds) {
- long tmo = x->lft.soft_add_expires_seconds +
+ time64_t tmo = x->lft.soft_add_expires_seconds +
x->curlft.add_time - now;
if (tmo <= 0) {
warn = 1;
@@ -616,8 +616,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
}
}
if (x->lft.soft_use_expires_seconds) {
- long tmo = x->lft.soft_use_expires_seconds +
- (x->curlft.use_time ? : now) - now;
+ time64_t tmo = x->lft.soft_use_expires_seconds +
+ (READ_ONCE(x->curlft.use_time) ? : now) - now;
if (tmo <= 0)
warn = 1;
else if (tmo < next)
@@ -1906,7 +1906,7 @@ out:
hrtimer_start(&x1->mtimer, ktime_set(1, 0),
HRTIMER_MODE_REL_SOFT);
- if (x1->curlft.use_time)
+ if (READ_ONCE(x1->curlft.use_time))
xfrm_state_check_expire(x1);
if (x->props.smark.m || x->props.smark.v || x->if_id) {
@@ -1940,8 +1940,8 @@ int xfrm_state_check_expire(struct xfrm_state *x)
{
xfrm_dev_state_update_curlft(x);
- if (!x->curlft.use_time)
- x->curlft.use_time = ktime_get_real_seconds();
+ if (!READ_ONCE(x->curlft.use_time))
+ WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds());
if (x->curlft.bytes >= x->lft.hard_byte_limit ||
x->curlft.packets >= x->lft.hard_packet_limit) {