summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/9p/client.c134
-rw-r--r--net/Kconfig3
-rw-r--r--net/bluetooth/hci_core.c39
-rw-r--r--net/bluetooth/hci_event.c25
-rw-r--r--net/bluetooth/l2cap_core.c137
-rw-r--r--net/bluetooth/l2cap_sock.c8
-rw-r--r--net/bpf/test_run.c4
-rw-r--r--net/bpfilter/.gitignore1
-rw-r--r--net/bpfilter/main.c13
-rw-r--r--net/ceph/debugfs.c20
-rw-r--r--net/ceph/messenger.c9
-rw-r--r--net/ceph/mon_client.c8
-rw-r--r--net/ceph/osd_client.c96
-rw-r--r--net/ceph/osdmap.c9
-rw-r--r--net/compat.c30
-rw-r--r--net/core/datagram.c14
-rw-r--r--net/core/dev.c36
-rw-r--r--net/core/dev_ioctl.c6
-rw-r--r--net/core/devlink.c919
-rw-r--r--net/core/filter.c141
-rw-r--r--net/core/flow_offload.c6
-rw-r--r--net/core/lwt_bpf.c2
-rw-r--r--net/core/lwtunnel.c6
-rw-r--r--net/core/neighbour.c10
-rw-r--r--net/core/net_namespace.c15
-rw-r--r--net/core/page_pool.c8
-rw-r--r--net/core/pktgen.c2
-rw-r--r--net/core/rtnetlink.c14
-rw-r--r--net/core/skbuff.c1
-rw-r--r--net/core/sock.c14
-rw-r--r--net/core/sock_map.c12
-rw-r--r--net/dns_resolver/dns_key.c2
-rw-r--r--net/dsa/dsa.c6
-rw-r--r--net/dsa/dsa2.c2
-rw-r--r--net/dsa/dsa_priv.h15
-rw-r--r--net/dsa/master.c21
-rw-r--r--net/dsa/port.c13
-rw-r--r--net/dsa/slave.c367
-rw-r--r--net/dsa/switch.c37
-rw-r--r--net/dsa/tag_8021q.c43
-rw-r--r--net/dsa/tag_brcm.c2
-rw-r--r--net/dsa/tag_sja1105.c19
-rw-r--r--net/ethtool/Makefile2
-rw-r--r--net/ethtool/coalesce.c353
-rw-r--r--net/ethtool/common.c71
-rw-r--r--net/ethtool/common.h6
-rw-r--r--net/ethtool/debug.c4
-rw-r--r--net/ethtool/eee.c206
-rw-r--r--net/ethtool/ioctl.c41
-rw-r--r--net/ethtool/linkinfo.c4
-rw-r--r--net/ethtool/linkmodes.c4
-rw-r--r--net/ethtool/netlink.c69
-rw-r--r--net/ethtool/netlink.h7
-rw-r--r--net/ethtool/pause.c145
-rw-r--r--net/ethtool/privflags.c4
-rw-r--r--net/ethtool/strset.c15
-rw-r--r--net/ethtool/tsinfo.c143
-rw-r--r--net/ethtool/wol.c4
-rw-r--r--net/hsr/hsr_framereg.c9
-rw-r--r--net/hsr/hsr_netlink.c70
-rw-r--r--net/hsr/hsr_slave.c8
-rw-r--r--net/ipv4/Kconfig7
-rw-r--r--net/ipv4/af_inet.c4
-rw-r--r--net/ipv4/bpf_tcp_ca.c40
-rw-r--r--net/ipv4/esp4.c16
-rw-r--r--net/ipv4/esp4_offload.c32
-rw-r--r--net/ipv4/fib_frontend.c2
-rw-r--r--net/ipv4/fib_lookup.h2
-rw-r--r--net/ipv4/fib_semantics.c22
-rw-r--r--net/ipv4/fib_trie.c5
-rw-r--r--net/ipv4/ip_gre.c105
-rw-r--r--net/ipv4/ip_input.c3
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ip_tunnel.c6
-rw-r--r--net/ipv4/ip_tunnel_core.c4
-rw-r--r--net/ipv4/ip_vti.c38
-rw-r--r--net/ipv4/ipconfig.c2
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_bpf.c152
-rw-r--r--net/ipv4/tcp_minisocks.c6
-rw-r--r--net/ipv4/tcp_output.c12
-rw-r--r--net/ipv4/udp.c8
-rw-r--r--net/ipv4/udp_offload.c1
-rw-r--r--net/ipv6/Kconfig10
-rw-r--r--net/ipv6/Makefile3
-rw-r--r--net/ipv6/addrconf.c74
-rw-r--r--net/ipv6/af_inet6.c7
-rw-r--r--net/ipv6/esp6.c16
-rw-r--r--net/ipv6/esp6_offload.c36
-rw-r--r--net/ipv6/exthdrs.c201
-rw-r--r--net/ipv6/ila/ila_lwt.c2
-rw-r--r--net/ipv6/ip6_input.c3
-rw-r--r--net/ipv6/ip6_vti.c34
-rw-r--r--net/ipv6/ndisc.c5
-rw-r--r--net/ipv6/route.c2
-rw-r--r--net/ipv6/rpl.c123
-rw-r--r--net/ipv6/rpl_iptunnel.c382
-rw-r--r--net/ipv6/seg6_iptunnel.c2
-rw-r--r--net/ipv6/seg6_local.c5
-rw-r--r--net/ipv6/udp.c9
-rw-r--r--net/ipv6/xfrm6_tunnel.c2
-rw-r--r--net/mac80211/debugfs_sta.c3
-rw-r--r--net/mac80211/key.c20
-rw-r--r--net/mac80211/sta_info.c5
-rw-r--r--net/mac80211/sta_info.h1
-rw-r--r--net/mac80211/tx.c38
-rw-r--r--net/mpls/mpls_iptunnel.c2
-rw-r--r--net/mptcp/Makefile3
-rw-r--r--net/mptcp/crypto.c17
-rw-r--r--net/mptcp/diag.c104
-rw-r--r--net/mptcp/mib.c69
-rw-r--r--net/mptcp/mib.h40
-rw-r--r--net/mptcp/options.c517
-rw-r--r--net/mptcp/pm.c244
-rw-r--r--net/mptcp/pm_netlink.c859
-rw-r--r--net/mptcp/protocol.c697
-rw-r--r--net/mptcp/protocol.h189
-rw-r--r--net/mptcp/subflow.c340
-rw-r--r--net/mptcp/token.c36
-rw-r--r--net/netfilter/Makefile2
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c45
-rw-r--r--net/netfilter/nf_conntrack_core.c18
-rw-r--r--net/netfilter/nf_conntrack_netlink.c3
-rw-r--r--net/netfilter/nf_flow_table_core.c14
-rw-r--r--net/netfilter/nf_flow_table_ip.c21
-rw-r--r--net/netfilter/nf_flow_table_offload.c71
-rw-r--r--net/netfilter/nf_queue.c96
-rw-r--r--net/netfilter/nf_tables_api.c132
-rw-r--r--net/netfilter/nf_tables_offload.c2
-rw-r--r--net/netfilter/nfnetlink_queue.c10
-rw-r--r--net/netfilter/nft_dynset.c26
-rw-r--r--net/netfilter/nft_exthdr.c8
-rw-r--r--net/netfilter/nft_fwd_netdev.c12
-rw-r--r--net/netfilter/nft_set_bitmap.c3
-rw-r--r--net/netfilter/nft_set_pipapo.c36
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.h4
-rw-r--r--net/netfilter/nft_set_rbtree.c87
-rw-r--r--net/netlink/af_netlink.c43
-rw-r--r--net/openvswitch/datapath.c2
-rw-r--r--net/openvswitch/flow_table.c10
-rw-r--r--net/packet/af_packet.c21
-rw-r--r--net/packet/internal.h5
-rw-r--r--net/rxrpc/af_rxrpc.c37
-rw-r--r--net/rxrpc/ar-internal.h5
-rw-r--r--net/rxrpc/call_object.c3
-rw-r--r--net/rxrpc/conn_client.c13
-rw-r--r--net/rxrpc/input.c1
-rw-r--r--net/rxrpc/key.c27
-rw-r--r--net/rxrpc/sendmsg.c75
-rw-r--r--net/sched/act_api.c27
-rw-r--r--net/sched/act_bpf.c3
-rw-r--r--net/sched/act_ct.c2
-rw-r--r--net/sched/act_mirred.c6
-rw-r--r--net/sched/act_pedit.c11
-rw-r--r--net/sched/act_skbedit.c11
-rw-r--r--net/sched/cls_api.c2
-rw-r--r--net/sched/cls_flower.c64
-rw-r--r--net/sched/cls_matchall.c4
-rw-r--r--net/sched/cls_route.c4
-rw-r--r--net/sched/cls_tcindex.c48
-rw-r--r--net/sched/sch_cbs.c12
-rw-r--r--net/sched/sch_red.c7
-rw-r--r--net/sctp/ipv6.c20
-rw-r--r--net/sctp/protocol.c28
-rw-r--r--net/sctp/socket.c31
-rw-r--r--net/socket.c33
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c96
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c60
-rw-r--r--net/sunrpc/cache.c128
-rw-r--r--net/sunrpc/clnt.c9
-rw-r--r--net/sunrpc/sched.c22
-rw-r--r--net/sunrpc/socklib.c141
-rw-r--r--net/sunrpc/socklib.h15
-rw-r--r--net/sunrpc/sunrpc.h4
-rw-r--r--net/sunrpc/svc.c20
-rw-r--r--net/sunrpc/svc_xprt.c22
-rw-r--r--net/sunrpc/svcauth_unix.c12
-rw-r--r--net/sunrpc/svcsock.c202
-rw-r--r--net/sunrpc/xdr.c55
-rw-r--r--net/sunrpc/xprt.c3
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c8
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c154
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c68
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c17
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c244
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c57
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c512
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c8
-rw-r--r--net/sunrpc/xprtrdma/transport.c72
-rw-r--r--net/sunrpc/xprtrdma/verbs.c683
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h89
-rw-r--r--net/sunrpc/xprtsock.c190
-rw-r--r--net/tipc/msg.h5
-rw-r--r--net/tipc/node.c3
-rw-r--r--net/tipc/socket.c2
-rw-r--r--net/wireless/.gitignore1
-rw-r--r--net/wireless/nl80211.c2
-rw-r--r--net/wireless/scan.c6
-rw-r--r--net/xfrm/xfrm_device.c37
-rw-r--r--net/xfrm/xfrm_output.c4
-rw-r--r--net/xfrm/xfrm_policy.c2
-rw-r--r--net/xfrm/xfrm_state.c2
-rw-r--r--net/xfrm/xfrm_user.c6
204 files changed, 9356 insertions, 2759 deletions
diff --git a/net/9p/client.c b/net/9p/client.c
index 1d48afc7033c..fc1f3635e5dd 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1549,82 +1549,94 @@ EXPORT_SYMBOL(p9_client_unlinkat);
int
p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
{
- struct p9_client *clnt = fid->clnt;
- struct p9_req_t *req;
int total = 0;
*err = 0;
+ while (iov_iter_count(to)) {
+ int count;
+
+ count = p9_client_read_once(fid, offset, to, err);
+ if (!count || *err)
+ break;
+ offset += count;
+ total += count;
+ }
+ return total;
+}
+EXPORT_SYMBOL(p9_client_read);
+
+int
+p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to,
+ int *err)
+{
+ struct p9_client *clnt = fid->clnt;
+ struct p9_req_t *req;
+ int count = iov_iter_count(to);
+ int rsize, non_zc = 0;
+ char *dataptr;
+
+ *err = 0;
p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n",
fid->fid, (unsigned long long) offset, (int)iov_iter_count(to));
- while (iov_iter_count(to)) {
- int count = iov_iter_count(to);
- int rsize, non_zc = 0;
- char *dataptr;
+ rsize = fid->iounit;
+ if (!rsize || rsize > clnt->msize - P9_IOHDRSZ)
+ rsize = clnt->msize - P9_IOHDRSZ;
- rsize = fid->iounit;
- if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
- rsize = clnt->msize - P9_IOHDRSZ;
+ if (count < rsize)
+ rsize = count;
- if (count < rsize)
- rsize = count;
+ /* Don't bother zerocopy for small IO (< 1024) */
+ if (clnt->trans_mod->zc_request && rsize > 1024) {
+ /* response header len is 11
+ * PDU Header(7) + IO Size (4)
+ */
+ req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize,
+ 0, 11, "dqd", fid->fid,
+ offset, rsize);
+ } else {
+ non_zc = 1;
+ req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
+ rsize);
+ }
+ if (IS_ERR(req)) {
+ *err = PTR_ERR(req);
+ return 0;
+ }
- /* Don't bother zerocopy for small IO (< 1024) */
- if (clnt->trans_mod->zc_request && rsize > 1024) {
- /*
- * response header len is 11
- * PDU Header(7) + IO Size (4)
- */
- req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize,
- 0, 11, "dqd", fid->fid,
- offset, rsize);
- } else {
- non_zc = 1;
- req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
- rsize);
- }
- if (IS_ERR(req)) {
- *err = PTR_ERR(req);
- break;
- }
+ *err = p9pdu_readf(&req->rc, clnt->proto_version,
+ "D", &count, &dataptr);
+ if (*err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_tag_remove(clnt, req);
+ return 0;
+ }
+ if (rsize < count) {
+ pr_err("bogus RREAD count (%d > %d)\n", count, rsize);
+ count = rsize;
+ }
- *err = p9pdu_readf(&req->rc, clnt->proto_version,
- "D", &count, &dataptr);
- if (*err) {
- trace_9p_protocol_dump(clnt, &req->rc);
- p9_tag_remove(clnt, req);
- break;
- }
- if (rsize < count) {
- pr_err("bogus RREAD count (%d > %d)\n", count, rsize);
- count = rsize;
- }
+ p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
+ if (!count) {
+ p9_tag_remove(clnt, req);
+ return 0;
+ }
- p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
- if (!count) {
- p9_tag_remove(clnt, req);
- break;
- }
+ if (non_zc) {
+ int n = copy_to_iter(dataptr, count, to);
- if (non_zc) {
- int n = copy_to_iter(dataptr, count, to);
- total += n;
- offset += n;
- if (n != count) {
- *err = -EFAULT;
- p9_tag_remove(clnt, req);
- break;
- }
- } else {
- iov_iter_advance(to, count);
- total += count;
- offset += count;
+ if (n != count) {
+ *err = -EFAULT;
+ p9_tag_remove(clnt, req);
+ return n;
}
- p9_tag_remove(clnt, req);
+ } else {
+ iov_iter_advance(to, count);
}
- return total;
+ p9_tag_remove(clnt, req);
+ return count;
}
-EXPORT_SYMBOL(p9_client_read);
+EXPORT_SYMBOL(p9_client_read_once);
int
p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
diff --git a/net/Kconfig b/net/Kconfig
index 2eeb0e55f7c9..df8d8c9bd021 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -52,6 +52,9 @@ config NET_INGRESS
config NET_EGRESS
bool
+config NET_REDIRECT
+ bool
+
config SKB_EXTENSIONS
bool
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index dbd2ad3a26ed..2e7bc2da8371 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3305,6 +3305,15 @@ static void hci_prepare_suspend(struct work_struct *work)
hci_dev_unlock(hdev);
}
+static int hci_change_suspend_state(struct hci_dev *hdev,
+ enum suspended_state next)
+{
+ hdev->suspend_state_next = next;
+ set_bit(SUSPEND_PREPARE_NOTIFIER, hdev->suspend_tasks);
+ queue_work(hdev->req_workqueue, &hdev->suspend_prepare);
+ return hci_suspend_wait_event(hdev);
+}
+
static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action,
void *data)
{
@@ -3330,32 +3339,24 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action,
* connectable (disabling scanning)
* - Second, program event filter/whitelist and enable scan
*/
- hdev->suspend_state_next = BT_SUSPEND_DISCONNECT;
- set_bit(SUSPEND_PREPARE_NOTIFIER, hdev->suspend_tasks);
- queue_work(hdev->req_workqueue, &hdev->suspend_prepare);
- ret = hci_suspend_wait_event(hdev);
+ ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT);
- /* If the disconnect portion failed, don't attempt to complete
- * by configuring the whitelist. The suspend notifier will
- * follow a cancelled suspend with a PM_POST_SUSPEND
- * notification.
- */
- if (!ret) {
- hdev->suspend_state_next = BT_SUSPEND_COMPLETE;
- set_bit(SUSPEND_PREPARE_NOTIFIER, hdev->suspend_tasks);
- queue_work(hdev->req_workqueue, &hdev->suspend_prepare);
- ret = hci_suspend_wait_event(hdev);
- }
+ /* Only configure whitelist if disconnect succeeded */
+ if (!ret)
+ ret = hci_change_suspend_state(hdev,
+ BT_SUSPEND_COMPLETE);
} else if (action == PM_POST_SUSPEND) {
- hdev->suspend_state_next = BT_RUNNING;
- set_bit(SUSPEND_PREPARE_NOTIFIER, hdev->suspend_tasks);
- queue_work(hdev->req_workqueue, &hdev->suspend_prepare);
- ret = hci_suspend_wait_event(hdev);
+ ret = hci_change_suspend_state(hdev, BT_RUNNING);
}
+ /* If suspend failed, restore it to running */
+ if (ret && action == PM_SUSPEND_PREPARE)
+ hci_change_suspend_state(hdev, BT_RUNNING);
+
done:
return ret ? notifier_from_errno(-EBUSY) : NOTIFY_STOP;
}
+
/* Alloc HCI device */
struct hci_dev *hci_alloc_dev(void)
{
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 20408d386268..0a591be8b0ae 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2539,16 +2539,17 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
bt_dev_err(hdev, "no memory for new conn");
goto unlock;
}
- }
-
- if (ev->link_type != SCO_LINK)
- goto unlock;
+ } else {
+ if (ev->link_type != SCO_LINK)
+ goto unlock;
- conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr);
- if (!conn)
- goto unlock;
+ conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK,
+ &ev->bdaddr);
+ if (!conn)
+ goto unlock;
- conn->type = SCO_LINK;
+ conn->type = SCO_LINK;
+ }
}
if (!ev->status) {
@@ -2962,14 +2963,14 @@ static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status,
if (!conn)
goto unlock;
- /* If we fail to read the encryption key size, assume maximum
- * (which is the same we do also when this HCI command isn't
- * supported.
+ /* While unexpected, the read_enc_key_size command may fail. The most
+ * secure approach is to then assume the key size is 0 to force a
+ * disconnection.
*/
if (rp->status) {
bt_dev_err(hdev, "failed to read key size for handle %u",
handle);
- conn->enc_key_size = HCI_LINK_KEY_SIZE;
+ conn->enc_key_size = 0;
} else {
conn->enc_key_size = rp->key_size;
}
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 8b0fca39989d..fd9d0d08f9c9 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -678,6 +678,29 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err)
}
EXPORT_SYMBOL_GPL(l2cap_chan_del);
+static void __l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func,
+ void *data)
+{
+ struct l2cap_chan *chan;
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ func(chan, data);
+ }
+}
+
+void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func,
+ void *data)
+{
+ if (!conn)
+ return;
+
+ mutex_lock(&conn->chan_lock);
+ __l2cap_chan_list(conn, func, data);
+ mutex_unlock(&conn->chan_lock);
+}
+
+EXPORT_SYMBOL_GPL(l2cap_chan_list);
+
static void l2cap_conn_update_id_addr(struct work_struct *work)
{
struct l2cap_conn *conn = container_of(work, struct l2cap_conn,
@@ -1356,29 +1379,79 @@ static void l2cap_le_connect(struct l2cap_chan *chan)
sizeof(req), &req);
}
-static void l2cap_ecred_connect(struct l2cap_chan *chan)
-{
- struct l2cap_conn *conn = chan->conn;
+struct l2cap_ecred_conn_data {
struct {
struct l2cap_ecred_conn_req req;
- __le16 scid;
+ __le16 scid[5];
} __packed pdu;
+ struct l2cap_chan *chan;
+ struct pid *pid;
+ int count;
+};
+
+static void l2cap_ecred_defer_connect(struct l2cap_chan *chan, void *data)
+{
+ struct l2cap_ecred_conn_data *conn = data;
+ struct pid *pid;
+
+ if (chan == conn->chan)
+ return;
+
+ if (!test_and_clear_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ pid = chan->ops->get_peer_pid(chan);
+
+ /* Only add deferred channels with the same PID/PSM */
+ if (conn->pid != pid || chan->psm != conn->chan->psm || chan->ident ||
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL || chan->state != BT_CONNECT)
+ return;
if (test_and_set_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags))
return;
l2cap_ecred_init(chan, 0);
- pdu.req.psm = chan->psm;
- pdu.req.mtu = cpu_to_le16(chan->imtu);
- pdu.req.mps = cpu_to_le16(chan->mps);
- pdu.req.credits = cpu_to_le16(chan->rx_credits);
- pdu.scid = cpu_to_le16(chan->scid);
+ /* Set the same ident so we can match on the rsp */
+ chan->ident = conn->chan->ident;
+
+ /* Include all channels deferred */
+ conn->pdu.scid[conn->count] = cpu_to_le16(chan->scid);
+
+ conn->count++;
+}
+
+static void l2cap_ecred_connect(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_ecred_conn_data data;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ if (test_and_set_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags))
+ return;
+
+ l2cap_ecred_init(chan, 0);
+
+ data.pdu.req.psm = chan->psm;
+ data.pdu.req.mtu = cpu_to_le16(chan->imtu);
+ data.pdu.req.mps = cpu_to_le16(chan->mps);
+ data.pdu.req.credits = cpu_to_le16(chan->rx_credits);
+ data.pdu.scid[0] = cpu_to_le16(chan->scid);
chan->ident = l2cap_get_ident(conn);
+ data.pid = chan->ops->get_peer_pid(chan);
+
+ data.count = 1;
+ data.chan = chan;
+ data.pid = chan->ops->get_peer_pid(chan);
+
+ __l2cap_chan_list(conn, l2cap_ecred_defer_connect, &data);
l2cap_send_cmd(conn, chan->ident, L2CAP_ECRED_CONN_REQ,
- sizeof(pdu), &pdu);
+ sizeof(data.pdu.req) + data.count * sizeof(__le16),
+ &data.pdu);
}
static void l2cap_le_start(struct l2cap_chan *chan)
@@ -7693,6 +7766,33 @@ static bool is_valid_psm(u16 psm, u8 dst_type) {
return ((psm & 0x0101) == 0x0001);
}
+struct l2cap_chan_data {
+ struct l2cap_chan *chan;
+ struct pid *pid;
+ int count;
+};
+
+static void l2cap_chan_by_pid(struct l2cap_chan *chan, void *data)
+{
+ struct l2cap_chan_data *d = data;
+ struct pid *pid;
+
+ if (chan == d->chan)
+ return;
+
+ if (!test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ pid = chan->ops->get_peer_pid(chan);
+
+ /* Only count deferred channels with the same PID/PSM */
+ if (d->pid != pid || chan->psm != d->chan->psm || chan->ident ||
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL || chan->state != BT_CONNECT)
+ return;
+
+ d->count++;
+}
+
int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
bdaddr_t *dst, u8 dst_type)
{
@@ -7812,6 +7912,23 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
goto done;
}
+ if (chan->mode == L2CAP_MODE_EXT_FLOWCTL) {
+ struct l2cap_chan_data data;
+
+ data.chan = chan;
+ data.pid = chan->ops->get_peer_pid(chan);
+ data.count = 1;
+
+ l2cap_chan_list(conn, l2cap_chan_by_pid, &data);
+
+ /* Check if there isn't too many channels being connected */
+ if (data.count > L2CAP_ECRED_CONN_SCID_MAX) {
+ hci_conn_drop(hcon);
+ err = -EPROTO;
+ goto done;
+ }
+ }
+
mutex_lock(&conn->chan_lock);
l2cap_chan_lock(chan);
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 40fb10b591bd..117ba20ea194 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1504,6 +1504,13 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan)
return sk->sk_sndtimeo;
}
+static struct pid *l2cap_sock_get_peer_pid_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ return sk->sk_peer_pid;
+}
+
static void l2cap_sock_suspend_cb(struct l2cap_chan *chan)
{
struct sock *sk = chan->data;
@@ -1525,6 +1532,7 @@ static const struct l2cap_ops l2cap_chan_ops = {
.suspend = l2cap_sock_suspend_cb,
.set_shutdown = l2cap_sock_set_shutdown_cb,
.get_sndtimeo = l2cap_sock_get_sndtimeo_cb,
+ .get_peer_pid = l2cap_sock_get_peer_pid_cb,
.alloc_skb = l2cap_sock_alloc_skb_cb,
};
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 4c921f5154e0..29dbdd4c29f6 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -114,6 +114,9 @@ out:
* architecture dependent calling conventions. 7+ can be supported in the
* future.
*/
+__diag_push();
+__diag_ignore(GCC, 8, "-Wmissing-prototypes",
+ "Global functions as their definitions will be in vmlinux BTF");
int noinline bpf_fentry_test1(int a)
{
return a + 1;
@@ -149,6 +152,7 @@ int noinline bpf_modify_return_test(int a, int *b)
*b += 1;
return a + *b;
}
+__diag_pop();
ALLOW_ERROR_INJECTION(bpf_modify_return_test, ERRNO);
diff --git a/net/bpfilter/.gitignore b/net/bpfilter/.gitignore
index e97084e3eea2..f34e85ee8204 100644
--- a/net/bpfilter/.gitignore
+++ b/net/bpfilter/.gitignore
@@ -1 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
bpfilter_umh
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
index 77396a098fbe..05e1cfc1e5cd 100644
--- a/net/bpfilter/main.c
+++ b/net/bpfilter/main.c
@@ -10,7 +10,7 @@
#include <asm/unistd.h>
#include "msgfmt.h"
-int debug_fd;
+FILE *debug_f;
static int handle_get_cmd(struct mbox_request *cmd)
{
@@ -37,7 +37,7 @@ static void loop(void)
n = read(0, &req, sizeof(req));
if (n != sizeof(req)) {
- dprintf(debug_fd, "invalid request %d\n", n);
+ fprintf(debug_f, "invalid request %d\n", n);
return;
}
@@ -47,7 +47,7 @@ static void loop(void)
n = write(1, &reply, sizeof(reply));
if (n != sizeof(reply)) {
- dprintf(debug_fd, "reply failed %d\n", n);
+ fprintf(debug_f, "reply failed %d\n", n);
return;
}
}
@@ -55,9 +55,10 @@ static void loop(void)
int main(void)
{
- debug_fd = open("/dev/kmsg", 00000002);
- dprintf(debug_fd, "Started bpfilter\n");
+ debug_f = fopen("/dev/kmsg", "w");
+ setvbuf(debug_f, 0, _IOLBF, 0);
+ fprintf(debug_f, "Started bpfilter\n");
loop();
- close(debug_fd);
+ fclose(debug_f);
return 0;
}
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 7cb992e55475..1344f232ecc5 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -383,11 +383,11 @@ static int client_options_show(struct seq_file *s, void *p)
return 0;
}
-CEPH_DEFINE_SHOW_FUNC(monmap_show)
-CEPH_DEFINE_SHOW_FUNC(osdmap_show)
-CEPH_DEFINE_SHOW_FUNC(monc_show)
-CEPH_DEFINE_SHOW_FUNC(osdc_show)
-CEPH_DEFINE_SHOW_FUNC(client_options_show)
+DEFINE_SHOW_ATTRIBUTE(monmap);
+DEFINE_SHOW_ATTRIBUTE(osdmap);
+DEFINE_SHOW_ATTRIBUTE(monc);
+DEFINE_SHOW_ATTRIBUTE(osdc);
+DEFINE_SHOW_ATTRIBUTE(client_options);
void __init ceph_debugfs_init(void)
{
@@ -414,31 +414,31 @@ void ceph_debugfs_client_init(struct ceph_client *client)
0400,
client->debugfs_dir,
client,
- &monc_show_fops);
+ &monc_fops);
client->osdc.debugfs_file = debugfs_create_file("osdc",
0400,
client->debugfs_dir,
client,
- &osdc_show_fops);
+ &osdc_fops);
client->debugfs_monmap = debugfs_create_file("monmap",
0400,
client->debugfs_dir,
client,
- &monmap_show_fops);
+ &monmap_fops);
client->debugfs_osdmap = debugfs_create_file("osdmap",
0400,
client->debugfs_dir,
client,
- &osdmap_show_fops);
+ &osdmap_fops);
client->debugfs_options = debugfs_create_file("client_options",
0400,
client->debugfs_dir,
client,
- &client_options_show_fops);
+ &client_options_fops);
}
void ceph_debugfs_client_cleanup(struct ceph_client *client)
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 5b4bd8261002..f8ca5edc5f2c 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -3248,12 +3248,16 @@ static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg)
static void ceph_msg_data_destroy(struct ceph_msg_data *data)
{
- if (data->type == CEPH_MSG_DATA_PAGELIST)
+ if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) {
+ int num_pages = calc_pages_for(data->alignment, data->length);
+ ceph_release_page_vector(data->pages, num_pages);
+ } else if (data->type == CEPH_MSG_DATA_PAGELIST) {
ceph_pagelist_release(data->pagelist);
+ }
}
void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
- size_t length, size_t alignment)
+ size_t length, size_t alignment, bool own_pages)
{
struct ceph_msg_data *data;
@@ -3265,6 +3269,7 @@ void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
data->pages = pages;
data->length = length;
data->alignment = alignment & ~PAGE_MASK;
+ data->own_pages = own_pages;
msg->data_length += length;
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 9d9e4e4ea600..3d8c8015e976 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -467,7 +467,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
struct ceph_msg *msg)
{
struct ceph_client *client = monc->client;
- struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+ struct ceph_monmap *monmap;
void *p, *end;
mutex_lock(&monc->mutex);
@@ -484,13 +484,13 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
goto out;
}
- if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+ if (ceph_check_fsid(client, &monmap->fsid) < 0) {
kfree(monmap);
goto out;
}
- client->monc.monmap = monmap;
- kfree(old);
+ kfree(monc->monmap);
+ monc->monmap = monmap;
__ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
client->have_fsid = true;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b68b376d8c2f..998e26b75a78 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -962,7 +962,7 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
BUG_ON(length > (u64) SIZE_MAX);
if (length)
ceph_msg_data_add_pages(msg, osd_data->pages,
- length, osd_data->alignment);
+ length, osd_data->alignment, false);
} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
BUG_ON(!length);
ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
@@ -3483,9 +3483,6 @@ static int ceph_redirect_decode(void **p, void *end,
goto e_inval;
}
- len = ceph_decode_32(p);
- *p += len; /* skip osd_instructions */
-
/* skip the rest */
*p = struct_end;
out:
@@ -4436,9 +4433,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
CEPH_MSG_DATA_PAGES);
*lreq->preply_pages = data->pages;
*lreq->preply_len = data->length;
- } else {
- ceph_release_page_vector(data->pages,
- calc_pages_for(0, data->length));
+ data->own_pages = false;
}
}
lreq->notify_finish_error = return_code;
@@ -5230,85 +5225,6 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
ceph_msgpool_destroy(&osdc->msgpool_op_reply);
}
-/*
- * Read some contiguous pages. If we cross a stripe boundary, shorten
- * *plen. Return number of bytes read, or error.
- */
-int ceph_osdc_readpages(struct ceph_osd_client *osdc,
- struct ceph_vino vino, struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int num_pages, int page_align)
-{
- struct ceph_osd_request *req;
- int rc = 0;
-
- dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
- vino.snap, off, *plen);
- req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
- NULL, truncate_seq, truncate_size,
- false);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short read due to an object boundary */
- osd_req_op_extent_osd_data_pages(req, 0,
- pages, *plen, page_align, false, false);
-
- dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
- off, *plen, *plen, page_align);
-
- rc = ceph_osdc_start_request(osdc, req, false);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_osdc_put_request(req);
- dout("readpages result %d\n", rc);
- return rc;
-}
-EXPORT_SYMBOL(ceph_osdc_readpages);
-
-/*
- * do a synchronous write on N pages
- */
-int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *snapc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec64 *mtime,
- struct page **pages, int num_pages)
-{
- struct ceph_osd_request *req;
- int rc = 0;
- int page_align = off & ~PAGE_MASK;
-
- req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
- CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
- snapc, truncate_seq, truncate_size,
- true);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short write due to an object boundary */
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
- false, false);
- dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
-
- req->r_mtime = *mtime;
- rc = ceph_osdc_start_request(osdc, req, true);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_osdc_put_request(req);
- if (rc == 0)
- rc = len;
- dout("writepages result %d\n", rc);
- return rc;
-}
-EXPORT_SYMBOL(ceph_osdc_writepages);
-
static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
u64 src_snapid, u64 src_version,
struct ceph_object_id *src_oid,
@@ -5506,9 +5422,6 @@ out_unlock_osdc:
return m;
}
-/*
- * TODO: switch to a msg-owned pagelist
- */
static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
{
struct ceph_msg *m;
@@ -5522,7 +5435,6 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
if (data_len) {
struct page **pages;
- struct ceph_osd_data osd_data;
pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
GFP_NOIO);
@@ -5531,9 +5443,7 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
return NULL;
}
- ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
- false);
- ceph_osdc_msg_data_add(m, &osd_data);
+ ceph_msg_data_add_pages(m, pages, data_len, 0, true);
}
return m;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 4e0de14f80bb..2a6e63a8edbe 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -710,6 +710,15 @@ int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
}
EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id)
+{
+ struct ceph_pg_pool_info *pi;
+
+ pi = __lookup_pg_pool(&map->pg_pools, id);
+ return pi ? pi->flags : 0;
+}
+EXPORT_SYMBOL(ceph_pg_pool_flags);
+
static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
{
rb_erase(&pi->node, root);
diff --git a/net/compat.c b/net/compat.c
index 47d99c784947..4bed96e84d9a 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -33,10 +33,10 @@
#include <linux/uaccess.h>
#include <net/compat.h>
-int get_compat_msghdr(struct msghdr *kmsg,
- struct compat_msghdr __user *umsg,
- struct sockaddr __user **save_addr,
- struct iovec **iov)
+int __get_compat_msghdr(struct msghdr *kmsg,
+ struct compat_msghdr __user *umsg,
+ struct sockaddr __user **save_addr,
+ compat_uptr_t *ptr, compat_size_t *len)
{
struct compat_msghdr msg;
ssize_t err;
@@ -79,10 +79,26 @@ int get_compat_msghdr(struct msghdr *kmsg,
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
+ *ptr = msg.msg_iov;
+ *len = msg.msg_iovlen;
+ return 0;
+}
+
+int get_compat_msghdr(struct msghdr *kmsg,
+ struct compat_msghdr __user *umsg,
+ struct sockaddr __user **save_addr,
+ struct iovec **iov)
+{
+ compat_uptr_t ptr;
+ compat_size_t len;
+ ssize_t err;
+
+ err = __get_compat_msghdr(kmsg, umsg, save_addr, &ptr, &len);
+ if (err)
+ return err;
- err = compat_import_iovec(save_addr ? READ : WRITE,
- compat_ptr(msg.msg_iov), msg.msg_iovlen,
- UIO_FASTIOV, iov, &kmsg->msg_iter);
+ err = compat_import_iovec(save_addr ? READ : WRITE, compat_ptr(ptr),
+ len, UIO_FASTIOV, iov, &kmsg->msg_iter);
return err < 0 ? err : 0;
}
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 4213081c6ed3..639745d4f3b9 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -51,6 +51,7 @@
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/uio.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -403,6 +404,11 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
}
EXPORT_SYMBOL(skb_kill_datagram);
+INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr,
+ size_t bytes,
+ void *data __always_unused,
+ struct iov_iter *i));
+
static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
struct iov_iter *to, int len, bool fault_short,
size_t (*cb)(const void *, size_t, void *,
@@ -416,7 +422,8 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > 0) {
if (copy > len)
copy = len;
- n = cb(skb->data + offset, copy, data, to);
+ n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ skb->data + offset, copy, data, to);
offset += n;
if (n != copy)
goto short_copy;
@@ -438,8 +445,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
- n = cb(vaddr + skb_frag_off(frag) + offset - start,
- copy, data, to);
+ n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ vaddr + skb_frag_off(frag) + offset - start,
+ copy, data, to);
kunmap(page);
offset += n;
if (n != copy)
diff --git a/net/core/dev.c b/net/core/dev.c
index 021e18251465..9c9e763bfe0e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3266,7 +3266,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*
- * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
+ * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
*/
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
@@ -3295,7 +3295,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
features &= ~NETIF_F_GSO_PARTIAL;
}
- BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
+ BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
@@ -4516,7 +4516,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
*/
- if (skb_is_tc_redirected(skb))
+ if (skb_is_redirected(skb))
return XDP_PASS;
/* XDP packets must be linear and must have sufficient headroom
@@ -5063,7 +5063,7 @@ skip_taps:
goto out;
}
#endif
- skb_reset_tc(skb);
+ skb_reset_redirect(skb);
skip_classify:
if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
goto drop;
@@ -5195,7 +5195,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
*
* More direct receive version of netif_receive_skb(). It should
* only be used by callers that have a need to skip RPS and Generic XDP.
- * Caller must also take care of handling if (page_is_)pfmemalloc.
+ * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
@@ -8655,15 +8655,17 @@ static void dev_xdp_uninstall(struct net_device *dev)
* @dev: device
* @extack: netlink extended ack
* @fd: new program fd or negative value to clear
+ * @expected_fd: old program fd that userspace expects to replace or clear
* @flags: xdp-related flags
*
* Set or clear a bpf program for a device
*/
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
- int fd, u32 flags)
+ int fd, int expected_fd, u32 flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
enum bpf_netdev_command query;
+ u32 prog_id, expected_id = 0;
struct bpf_prog *prog = NULL;
bpf_op_t bpf_op, bpf_chk;
bool offload;
@@ -8684,15 +8686,29 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
if (bpf_op == bpf_chk)
bpf_chk = generic_xdp_install;
- if (fd >= 0) {
- u32 prog_id;
+ prog_id = __dev_xdp_query(dev, bpf_op, query);
+ if (flags & XDP_FLAGS_REPLACE) {
+ if (expected_fd >= 0) {
+ prog = bpf_prog_get_type_dev(expected_fd,
+ BPF_PROG_TYPE_XDP,
+ bpf_op == ops->ndo_bpf);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+ expected_id = prog->aux->id;
+ bpf_prog_put(prog);
+ }
+ if (prog_id != expected_id) {
+ NL_SET_ERR_MSG(extack, "Active program does not match expected");
+ return -EEXIST;
+ }
+ }
+ if (fd >= 0) {
if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
return -EEXIST;
}
- prog_id = __dev_xdp_query(dev, bpf_op, query);
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
NL_SET_ERR_MSG(extack, "XDP program already attached");
return -EBUSY;
@@ -8715,7 +8731,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
return 0;
}
} else {
- if (!__dev_xdp_query(dev, bpf_op, query))
+ if (!prog_id)
return 0;
}
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index dbaebbe573f0..547b587c1950 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -190,6 +190,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
case HWTSTAMP_TX_ONESTEP_P2P:
tx_type_valid = 1;
break;
+ case __HWTSTAMP_TX_CNT:
+ /* not a real value */
+ break;
}
switch (rx_filter) {
@@ -211,6 +214,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
case HWTSTAMP_FILTER_NTP_ALL:
rx_filter_valid = 1;
break;
+ case __HWTSTAMP_FILTER_CNT:
+ /* not a real value */
+ break;
}
if (!tx_type_valid || !rx_filter_valid)
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 73bb8fbe3393..80f97722f31f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -344,7 +344,7 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
struct devlink_region {
struct devlink *devlink;
struct list_head list;
- const char *name;
+ const struct devlink_region_ops *ops;
struct list_head snapshot_list;
u32 max_snapshots;
u32 cur_snapshots;
@@ -354,7 +354,6 @@ struct devlink_region {
struct devlink_snapshot {
struct list_head list;
struct devlink_region *region;
- devlink_snapshot_data_dest_t *data_destructor;
u8 *data;
u32 id;
};
@@ -365,7 +364,7 @@ devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
struct devlink_region *region;
list_for_each_entry(region, &devlink->region_list, list)
- if (!strcmp(region->name, region_name))
+ if (!strcmp(region->ops->name, region_name))
return region;
return NULL;
@@ -2710,7 +2709,7 @@ static struct net *devlink_netns_get(struct sk_buff *skb,
struct net *net;
if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) {
- NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified");
+ NL_SET_ERR_MSG_MOD(info->extack, "multiple netns identifying attributes specified");
return ERR_PTR(-EINVAL);
}
@@ -2728,7 +2727,7 @@ static struct net *devlink_netns_get(struct sk_buff *skb,
net = ERR_PTR(-EINVAL);
}
if (IS_ERR(net)) {
- NL_SET_ERR_MSG(info->extack, "Unknown network namespace");
+ NL_SET_ERR_MSG_MOD(info->extack, "Unknown network namespace");
return ERR_PTR(-EINVAL);
}
if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
@@ -3695,7 +3694,7 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
if (err)
goto nla_put_failure;
- err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name);
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name);
if (err)
goto nla_put_failure;
@@ -3741,7 +3740,7 @@ static void devlink_nl_region_notify(struct devlink_region *region,
goto out_cancel_msg;
err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
- region->name);
+ region->ops->name);
if (err)
goto out_cancel_msg;
@@ -3769,13 +3768,201 @@ out_free_msg:
nlmsg_free(msg);
}
+/**
+ * __devlink_snapshot_id_increment - Increment number of snapshots using an id
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Track when a new snapshot begins using an id. Load the count for the
+ * given id from the snapshot xarray, increment it, and store it back.
+ *
+ * Called when a new snapshot is created with the given id.
+ *
+ * The id *must* have been previously allocated by
+ * devlink_region_snapshot_id_get().
+ *
+ * Returns 0 on success, or an error on failure.
+ */
+static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id)
+{
+ unsigned long count;
+ void *p;
+
+ lockdep_assert_held(&devlink->lock);
+
+ p = xa_load(&devlink->snapshot_ids, id);
+ if (WARN_ON(!p))
+ return -EINVAL;
+
+ if (WARN_ON(!xa_is_value(p)))
+ return -EINVAL;
+
+ count = xa_to_value(p);
+ count++;
+
+ return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
+ GFP_KERNEL));
+}
+
+/**
+ * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Track when a snapshot is deleted and stops using an id. Load the count
+ * for the given id from the snapshot xarray, decrement it, and store it
+ * back.
+ *
+ * If the count reaches zero, erase this id from the xarray, freeing it
+ * up for future re-use by devlink_region_snapshot_id_get().
+ *
+ * Called when a snapshot using the given id is deleted, and when the
+ * initial allocator of the id is finished using it.
+ */
+static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id)
+{
+ unsigned long count;
+ void *p;
+
+ lockdep_assert_held(&devlink->lock);
+
+ p = xa_load(&devlink->snapshot_ids, id);
+ if (WARN_ON(!p))
+ return;
+
+ if (WARN_ON(!xa_is_value(p)))
+ return;
+
+ count = xa_to_value(p);
+
+ if (count > 1) {
+ count--;
+ xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
+ GFP_KERNEL);
+ } else {
+ /* If this was the last user, we can erase this id */
+ xa_erase(&devlink->snapshot_ids, id);
+ }
+}
+
+/**
+ * __devlink_snapshot_id_insert - Insert a specific snapshot ID
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Mark the given snapshot id as used by inserting a zero value into the
+ * snapshot xarray.
+ *
+ * This must be called while holding the devlink instance lock. Unlike
+ * devlink_snapshot_id_get, the initial reference count is zero, not one.
+ * It is expected that the id will immediately be used before
+ * releasing the devlink instance lock.
+ *
+ * Returns zero on success, or an error code if the snapshot id could not
+ * be inserted.
+ */
+static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id)
+{
+ lockdep_assert_held(&devlink->lock);
+
+ if (WARN_ON(xa_load(&devlink->snapshot_ids, id)))
+ return -EEXIST;
+
+ return xa_err(xa_store(&devlink->snapshot_ids, id, xa_mk_value(0),
+ GFP_KERNEL));
+}
+
+/**
+ * __devlink_region_snapshot_id_get - get snapshot ID
+ * @devlink: devlink instance
+ * @id: storage to return snapshot id
+ *
+ * Allocates a new snapshot id. Returns zero on success, or a negative
+ * error on failure. Must be called while holding the devlink instance
+ * lock.
+ *
+ * Snapshot IDs are tracked using an xarray which stores the number of
+ * users of the snapshot id.
+ *
+ * Note that the caller of this function counts as a 'user', in order to
+ * avoid race conditions. The caller must release its hold on the
+ * snapshot by using devlink_region_snapshot_id_put.
+ */
+static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
+{
+ lockdep_assert_held(&devlink->lock);
+
+ return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1),
+ xa_limit_32b, GFP_KERNEL);
+}
+
+/**
+ * __devlink_region_snapshot_create - create a new snapshot
+ * This will add a new snapshot of a region. The snapshot
+ * will be stored on the region struct and can be accessed
+ * from devlink. This is useful for future analyses of snapshots.
+ * Multiple snapshots can be created on a region.
+ * The @snapshot_id should be obtained using the getter function.
+ *
+ * Must be called only while holding the devlink instance lock.
+ *
+ * @region: devlink region of the snapshot
+ * @data: snapshot data
+ * @snapshot_id: snapshot id to be created
+ */
+static int
+__devlink_region_snapshot_create(struct devlink_region *region,
+ u8 *data, u32 snapshot_id)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot;
+ int err;
+
+ lockdep_assert_held(&devlink->lock);
+
+ /* check if region can hold one more snapshot */
+ if (region->cur_snapshots == region->max_snapshots)
+ return -ENOSPC;
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id))
+ return -EEXIST;
+
+ snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
+ if (!snapshot)
+ return -ENOMEM;
+
+ err = __devlink_snapshot_id_increment(devlink, snapshot_id);
+ if (err)
+ goto err_snapshot_id_increment;
+
+ snapshot->id = snapshot_id;
+ snapshot->region = region;
+ snapshot->data = data;
+
+ list_add_tail(&snapshot->list, &region->snapshot_list);
+
+ region->cur_snapshots++;
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
+ return 0;
+
+err_snapshot_id_increment:
+ kfree(snapshot);
+ return err;
+}
+
static void devlink_region_snapshot_del(struct devlink_region *region,
struct devlink_snapshot *snapshot)
{
+ struct devlink *devlink = region->devlink;
+
+ lockdep_assert_held(&devlink->lock);
+
devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
region->cur_snapshots--;
list_del(&snapshot->list);
- (*snapshot->data_destructor)(snapshot->data);
+ region->ops->destructor(snapshot->data);
+ __devlink_snapshot_id_decrement(devlink, snapshot->id);
kfree(snapshot);
}
@@ -3878,6 +4065,71 @@ static int devlink_nl_cmd_region_del(struct sk_buff *skb,
return 0;
}
+static int
+devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_region *region;
+ const char *region_name;
+ u32 snapshot_id;
+ u8 *data;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_REGION_NAME]) {
+ NL_SET_ERR_MSG_MOD(info->extack, "No region name provided");
+ return -EINVAL;
+ }
+
+ if (!info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) {
+ NL_SET_ERR_MSG_MOD(info->extack, "No snapshot id provided");
+ return -EINVAL;
+ }
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ region = devlink_region_get_by_name(devlink, region_name);
+ if (!region) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not exist");
+ return -EINVAL;
+ }
+
+ if (!region->ops->snapshot) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested region does not support taking an immediate snapshot");
+ return -EOPNOTSUPP;
+ }
+
+ if (region->cur_snapshots == region->max_snapshots) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The region has reached the maximum number of stored snapshots");
+ return -ENOSPC;
+ }
+
+ snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
+ NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use");
+ return -EEXIST;
+ }
+
+ err = __devlink_snapshot_id_insert(devlink, snapshot_id);
+ if (err)
+ return err;
+
+ err = region->ops->snapshot(devlink, info->extack, &data);
+ if (err)
+ goto err_snapshot_capture;
+
+ err = __devlink_region_snapshot_create(region, data, snapshot_id);
+ if (err)
+ goto err_snapshot_create;
+
+ return 0;
+
+err_snapshot_create:
+ region->ops->destructor(data);
+err_snapshot_capture:
+ __devlink_snapshot_id_decrement(devlink, snapshot_id);
+ return err;
+}
+
static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
struct devlink *devlink,
u8 *chunk, u32 chunk_size,
@@ -4837,6 +5089,7 @@ struct devlink_health_reporter {
struct mutex dump_lock; /* lock parallel read/write from dump buffers */
u64 graceful_period;
bool auto_recover;
+ bool auto_dump;
u8 health_state;
u64 dump_ts;
u64 dump_real_ts;
@@ -4872,14 +5125,12 @@ devlink_health_reporter_find_by_name(struct devlink *devlink,
* @devlink: devlink
* @ops: ops
* @graceful_period: to avoid recovery loops, in msecs
- * @auto_recover: auto recover when error occurs
* @priv: priv
*/
struct devlink_health_reporter *
devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, bool auto_recover,
- void *priv)
+ u64 graceful_period, void *priv)
{
struct devlink_health_reporter *reporter;
@@ -4889,8 +5140,7 @@ devlink_health_reporter_create(struct devlink *devlink,
goto unlock;
}
- if (WARN_ON(auto_recover && !ops->recover) ||
- WARN_ON(graceful_period && !ops->recover)) {
+ if (WARN_ON(graceful_period && !ops->recover)) {
reporter = ERR_PTR(-EINVAL);
goto unlock;
}
@@ -4905,7 +5155,8 @@ devlink_health_reporter_create(struct devlink *devlink,
reporter->ops = ops;
reporter->devlink = devlink;
reporter->graceful_period = graceful_period;
- reporter->auto_recover = auto_recover;
+ reporter->auto_recover = !!ops->recover;
+ reporter->auto_dump = !!ops->dump;
mutex_init(&reporter->dump_lock);
refcount_set(&reporter->refcount, 1);
list_add_tail(&reporter->list, &devlink->reporter_list);
@@ -4986,6 +5237,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS,
reporter->dump_real_ts, DEVLINK_ATTR_PAD))
goto reporter_nest_cancel;
+ if (reporter->ops->dump &&
+ nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP,
+ reporter->auto_dump))
+ goto reporter_nest_cancel;
nla_nest_end(msg, reporter_attr);
genlmsg_end(msg, hdr);
@@ -5132,10 +5387,12 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
- mutex_lock(&reporter->dump_lock);
- /* store current dump of current error, for later analysis */
- devlink_health_do_dump(reporter, priv_ctx, NULL);
- mutex_unlock(&reporter->dump_lock);
+ if (reporter->auto_dump) {
+ mutex_lock(&reporter->dump_lock);
+ /* store current dump of current error, for later analysis */
+ devlink_health_do_dump(reporter, priv_ctx, NULL);
+ mutex_unlock(&reporter->dump_lock);
+ }
if (reporter->auto_recover)
return devlink_health_reporter_recover(reporter,
@@ -5309,6 +5566,11 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
err = -EOPNOTSUPP;
goto out;
}
+ if (!reporter->ops->dump &&
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
reporter->graceful_period =
@@ -5318,6 +5580,10 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb,
reporter->auto_recover =
nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]);
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
+ reporter->auto_dump =
+ nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]);
+
devlink_health_reporter_put(reporter);
return 0;
out:
@@ -5454,8 +5720,26 @@ struct devlink_stats {
};
/**
+ * struct devlink_trap_policer_item - Packet trap policer attributes.
+ * @policer: Immutable packet trap policer attributes.
+ * @rate: Rate in packets / sec.
+ * @burst: Burst size in packets.
+ * @list: trap_policer_list member.
+ *
+ * Describes packet trap policer attributes. Created by devlink during trap
+ * policer registration.
+ */
+struct devlink_trap_policer_item {
+ const struct devlink_trap_policer *policer;
+ u64 rate;
+ u64 burst;
+ struct list_head list;
+};
+
+/**
* struct devlink_trap_group_item - Packet trap group attributes.
* @group: Immutable packet trap group attributes.
+ * @policer_item: Associated policer item. Can be NULL.
* @list: trap_group_list member.
* @stats: Trap group statistics.
*
@@ -5464,6 +5748,7 @@ struct devlink_stats {
*/
struct devlink_trap_group_item {
const struct devlink_trap_group *group;
+ struct devlink_trap_policer_item *policer_item;
struct list_head list;
struct devlink_stats __percpu *stats;
};
@@ -5489,6 +5774,19 @@ struct devlink_trap_item {
void *priv;
};
+static struct devlink_trap_policer_item *
+devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ list_for_each_entry(policer_item, &devlink->trap_policer_list, list) {
+ if (policer_item->policer->id == id)
+ return policer_item;
+ }
+
+ return NULL;
+}
+
static struct devlink_trap_item *
devlink_trap_item_lookup(struct devlink *devlink, const char *name)
{
@@ -5865,6 +6163,11 @@ devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink,
nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
goto nla_put_failure;
+ if (group_item->policer_item &&
+ nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
+ group_item->policer_item->policer->id))
+ goto nla_put_failure;
+
err = devlink_trap_stats_put(msg, group_item->stats);
if (err)
goto nla_put_failure;
@@ -5980,7 +6283,7 @@ __devlink_trap_group_action_set(struct devlink *devlink,
static int
devlink_trap_group_action_set(struct devlink *devlink,
struct devlink_trap_group_item *group_item,
- struct genl_info *info)
+ struct genl_info *info, bool *p_modified)
{
enum devlink_trap_action trap_action;
int err;
@@ -5999,6 +6302,47 @@ devlink_trap_group_action_set(struct devlink *devlink,
if (err)
return err;
+ *p_modified = true;
+
+ return 0;
+}
+
+static int devlink_trap_group_set(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ const struct devlink_trap_policer *policer;
+ struct nlattr **attrs = info->attrs;
+ int err;
+
+ if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
+ return 0;
+
+ if (!devlink->ops->trap_group_set)
+ return -EOPNOTSUPP;
+
+ policer_item = group_item->policer_item;
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_ID]) {
+ u32 policer_id;
+
+ policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
+ policer_item = devlink_trap_policer_item_lookup(devlink,
+ policer_id);
+ if (policer_id && !policer_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+ }
+ policer = policer_item ? policer_item->policer : NULL;
+
+ err = devlink->ops->trap_group_set(devlink, group_item->group, policer);
+ if (err)
+ return err;
+
+ group_item->policer_item = policer_item;
+
return 0;
}
@@ -6008,6 +6352,7 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb,
struct netlink_ext_ack *extack = info->extack;
struct devlink *devlink = info->user_ptr[0];
struct devlink_trap_group_item *group_item;
+ bool modified = false;
int err;
if (list_empty(&devlink->trap_group_list))
@@ -6019,14 +6364,262 @@ static int devlink_nl_cmd_trap_group_set_doit(struct sk_buff *skb,
return -ENOENT;
}
- err = devlink_trap_group_action_set(devlink, group_item, info);
+ err = devlink_trap_group_action_set(devlink, group_item, info,
+ &modified);
+ if (err)
+ return err;
+
+ err = devlink_trap_group_set(devlink, group_item, info);
+ if (err)
+ goto err_trap_group_set;
+
+ return 0;
+
+err_trap_group_set:
+ if (modified)
+ NL_SET_ERR_MSG_MOD(extack, "Trap group set failed, but some changes were committed already");
+ return err;
+}
+
+static struct devlink_trap_policer_item *
+devlink_trap_policer_item_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ u32 id;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
+ return NULL;
+ id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
+
+ return devlink_trap_policer_item_lookup(devlink, id);
+}
+
+static int
+devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct nlattr *attr;
+ u64 drops;
+ int err;
+
+ if (!devlink->ops->trap_policer_counter_get)
+ return 0;
+
+ err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops);
if (err)
return err;
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops,
+ DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int
+devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
+ policer_item->policer->id))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_RATE,
+ policer_item->rate, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_TRAP_POLICER_BURST,
+ policer_item->burst, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+ err = devlink_trap_policer_stats_put(msg, devlink,
+ policer_item->policer);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_cmd_trap_policer_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ if (list_empty(&devlink->trap_policer_list))
+ return -EOPNOTSUPP;
+
+ policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
+ if (!policer_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err)
+ goto err_trap_policer_fill;
+
+ return genlmsg_reply(msg, info);
+
+err_trap_policer_fill:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ enum devlink_command cmd = DEVLINK_CMD_TRAP_POLICER_NEW;
+ struct devlink_trap_policer_item *policer_item;
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ mutex_lock(&devlink->lock);
+ list_for_each_entry(policer_item, &devlink->trap_policer_list,
+ list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_trap_policer_fill(msg, devlink,
+ policer_item, cmd,
+ portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err) {
+ mutex_unlock(&devlink->lock);
+ goto out;
+ }
+ idx++;
+ }
+ mutex_unlock(&devlink->lock);
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int
+devlink_trap_policer_set(struct devlink *devlink,
+ struct devlink_trap_policer_item *policer_item,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nlattr **attrs = info->attrs;
+ u64 rate, burst;
+ int err;
+
+ rate = policer_item->rate;
+ burst = policer_item->burst;
+
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE])
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]);
+
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST])
+ burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]);
+
+ if (rate < policer_item->policer->min_rate) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer rate lower than limit");
+ return -EINVAL;
+ }
+
+ if (rate > policer_item->policer->max_rate) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer rate higher than limit");
+ return -EINVAL;
+ }
+
+ if (burst < policer_item->policer->min_burst) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer burst size lower than limit");
+ return -EINVAL;
+ }
+
+ if (burst > policer_item->policer->max_burst) {
+ NL_SET_ERR_MSG_MOD(extack, "Policer burst size higher than limit");
+ return -EINVAL;
+ }
+
+ err = devlink->ops->trap_policer_set(devlink, policer_item->policer,
+ rate, burst, info->extack);
+ if (err)
+ return err;
+
+ policer_item->rate = rate;
+ policer_item->burst = burst;
+
+ return 0;
+}
+
+static int devlink_nl_cmd_trap_policer_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (list_empty(&devlink->trap_policer_list))
+ return -EOPNOTSUPP;
+
+ if (!devlink->ops->trap_policer_set)
+ return -EOPNOTSUPP;
+
+ policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
+ if (!policer_item) {
+ NL_SET_ERR_MSG_MOD(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+
+ return devlink_trap_policer_set(devlink, policer_item, info);
}
static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
+ [DEVLINK_ATTR_UNSPEC] = { .strict_start_type =
+ DEVLINK_ATTR_TRAP_POLICER_ID },
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
@@ -6064,6 +6657,10 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
[DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32 },
[DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32 },
[DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8 },
+ [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64 },
+ [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64 },
};
static const struct genl_ops devlink_nl_ops[] = {
@@ -6187,7 +6784,8 @@ static const struct genl_ops devlink_nl_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_eswitch_get_doit,
.flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
+ DEVLINK_NL_FLAG_NO_LOCK,
},
{
.cmd = DEVLINK_CMD_ESWITCH_SET,
@@ -6286,6 +6884,13 @@ static const struct genl_ops devlink_nl_ops[] = {
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
{
+ .cmd = DEVLINK_CMD_REGION_NEW,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = devlink_nl_cmd_region_new,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
+ {
.cmd = DEVLINK_CMD_REGION_DEL,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = devlink_nl_cmd_region_del,
@@ -6391,6 +6996,19 @@ static const struct genl_ops devlink_nl_ops[] = {
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
},
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_GET,
+ .doit = devlink_nl_cmd_trap_policer_get_doit,
+ .dumpit = devlink_nl_cmd_trap_policer_get_dumpit,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_SET,
+ .doit = devlink_nl_cmd_trap_policer_set_doit,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
+ },
};
static struct genl_family devlink_nl_family __ro_after_init = {
@@ -6428,6 +7046,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
if (!devlink)
return NULL;
devlink->ops = ops;
+ xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
__devlink_net_set(devlink, &init_net);
INIT_LIST_HEAD(&devlink->port_list);
INIT_LIST_HEAD(&devlink->sb_list);
@@ -6438,6 +7057,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
INIT_LIST_HEAD(&devlink->reporter_list);
INIT_LIST_HEAD(&devlink->trap_list);
INIT_LIST_HEAD(&devlink->trap_group_list);
+ INIT_LIST_HEAD(&devlink->trap_policer_list);
mutex_init(&devlink->lock);
mutex_init(&devlink->reporters_lock);
return devlink;
@@ -6522,6 +7142,7 @@ void devlink_free(struct devlink *devlink)
{
mutex_destroy(&devlink->reporters_lock);
mutex_destroy(&devlink->lock);
+ WARN_ON(!list_empty(&devlink->trap_policer_list));
WARN_ON(!list_empty(&devlink->trap_group_list));
WARN_ON(!list_empty(&devlink->trap_list));
WARN_ON(!list_empty(&devlink->reporter_list));
@@ -6532,6 +7153,8 @@ void devlink_free(struct devlink *devlink)
WARN_ON(!list_empty(&devlink->sb_list));
WARN_ON(!list_empty(&devlink->port_list));
+ xa_destroy(&devlink->snapshot_ids);
+
kfree(devlink);
}
EXPORT_SYMBOL_GPL(devlink_free);
@@ -7647,21 +8270,24 @@ EXPORT_SYMBOL_GPL(devlink_param_value_str_fill);
* devlink_region_create - create a new address region
*
* @devlink: devlink
- * @region_name: region name
+ * @ops: region operations and name
* @region_max_snapshots: Maximum supported number of snapshots for region
* @region_size: size of region
*/
-struct devlink_region *devlink_region_create(struct devlink *devlink,
- const char *region_name,
- u32 region_max_snapshots,
- u64 region_size)
+struct devlink_region *
+devlink_region_create(struct devlink *devlink,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
{
struct devlink_region *region;
int err = 0;
+ if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
+ return ERR_PTR(-EINVAL);
+
mutex_lock(&devlink->lock);
- if (devlink_region_get_by_name(devlink, region_name)) {
+ if (devlink_region_get_by_name(devlink, ops->name)) {
err = -EEXIST;
goto unlock;
}
@@ -7674,7 +8300,7 @@ struct devlink_region *devlink_region_create(struct devlink *devlink,
region->devlink = devlink;
region->max_snapshots = region_max_snapshots;
- region->name = region_name;
+ region->ops = ops;
region->size = region_size;
INIT_LIST_HEAD(&region->snapshot_list);
list_add_tail(&region->list, &devlink->region_list);
@@ -7720,75 +8346,66 @@ EXPORT_SYMBOL_GPL(devlink_region_destroy);
* Driver should use the same id for multiple snapshots taken
* on multiple regions at the same time/by the same trigger.
*
+ * The caller of this function must use devlink_region_snapshot_id_put
+ * when finished creating regions using this id.
+ *
+ * Returns zero on success, or a negative error code on failure.
+ *
* @devlink: devlink
+ * @id: storage to return id
*/
-u32 devlink_region_snapshot_id_get(struct devlink *devlink)
+int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
{
- u32 id;
+ int err;
mutex_lock(&devlink->lock);
- id = ++devlink->snapshot_id;
+ err = __devlink_region_snapshot_id_get(devlink, id);
mutex_unlock(&devlink->lock);
- return id;
+ return err;
}
EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get);
/**
+ * devlink_region_snapshot_id_put - put snapshot ID reference
+ *
+ * This should be called by a driver after finishing creating snapshots
+ * with an id. Doing so ensures that the ID can later be released in the
+ * event that all snapshots using it have been destroyed.
+ *
+ * @devlink: devlink
+ * @id: id to release reference on
+ */
+void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id)
+{
+ mutex_lock(&devlink->lock);
+ __devlink_snapshot_id_decrement(devlink, id);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put);
+
+/**
* devlink_region_snapshot_create - create a new snapshot
* This will add a new snapshot of a region. The snapshot
* will be stored on the region struct and can be accessed
- * from devlink. This is useful for future analyses of snapshots.
+ * from devlink. This is useful for future analyses of snapshots.
* Multiple snapshots can be created on a region.
* The @snapshot_id should be obtained using the getter function.
*
* @region: devlink region of the snapshot
* @data: snapshot data
* @snapshot_id: snapshot id to be created
- * @data_destructor: pointer to destructor function to free data
*/
int devlink_region_snapshot_create(struct devlink_region *region,
- u8 *data, u32 snapshot_id,
- devlink_snapshot_data_dest_t *data_destructor)
+ u8 *data, u32 snapshot_id)
{
struct devlink *devlink = region->devlink;
- struct devlink_snapshot *snapshot;
int err;
mutex_lock(&devlink->lock);
-
- /* check if region can hold one more snapshot */
- if (region->cur_snapshots == region->max_snapshots) {
- err = -ENOMEM;
- goto unlock;
- }
-
- if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
- err = -EEXIST;
- goto unlock;
- }
-
- snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
- if (!snapshot) {
- err = -ENOMEM;
- goto unlock;
- }
-
- snapshot->id = snapshot_id;
- snapshot->region = region;
- snapshot->data = data;
- snapshot->data_destructor = data_destructor;
-
- list_add_tail(&snapshot->list, &region->snapshot_list);
-
- region->cur_snapshots++;
-
- devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
+ err = __devlink_region_snapshot_create(region, data, snapshot_id);
mutex_unlock(&devlink->lock);
- return 0;
-unlock:
- mutex_unlock(&devlink->lock);
return err;
}
EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
@@ -8202,6 +8819,25 @@ void *devlink_trap_ctx_priv(void *trap_ctx)
EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv);
static int
+devlink_trap_group_item_policer_link(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item)
+{
+ u32 policer_id = group_item->group->init_policer_id;
+ struct devlink_trap_policer_item *policer_item;
+
+ if (policer_id == 0)
+ return 0;
+
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer_id);
+ if (WARN_ON_ONCE(!policer_item))
+ return -EINVAL;
+
+ group_item->policer_item = policer_item;
+
+ return 0;
+}
+
+static int
devlink_trap_group_register(struct devlink *devlink,
const struct devlink_trap_group *group)
{
@@ -8223,6 +8859,10 @@ devlink_trap_group_register(struct devlink *devlink,
group_item->group = group;
+ err = devlink_trap_group_item_policer_link(devlink, group_item);
+ if (err)
+ goto err_policer_link;
+
if (devlink->ops->trap_group_init) {
err = devlink->ops->trap_group_init(devlink, group);
if (err)
@@ -8236,6 +8876,7 @@ devlink_trap_group_register(struct devlink *devlink,
return 0;
err_group_init:
+err_policer_link:
free_percpu(group_item->stats);
err_stats_alloc:
kfree(group_item);
@@ -8317,6 +8958,148 @@ void devlink_trap_groups_unregister(struct devlink *devlink,
}
EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister);
+static void
+devlink_trap_policer_notify(struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW &&
+ cmd != DEVLINK_CMD_TRAP_POLICER_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0,
+ 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int
+devlink_trap_policer_register(struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct devlink_trap_policer_item *policer_item;
+ int err;
+
+ if (devlink_trap_policer_item_lookup(devlink, policer->id))
+ return -EEXIST;
+
+ policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL);
+ if (!policer_item)
+ return -ENOMEM;
+
+ policer_item->policer = policer;
+ policer_item->rate = policer->init_rate;
+ policer_item->burst = policer->init_burst;
+
+ if (devlink->ops->trap_policer_init) {
+ err = devlink->ops->trap_policer_init(devlink, policer);
+ if (err)
+ goto err_policer_init;
+ }
+
+ list_add_tail(&policer_item->list, &devlink->trap_policer_list);
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW);
+
+ return 0;
+
+err_policer_init:
+ kfree(policer_item);
+ return err;
+}
+
+static void
+devlink_trap_policer_unregister(struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer->id);
+ if (WARN_ON_ONCE(!policer_item))
+ return;
+
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_DEL);
+ list_del(&policer_item->list);
+ if (devlink->ops->trap_policer_fini)
+ devlink->ops->trap_policer_fini(devlink, policer);
+ kfree(policer_item);
+}
+
+/**
+ * devlink_trap_policers_register - Register packet trap policers with devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ *
+ * Return: Non-zero value on failure.
+ */
+int
+devlink_trap_policers_register(struct devlink *devlink,
+ const struct devlink_trap_policer *policers,
+ size_t policers_count)
+{
+ int i, err;
+
+ mutex_lock(&devlink->lock);
+ for (i = 0; i < policers_count; i++) {
+ const struct devlink_trap_policer *policer = &policers[i];
+
+ if (WARN_ON(policer->id == 0 ||
+ policer->max_rate < policer->min_rate ||
+ policer->max_burst < policer->min_burst)) {
+ err = -EINVAL;
+ goto err_trap_policer_verify;
+ }
+
+ err = devlink_trap_policer_register(devlink, policer);
+ if (err)
+ goto err_trap_policer_register;
+ }
+ mutex_unlock(&devlink->lock);
+
+ return 0;
+
+err_trap_policer_register:
+err_trap_policer_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_policer_unregister(devlink, &policers[i]);
+ mutex_unlock(&devlink->lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_trap_policers_register);
+
+/**
+ * devlink_trap_policers_unregister - Unregister packet trap policers from devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ */
+void
+devlink_trap_policers_unregister(struct devlink *devlink,
+ const struct devlink_trap_policer *policers,
+ size_t policers_count)
+{
+ int i;
+
+ mutex_lock(&devlink->lock);
+ for (i = policers_count - 1; i >= 0; i--)
+ devlink_trap_policer_unregister(devlink, &policers[i]);
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devlink_trap_policers_unregister);
+
static void __devlink_compat_running_version(struct devlink *devlink,
char *buf, size_t len)
{
diff --git a/net/core/filter.c b/net/core/filter.c
index 96350a743539..7628b947dbc3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2642,6 +2642,19 @@ static const struct bpf_func_proto bpf_msg_pop_data_proto = {
.arg4_type = ARG_ANYTHING,
};
+#ifdef CONFIG_CGROUP_NET_CLASSID
+BPF_CALL_0(bpf_get_cgroup_classid_curr)
+{
+ return __task_get_classid(current);
+}
+
+static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
+ .func = bpf_get_cgroup_classid_curr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+#endif
+
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
return task_get_classid(skb);
@@ -4117,6 +4130,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
+{
+ return sock_gen_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
+ .func = bpf_get_socket_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
return sock_gen_cookie(ctx->sk);
@@ -4129,6 +4154,39 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static u64 __bpf_get_netns_cookie(struct sock *sk)
+{
+#ifdef CONFIG_NET_NS
+ return net_gen_cookie(sk ? sk->sk_net.net : &init_net);
+#else
+ return 0;
+#endif
+}
+
+BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
+ .func = bpf_get_netns_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
+ .func = bpf_get_netns_cookie_sock_addr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
struct sock *sk = sk_to_full_sk(skb->sk);
@@ -4147,8 +4205,8 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock,
- struct bpf_map *, map, u64, flags, void *, data, u64, size)
+BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags,
+ void *, data, u64, size)
{
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -4156,8 +4214,8 @@ BPF_CALL_5(bpf_sockopt_event_output, struct bpf_sock_ops_kern *, bpf_sock,
return bpf_event_output(map, flags, data, size, NULL, 0, NULL);
}
-static const struct bpf_func_proto bpf_sockopt_event_output_proto = {
- .func = bpf_sockopt_event_output,
+static const struct bpf_func_proto bpf_event_output_data_proto = {
+ .func = bpf_event_output_data,
.gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -5343,8 +5401,7 @@ static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
- /* Only full sockets have sk->sk_flags. */
- if (!sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE))
+ if (sk_is_refcounted(sk))
sock_gen_put(sk);
return 0;
}
@@ -5860,6 +5917,36 @@ static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
.arg5_type = ARG_CONST_SIZE,
};
+BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
+{
+ if (flags != 0)
+ return -EINVAL;
+ if (!skb_at_tc_ingress(skb))
+ return -EOPNOTSUPP;
+ if (unlikely(dev_net(skb->dev) != sock_net(sk)))
+ return -ENETUNREACH;
+ if (unlikely(sk->sk_reuseport))
+ return -ESOCKTNOSUPPORT;
+ if (sk_is_refcounted(sk) &&
+ unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+ return -ENOENT;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_pfree;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_assign_proto = {
+ .func = bpf_sk_assign,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_SOCK_COMMON,
+ .arg3_type = ARG_ANYTHING,
+};
+
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -5954,6 +6041,26 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
default:
return bpf_base_func_proto(func_id);
}
@@ -5978,8 +6085,26 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_addr_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_addr_proto;
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_get_current_pid_tgid:
+ return &bpf_get_current_pid_tgid_proto;
+ case BPF_FUNC_get_current_comm:
+ return &bpf_get_current_comm_proto;
+#ifdef CONFIG_CGROUPS
+ case BPF_FUNC_get_current_cgroup_id:
+ return &bpf_get_current_cgroup_id_proto;
+ case BPF_FUNC_get_current_ancestor_cgroup_id:
+ return &bpf_get_current_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_get_cgroup_classid:
+ return &bpf_get_cgroup_classid_curr_proto;
+#endif
#ifdef CONFIG_INET
case BPF_FUNC_sk_lookup_tcp:
return &bpf_sock_addr_sk_lookup_tcp_proto;
@@ -6153,6 +6278,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_ecn_set_ce_proto;
case BPF_FUNC_tcp_gen_syncookie:
return &bpf_tcp_gen_syncookie_proto;
+ case BPF_FUNC_sk_assign:
+ return &bpf_sk_assign_proto;
#endif
default:
return bpf_base_func_proto(func_id);
@@ -6222,7 +6349,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
case BPF_FUNC_perf_event_output:
- return &bpf_sockopt_event_output_proto;
+ return &bpf_event_output_data_proto;
case BPF_FUNC_sk_storage_get:
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 7440e6117c81..e951b743bed3 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -511,7 +511,8 @@ EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister);
void flow_indr_block_call(struct net_device *dev,
struct flow_block_offload *bo,
- enum flow_block_command command)
+ enum flow_block_command command,
+ enum tc_setup_type type)
{
struct flow_indr_block_cb *indr_block_cb;
struct flow_indr_block_dev *indr_dev;
@@ -521,8 +522,7 @@ void flow_indr_block_call(struct net_device *dev,
return;
list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list)
- indr_block_cb->cb(dev, indr_block_cb->cb_priv, TC_SETUP_BLOCK,
- bo);
+ indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo);
}
EXPORT_SYMBOL_GPL(flow_indr_block_call);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 99a6de52b21d..7d3438215f32 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -367,7 +367,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
[LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
};
-static int bpf_build_state(struct nlattr *nla,
+static int bpf_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 2f9c0de533c7..8ec7d13d2860 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -41,6 +41,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "BPF";
case LWTUNNEL_ENCAP_SEG6_LOCAL:
return "SEG6LOCAL";
+ case LWTUNNEL_ENCAP_RPL:
+ return "RPL";
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
@@ -98,7 +100,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
}
EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops);
-int lwtunnel_build_state(u16 encap_type,
+int lwtunnel_build_state(struct net *net, u16 encap_type,
struct nlattr *encap, unsigned int family,
const void *cfg, struct lwtunnel_state **lws,
struct netlink_ext_ack *extack)
@@ -122,7 +124,7 @@ int lwtunnel_build_state(u16 encap_type,
rcu_read_unlock();
if (found) {
- ret = ops->build_state(encap, family, cfg, lws, extack);
+ ret = ops->build_state(net, encap, family, cfg, lws, extack);
if (ret)
module_put(ops->owner);
} else {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 5bf8d22a47ec..39d37d0ef575 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1065,11 +1065,12 @@ static void neigh_timer_handler(struct timer_list *t)
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
notify = 1;
- next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/100);
}
} else {
/* NUD_PROBE|NUD_INCOMPLETE */
- next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100);
}
if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
@@ -1125,7 +1126,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
neigh->nud_state = NUD_INCOMPLETE;
neigh->updated = now;
next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
- HZ/2);
+ HZ/100);
neigh_add_timer(neigh, next);
immediate_probe = true;
} else {
@@ -1427,7 +1428,8 @@ void __neigh_set_probe_once(struct neighbour *neigh)
neigh->nud_state = NUD_INCOMPLETE;
atomic_set(&neigh->probes, neigh_max_probes(neigh));
neigh_add_timer(neigh,
- jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME));
+ jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/100));
}
EXPORT_SYMBOL(__neigh_set_probe_once);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 757cc1d084e7..190ca66a383b 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -69,6 +69,20 @@ EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
+static atomic64_t cookie_gen;
+
+u64 net_gen_cookie(struct net *net)
+{
+ while (1) {
+ u64 res = atomic64_read(&net->net_cookie);
+
+ if (res)
+ return res;
+ res = atomic64_inc_return(&cookie_gen);
+ atomic64_cmpxchg(&net->net_cookie, 0, res);
+ }
+}
+
static struct net_generic *net_alloc_generic(void)
{
struct net_generic *ng;
@@ -1087,6 +1101,7 @@ static int __init net_ns_init(void)
panic("Could not allocate generic netns");
rcu_assign_pointer(init_net.gen, ng);
+ net_gen_cookie(&init_net);
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net, &init_user_ns))
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 626db912fce4..ef98372facf6 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -43,9 +43,11 @@ static int page_pool_init(struct page_pool *pool,
* DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
* which is the XDP_TX use-case.
*/
- if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
- (pool->p.dma_dir != DMA_BIDIRECTIONAL))
- return -EINVAL;
+ if (pool->p.flags & PP_FLAG_DMA_MAP) {
+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+ return -EINVAL;
+ }
if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
/* In order to request DMA-sync-for-device the page
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f2b3d8dd40f4..08e2811b5274 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3362,7 +3362,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
/* skb was 'freed' by stack, so clean few
* bits and reuse it
*/
- skb_reset_tc(skb);
+ skb_reset_redirect(skb);
} while (--burst > 0);
goto out; /* Skips xmit_mode M_START_XMIT */
} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 14e6ea21c378..709ebbf8ab5b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1872,7 +1872,9 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
};
static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
+ [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD },
[IFLA_XDP_FD] = { .type = NLA_S32 },
+ [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 },
[IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
[IFLA_XDP_FLAGS] = { .type = NLA_U32 },
[IFLA_XDP_PROG_ID] = { .type = NLA_U32 },
@@ -2799,8 +2801,20 @@ static int do_setlink(const struct sk_buff *skb,
}
if (xdp[IFLA_XDP_FD]) {
+ int expected_fd = -1;
+
+ if (xdp_flags & XDP_FLAGS_REPLACE) {
+ if (!xdp[IFLA_XDP_EXPECTED_FD]) {
+ err = -EINVAL;
+ goto errout;
+ }
+ expected_fd =
+ nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]);
+ }
+
err = dev_change_xdp_fd(dev, extack,
nla_get_s32(xdp[IFLA_XDP_FD]),
+ expected_fd,
xdp_flags);
if (err)
goto errout;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 621b4479fee1..7e29590482ce 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3668,6 +3668,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb,
skb_push(nskb, -skb_network_offset(nskb) + offset);
+ skb_release_head_state(nskb);
__copy_skb_header(nskb, skb);
skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
diff --git a/net/core/sock.c b/net/core/sock.c
index 0fc8937a7ff4..ce1d8dce9b7a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -574,7 +574,7 @@ static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
/* Sorry... */
ret = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
goto out;
ret = -EINVAL;
@@ -2071,6 +2071,18 @@ void sock_efree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_efree);
+/* Buffer destructor for prefetch/receive path where reference count may
+ * not be held, e.g. for listen sockets.
+ */
+#ifdef CONFIG_INET
+void sock_pfree(struct sk_buff *skb)
+{
+ if (sk_is_refcounted(skb->sk))
+ sock_gen_put(skb->sk);
+}
+EXPORT_SYMBOL(sock_pfree);
+#endif /* CONFIG_INET */
+
kuid_t sock_i_uid(struct sock *sk)
{
kuid_t uid;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index a7075b3b4489..b08dfae10f88 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -299,8 +299,11 @@ static void sock_map_free(struct bpf_map *map)
struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
int i;
+ /* After the sync no updates or deletes will be in-flight so it
+ * is safe to walk map and remove entries without risking a race
+ * in EEXIST update case.
+ */
synchronize_rcu();
- raw_spin_lock_bh(&stab->lock);
for (i = 0; i < stab->map.max_entries; i++) {
struct sock **psk = &stab->sks[i];
struct sock *sk;
@@ -314,7 +317,6 @@ static void sock_map_free(struct bpf_map *map)
release_sock(sk);
}
}
- raw_spin_unlock_bh(&stab->lock);
/* wait for psock readers accessing its map link */
synchronize_rcu();
@@ -1008,10 +1010,13 @@ static void sock_hash_free(struct bpf_map *map)
struct hlist_node *node;
int i;
+ /* After the sync no updates or deletes will be in-flight so it
+ * is safe to walk map and remove entries without risking a race
+ * in EEXIST update case.
+ */
synchronize_rcu();
for (i = 0; i < htab->buckets_num; i++) {
bucket = sock_hash_select_bucket(htab, i);
- raw_spin_lock_bh(&bucket->lock);
hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
hlist_del_rcu(&elem->node);
lock_sock(elem->sk);
@@ -1020,7 +1025,6 @@ static void sock_hash_free(struct bpf_map *map)
rcu_read_unlock();
release_sock(elem->sk);
}
- raw_spin_unlock_bh(&bucket->lock);
}
/* wait for psock readers accessing its map link */
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 3e1a90669006..ad53eb31d40f 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -302,7 +302,7 @@ static void dns_resolver_describe(const struct key *key, struct seq_file *m)
* - the key's semaphore is read-locked
*/
static long dns_resolver_read(const struct key *key,
- char __user *buffer, size_t buflen)
+ char *buffer, size_t buflen)
{
int err = PTR_ERR(key->payload.data[dns_key_error]);
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 17281fec710c..ee2610c4d46a 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -88,13 +88,9 @@ const struct dsa_device_ops *dsa_tag_driver_get(int tag_protocol)
{
struct dsa_tag_driver *dsa_tag_driver;
const struct dsa_device_ops *ops;
- char module_name[128];
bool found = false;
- snprintf(module_name, 127, "%s%d", DSA_TAG_DRIVER_ALIAS,
- tag_protocol);
-
- request_module(module_name);
+ request_module("%s%d", DSA_TAG_DRIVER_ALIAS, tag_protocol);
mutex_lock(&dsa_tag_drivers_lock);
list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index e7c30b472034..9a271a58a41d 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -18,8 +18,8 @@
#include "dsa_priv.h"
-static LIST_HEAD(dsa_tree_list);
static DEFINE_MUTEX(dsa2_mutex);
+LIST_HEAD(dsa_tree_list);
static const struct devlink_ops dsa_devlink_ops = {
};
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 760e6ea3178a..904cc7c9b882 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -22,6 +22,7 @@ enum {
DSA_NOTIFIER_MDB_DEL,
DSA_NOTIFIER_VLAN_ADD,
DSA_NOTIFIER_VLAN_DEL,
+ DSA_NOTIFIER_MTU,
};
/* DSA_NOTIFIER_AGEING_TIME */
@@ -61,6 +62,14 @@ struct dsa_notifier_vlan_info {
int port;
};
+/* DSA_NOTIFIER_MTU */
+struct dsa_notifier_mtu_info {
+ bool propagate_upstream;
+ int sw_index;
+ int port;
+ int mtu;
+};
+
struct dsa_slave_priv {
/* Copy of CPU port xmit for faster access in slave transmit hot path */
struct sk_buff * (*xmit)(struct sk_buff *skb,
@@ -127,6 +136,8 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
struct switchdev_trans *trans);
int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock,
struct switchdev_trans *trans);
+int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu,
+ bool propagate_upstream);
int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
u16 vid);
int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
@@ -183,4 +194,8 @@ dsa_slave_to_master(const struct net_device *dev)
/* switch.c */
int dsa_switch_register_notifier(struct dsa_switch *ds);
void dsa_switch_unregister_notifier(struct dsa_switch *ds);
+
+/* dsa2.c */
+extern struct list_head dsa_tree_list;
+
#endif
diff --git a/net/dsa/master.c b/net/dsa/master.c
index bd44bde272f4..b5c535af63a3 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -314,20 +314,6 @@ static const struct attribute_group dsa_group = {
.attrs = dsa_slave_attrs,
};
-static void dsa_master_set_mtu(struct net_device *dev, struct dsa_port *cpu_dp)
-{
- unsigned int mtu = ETH_DATA_LEN + cpu_dp->tag_ops->overhead;
- int err;
-
- rtnl_lock();
- if (mtu <= dev->max_mtu) {
- err = dev_set_mtu(dev, mtu);
- if (err)
- netdev_dbg(dev, "Unable to set MTU to include for DSA overheads\n");
- }
- rtnl_unlock();
-}
-
static void dsa_master_reset_mtu(struct net_device *dev)
{
int err;
@@ -344,7 +330,12 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
{
int ret;
- dsa_master_set_mtu(dev, cpu_dp);
+ rtnl_lock();
+ ret = dev_set_mtu(dev, ETH_DATA_LEN + cpu_dp->tag_ops->overhead);
+ rtnl_unlock();
+ if (ret)
+ netdev_warn(dev, "error %d setting MTU to include DSA overhead\n",
+ ret);
/* If we use a tagging format that doesn't have an ethertype
* field, make sure that all packets from this point on get
diff --git a/net/dsa/port.c b/net/dsa/port.c
index a18e65a474a5..231b2d494f1c 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -297,6 +297,19 @@ int dsa_port_mrouter(struct dsa_port *dp, bool mrouter,
return ds->ops->port_egress_floods(ds, port, true, mrouter);
}
+int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu,
+ bool propagate_upstream)
+{
+ struct dsa_notifier_mtu_info info = {
+ .sw_index = dp->ds->index,
+ .propagate_upstream = propagate_upstream,
+ .port = dp->index,
+ .mtu = new_mtu,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_MTU, &info);
+}
+
int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
u16 vid)
{
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 5f782fa3029f..e94eb1aac602 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -842,63 +842,137 @@ dsa_slave_mall_tc_entry_find(struct net_device *dev, unsigned long cookie)
return NULL;
}
-static int dsa_slave_add_cls_matchall(struct net_device *dev,
- struct tc_cls_matchall_offload *cls,
- bool ingress)
+static int
+dsa_slave_add_cls_matchall_mirred(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_mall_mirror_tc_entry *mirror;
struct dsa_mall_tc_entry *mall_tc_entry;
- __be16 protocol = cls->common.protocol;
struct dsa_switch *ds = dp->ds;
struct flow_action_entry *act;
struct dsa_port *to_dp;
- int err = -EOPNOTSUPP;
+ int err;
+
+ act = &cls->rule->action.entries[0];
if (!ds->ops->port_mirror_add)
- return err;
+ return -EOPNOTSUPP;
- if (!flow_offload_has_one_action(&cls->rule->action))
- return err;
+ if (!act->dev)
+ return -EINVAL;
if (!flow_action_basic_hw_stats_check(&cls->rule->action,
cls->common.extack))
- return err;
+ return -EOPNOTSUPP;
act = &cls->rule->action.entries[0];
- if (act->id == FLOW_ACTION_MIRRED && protocol == htons(ETH_P_ALL)) {
- struct dsa_mall_mirror_tc_entry *mirror;
+ if (!dsa_slave_dev_check(act->dev))
+ return -EOPNOTSUPP;
+
+ mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+ if (!mall_tc_entry)
+ return -ENOMEM;
- if (!act->dev)
- return -EINVAL;
+ mall_tc_entry->cookie = cls->cookie;
+ mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
+ mirror = &mall_tc_entry->mirror;
- if (!dsa_slave_dev_check(act->dev))
- return -EOPNOTSUPP;
+ to_dp = dsa_slave_to_port(act->dev);
+
+ mirror->to_local_port = to_dp->index;
+ mirror->ingress = ingress;
- mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
- if (!mall_tc_entry)
- return -ENOMEM;
+ err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress);
+ if (err) {
+ kfree(mall_tc_entry);
+ return err;
+ }
- mall_tc_entry->cookie = cls->cookie;
- mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
- mirror = &mall_tc_entry->mirror;
+ list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+
+ return err;
+}
- to_dp = dsa_slave_to_port(act->dev);
+static int
+dsa_slave_add_cls_matchall_police(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
+{
+ struct netlink_ext_ack *extack = cls->common.extack;
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_mall_policer_tc_entry *policer;
+ struct dsa_mall_tc_entry *mall_tc_entry;
+ struct dsa_switch *ds = dp->ds;
+ struct flow_action_entry *act;
+ int err;
- mirror->to_local_port = to_dp->index;
- mirror->ingress = ingress;
+ if (!ds->ops->port_policer_add) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Policing offload not implemented\n");
+ return -EOPNOTSUPP;
+ }
- err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress);
- if (err) {
- kfree(mall_tc_entry);
- return err;
+ if (!ingress) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only supported on ingress qdisc\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (!flow_action_basic_hw_stats_check(&cls->rule->action,
+ cls->common.extack))
+ return -EOPNOTSUPP;
+
+ list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) {
+ if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only one port policer allowed\n");
+ return -EEXIST;
}
+ }
+
+ act = &cls->rule->action.entries[0];
+
+ mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+ if (!mall_tc_entry)
+ return -ENOMEM;
+
+ mall_tc_entry->cookie = cls->cookie;
+ mall_tc_entry->type = DSA_PORT_MALL_POLICER;
+ policer = &mall_tc_entry->policer;
+ policer->rate_bytes_per_sec = act->police.rate_bytes_ps;
+ policer->burst = act->police.burst;
- list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+ err = ds->ops->port_policer_add(ds, dp->index, policer);
+ if (err) {
+ kfree(mall_tc_entry);
+ return err;
}
- return 0;
+ list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+
+ return err;
+}
+
+static int dsa_slave_add_cls_matchall(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
+{
+ int err = -EOPNOTSUPP;
+
+ if (cls->common.protocol == htons(ETH_P_ALL) &&
+ flow_offload_has_one_action(&cls->rule->action) &&
+ cls->rule->action.entries[0].id == FLOW_ACTION_MIRRED)
+ err = dsa_slave_add_cls_matchall_mirred(dev, cls, ingress);
+ else if (flow_offload_has_one_action(&cls->rule->action) &&
+ cls->rule->action.entries[0].id == FLOW_ACTION_POLICE)
+ err = dsa_slave_add_cls_matchall_police(dev, cls, ingress);
+
+ return err;
}
static void dsa_slave_del_cls_matchall(struct net_device *dev,
@@ -908,9 +982,6 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev,
struct dsa_mall_tc_entry *mall_tc_entry;
struct dsa_switch *ds = dp->ds;
- if (!ds->ops->port_mirror_del)
- return;
-
mall_tc_entry = dsa_slave_mall_tc_entry_find(dev, cls->cookie);
if (!mall_tc_entry)
return;
@@ -919,7 +990,13 @@ static void dsa_slave_del_cls_matchall(struct net_device *dev,
switch (mall_tc_entry->type) {
case DSA_PORT_MALL_MIRROR:
- ds->ops->port_mirror_del(ds, dp->index, &mall_tc_entry->mirror);
+ if (ds->ops->port_mirror_del)
+ ds->ops->port_mirror_del(ds, dp->index,
+ &mall_tc_entry->mirror);
+ break;
+ case DSA_PORT_MALL_POLICER:
+ if (ds->ops->port_policer_del)
+ ds->ops->port_policer_del(ds, dp->index);
break;
default:
WARN_ON(1);
@@ -1218,6 +1295,208 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
return dsa_port_vid_del(dp, vid);
}
+struct dsa_hw_port {
+ struct list_head list;
+ struct net_device *dev;
+ int old_mtu;
+};
+
+static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu)
+{
+ const struct dsa_hw_port *p;
+ int err;
+
+ list_for_each_entry(p, hw_port_list, list) {
+ if (p->dev->mtu == mtu)
+ continue;
+
+ err = dev_set_mtu(p->dev, mtu);
+ if (err)
+ goto rollback;
+ }
+
+ return 0;
+
+rollback:
+ list_for_each_entry_continue_reverse(p, hw_port_list, list) {
+ if (p->dev->mtu == p->old_mtu)
+ continue;
+
+ if (dev_set_mtu(p->dev, p->old_mtu))
+ netdev_err(p->dev, "Failed to restore MTU\n");
+ }
+
+ return err;
+}
+
+static void dsa_hw_port_list_free(struct list_head *hw_port_list)
+{
+ struct dsa_hw_port *p, *n;
+
+ list_for_each_entry_safe(p, n, hw_port_list, list)
+ kfree(p);
+}
+
+/* Make the hardware datapath to/from @dev limited to a common MTU */
+static void dsa_bridge_mtu_normalization(struct dsa_port *dp)
+{
+ struct list_head hw_port_list;
+ struct dsa_switch_tree *dst;
+ int min_mtu = ETH_MAX_MTU;
+ struct dsa_port *other_dp;
+ int err;
+
+ if (!dp->ds->mtu_enforcement_ingress)
+ return;
+
+ if (!dp->bridge_dev)
+ return;
+
+ INIT_LIST_HEAD(&hw_port_list);
+
+ /* Populate the list of ports that are part of the same bridge
+ * as the newly added/modified port
+ */
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ list_for_each_entry(other_dp, &dst->ports, list) {
+ struct dsa_hw_port *hw_port;
+ struct net_device *slave;
+
+ if (other_dp->type != DSA_PORT_TYPE_USER)
+ continue;
+
+ if (other_dp->bridge_dev != dp->bridge_dev)
+ continue;
+
+ if (!other_dp->ds->mtu_enforcement_ingress)
+ continue;
+
+ slave = other_dp->slave;
+
+ if (min_mtu > slave->mtu)
+ min_mtu = slave->mtu;
+
+ hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL);
+ if (!hw_port)
+ goto out;
+
+ hw_port->dev = slave;
+ hw_port->old_mtu = slave->mtu;
+
+ list_add(&hw_port->list, &hw_port_list);
+ }
+ }
+
+ /* Attempt to configure the entire hardware bridge to the newly added
+ * interface's MTU first, regardless of whether the intention of the
+ * user was to raise or lower it.
+ */
+ err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->slave->mtu);
+ if (!err)
+ goto out;
+
+ /* Clearly that didn't work out so well, so just set the minimum MTU on
+ * all hardware bridge ports now. If this fails too, then all ports will
+ * still have their old MTU rolled back anyway.
+ */
+ dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu);
+
+out:
+ dsa_hw_port_list_free(&hw_port_list);
+}
+
+static int dsa_slave_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct net_device *master = dsa_slave_to_master(dev);
+ struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->dp->ds;
+ struct dsa_port *cpu_dp;
+ int port = p->dp->index;
+ int largest_mtu = 0;
+ int new_master_mtu;
+ int old_master_mtu;
+ int mtu_limit;
+ int cpu_mtu;
+ int err, i;
+
+ if (!ds->ops->port_change_mtu)
+ return -EOPNOTSUPP;
+
+ for (i = 0; i < ds->num_ports; i++) {
+ int slave_mtu;
+
+ if (!dsa_is_user_port(ds, i))
+ continue;
+
+ /* During probe, this function will be called for each slave
+ * device, while not all of them have been allocated. That's
+ * ok, it doesn't change what the maximum is, so ignore it.
+ */
+ if (!dsa_to_port(ds, i)->slave)
+ continue;
+
+ /* Pretend that we already applied the setting, which we
+ * actually haven't (still haven't done all integrity checks)
+ */
+ if (i == port)
+ slave_mtu = new_mtu;
+ else
+ slave_mtu = dsa_to_port(ds, i)->slave->mtu;
+
+ if (largest_mtu < slave_mtu)
+ largest_mtu = slave_mtu;
+ }
+
+ cpu_dp = dsa_to_port(ds, port)->cpu_dp;
+
+ mtu_limit = min_t(int, master->max_mtu, dev->max_mtu);
+ old_master_mtu = master->mtu;
+ new_master_mtu = largest_mtu + cpu_dp->tag_ops->overhead;
+ if (new_master_mtu > mtu_limit)
+ return -ERANGE;
+
+ /* If the master MTU isn't over limit, there's no need to check the CPU
+ * MTU, since that surely isn't either.
+ */
+ cpu_mtu = largest_mtu;
+
+ /* Start applying stuff */
+ if (new_master_mtu != old_master_mtu) {
+ err = dev_set_mtu(master, new_master_mtu);
+ if (err < 0)
+ goto out_master_failed;
+
+ /* We only need to propagate the MTU of the CPU port to
+ * upstream switches.
+ */
+ err = dsa_port_mtu_change(cpu_dp, cpu_mtu, true);
+ if (err)
+ goto out_cpu_failed;
+ }
+
+ err = dsa_port_mtu_change(dp, new_mtu, false);
+ if (err)
+ goto out_port_failed;
+
+ dev->mtu = new_mtu;
+
+ dsa_bridge_mtu_normalization(dp);
+
+ return 0;
+
+out_port_failed:
+ if (new_master_mtu != old_master_mtu)
+ dsa_port_mtu_change(cpu_dp, old_master_mtu -
+ cpu_dp->tag_ops->overhead,
+ true);
+out_cpu_failed:
+ if (new_master_mtu != old_master_mtu)
+ dev_set_mtu(master, old_master_mtu);
+out_master_failed:
+ return err;
+}
+
static const struct ethtool_ops dsa_slave_ethtool_ops = {
.get_drvinfo = dsa_slave_get_drvinfo,
.get_regs_len = dsa_slave_get_regs_len,
@@ -1295,6 +1574,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_vlan_rx_add_vid = dsa_slave_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid,
.ndo_get_devlink_port = dsa_slave_get_devlink_port,
+ .ndo_change_mtu = dsa_slave_change_mtu,
};
static struct device_type dsa_type = {
@@ -1305,7 +1585,8 @@ void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
{
const struct dsa_port *dp = dsa_to_port(ds, port);
- phylink_mac_change(dp->pl, up);
+ if (dp->pl)
+ phylink_mac_change(dp->pl, up);
}
EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
@@ -1465,7 +1746,10 @@ int dsa_slave_create(struct dsa_port *port)
slave_dev->priv_flags |= IFF_NO_QUEUE;
slave_dev->netdev_ops = &dsa_slave_netdev_ops;
slave_dev->min_mtu = 0;
- slave_dev->max_mtu = ETH_MAX_MTU;
+ if (ds->ops->port_max_mtu)
+ slave_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
+ else
+ slave_dev->max_mtu = ETH_MAX_MTU;
SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
SET_NETDEV_DEV(slave_dev, port->ds->dev);
@@ -1483,6 +1767,15 @@ int dsa_slave_create(struct dsa_port *port)
p->xmit = cpu_dp->tag_ops->xmit;
port->slave = slave_dev;
+ rtnl_lock();
+ ret = dsa_slave_change_mtu(slave_dev, ETH_DATA_LEN);
+ rtnl_unlock();
+ if (ret && ret != -EOPNOTSUPP) {
+ dev_err(ds->dev, "error %d setting MTU on port %d\n",
+ ret, port->index);
+ goto out_free;
+ }
+
netif_carrier_off(slave_dev);
ret = dsa_slave_phy_setup(slave_dev);
@@ -1545,6 +1838,8 @@ static int dsa_slave_changeupper(struct net_device *dev,
if (netif_is_bridge_master(info->upper_dev)) {
if (info->linking) {
err = dsa_port_bridge_join(dp, info->upper_dev);
+ if (!err)
+ dsa_bridge_mtu_normalization(dp);
err = notifier_from_errno(err);
} else {
dsa_port_bridge_leave(dp, info->upper_dev);
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index df4abe897ed6..f3c32ff552b3 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -52,6 +52,40 @@ static int dsa_switch_ageing_time(struct dsa_switch *ds,
return 0;
}
+static bool dsa_switch_mtu_match(struct dsa_switch *ds, int port,
+ struct dsa_notifier_mtu_info *info)
+{
+ if (ds->index == info->sw_index)
+ return (port == info->port) || dsa_is_dsa_port(ds, port);
+
+ if (!info->propagate_upstream)
+ return false;
+
+ if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port))
+ return true;
+
+ return false;
+}
+
+static int dsa_switch_mtu(struct dsa_switch *ds,
+ struct dsa_notifier_mtu_info *info)
+{
+ int port, ret;
+
+ if (!ds->ops->port_change_mtu)
+ return -EOPNOTSUPP;
+
+ for (port = 0; port < ds->num_ports; port++) {
+ if (dsa_switch_mtu_match(ds, port, info)) {
+ ret = ds->ops->port_change_mtu(ds, port, info->mtu);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
static int dsa_switch_bridge_join(struct dsa_switch *ds,
struct dsa_notifier_bridge_info *info)
{
@@ -328,6 +362,9 @@ static int dsa_switch_event(struct notifier_block *nb,
case DSA_NOTIFIER_VLAN_DEL:
err = dsa_switch_vlan_del(ds, info);
break;
+ case DSA_NOTIFIER_MTU:
+ err = dsa_switch_mtu(ds, info);
+ break;
default:
err = -EOPNOTSUPP;
break;
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 2fb6c26294b5..b97ad93d1c1a 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -298,47 +298,4 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
}
EXPORT_SYMBOL_GPL(dsa_8021q_xmit);
-/* In the DSA packet_type handler, skb->data points in the middle of the VLAN
- * tag, after tpid and before tci. This is because so far, ETH_HLEN
- * (DMAC, SMAC, EtherType) bytes were pulled.
- * There are 2 bytes of VLAN tag left in skb->data, and upper
- * layers expect the 'real' EtherType to be consumed as well.
- * Coincidentally, a VLAN header is also of the same size as
- * the number of bytes that need to be pulled.
- *
- * skb_mac_header skb->data
- * | |
- * v v
- * | | | | | | | | | | | | | | | | | | |
- * +-----------------------+-----------------------+-------+-------+-------+
- * | Destination MAC | Source MAC | TPID | TCI | EType |
- * +-----------------------+-----------------------+-------+-------+-------+
- * ^ | |
- * |<--VLAN_HLEN-->to <---VLAN_HLEN--->
- * from |
- * >>>>>>> v
- * >>>>>>> | | | | | | | | | | | | | | |
- * >>>>>>> +-----------------------+-----------------------+-------+
- * >>>>>>> | Destination MAC | Source MAC | EType |
- * +-----------------------+-----------------------+-------+
- * ^ ^
- * (now part of | |
- * skb->head) skb_mac_header skb->data
- */
-struct sk_buff *dsa_8021q_remove_header(struct sk_buff *skb)
-{
- u8 *from = skb_mac_header(skb);
- u8 *dest = from + VLAN_HLEN;
-
- memmove(dest, from, ETH_HLEN - VLAN_HLEN);
- skb_pull(skb, VLAN_HLEN);
- skb_push(skb, ETH_HLEN);
- skb_reset_mac_header(skb);
- skb_reset_mac_len(skb);
- skb_pull_rcsum(skb, ETH_HLEN);
-
- return skb;
-}
-EXPORT_SYMBOL_GPL(dsa_8021q_remove_header);
-
MODULE_LICENSE("GPL v2");
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 0d3f796d14a3..cc8512b5f9e2 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -140,6 +140,8 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
/* Remove Broadcom tag and update checksum */
skb_pull_rcsum(skb, BRCM_TAG_LEN);
+ skb->offload_fwd_mark = 1;
+
return skb;
}
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 5366ea430349..d553bf36bd41 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -250,14 +250,14 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
{
struct sja1105_meta meta = {0};
int source_port, switch_id;
- struct vlan_ethhdr *hdr;
+ struct ethhdr *hdr;
u16 tpid, vid, tci;
bool is_link_local;
bool is_tagged;
bool is_meta;
- hdr = vlan_eth_hdr(skb);
- tpid = ntohs(hdr->h_vlan_proto);
+ hdr = eth_hdr(skb);
+ tpid = ntohs(hdr->h_proto);
is_tagged = (tpid == ETH_P_SJA1105);
is_link_local = sja1105_is_link_local(skb);
is_meta = sja1105_is_meta_frame(skb);
@@ -266,7 +266,12 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
if (is_tagged) {
/* Normal traffic path. */
- tci = ntohs(hdr->h_vlan_TCI);
+ skb_push_rcsum(skb, ETH_HLEN);
+ __skb_vlan_pop(skb, &tci);
+ skb_pull_rcsum(skb, ETH_HLEN);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+
vid = tci & VLAN_VID_MASK;
source_port = dsa_8021q_rx_source_port(vid);
switch_id = dsa_8021q_rx_switch_id(vid);
@@ -295,12 +300,6 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
return NULL;
}
- /* Delete/overwrite fake VLAN header, DSA expects to not find
- * it there, see dsa_switch_rcv: skb_push(skb, ETH_HLEN).
- */
- if (is_tagged)
- skb = dsa_8021q_remove_header(skb);
-
return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local,
is_meta);
}
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index b0bd3decad02..6c360c9c9370 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -6,4 +6,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o
ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \
linkstate.o debug.o wol.o features.o privflags.o rings.o \
- channels.o
+ channels.o coalesce.o pause.o eee.o tsinfo.o
diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c
new file mode 100644
index 000000000000..6afd99042d67
--- /dev/null
+++ b/net/ethtool/coalesce.c
@@ -0,0 +1,353 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+
+struct coalesce_req_info {
+ struct ethnl_req_info base;
+};
+
+struct coalesce_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_coalesce coalesce;
+ u32 supported_params;
+};
+
+#define COALESCE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct coalesce_reply_data, base)
+
+#define __SUPPORTED_OFFSET ETHTOOL_A_COALESCE_RX_USECS
+static u32 attr_to_mask(unsigned int attr_type)
+{
+ return BIT(attr_type - __SUPPORTED_OFFSET);
+}
+
+/* build time check that indices in ethtool_ops::supported_coalesce_params
+ * match corresponding attribute types with an offset
+ */
+#define __CHECK_SUPPORTED_OFFSET(x) \
+ static_assert((ETHTOOL_ ## x) == \
+ BIT((ETHTOOL_A_ ## x) - __SUPPORTED_OFFSET))
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_STATS_BLOCK_USECS);
+__CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_RX);
+__CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_TX);
+__CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RATE_SAMPLE_INTERVAL);
+
+static const struct nla_policy
+coalesce_get_policy[ETHTOOL_A_COALESCE_MAX + 1] = {
+ [ETHTOOL_A_COALESCE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_USECS] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_USECS_IRQ] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_STATS_BLOCK_USECS] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_PKT_RATE_LOW] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_USECS_LOW] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_USECS_LOW] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_PKT_RATE_HIGH] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_USECS_HIGH] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_REJECT },
+};
+
+static int coalesce_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_coalesce)
+ return -EOPNOTSUPP;
+ data->supported_params = dev->ethtool_ops->supported_coalesce_params;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int coalesce_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u32)) + /* _RX_USECS */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES */
+ nla_total_size(sizeof(u32)) + /* _RX_USECS_IRQ */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_IRQ */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS_IRQ */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_IRQ */
+ nla_total_size(sizeof(u32)) + /* _STATS_BLOCK_USECS */
+ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_RX */
+ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_TX */
+ nla_total_size(sizeof(u32)) + /* _PKT_RATE_LOW */
+ nla_total_size(sizeof(u32)) + /* _RX_USECS_LOW */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_LOW */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS_LOW */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_LOW */
+ nla_total_size(sizeof(u32)) + /* _PKT_RATE_HIGH */
+ nla_total_size(sizeof(u32)) + /* _RX_USECS_HIGH */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_HIGH */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS_HIGH */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */
+ nla_total_size(sizeof(u32)); /* _RATE_SAMPLE_INTERVAL */
+}
+
+static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val,
+ u32 supported_params)
+{
+ if (!val && !(supported_params & attr_to_mask(attr_type)))
+ return false;
+ return nla_put_u32(skb, attr_type, val);
+}
+
+static bool coalesce_put_bool(struct sk_buff *skb, u16 attr_type, u32 val,
+ u32 supported_params)
+{
+ if (!val && !(supported_params & attr_to_mask(attr_type)))
+ return false;
+ return nla_put_u8(skb, attr_type, !!val);
+}
+
+static int coalesce_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base);
+ const struct ethtool_coalesce *coal = &data->coalesce;
+ u32 supported = data->supported_params;
+
+ if (coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS,
+ coal->rx_coalesce_usecs, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES,
+ coal->rx_max_coalesced_frames, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_IRQ,
+ coal->rx_coalesce_usecs_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ,
+ coal->rx_max_coalesced_frames_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS,
+ coal->tx_coalesce_usecs, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES,
+ coal->tx_max_coalesced_frames, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_IRQ,
+ coal->tx_coalesce_usecs_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ,
+ coal->tx_max_coalesced_frames_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_STATS_BLOCK_USECS,
+ coal->stats_block_coalesce_usecs, supported) ||
+ coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX,
+ coal->use_adaptive_rx_coalesce, supported) ||
+ coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX,
+ coal->use_adaptive_tx_coalesce, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_LOW,
+ coal->pkt_rate_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_LOW,
+ coal->rx_coalesce_usecs_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW,
+ coal->rx_max_coalesced_frames_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_LOW,
+ coal->tx_coalesce_usecs_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW,
+ coal->tx_max_coalesced_frames_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_HIGH,
+ coal->pkt_rate_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_HIGH,
+ coal->rx_coalesce_usecs_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH,
+ coal->rx_max_coalesced_frames_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_HIGH,
+ coal->tx_coalesce_usecs_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH,
+ coal->tx_max_coalesced_frames_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL,
+ coal->rate_sample_interval, supported))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_coalesce_request_ops = {
+ .request_cmd = ETHTOOL_MSG_COALESCE_GET,
+ .reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_COALESCE_HEADER,
+ .max_attr = ETHTOOL_A_COALESCE_MAX,
+ .req_info_size = sizeof(struct coalesce_req_info),
+ .reply_data_size = sizeof(struct coalesce_reply_data),
+ .request_policy = coalesce_get_policy,
+
+ .prepare_data = coalesce_prepare_data,
+ .reply_size = coalesce_reply_size,
+ .fill_reply = coalesce_fill_reply,
+};
+
+/* COALESCE_SET */
+
+static const struct nla_policy
+coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1] = {
+ [ETHTOOL_A_COALESCE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_COALESCE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_STATS_BLOCK_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX] = { .type = NLA_U8 },
+ [ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX] = { .type = NLA_U8 },
+ [ETHTOOL_A_COALESCE_PKT_RATE_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_USECS_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_PKT_RATE_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_USECS_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 },
+};
+
+int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ETHTOOL_A_COALESCE_MAX + 1];
+ struct ethtool_coalesce coalesce = {};
+ struct ethnl_req_info req_info = {};
+ const struct ethtool_ops *ops;
+ struct net_device *dev;
+ u32 supported_params;
+ bool mod = false;
+ int ret;
+ u16 a;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb,
+ ETHTOOL_A_COALESCE_MAX, coalesce_set_policy,
+ info->extack);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_COALESCE_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ ops = dev->ethtool_ops;
+ ret = -EOPNOTSUPP;
+ if (!ops->get_coalesce || !ops->set_coalesce)
+ goto out_dev;
+
+ /* make sure that only supported parameters are present */
+ supported_params = ops->supported_coalesce_params;
+ for (a = ETHTOOL_A_COALESCE_RX_USECS; a < __ETHTOOL_A_COALESCE_CNT; a++)
+ if (tb[a] && !(supported_params & attr_to_mask(a))) {
+ ret = -EINVAL;
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[a],
+ "cannot modify an unsupported parameter");
+ goto out_dev;
+ }
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+ ret = ops->get_coalesce(dev, &coalesce);
+ if (ret < 0)
+ goto out_ops;
+
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs,
+ tb[ETHTOOL_A_COALESCE_RX_USECS], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES], &mod);
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs_irq,
+ tb[ETHTOOL_A_COALESCE_RX_USECS_IRQ], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames_irq,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs,
+ tb[ETHTOOL_A_COALESCE_TX_USECS], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs_irq,
+ tb[ETHTOOL_A_COALESCE_TX_USECS_IRQ], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames_irq,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ], &mod);
+ ethnl_update_u32(&coalesce.stats_block_coalesce_usecs,
+ tb[ETHTOOL_A_COALESCE_STATS_BLOCK_USECS], &mod);
+ ethnl_update_bool32(&coalesce.use_adaptive_rx_coalesce,
+ tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX], &mod);
+ ethnl_update_bool32(&coalesce.use_adaptive_tx_coalesce,
+ tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX], &mod);
+ ethnl_update_u32(&coalesce.pkt_rate_low,
+ tb[ETHTOOL_A_COALESCE_PKT_RATE_LOW], &mod);
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs_low,
+ tb[ETHTOOL_A_COALESCE_RX_USECS_LOW], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames_low,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs_low,
+ tb[ETHTOOL_A_COALESCE_TX_USECS_LOW], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames_low,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW], &mod);
+ ethnl_update_u32(&coalesce.pkt_rate_high,
+ tb[ETHTOOL_A_COALESCE_PKT_RATE_HIGH], &mod);
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs_high,
+ tb[ETHTOOL_A_COALESCE_RX_USECS_HIGH], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames_high,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs_high,
+ tb[ETHTOOL_A_COALESCE_TX_USECS_HIGH], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames_high,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod);
+ ethnl_update_u32(&coalesce.rate_sample_interval,
+ tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod);
+ ret = 0;
+ if (!mod)
+ goto out_ops;
+
+ ret = dev->ethtool_ops->set_coalesce(dev, &coalesce);
+ if (ret < 0)
+ goto out_ops;
+ ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+out_dev:
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index dab047eec943..423e640e3876 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -1,5 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/net_tstamp.h>
+#include <linux/phy.h>
+
#include "common.h"
const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
@@ -60,6 +63,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
[NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
[NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
[NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
+ [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
};
const char
@@ -203,6 +207,53 @@ const char wol_mode_names[][ETH_GSTRING_LEN] = {
};
static_assert(ARRAY_SIZE(wol_mode_names) == WOL_MODE_COUNT);
+const char sof_timestamping_names[][ETH_GSTRING_LEN] = {
+ [const_ilog2(SOF_TIMESTAMPING_TX_HARDWARE)] = "hardware-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_TX_SOFTWARE)] = "software-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_RX_HARDWARE)] = "hardware-receive",
+ [const_ilog2(SOF_TIMESTAMPING_RX_SOFTWARE)] = "software-receive",
+ [const_ilog2(SOF_TIMESTAMPING_SOFTWARE)] = "software-system-clock",
+ [const_ilog2(SOF_TIMESTAMPING_SYS_HARDWARE)] = "hardware-legacy-clock",
+ [const_ilog2(SOF_TIMESTAMPING_RAW_HARDWARE)] = "hardware-raw-clock",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_ID)] = "option-id",
+ [const_ilog2(SOF_TIMESTAMPING_TX_SCHED)] = "sched-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_TX_ACK)] = "ack-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_CMSG)] = "option-cmsg",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_TSONLY)] = "option-tsonly",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_STATS)] = "option-stats",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)] = "option-pktinfo",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)] = "option-tx-swhw",
+};
+static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT);
+
+const char ts_tx_type_names[][ETH_GSTRING_LEN] = {
+ [HWTSTAMP_TX_OFF] = "off",
+ [HWTSTAMP_TX_ON] = "on",
+ [HWTSTAMP_TX_ONESTEP_SYNC] = "onestep-sync",
+ [HWTSTAMP_TX_ONESTEP_P2P] = "onestep-p2p",
+};
+static_assert(ARRAY_SIZE(ts_tx_type_names) == __HWTSTAMP_TX_CNT);
+
+const char ts_rx_filter_names[][ETH_GSTRING_LEN] = {
+ [HWTSTAMP_FILTER_NONE] = "none",
+ [HWTSTAMP_FILTER_ALL] = "all",
+ [HWTSTAMP_FILTER_SOME] = "some",
+ [HWTSTAMP_FILTER_PTP_V1_L4_EVENT] = "ptpv1-l4-event",
+ [HWTSTAMP_FILTER_PTP_V1_L4_SYNC] = "ptpv1-l4-sync",
+ [HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ] = "ptpv1-l4-delay-req",
+ [HWTSTAMP_FILTER_PTP_V2_L4_EVENT] = "ptpv2-l4-event",
+ [HWTSTAMP_FILTER_PTP_V2_L4_SYNC] = "ptpv2-l4-sync",
+ [HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ] = "ptpv2-l4-delay-req",
+ [HWTSTAMP_FILTER_PTP_V2_L2_EVENT] = "ptpv2-l2-event",
+ [HWTSTAMP_FILTER_PTP_V2_L2_SYNC] = "ptpv2-l2-sync",
+ [HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ] = "ptpv2-l2-delay-req",
+ [HWTSTAMP_FILTER_PTP_V2_EVENT] = "ptpv2-event",
+ [HWTSTAMP_FILTER_PTP_V2_SYNC] = "ptpv2-sync",
+ [HWTSTAMP_FILTER_PTP_V2_DELAY_REQ] = "ptpv2-delay-req",
+ [HWTSTAMP_FILTER_NTP_ALL] = "ntp-all",
+};
+static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT);
+
/* return false if legacy contained non-0 deprecated fields
* maxtxpkt/maxrxpkt. rest of ksettings always updated
*/
@@ -300,3 +351,23 @@ int ethtool_check_ops(const struct ethtool_ops *ops)
*/
return 0;
}
+
+int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ memset(info, 0, sizeof(*info));
+ info->cmd = ETHTOOL_GET_TS_INFO;
+
+ if (phy_has_tsinfo(phydev))
+ return phy_ts_info(phydev, info);
+ if (ops->get_ts_info)
+ return ops->get_ts_info(dev, info);
+
+ info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info->phc_index = -1;
+
+ return 0;
+}
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
index 03946e16e623..a62f68ccc43a 100644
--- a/net/ethtool/common.h
+++ b/net/ethtool/common.h
@@ -12,6 +12,8 @@
#define ETHTOOL_LINK_MODE(speed, type, duplex) \
ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT
+#define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1)
+
extern const char
netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN];
extern const char
@@ -23,6 +25,9 @@ phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN];
extern const char link_mode_names[][ETH_GSTRING_LEN];
extern const char netif_msg_class_names[][ETH_GSTRING_LEN];
extern const char wol_mode_names[][ETH_GSTRING_LEN];
+extern const char sof_timestamping_names[][ETH_GSTRING_LEN];
+extern const char ts_tx_type_names[][ETH_GSTRING_LEN];
+extern const char ts_rx_filter_names[][ETH_GSTRING_LEN];
int __ethtool_get_link(struct net_device *dev);
@@ -30,5 +35,6 @@ bool convert_legacy_settings_to_link_ksettings(
struct ethtool_link_ksettings *link_ksettings,
const struct ethtool_cmd *legacy_settings);
int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max);
+int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info);
#endif /* _ETHTOOL_COMMON_H */
diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c
index 87f288ee20c8..1bd026a29f3f 100644
--- a/net/ethtool/debug.c
+++ b/net/ethtool/debug.c
@@ -109,8 +109,9 @@ int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info)
if (ret < 0)
return ret;
dev = req_info.dev;
+ ret = -EOPNOTSUPP;
if (!dev->ethtool_ops->get_msglevel || !dev->ethtool_ops->set_msglevel)
- return -EOPNOTSUPP;
+ goto out_dev;
rtnl_lock();
ret = ethnl_ops_begin(dev);
@@ -131,6 +132,7 @@ out_ops:
ethnl_ops_complete(dev);
out_rtnl:
rtnl_unlock();
+out_dev:
dev_put(dev);
return ret;
}
diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c
new file mode 100644
index 000000000000..94aa19cff22f
--- /dev/null
+++ b/net/ethtool/eee.c
@@ -0,0 +1,206 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+#define EEE_MODES_COUNT \
+ (sizeof_field(struct ethtool_eee, supported) * BITS_PER_BYTE)
+
+struct eee_req_info {
+ struct ethnl_req_info base;
+};
+
+struct eee_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_eee eee;
+};
+
+#define EEE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct eee_reply_data, base)
+
+static const struct nla_policy
+eee_get_policy[ETHTOOL_A_EEE_MAX + 1] = {
+ [ETHTOOL_A_EEE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_EEE_MODES_OURS] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_MODES_PEER] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_ACTIVE] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_ENABLED] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_TX_LPI_ENABLED] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_TX_LPI_TIMER] = { .type = NLA_REJECT },
+};
+
+static int eee_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct eee_reply_data *data = EEE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_eee)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = dev->ethtool_ops->get_eee(dev, &data->eee);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int eee_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct eee_reply_data *data = EEE_REPDATA(reply_base);
+ const struct ethtool_eee *eee = &data->eee;
+ int len = 0;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(eee->advertised) * BITS_PER_BYTE !=
+ EEE_MODES_COUNT);
+ BUILD_BUG_ON(sizeof(eee->lp_advertised) * BITS_PER_BYTE !=
+ EEE_MODES_COUNT);
+
+ /* MODES_OURS */
+ ret = ethnl_bitset32_size(&eee->advertised, &eee->supported,
+ EEE_MODES_COUNT, link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ /* MODES_PEERS */
+ ret = ethnl_bitset32_size(&eee->lp_advertised, NULL,
+ EEE_MODES_COUNT, link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+
+ len += nla_total_size(sizeof(u8)) + /* _EEE_ACTIVE */
+ nla_total_size(sizeof(u8)) + /* _EEE_ENABLED */
+ nla_total_size(sizeof(u8)) + /* _EEE_TX_LPI_ENABLED */
+ nla_total_size(sizeof(u32)); /* _EEE_TX_LPI_TIMER */
+
+ return len;
+}
+
+static int eee_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct eee_reply_data *data = EEE_REPDATA(reply_base);
+ const struct ethtool_eee *eee = &data->eee;
+ int ret;
+
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_OURS,
+ &eee->advertised, &eee->supported,
+ EEE_MODES_COUNT, link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_PEER,
+ &eee->lp_advertised, NULL, EEE_MODES_COUNT,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+
+ if (nla_put_u8(skb, ETHTOOL_A_EEE_ACTIVE, !!eee->eee_active) ||
+ nla_put_u8(skb, ETHTOOL_A_EEE_ENABLED, !!eee->eee_enabled) ||
+ nla_put_u8(skb, ETHTOOL_A_EEE_TX_LPI_ENABLED,
+ !!eee->tx_lpi_enabled) ||
+ nla_put_u32(skb, ETHTOOL_A_EEE_TX_LPI_TIMER, eee->tx_lpi_timer))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_eee_request_ops = {
+ .request_cmd = ETHTOOL_MSG_EEE_GET,
+ .reply_cmd = ETHTOOL_MSG_EEE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_EEE_HEADER,
+ .max_attr = ETHTOOL_A_EEE_MAX,
+ .req_info_size = sizeof(struct eee_req_info),
+ .reply_data_size = sizeof(struct eee_reply_data),
+ .request_policy = eee_get_policy,
+
+ .prepare_data = eee_prepare_data,
+ .reply_size = eee_reply_size,
+ .fill_reply = eee_fill_reply,
+};
+
+/* EEE_SET */
+
+static const struct nla_policy
+eee_set_policy[ETHTOOL_A_EEE_MAX + 1] = {
+ [ETHTOOL_A_EEE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_EEE_MODES_OURS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_EEE_MODES_PEER] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_ACTIVE] = { .type = NLA_REJECT },
+ [ETHTOOL_A_EEE_ENABLED] = { .type = NLA_U8 },
+ [ETHTOOL_A_EEE_TX_LPI_ENABLED] = { .type = NLA_U8 },
+ [ETHTOOL_A_EEE_TX_LPI_TIMER] = { .type = NLA_U32 },
+};
+
+int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ETHTOOL_A_EEE_MAX + 1];
+ struct ethtool_eee eee = {};
+ struct ethnl_req_info req_info = {};
+ const struct ethtool_ops *ops;
+ struct net_device *dev;
+ bool mod = false;
+ int ret;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_EEE_MAX,
+ eee_set_policy, info->extack);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_EEE_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ ops = dev->ethtool_ops;
+ ret = -EOPNOTSUPP;
+ if (!ops->get_eee || !ops->set_eee)
+ goto out_dev;
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+ ret = ops->get_eee(dev, &eee);
+ if (ret < 0)
+ goto out_ops;
+
+ ret = ethnl_update_bitset32(&eee.advertised, EEE_MODES_COUNT,
+ tb[ETHTOOL_A_EEE_MODES_OURS],
+ link_mode_names, info->extack, &mod);
+ if (ret < 0)
+ goto out_ops;
+ ethnl_update_bool32(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod);
+ ethnl_update_bool32(&eee.tx_lpi_enabled,
+ tb[ETHTOOL_A_EEE_TX_LPI_ENABLED], &mod);
+ ethnl_update_bool32(&eee.tx_lpi_timer, tb[ETHTOOL_A_EEE_TX_LPI_TIMER],
+ &mod);
+ ret = 0;
+ if (!mod)
+ goto out_ops;
+
+ ret = dev->ethtool_ops->set_eee(dev, &eee);
+ if (ret < 0)
+ goto out_ops;
+ ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+out_dev:
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 10d929abdf6a..89d0b1827aaf 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1354,6 +1354,7 @@ static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
static int ethtool_set_eee(struct net_device *dev, char __user *useraddr)
{
struct ethtool_eee edata;
+ int ret;
if (!dev->ethtool_ops->set_eee)
return -EOPNOTSUPP;
@@ -1361,7 +1362,10 @@ static int ethtool_set_eee(struct net_device *dev, char __user *useraddr)
if (copy_from_user(&edata, useraddr, sizeof(edata)))
return -EFAULT;
- return dev->ethtool_ops->set_eee(dev, &edata);
+ ret = dev->ethtool_ops->set_eee(dev, &edata);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL);
+ return ret;
}
static int ethtool_nway_reset(struct net_device *dev)
@@ -1571,6 +1575,7 @@ static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
void __user *useraddr)
{
struct ethtool_coalesce coalesce;
+ int ret;
if (!dev->ethtool_ops->set_coalesce)
return -EOPNOTSUPP;
@@ -1581,7 +1586,10 @@ static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
if (!ethtool_set_coalesce_supported(dev, &coalesce))
return -EOPNOTSUPP;
- return dev->ethtool_ops->set_coalesce(dev, &coalesce);
+ ret = dev->ethtool_ops->set_coalesce(dev, &coalesce);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL);
+ return ret;
}
static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
@@ -1701,6 +1709,7 @@ static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
{
struct ethtool_pauseparam pauseparam;
+ int ret;
if (!dev->ethtool_ops->set_pauseparam)
return -EOPNOTSUPP;
@@ -1708,7 +1717,10 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
return -EFAULT;
- return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
+ ret = dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL);
+ return ret;
}
static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
@@ -2128,32 +2140,17 @@ out:
static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr)
{
- int err = 0;
struct ethtool_ts_info info;
- const struct ethtool_ops *ops = dev->ethtool_ops;
- struct phy_device *phydev = dev->phydev;
-
- memset(&info, 0, sizeof(info));
- info.cmd = ETHTOOL_GET_TS_INFO;
-
- if (phy_has_tsinfo(phydev)) {
- err = phy_ts_info(phydev, &info);
- } else if (ops->get_ts_info) {
- err = ops->get_ts_info(dev, &info);
- } else {
- info.so_timestamping =
- SOF_TIMESTAMPING_RX_SOFTWARE |
- SOF_TIMESTAMPING_SOFTWARE;
- info.phc_index = -1;
- }
+ int err;
+ err = __ethtool_get_ts_info(dev, &info);
if (err)
return err;
if (copy_to_user(useraddr, &info, sizeof(info)))
- err = -EFAULT;
+ return -EFAULT;
- return err;
+ return 0;
}
static int __ethtool_get_module_info(struct net_device *dev,
diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c
index 2df420068cbb..677068deb68c 100644
--- a/net/ethtool/linkinfo.c
+++ b/net/ethtool/linkinfo.c
@@ -128,9 +128,10 @@ int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info)
if (ret < 0)
return ret;
dev = req_info.dev;
+ ret = -EOPNOTSUPP;
if (!dev->ethtool_ops->get_link_ksettings ||
!dev->ethtool_ops->set_link_ksettings)
- return -EOPNOTSUPP;
+ goto out_dev;
rtnl_lock();
ret = ethnl_ops_begin(dev);
@@ -164,6 +165,7 @@ out_ops:
ethnl_ops_complete(dev);
out_rtnl:
rtnl_unlock();
+out_dev:
dev_put(dev);
return ret;
}
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
index cb29cc8c5960..452608c6d856 100644
--- a/net/ethtool/linkmodes.c
+++ b/net/ethtool/linkmodes.c
@@ -341,9 +341,10 @@ int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info)
if (ret < 0)
return ret;
dev = req_info.dev;
+ ret = -EOPNOTSUPP;
if (!dev->ethtool_ops->get_link_ksettings ||
!dev->ethtool_ops->set_link_ksettings)
- return -EOPNOTSUPP;
+ goto out_dev;
rtnl_lock();
ret = ethnl_ops_begin(dev);
@@ -373,6 +374,7 @@ out_ops:
ethnl_ops_complete(dev);
out_rtnl:
rtnl_unlock();
+out_dev:
dev_put(dev);
return ret;
}
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index 55c8ce4019d9..0c772318c023 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -40,6 +40,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
struct nlattr *tb[ETHTOOL_A_HEADER_MAX + 1];
const struct nlattr *devname_attr;
struct net_device *dev = NULL;
+ u32 flags = 0;
int ret;
if (!header) {
@@ -50,8 +51,17 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
ethnl_header_policy, extack);
if (ret < 0)
return ret;
- devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME];
+ if (tb[ETHTOOL_A_HEADER_FLAGS]) {
+ flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]);
+ if (flags & ~ETHTOOL_FLAG_ALL) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_HEADER_FLAGS],
+ "unrecognized request flags");
+ nl_set_extack_cookie_u32(extack, ETHTOOL_FLAG_ALL);
+ return -EOPNOTSUPP;
+ }
+ }
+ devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME];
if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) {
u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]);
@@ -90,9 +100,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
}
req_info->dev = dev;
- if (tb[ETHTOOL_A_HEADER_FLAGS])
- req_info->flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]);
-
+ req_info->flags = flags;
return 0;
}
@@ -219,6 +227,10 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
[ETHTOOL_MSG_PRIVFLAGS_GET] = &ethnl_privflags_request_ops,
[ETHTOOL_MSG_RINGS_GET] = &ethnl_rings_request_ops,
[ETHTOOL_MSG_CHANNELS_GET] = &ethnl_channels_request_ops,
+ [ETHTOOL_MSG_COALESCE_GET] = &ethnl_coalesce_request_ops,
+ [ETHTOOL_MSG_PAUSE_GET] = &ethnl_pause_request_ops,
+ [ETHTOOL_MSG_EEE_GET] = &ethnl_eee_request_ops,
+ [ETHTOOL_MSG_TSINFO_GET] = &ethnl_tsinfo_request_ops,
};
static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -535,6 +547,9 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = {
[ETHTOOL_MSG_PRIVFLAGS_NTF] = &ethnl_privflags_request_ops,
[ETHTOOL_MSG_RINGS_NTF] = &ethnl_rings_request_ops,
[ETHTOOL_MSG_CHANNELS_NTF] = &ethnl_channels_request_ops,
+ [ETHTOOL_MSG_COALESCE_NTF] = &ethnl_coalesce_request_ops,
+ [ETHTOOL_MSG_PAUSE_NTF] = &ethnl_pause_request_ops,
+ [ETHTOOL_MSG_EEE_NTF] = &ethnl_eee_request_ops,
};
/* default notification handler */
@@ -624,6 +639,9 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = {
[ETHTOOL_MSG_PRIVFLAGS_NTF] = ethnl_default_notify,
[ETHTOOL_MSG_RINGS_NTF] = ethnl_default_notify,
[ETHTOOL_MSG_CHANNELS_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify,
};
void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data)
@@ -778,6 +796,49 @@ static const struct genl_ops ethtool_genl_ops[] = {
.flags = GENL_UNS_ADMIN_PERM,
.doit = ethnl_set_channels,
},
+ {
+ .cmd = ETHTOOL_MSG_COALESCE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_COALESCE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_coalesce,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PAUSE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PAUSE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_pause,
+ },
+ {
+ .cmd = ETHTOOL_MSG_EEE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
+ {
+ .cmd = ETHTOOL_MSG_EEE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_eee,
+ },
+ {
+ .cmd = ETHTOOL_MSG_TSINFO_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ },
};
static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 45aad99a6021..81b8fa020bcb 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -341,6 +341,10 @@ extern const struct ethnl_request_ops ethnl_features_request_ops;
extern const struct ethnl_request_ops ethnl_privflags_request_ops;
extern const struct ethnl_request_ops ethnl_rings_request_ops;
extern const struct ethnl_request_ops ethnl_channels_request_ops;
+extern const struct ethnl_request_ops ethnl_coalesce_request_ops;
+extern const struct ethnl_request_ops ethnl_pause_request_ops;
+extern const struct ethnl_request_ops ethnl_eee_request_ops;
+extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info);
@@ -350,5 +354,8 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info);
int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info);
+int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info);
#endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c
new file mode 100644
index 000000000000..7aea35d1e8a5
--- /dev/null
+++ b/net/ethtool/pause.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+
+struct pause_req_info {
+ struct ethnl_req_info base;
+};
+
+struct pause_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_pauseparam pauseparam;
+};
+
+#define PAUSE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct pause_reply_data, base)
+
+static const struct nla_policy
+pause_get_policy[ETHTOOL_A_PAUSE_MAX + 1] = {
+ [ETHTOOL_A_PAUSE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_PAUSE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_REJECT },
+ [ETHTOOL_A_PAUSE_RX] = { .type = NLA_REJECT },
+ [ETHTOOL_A_PAUSE_TX] = { .type = NLA_REJECT },
+};
+
+static int pause_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct pause_reply_data *data = PAUSE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_pauseparam)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam);
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int pause_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */
+ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */
+ nla_total_size(sizeof(u8)); /* _PAUSE_TX */
+}
+
+static int pause_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct pause_reply_data *data = PAUSE_REPDATA(reply_base);
+ const struct ethtool_pauseparam *pauseparam = &data->pauseparam;
+
+ if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) ||
+ nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) ||
+ nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_pause_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PAUSE_GET,
+ .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PAUSE_HEADER,
+ .max_attr = ETHTOOL_A_PAUSE_MAX,
+ .req_info_size = sizeof(struct pause_req_info),
+ .reply_data_size = sizeof(struct pause_reply_data),
+ .request_policy = pause_get_policy,
+
+ .prepare_data = pause_prepare_data,
+ .reply_size = pause_reply_size,
+ .fill_reply = pause_fill_reply,
+};
+
+/* PAUSE_SET */
+
+static const struct nla_policy
+pause_set_policy[ETHTOOL_A_PAUSE_MAX + 1] = {
+ [ETHTOOL_A_PAUSE_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_PAUSE_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 },
+ [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 },
+ [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 },
+};
+
+int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[ETHTOOL_A_PAUSE_MAX + 1];
+ struct ethtool_pauseparam params = {};
+ struct ethnl_req_info req_info = {};
+ const struct ethtool_ops *ops;
+ struct net_device *dev;
+ bool mod = false;
+ int ret;
+
+ ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, ETHTOOL_A_PAUSE_MAX,
+ pause_set_policy, info->extack);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_PAUSE_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+ ops = dev->ethtool_ops;
+ ret = -EOPNOTSUPP;
+ if (!ops->get_pauseparam || !ops->set_pauseparam)
+ goto out_dev;
+
+ rtnl_lock();
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_rtnl;
+ ops->get_pauseparam(dev, &params);
+
+ ethnl_update_bool32(&params.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod);
+ ethnl_update_bool32(&params.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod);
+ ethnl_update_bool32(&params.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod);
+ ret = 0;
+ if (!mod)
+ goto out_ops;
+
+ ret = dev->ethtool_ops->set_pauseparam(dev, &params);
+ if (ret < 0)
+ goto out_ops;
+ ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_rtnl:
+ rtnl_unlock();
+out_dev:
+ dev_put(dev);
+ return ret;
+}
diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c
index e8f03b33db9b..77447dceb109 100644
--- a/net/ethtool/privflags.c
+++ b/net/ethtool/privflags.c
@@ -175,9 +175,10 @@ int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info)
return ret;
dev = req_info.dev;
ops = dev->ethtool_ops;
+ ret = -EOPNOTSUPP;
if (!ops->get_priv_flags || !ops->set_priv_flags ||
!ops->get_sset_count || !ops->get_strings)
- return -EOPNOTSUPP;
+ goto out_dev;
rtnl_lock();
ret = ethnl_ops_begin(dev);
@@ -204,6 +205,7 @@ out_ops:
ethnl_ops_complete(dev);
out_rtnl:
rtnl_unlock();
+out_dev:
dev_put(dev);
return ret;
}
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index 8e5911887b4c..95eae5c68a52 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -60,6 +60,21 @@ static const struct strset_info info_template[] = {
.count = WOL_MODE_COUNT,
.strings = wol_mode_names,
},
+ [ETH_SS_SOF_TIMESTAMPING] = {
+ .per_dev = false,
+ .count = __SOF_TIMESTAMPING_CNT,
+ .strings = sof_timestamping_names,
+ },
+ [ETH_SS_TS_TX_TYPES] = {
+ .per_dev = false,
+ .count = __HWTSTAMP_TX_CNT,
+ .strings = ts_tx_type_names,
+ },
+ [ETH_SS_TS_RX_FILTERS] = {
+ .per_dev = false,
+ .count = __HWTSTAMP_FILTER_CNT,
+ .strings = ts_rx_filter_names,
+ },
};
struct strset_req_info {
diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c
new file mode 100644
index 000000000000..7cb5b512b77c
--- /dev/null
+++ b/net/ethtool/tsinfo.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/net_tstamp.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct tsinfo_req_info {
+ struct ethnl_req_info base;
+};
+
+struct tsinfo_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_ts_info ts_info;
+};
+
+#define TSINFO_REPDATA(__reply_base) \
+ container_of(__reply_base, struct tsinfo_reply_data, base)
+
+static const struct nla_policy
+tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = {
+ [ETHTOOL_A_TSINFO_UNSPEC] = { .type = NLA_REJECT },
+ [ETHTOOL_A_TSINFO_HEADER] = { .type = NLA_NESTED },
+ [ETHTOOL_A_TSINFO_TIMESTAMPING] = { .type = NLA_REJECT },
+ [ETHTOOL_A_TSINFO_TX_TYPES] = { .type = NLA_REJECT },
+ [ETHTOOL_A_TSINFO_RX_FILTERS] = { .type = NLA_REJECT },
+ [ETHTOOL_A_TSINFO_PHC_INDEX] = { .type = NLA_REJECT },
+};
+
+static int tsinfo_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ struct genl_info *info)
+{
+ struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = __ethtool_get_ts_info(dev, &data->ts_info);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int tsinfo_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct ethtool_ts_info *ts_info = &data->ts_info;
+ int len = 0;
+ int ret;
+
+ BUILD_BUG_ON(__SOF_TIMESTAMPING_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32);
+
+ if (ts_info->so_timestamping) {
+ ret = ethnl_bitset32_size(&ts_info->so_timestamping, NULL,
+ __SOF_TIMESTAMPING_CNT,
+ sof_timestamping_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSINFO_TIMESTAMPING */
+ }
+ if (ts_info->tx_types) {
+ ret = ethnl_bitset32_size(&ts_info->tx_types, NULL,
+ __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSINFO_TX_TYPES */
+ }
+ if (ts_info->rx_filters) {
+ ret = ethnl_bitset32_size(&ts_info->rx_filters, NULL,
+ __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSINFO_RX_FILTERS */
+ }
+ if (ts_info->phc_index >= 0)
+ len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */
+
+ return len;
+}
+
+static int tsinfo_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct ethtool_ts_info *ts_info = &data->ts_info;
+ int ret;
+
+ if (ts_info->so_timestamping) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TIMESTAMPING,
+ &ts_info->so_timestamping, NULL,
+ __SOF_TIMESTAMPING_CNT,
+ sof_timestamping_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+ if (ts_info->tx_types) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TX_TYPES,
+ &ts_info->tx_types, NULL,
+ __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+ if (ts_info->rx_filters) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_RX_FILTERS,
+ &ts_info->rx_filters, NULL,
+ __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+ if (ts_info->phc_index >= 0 &&
+ nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_tsinfo_request_ops = {
+ .request_cmd = ETHTOOL_MSG_TSINFO_GET,
+ .reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_TSINFO_HEADER,
+ .max_attr = ETHTOOL_A_TSINFO_MAX,
+ .req_info_size = sizeof(struct tsinfo_req_info),
+ .reply_data_size = sizeof(struct tsinfo_reply_data),
+ .request_policy = tsinfo_get_policy,
+
+ .prepare_data = tsinfo_prepare_data,
+ .reply_size = tsinfo_reply_size,
+ .fill_reply = tsinfo_fill_reply,
+};
diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c
index 1d2bcabee554..1798421e9f1c 100644
--- a/net/ethtool/wol.c
+++ b/net/ethtool/wol.c
@@ -129,8 +129,9 @@ int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info)
if (ret < 0)
return ret;
dev = req_info.dev;
+ ret = -EOPNOTSUPP;
if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol)
- return -EOPNOTSUPP;
+ goto out_dev;
rtnl_lock();
ret = ethnl_ops_begin(dev);
@@ -173,6 +174,7 @@ out_ops:
ethnl_ops_complete(dev);
out_rtnl:
rtnl_unlock();
+out_dev:
dev_put(dev);
return ret;
}
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index d46d22c7105c..03b891904314 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -483,12 +483,9 @@ int hsr_get_node_data(struct hsr_priv *hsr,
struct hsr_port *port;
unsigned long tdiff;
- rcu_read_lock();
node = find_node_by_addr_A(&hsr->node_db, addr);
- if (!node) {
- rcu_read_unlock();
- return -ENOENT; /* No such entry */
- }
+ if (!node)
+ return -ENOENT;
ether_addr_copy(addr_b, node->macaddress_B);
@@ -523,7 +520,5 @@ int hsr_get_node_data(struct hsr_priv *hsr,
*addr_b_ifindex = -1;
}
- rcu_read_unlock();
-
return 0;
}
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index 64d39c1e93a2..5465a395da04 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -250,15 +250,16 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
if (!na)
goto invalid;
- hsr_dev = __dev_get_by_index(genl_info_net(info),
- nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+ rcu_read_lock();
+ hsr_dev = dev_get_by_index_rcu(genl_info_net(info),
+ nla_get_u32(info->attrs[HSR_A_IFINDEX]));
if (!hsr_dev)
- goto invalid;
+ goto rcu_unlock;
if (!is_hsr_master(hsr_dev))
- goto invalid;
+ goto rcu_unlock;
/* Send reply */
- skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb_out) {
res = -ENOMEM;
goto fail;
@@ -312,12 +313,10 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq);
if (res < 0)
goto nla_put_failure;
- rcu_read_lock();
port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
if (port)
res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX,
port->dev->ifindex);
- rcu_read_unlock();
if (res < 0)
goto nla_put_failure;
@@ -327,20 +326,22 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq);
if (res < 0)
goto nla_put_failure;
- rcu_read_lock();
port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
if (port)
res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX,
port->dev->ifindex);
- rcu_read_unlock();
if (res < 0)
goto nla_put_failure;
+ rcu_read_unlock();
+
genlmsg_end(skb_out, msg_head);
genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid);
return 0;
+rcu_unlock:
+ rcu_read_unlock();
invalid:
netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL);
return 0;
@@ -350,6 +351,7 @@ nla_put_failure:
/* Fall through */
fail:
+ rcu_read_unlock();
return res;
}
@@ -357,16 +359,14 @@ fail:
*/
static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
{
- /* For receiving */
- struct nlattr *na;
+ unsigned char addr[ETH_ALEN];
struct net_device *hsr_dev;
-
- /* For sending */
struct sk_buff *skb_out;
- void *msg_head;
struct hsr_priv *hsr;
- void *pos;
- unsigned char addr[ETH_ALEN];
+ bool restart = false;
+ struct nlattr *na;
+ void *pos = NULL;
+ void *msg_head;
int res;
if (!info)
@@ -376,15 +376,17 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
if (!na)
goto invalid;
- hsr_dev = __dev_get_by_index(genl_info_net(info),
- nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+ rcu_read_lock();
+ hsr_dev = dev_get_by_index_rcu(genl_info_net(info),
+ nla_get_u32(info->attrs[HSR_A_IFINDEX]));
if (!hsr_dev)
- goto invalid;
+ goto rcu_unlock;
if (!is_hsr_master(hsr_dev))
- goto invalid;
+ goto rcu_unlock;
+restart:
/* Send reply */
- skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb_out = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC);
if (!skb_out) {
res = -ENOMEM;
goto fail;
@@ -398,18 +400,26 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
goto nla_put_failure;
}
- res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
- if (res < 0)
- goto nla_put_failure;
+ if (!restart) {
+ res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+ }
hsr = netdev_priv(hsr_dev);
- rcu_read_lock();
- pos = hsr_get_next_node(hsr, NULL, addr);
+ if (!pos)
+ pos = hsr_get_next_node(hsr, NULL, addr);
while (pos) {
res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr);
if (res < 0) {
- rcu_read_unlock();
+ if (res == -EMSGSIZE) {
+ genlmsg_end(skb_out, msg_head);
+ genlmsg_unicast(genl_info_net(info), skb_out,
+ info->snd_portid);
+ restart = true;
+ goto restart;
+ }
goto nla_put_failure;
}
pos = hsr_get_next_node(hsr, pos, addr);
@@ -421,15 +431,18 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
return 0;
+rcu_unlock:
+ rcu_read_unlock();
invalid:
netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL);
return 0;
nla_put_failure:
- kfree_skb(skb_out);
+ nlmsg_free(skb_out);
/* Fall through */
fail:
+ rcu_read_unlock();
return res;
}
@@ -456,6 +469,7 @@ static struct genl_family hsr_genl_family __ro_after_init = {
.version = 1,
.maxattr = HSR_A_MAX,
.policy = hsr_genl_policy,
+ .netnsok = true,
.module = THIS_MODULE,
.ops = hsr_ops,
.n_ops = ARRAY_SIZE(hsr_ops),
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index d3547e8c6d5b..f4b9f7a3ce51 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -151,16 +151,16 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
if (!port)
return -ENOMEM;
+ port->hsr = hsr;
+ port->dev = dev;
+ port->type = type;
+
if (type != HSR_PT_MASTER) {
res = hsr_portdev_setup(hsr, dev, port, extack);
if (res)
goto fail_dev_setup;
}
- port->hsr = hsr;
- port->dev = dev;
- port->type = type;
-
list_add_tail_rcu(&port->port_list, &hsr->ports);
synchronize_rcu();
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index f96bd489b362..25a8888826b8 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -129,7 +129,7 @@ config IP_PNP_DHCP
If unsure, say Y. Note that if you want to use DHCP, a DHCP server
must be operating on your network. Read
- <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+ <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
config IP_PNP_BOOTP
bool "IP: BOOTP support"
@@ -144,7 +144,7 @@ config IP_PNP_BOOTP
does BOOTP itself, providing all necessary information on the kernel
command line, you can say N here. If unsure, say Y. Note that if you
want to use BOOTP, a BOOTP server must be operating on your network.
- Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+ Read <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
config IP_PNP_RARP
bool "IP: RARP support"
@@ -157,7 +157,7 @@ config IP_PNP_RARP
older protocol which is being obsoleted by BOOTP and DHCP), say Y
here. Note that if you want to use RARP, a RARP server must be
operating on your network. Read
- <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+ <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
config NET_IPIP
tristate "IP: tunneling"
@@ -303,6 +303,7 @@ config SYN_COOKIES
config NET_IPVTI
tristate "Virtual (secure) IP: tunneling"
+ depends on IPV6 || IPV6=n
select INET_TUNNEL
select NET_IP_TUNNEL
select XFRM
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index bd7b4e92e07f..cf58e29cf746 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1793,6 +1793,10 @@ static __net_exit void ipv4_mib_exit_net(struct net *net)
free_percpu(net->mib.net_statistics);
free_percpu(net->mib.ip_statistics);
free_percpu(net->mib.tcp_statistics);
+#ifdef CONFIG_MPTCP
+ /* allocated on demand, see mptcp_init_sock() */
+ free_percpu(net->mib.mptcp_statistics);
+#endif
}
static __net_initdata struct pernet_operations ipv4_mib_ops = {
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 574972bc7299..e3939f76b024 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -7,6 +7,7 @@
#include <linux/btf.h>
#include <linux/filter.h>
#include <net/tcp.h>
+#include <net/bpf_sk_storage.h>
static u32 optional_ops[] = {
offsetof(struct tcp_congestion_ops, init),
@@ -27,6 +28,27 @@ static u32 unsupported_ops[] = {
static const struct btf_type *tcp_sock_type;
static u32 tcp_sock_id, sock_id;
+static int btf_sk_storage_get_ids[5];
+static struct bpf_func_proto btf_sk_storage_get_proto __read_mostly;
+
+static int btf_sk_storage_delete_ids[5];
+static struct bpf_func_proto btf_sk_storage_delete_proto __read_mostly;
+
+static void convert_sk_func_proto(struct bpf_func_proto *to, int *to_btf_ids,
+ const struct bpf_func_proto *from)
+{
+ int i;
+
+ *to = *from;
+ to->btf_id = to_btf_ids;
+ for (i = 0; i < ARRAY_SIZE(to->arg_type); i++) {
+ if (to->arg_type[i] == ARG_PTR_TO_SOCKET) {
+ to->arg_type[i] = ARG_PTR_TO_BTF_ID;
+ to->btf_id[i] = tcp_sock_id;
+ }
+ }
+}
+
static int bpf_tcp_ca_init(struct btf *btf)
{
s32 type_id;
@@ -42,6 +64,13 @@ static int bpf_tcp_ca_init(struct btf *btf)
tcp_sock_id = type_id;
tcp_sock_type = btf_type_by_id(btf, tcp_sock_id);
+ convert_sk_func_proto(&btf_sk_storage_get_proto,
+ btf_sk_storage_get_ids,
+ &bpf_sk_storage_get_proto);
+ convert_sk_func_proto(&btf_sk_storage_delete_proto,
+ btf_sk_storage_delete_ids,
+ &bpf_sk_storage_delete_proto);
+
return 0;
}
@@ -167,6 +196,10 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
switch (func_id) {
case BPF_FUNC_tcp_send_ack:
return &bpf_tcp_send_ack_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &btf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &btf_sk_storage_delete_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -184,7 +217,6 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
{
const struct tcp_congestion_ops *utcp_ca;
struct tcp_congestion_ops *tcp_ca;
- size_t tcp_ca_name_len;
int prog_fd;
u32 moff;
@@ -199,13 +231,11 @@ static int bpf_tcp_ca_init_member(const struct btf_type *t,
tcp_ca->flags = utcp_ca->flags;
return 1;
case offsetof(struct tcp_congestion_ops, name):
- tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name));
- if (!tcp_ca_name_len ||
- tcp_ca_name_len == sizeof(utcp_ca->name))
+ if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
+ sizeof(tcp_ca->name)) <= 0)
return -EINVAL;
if (tcp_ca_find(utcp_ca->name))
return -EEXIST;
- memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name));
return 1;
}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 103c7d599a3c..8b07f3a4f2db 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -341,22 +341,6 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
esp_output_done(base, err);
}
-static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
-{
- /* Fill padding... */
- if (tfclen) {
- memset(tail, 0, tfclen);
- tail += tfclen;
- }
- do {
- int i;
- for (i = 0; i < plen - 2; i++)
- tail[i] = i + 1;
- } while (0);
- tail[plen - 2] = plen - 2;
- tail[plen - 1] = proto;
-}
-
static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
int encap_type,
struct esp_info *esp,
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index e2e219c7854a..731022cff600 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -132,6 +132,36 @@ static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
return segs;
}
+static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ int proto = xo->proto;
+
+ skb->transport_header += x->props.header_len;
+
+ if (proto == IPPROTO_BEETPH) {
+ struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data;
+
+ skb->transport_header += ph->hdrlen * 8;
+ proto = ph->nexthdr;
+ } else if (x->sel.family != AF_INET6) {
+ skb->transport_header -= IPV4_BEET_PHMAXLEN;
+ } else if (proto == IPPROTO_TCP) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+ }
+
+ __skb_pull(skb, skb_transport_offset(skb));
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x,
struct sk_buff *skb,
netdev_features_t features)
@@ -141,6 +171,8 @@ static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x,
return xfrm4_tunnel_gso_segment(x, skb, features);
case XFRM_MODE_TRANSPORT:
return xfrm4_transport_gso_segment(x, skb, features);
+ case XFRM_MODE_BEET:
+ return xfrm4_beet_gso_segment(x, skb, features);
}
return ERR_PTR(-EOPNOTSUPP);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 577db1d50a24..213be9c050ad 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -997,7 +997,9 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
return -ENOENT;
}
+ rcu_read_lock();
err = fib_table_dump(tb, skb, cb, &filter);
+ rcu_read_unlock();
return skb->len ? : err;
}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index c092e9a55790..818916b2a04d 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -35,7 +35,7 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
void fib_release_info(struct fib_info *);
struct fib_info *fib_create_info(struct fib_config *cfg,
struct netlink_ext_ack *extack);
-int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
+int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
struct netlink_ext_ack *extack);
bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi);
int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e4c62b8f57a8..6ed8c9317179 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -570,8 +570,9 @@ static int fib_detect_death(struct fib_info *fi, int order,
return 1;
}
-int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap,
- u16 encap_type, void *cfg, gfp_t gfp_flags,
+int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
+ struct nlattr *encap, u16 encap_type,
+ void *cfg, gfp_t gfp_flags,
struct netlink_ext_ack *extack)
{
int err;
@@ -589,8 +590,9 @@ int fib_nh_common_init(struct fib_nh_common *nhc, struct nlattr *encap,
err = -EINVAL;
goto lwt_failure;
}
- err = lwtunnel_build_state(encap_type, encap, nhc->nhc_family,
- cfg, &lwtstate, extack);
+ err = lwtunnel_build_state(net, encap_type, encap,
+ nhc->nhc_family, cfg, &lwtstate,
+ extack);
if (err)
goto lwt_failure;
@@ -614,7 +616,7 @@ int fib_nh_init(struct net *net, struct fib_nh *nh,
nh->fib_nh_family = AF_INET;
- err = fib_nh_common_init(&nh->nh_common, cfg->fc_encap,
+ err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap,
cfg->fc_encap_type, cfg, GFP_KERNEL, extack);
if (err)
return err;
@@ -814,7 +816,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
-static int fib_encap_match(u16 encap_type,
+static int fib_encap_match(struct net *net, u16 encap_type,
struct nlattr *encap,
const struct fib_nh *nh,
const struct fib_config *cfg,
@@ -826,7 +828,7 @@ static int fib_encap_match(u16 encap_type,
if (encap_type == LWTUNNEL_ENCAP_NONE)
return 0;
- ret = lwtunnel_build_state(encap_type, encap, AF_INET,
+ ret = lwtunnel_build_state(net, encap_type, encap, AF_INET,
cfg, &lwtstate, extack);
if (!ret) {
result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws);
@@ -836,7 +838,7 @@ static int fib_encap_match(u16 encap_type,
return result;
}
-int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
+int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
struct netlink_ext_ack *extack)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -857,8 +859,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
struct fib_nh *nh = fib_info_nh(fi, 0);
if (cfg->fc_encap) {
- if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
- nh, cfg, extack))
+ if (fib_encap_match(net, cfg->fc_encap_type,
+ cfg->fc_encap, nh, cfg, extack))
return 1;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index f4c2ac445b3f..4f334b425538 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1679,7 +1679,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
fi->fib_prefsrc == cfg->fc_prefsrc) &&
(!cfg->fc_protocol ||
fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi, extack) == 0 &&
+ fib_nh_match(net, cfg, fi, extack) == 0 &&
fib_metrics_match(cfg, fi)) {
fa_to_delete = fa;
break;
@@ -2572,6 +2572,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
" %zd bytes, size of tnode: %zd bytes.\n",
LEAF_SIZE, TNODE_SIZE(0));
+ rcu_read_lock();
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
@@ -2591,7 +2592,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
trie_show_usage(seq, t->stats);
#endif
}
+ cond_resched_rcu();
}
+ rcu_read_unlock();
return 0;
}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8274f98c511c..029b24eeafba 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1153,6 +1153,24 @@ static int ipgre_netlink_parms(struct net_device *dev,
if (data[IFLA_GRE_FWMARK])
*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
+ return 0;
+}
+
+static int erspan_netlink_parms(struct net_device *dev,
+ struct nlattr *data[],
+ struct nlattr *tb[],
+ struct ip_tunnel_parm *parms,
+ __u32 *fwmark)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err;
+
+ err = ipgre_netlink_parms(dev, data, tb, parms, fwmark);
+ if (err)
+ return err;
+ if (!data)
+ return 0;
+
if (data[IFLA_GRE_ERSPAN_VER]) {
t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
@@ -1276,45 +1294,70 @@ static void ipgre_tap_setup(struct net_device *dev)
ip_tunnel_setup(dev, gre_tap_net_id);
}
-static int ipgre_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
- struct netlink_ext_ack *extack)
+static int
+ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[])
{
- struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
- __u32 fwmark = 0;
- int err;
if (ipgre_netlink_encap_parms(data, &ipencap)) {
struct ip_tunnel *t = netdev_priv(dev);
- err = ip_tunnel_encap_setup(t, &ipencap);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
+ return 0;
+}
+
+static int ipgre_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_tunnel_parm p;
+ __u32 fwmark = 0;
+ int err;
+
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
if (err < 0)
return err;
return ip_tunnel_newlink(dev, tb, &p, fwmark);
}
+static int erspan_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_tunnel_parm p;
+ __u32 fwmark = 0;
+ int err;
+
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
+ err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
+ if (err)
+ return err;
+ return ip_tunnel_newlink(dev, tb, &p, fwmark);
+}
+
static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[],
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_encap ipencap;
__u32 fwmark = t->fwmark;
struct ip_tunnel_parm p;
int err;
- if (ipgre_netlink_encap_parms(data, &ipencap)) {
- err = ip_tunnel_encap_setup(t, &ipencap);
-
- if (err < 0)
- return err;
- }
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
if (err < 0)
@@ -1327,8 +1370,34 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
t->parms.i_flags = p.i_flags;
t->parms.o_flags = p.o_flags;
- if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
- ipgre_link_update(dev, !tb[IFLA_MTU]);
+ ipgre_link_update(dev, !tb[IFLA_MTU]);
+
+ return 0;
+}
+
+static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ __u32 fwmark = t->fwmark;
+ struct ip_tunnel_parm p;
+ int err;
+
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
+ err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
+ if (err < 0)
+ return err;
+
+ err = ip_tunnel_changelink(dev, tb, &p, fwmark);
+ if (err < 0)
+ return err;
+
+ t->parms.i_flags = p.i_flags;
+ t->parms.o_flags = p.o_flags;
return 0;
}
@@ -1519,8 +1588,8 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = {
.priv_size = sizeof(struct ip_tunnel),
.setup = erspan_setup,
.validate = erspan_validate,
- .newlink = ipgre_newlink,
- .changelink = ipgre_changelink,
+ .newlink = erspan_newlink,
+ .changelink = erspan_changelink,
.dellink = ip_tunnel_dellink,
.get_size = ipgre_get_size,
.fill_info = ipgre_fill_info,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index aa438c6758a7..b0c244af1e4d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -509,7 +509,8 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */
- skb_orphan(skb);
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
return skb;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index aaaaf907e0d8..090d3097ee15 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -263,7 +263,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
* insufficent MTU.
*/
features = netif_skb_features(skb);
- BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 74e1d964a615..cd4b84310d92 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -142,11 +142,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
cand = t;
}
- if (flags & TUNNEL_NO_KEY)
- goto skip_key_lookup;
-
hlist_for_each_entry_rcu(t, head, hash_node) {
- if (t->parms.i_key != key ||
+ if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
t->parms.iph.saddr != 0 ||
t->parms.iph.daddr != 0 ||
!(t->dev->flags & IFF_UP))
@@ -158,7 +155,6 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
cand = t;
}
-skip_key_lookup:
if (cand)
return cand;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 47f8b947eef1..181b7a2a0247 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -432,7 +432,7 @@ static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
return ip_tun_parse_opts(attr, info, extack);
}
-static int ip_tun_build_state(struct nlattr *attr,
+static int ip_tun_build_state(struct net *net, struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
@@ -719,7 +719,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
[LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED },
};
-static int ip6_tun_build_state(struct nlattr *attr,
+static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 37cddd18f282..1b4e6f298648 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -187,17 +187,39 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
int mtu;
if (!dst) {
- struct rtable *rt;
-
- fl->u.ip4.flowi4_oif = dev->ifindex;
- fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
- rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
- if (IS_ERR(rt)) {
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ struct rtable *rt;
+
+ fl->u.ip4.flowi4_oif = dev->ifindex;
+ fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+ rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
+ if (IS_ERR(rt)) {
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+ dst = &rt->dst;
+ skb_dst_set(skb, dst);
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ fl->u.ip6.flowi6_oif = dev->ifindex;
+ fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
+ dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
+ if (dst->error) {
+ dst_release(dst);
+ dst = NULL;
+ dev->stats.tx_carrier_errors++;
+ goto tx_error_icmp;
+ }
+ skb_dst_set(skb, dst);
+ break;
+#endif
+ default:
dev->stats.tx_carrier_errors++;
goto tx_error_icmp;
}
- dst = &rt->dst;
- skb_dst_set(skb, dst);
}
dst_hold(dst);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 4438f6b12335..561f15b5a944 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -1621,7 +1621,7 @@ late_initcall(ip_auto_config);
/*
* Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
- * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt.
+ * command line parameter. See Documentation/admin-guide/nfs/nfsroot.rst.
*/
static int __init ic_proto_name(char *name)
{
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 2580303249e2..75545a829a2b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -32,6 +32,7 @@
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/tcp.h>
+#include <net/mptcp.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <linux/bottom_half.h>
@@ -485,6 +486,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
offsetof(struct ipstats_mib, syncp)));
seq_putc(seq, '\n');
+ mptcp_seq_show(seq);
return 0;
}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5c57850fab4b..6d87de434377 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2948,8 +2948,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EPERM;
else if (tp->repair_queue == TCP_SEND_QUEUE)
WRITE_ONCE(tp->write_seq, val);
- else if (tp->repair_queue == TCP_RECV_QUEUE)
+ else if (tp->repair_queue == TCP_RECV_QUEUE) {
WRITE_ONCE(tp->rcv_nxt, val);
+ WRITE_ONCE(tp->copied_seq, val);
+ }
else
err = -EINVAL;
break;
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index fe7b4fbc31c1..5a05327f97c1 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -10,38 +10,6 @@
#include <net/inet_common.h>
#include <net/tls.h>
-static bool tcp_bpf_stream_read(const struct sock *sk)
-{
- struct sk_psock *psock;
- bool empty = true;
-
- rcu_read_lock();
- psock = sk_psock(sk);
- if (likely(psock))
- empty = list_empty(&psock->ingress_msg);
- rcu_read_unlock();
- return !empty;
-}
-
-static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
- int flags, long timeo, int *err)
-{
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
- int ret = 0;
-
- if (!timeo)
- return ret;
-
- add_wait_queue(sk_sleep(sk), &wait);
- sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- ret = sk_wait_event(sk, &timeo,
- !list_empty(&psock->ingress_msg) ||
- !skb_queue_empty(&sk->sk_receive_queue), &wait);
- sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- remove_wait_queue(sk_sleep(sk), &wait);
- return ret;
-}
-
int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
struct msghdr *msg, int len, int flags)
{
@@ -115,49 +83,6 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
}
EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
-int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int nonblock, int flags, int *addr_len)
-{
- struct sk_psock *psock;
- int copied, ret;
-
- psock = sk_psock_get(sk);
- if (unlikely(!psock))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
- if (unlikely(flags & MSG_ERRQUEUE))
- return inet_recv_error(sk, msg, len, addr_len);
- if (!skb_queue_empty(&sk->sk_receive_queue) &&
- sk_psock_queue_empty(psock))
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
- lock_sock(sk);
-msg_bytes_ready:
- copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
- if (!copied) {
- int data, err = 0;
- long timeo;
-
- timeo = sock_rcvtimeo(sk, nonblock);
- data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
- if (data) {
- if (!sk_psock_queue_empty(psock))
- goto msg_bytes_ready;
- release_sock(sk);
- sk_psock_put(sk, psock);
- return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
- }
- if (err) {
- ret = err;
- goto out;
- }
- copied = -EAGAIN;
- }
- ret = copied;
-out:
- release_sock(sk);
- sk_psock_put(sk, psock);
- return ret;
-}
-
static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, u32 apply_bytes, int flags)
{
@@ -298,6 +223,82 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
}
EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
+#ifdef CONFIG_BPF_STREAM_PARSER
+static bool tcp_bpf_stream_read(const struct sock *sk)
+{
+ struct sk_psock *psock;
+ bool empty = true;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock))
+ empty = list_empty(&psock->ingress_msg);
+ rcu_read_unlock();
+ return !empty;
+}
+
+static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
+ int flags, long timeo, int *err)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ if (!timeo)
+ return ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = sk_wait_event(sk, &timeo,
+ !list_empty(&psock->ingress_msg) ||
+ !skb_queue_empty(&sk->sk_receive_queue), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+
+static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int nonblock, int flags, int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied, ret;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+ if (!skb_queue_empty(&sk->sk_receive_queue) &&
+ sk_psock_queue_empty(psock))
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ lock_sock(sk);
+msg_bytes_ready:
+ copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ int data, err = 0;
+ long timeo;
+
+ timeo = sock_rcvtimeo(sk, nonblock);
+ data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+ if (data) {
+ if (!sk_psock_queue_empty(psock))
+ goto msg_bytes_ready;
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+ }
+ if (err) {
+ ret = err;
+ goto out;
+ }
+ copied = -EAGAIN;
+ }
+ ret = copied;
+out:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
struct sk_msg *msg, int *copied, int flags)
{
@@ -528,7 +529,6 @@ out_err:
return copied ? copied : err;
}
-#ifdef CONFIG_BPF_STREAM_PARSER
enum {
TCP_BPF_IPV4,
TCP_BPF_IPV6,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 03af7c3e75ef..7e40322cc5ec 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -774,6 +774,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!child)
goto listen_overflow;
+ if (own_req && sk_is_mptcp(child) && mptcp_sk_is_subflow(child)) {
+ reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ return child;
+ }
+
sock_rps_save_rxhash(child, skb);
tcp_synack_rtt_meas(child, req);
*req_stolen = !own_req;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 306e25d743e8..2f45cde168c4 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1109,6 +1109,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
if (unlikely(!skb))
return -ENOBUFS;
+ /* retransmit skbs might have a non zero value in skb->dev
+ * because skb->dev is aliased with skb->rbnode.rb_left
+ */
+ skb->dev = NULL;
}
inet = inet_sk(sk);
@@ -3037,8 +3041,12 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
tcp_skb_tsorted_save(skb) {
nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
- err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
- -ENOBUFS;
+ if (nskb) {
+ nskb->dev = NULL;
+ err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
+ } else {
+ err = -ENOBUFS;
+ }
} tcp_skb_tsorted_restore(skb);
if (!err) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 2633fc231593..32564b350823 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2109,7 +2109,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
if (likely(!udp_unexpected_gso(sk, skb)))
return udp_queue_rcv_one_skb(sk, skb);
- BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
__skb_push(skb, -skb_mac_offset(skb));
segs = udp_rcv_segment(sk, skb, true);
skb_list_walk_safe(segs, skb, next) {
@@ -2288,6 +2288,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
+ bool refcounted;
/*
* Validate the packet.
@@ -2313,7 +2314,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
- sk = skb_steal_sock(skb);
+ sk = skb_steal_sock(skb, &refcounted);
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
@@ -2322,7 +2323,8 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
udp_sk_rx_dst_set(sk, dst);
ret = udp_unicast_rcv_skb(sk, skb, uh);
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 1a98583a79f4..e67a66fbf27b 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -453,6 +453,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
unsigned int off = skb_gro_offset(skb);
int flush = 1;
+ NAPI_GRO_CB(skb)->is_flist = 0;
if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ae1344e4cec5..2ccaee98fddb 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -303,4 +303,14 @@ config IPV6_SEG6_BPF
depends on IPV6_SEG6_LWTUNNEL
depends on IPV6 = y
+config IPV6_RPL_LWTUNNEL
+ bool "IPv6: RPL Source Routing Header support"
+ depends on IPV6
+ select LWTUNNEL
+ ---help---
+ Support for RFC6554 RPL Source Routing Header using the lightweight
+ tunnels mechanism.
+
+ If unsure, say N.
+
endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 8ccf35514015..cf7b47bdb9b3 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -10,7 +10,7 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
- udp_offload.o seg6.o fib6_notifier.o
+ udp_offload.o seg6.o fib6_notifier.o rpl.o
ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o
@@ -26,6 +26,7 @@ ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
ipv6-$(CONFIG_NETLABEL) += calipso.o
ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o
ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
+ipv6-$(CONFIG_IPV6_RPL_LWTUNNEL) += rpl_iptunnel.o
ipv6-objs += $(ipv6-y)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 5b9de773ce73..24e319dfb510 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -236,6 +236,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.enhanced_dad = 1,
.addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
.disable_policy = 0,
+ .rpl_seg_enabled = 0,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -290,6 +291,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.enhanced_dad = 1,
.addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
.disable_policy = 0,
+ .rpl_seg_enabled = 0,
};
/* Check if link is ready: is it up and is a valid qdisc available */
@@ -1355,7 +1357,7 @@ retry:
regen_advance = idev->cnf.regen_max_retry *
idev->cnf.dad_transmits *
- NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ;
+ max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
/* recalculate max_desync_factor each time and update
* idev->desync_factor if it's larger
@@ -3296,6 +3298,10 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
if (netif_is_l3_master(idev->dev))
return;
+ /* no link local addresses on devices flagged as slaves */
+ if (idev->dev->flags & IFF_SLAVE)
+ return;
+
ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
switch (idev->cnf.addr_gen_mode) {
@@ -4115,7 +4121,8 @@ static void addrconf_dad_work(struct work_struct *w)
ifp->dad_probes--;
addrconf_mod_dad_work(ifp,
- NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME));
+ max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME),
+ HZ/100));
spin_unlock(&ifp->lock);
write_unlock_bh(&idev->lock);
@@ -4398,6 +4405,59 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
}
#endif
+/* RFC6554 has some algorithm to avoid loops in segment routing by
+ * checking if the segments contains any of a local interface address.
+ *
+ * Quote:
+ *
+ * To detect loops in the SRH, a router MUST determine if the SRH
+ * includes multiple addresses assigned to any interface on that router.
+ * If such addresses appear more than once and are separated by at least
+ * one address not assigned to that router.
+ */
+int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
+ unsigned char nsegs)
+{
+ const struct in6_addr *addr;
+ int i, ret = 0, found = 0;
+ struct inet6_ifaddr *ifp;
+ bool separated = false;
+ unsigned int hash;
+ bool hash_found;
+
+ rcu_read_lock();
+ for (i = 0; i < nsegs; i++) {
+ addr = &segs[i];
+ hash = inet6_addr_hash(net, addr);
+
+ hash_found = false;
+ hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
+ if (!net_eq(dev_net(ifp->idev->dev), net))
+ continue;
+
+ if (ipv6_addr_equal(&ifp->addr, addr)) {
+ hash_found = true;
+ break;
+ }
+ }
+
+ if (hash_found) {
+ if (found > 1 && separated) {
+ ret = 1;
+ break;
+ }
+
+ separated = false;
+ found++;
+ } else {
+ separated = true;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
/*
* Periodic address status verification
*/
@@ -4468,7 +4528,7 @@ restart:
!(ifp->flags&IFA_F_TENTATIVE)) {
unsigned long regen_advance = ifp->idev->cnf.regen_max_retry *
ifp->idev->cnf.dad_transmits *
- NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME) / HZ;
+ max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
if (age >= ifp->prefered_lft - regen_advance) {
struct inet6_ifaddr *ifpub = ifp->ifpub;
@@ -5467,6 +5527,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy;
array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass;
+ array[DEVCONF_RPL_SEG_ENABLED] = cnf->rpl_seg_enabled;
}
static inline size_t inet6_ifla6_size(void)
@@ -6848,6 +6909,13 @@ static const struct ctl_table addrconf_sysctl[] = {
.extra2 = (void *)&two_five_five,
},
{
+ .procname = "rpl_seg_enabled",
+ .data = &ipv6_devconf.rpl_seg_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
/* sentinel */
}
};
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d727c3b41495..345baa0a754f 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -59,6 +59,7 @@
#endif
#include <net/calipso.h>
#include <net/seg6.h>
+#include <net/rpl.h>
#include <linux/uaccess.h>
#include <linux/mroute6.h>
@@ -1114,6 +1115,10 @@ static int __init inet6_init(void)
if (err)
goto seg6_fail;
+ err = rpl_init();
+ if (err)
+ goto rpl_fail;
+
err = igmp6_late_init();
if (err)
goto igmp6_late_err;
@@ -1136,6 +1141,8 @@ sysctl_fail:
igmp6_late_cleanup();
#endif
igmp6_late_err:
+ rpl_exit();
+rpl_fail:
seg6_exit();
seg6_fail:
calipso_exit();
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index a3b403ba8f8f..11143d039f16 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -207,22 +207,6 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
esp_output_done(base, err);
}
-static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
-{
- /* Fill padding... */
- if (tfclen) {
- memset(tail, 0, tfclen);
- tail += tfclen;
- }
- do {
- int i;
- for (i = 0; i < plen - 2; i++)
- tail[i] = i + 1;
- } while (0);
- tail[plen - 2] = plen - 2;
- tail[plen - 1] = proto;
-}
-
int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
u8 *tail;
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index fd535053245b..8eab2c869d61 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -159,6 +159,40 @@ static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x,
return segs;
}
+static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ int proto = xo->proto;
+
+ skb->transport_header += x->props.header_len;
+
+ if (proto == IPPROTO_BEETPH) {
+ struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data;
+
+ skb->transport_header += ph->hdrlen * 8;
+ proto = ph->nexthdr;
+ }
+
+ if (x->sel.family != AF_INET6) {
+ skb->transport_header -=
+ (sizeof(struct ipv6hdr) - sizeof(struct iphdr));
+
+ if (proto == IPPROTO_TCP)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
+ }
+
+ __skb_pull(skb, skb_transport_offset(skb));
+ ops = rcu_dereference(inet6_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x,
struct sk_buff *skb,
netdev_features_t features)
@@ -168,6 +202,8 @@ static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x,
return xfrm6_tunnel_gso_segment(x, skb, features);
case XFRM_MODE_TRANSPORT:
return xfrm6_transport_gso_segment(x, skb, features);
+ case XFRM_MODE_BEET:
+ return xfrm6_beet_gso_segment(x, skb, features);
}
return ERR_PTR(-EOPNOTSUPP);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index bcb9f5e62808..5a8bbcdcaf2b 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -48,6 +48,7 @@
#ifdef CONFIG_IPV6_SEG6_HMAC
#include <net/seg6_hmac.h>
#endif
+#include <net/rpl.h>
#include <linux/uaccess.h>
@@ -468,6 +469,195 @@ looped_back:
return -1;
}
+static int ipv6_rpl_srh_rcv(struct sk_buff *skb)
+{
+ struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr;
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct net *net = dev_net(skb->dev);
+ struct inet6_dev *idev;
+ struct ipv6hdr *oldhdr;
+ struct in6_addr addr;
+ unsigned char *buf;
+ int accept_rpl_seg;
+ int i, err;
+ u64 n = 0;
+ u32 r;
+
+ idev = __in6_dev_get(skb->dev);
+
+ accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled;
+ if (accept_rpl_seg > idev->cnf.rpl_seg_enabled)
+ accept_rpl_seg = idev->cnf.rpl_seg_enabled;
+
+ if (!accept_rpl_seg) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+looped_back:
+ hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb);
+
+ if (hdr->segments_left == 0) {
+ if (hdr->nexthdr == NEXTHDR_IPV6) {
+ int offset = (hdr->hdrlen + 1) << 3;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+
+ if (!pskb_pull(skb, offset)) {
+ kfree_skb(skb);
+ return -1;
+ }
+ skb_postpull_rcsum(skb, skb_transport_header(skb),
+ offset);
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->encapsulation = 0;
+
+ __skb_tunnel_rx(skb, skb->dev, net);
+
+ netif_rx(skb);
+ return -1;
+ }
+
+ opt->srcrt = skb_network_header_len(skb);
+ opt->lastopt = opt->srcrt;
+ skb->transport_header += (hdr->hdrlen + 1) << 3;
+ opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
+
+ return 1;
+ }
+
+ if (!pskb_may_pull(skb, sizeof(*hdr))) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre);
+ r = do_div(n, (16 - hdr->cmpri));
+ /* checks if calculation was without remainder and n fits into
+ * unsigned char which is segments_left field. Should not be
+ * higher than that.
+ */
+ if (r || (n + 1) > 255) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ if (hdr->segments_left > n + 1) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ ((&hdr->segments_left) -
+ skb_network_header(skb)));
+ return -1;
+ }
+
+ if (skb_cloned(skb)) {
+ if (pskb_expand_head(skb, IPV6_RPL_SRH_WORST_SWAP_SIZE, 0,
+ GFP_ATOMIC)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ return -1;
+ }
+ } else {
+ err = skb_cow_head(skb, IPV6_RPL_SRH_WORST_SWAP_SIZE);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return -1;
+ }
+ }
+
+ hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb);
+
+ if (!pskb_may_pull(skb, ipv6_rpl_srh_size(n, hdr->cmpri,
+ hdr->cmpre))) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ hdr->segments_left--;
+ i = n - hdr->segments_left;
+
+ buf = kzalloc(ipv6_rpl_srh_alloc_size(n + 1) * 2, GFP_ATOMIC);
+ if (unlikely(!buf)) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ ohdr = (struct ipv6_rpl_sr_hdr *)buf;
+ ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n);
+ chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3));
+
+ if ((ipv6_addr_type(&ipv6_hdr(skb)->daddr) & IPV6_ADDR_MULTICAST) ||
+ (ipv6_addr_type(&ohdr->rpl_segaddr[i]) & IPV6_ADDR_MULTICAST)) {
+ kfree_skb(skb);
+ kfree(buf);
+ return -1;
+ }
+
+ err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1);
+ if (err) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0);
+ kfree_skb(skb);
+ kfree(buf);
+ return -1;
+ }
+
+ addr = ipv6_hdr(skb)->daddr;
+ ipv6_hdr(skb)->daddr = ohdr->rpl_segaddr[i];
+ ohdr->rpl_segaddr[i] = addr;
+
+ ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n);
+
+ oldhdr = ipv6_hdr(skb);
+
+ skb_pull(skb, ((hdr->hdrlen + 1) << 3));
+ skb_postpull_rcsum(skb, oldhdr,
+ sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3));
+ skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr));
+ memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3);
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_postpush_rcsum(skb, ipv6_hdr(skb),
+ sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3));
+
+ kfree(buf);
+
+ skb_dst_drop(skb);
+
+ ip6_route_input(skb);
+
+ if (skb_dst(skb)->error) {
+ dst_input(skb);
+ return -1;
+ }
+
+ if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+ ICMPV6_EXC_HOPLIMIT, 0);
+ kfree_skb(skb);
+ return -1;
+ }
+ ipv6_hdr(skb)->hop_limit--;
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ goto looped_back;
+ }
+
+ dst_input(skb);
+
+ return -1;
+}
+
/********************************
Routing header.
********************************/
@@ -506,9 +696,16 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
return -1;
}
- /* segment routing */
- if (hdr->type == IPV6_SRCRT_TYPE_4)
+ switch (hdr->type) {
+ case IPV6_SRCRT_TYPE_4:
+ /* segment routing */
return ipv6_srh_rcv(skb);
+ case IPV6_SRCRT_TYPE_3:
+ /* rpl segment routing */
+ return ipv6_rpl_srh_rcv(skb);
+ default:
+ break;
+ }
looped_back:
if (hdr->segments_left == 0) {
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 422dcc691f71..8c1ce78956ba 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -125,7 +125,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
[ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, },
};
-static int ila_build_state(struct nlattr *nla,
+static int ila_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 7b089d0ac8cd..e96304d8a4a7 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -285,7 +285,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
rcu_read_unlock();
/* Must drop socket now because of tproxy. */
- skb_orphan(skb);
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
return skb;
err:
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 524006aa0d78..cc6180e08a4f 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -311,7 +311,7 @@ static int vti6_rcv(struct sk_buff *skb)
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
rcu_read_unlock();
- return 0;
+ goto discard;
}
ipv6h = ipv6_hdr(skb);
@@ -450,15 +450,33 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
int mtu;
if (!dst) {
- fl->u.ip6.flowi6_oif = dev->ifindex;
- fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
- dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
- if (dst->error) {
- dst_release(dst);
- dst = NULL;
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ struct rtable *rt;
+
+ fl->u.ip4.flowi4_oif = dev->ifindex;
+ fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+ rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
+ if (IS_ERR(rt))
+ goto tx_err_link_failure;
+ dst = &rt->dst;
+ skb_dst_set(skb, dst);
+ break;
+ }
+ case htons(ETH_P_IPV6):
+ fl->u.ip6.flowi6_oif = dev->ifindex;
+ fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
+ dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
+ if (dst->error) {
+ dst_release(dst);
+ dst = NULL;
+ goto tx_err_link_failure;
+ }
+ skb_dst_set(skb, dst);
+ break;
+ default:
goto tx_err_link_failure;
}
- skb_dst_set(skb, dst);
}
dst_hold(dst);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 4a3feccd5b10..1ecd4e9b0bdf 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -197,6 +197,7 @@ static inline int ndisc_is_useropt(const struct net_device *dev,
return opt->nd_opt_type == ND_OPT_RDNSS ||
opt->nd_opt_type == ND_OPT_DNSSL ||
opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL ||
+ opt->nd_opt_type == ND_OPT_PREF64 ||
ndisc_ops_is_useropt(dev, opt->nd_opt_type);
}
@@ -1358,8 +1359,8 @@ skip_defrtr:
if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) {
rtime = (rtime*HZ)/1000;
- if (rtime < HZ/10)
- rtime = HZ/10;
+ if (rtime < HZ/100)
+ rtime = HZ/100;
NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime);
in6_dev->tstamp = jiffies;
send_ifinfo_notify = true;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index afcde55d537c..310cbddaa533 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3471,7 +3471,7 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
!netif_carrier_ok(dev))
fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
- err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
+ err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
cfg->fc_encap_type, cfg, gfp_flags, extack);
if (err)
goto out;
diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c
new file mode 100644
index 000000000000..d38b476fc7f2
--- /dev/null
+++ b/net/ipv6/rpl.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/**
+ * Authors:
+ * (C) 2020 Alexander Aring <alex.aring@gmail.com>
+ */
+
+#include <net/ipv6.h>
+#include <net/rpl.h>
+
+#define IPV6_PFXTAIL_LEN(x) (sizeof(struct in6_addr) - (x))
+
+static void ipv6_rpl_addr_decompress(struct in6_addr *dst,
+ const struct in6_addr *daddr,
+ const void *post, unsigned char pfx)
+{
+ memcpy(dst, daddr, pfx);
+ memcpy(&dst->s6_addr[pfx], post, IPV6_PFXTAIL_LEN(pfx));
+}
+
+static void ipv6_rpl_addr_compress(void *dst, const struct in6_addr *addr,
+ unsigned char pfx)
+{
+ memcpy(dst, &addr->s6_addr[pfx], IPV6_PFXTAIL_LEN(pfx));
+}
+
+static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i)
+{
+ return (void *)&hdr->rpl_segdata[i * IPV6_PFXTAIL_LEN(hdr->cmpri)];
+}
+
+size_t ipv6_rpl_srh_size(unsigned char n, unsigned char cmpri,
+ unsigned char cmpre)
+{
+ return (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre);
+}
+
+void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr,
+ const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr, unsigned char n)
+{
+ int i;
+
+ outhdr->nexthdr = inhdr->nexthdr;
+ outhdr->hdrlen = (((n + 1) * sizeof(struct in6_addr)) >> 3);
+ outhdr->pad = 0;
+ outhdr->type = inhdr->type;
+ outhdr->segments_left = inhdr->segments_left;
+ outhdr->cmpri = 0;
+ outhdr->cmpre = 0;
+
+ for (i = 0; i < n; i++)
+ ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr,
+ ipv6_rpl_segdata_pos(inhdr, i),
+ inhdr->cmpri);
+
+ ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[n], daddr,
+ ipv6_rpl_segdata_pos(inhdr, n),
+ inhdr->cmpre);
+}
+
+static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr,
+ unsigned char n)
+{
+ unsigned char plen;
+ int i;
+
+ for (plen = 0; plen < sizeof(*daddr); plen++) {
+ for (i = 0; i < n; i++) {
+ if (daddr->s6_addr[plen] !=
+ inhdr->rpl_segaddr[i].s6_addr[plen])
+ return plen;
+ }
+ }
+
+ return plen;
+}
+
+static unsigned char ipv6_rpl_srh_calc_cmpre(const struct in6_addr *daddr,
+ const struct in6_addr *last_segment)
+{
+ unsigned int plen;
+
+ for (plen = 0; plen < sizeof(*daddr); plen++) {
+ if (daddr->s6_addr[plen] != last_segment->s6_addr[plen])
+ break;
+ }
+
+ return plen;
+}
+
+void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr,
+ const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr, unsigned char n)
+{
+ unsigned char cmpri, cmpre;
+ size_t seglen;
+ int i;
+
+ cmpri = ipv6_rpl_srh_calc_cmpri(inhdr, daddr, n);
+ cmpre = ipv6_rpl_srh_calc_cmpre(daddr, &inhdr->rpl_segaddr[n]);
+
+ outhdr->nexthdr = inhdr->nexthdr;
+ seglen = (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre);
+ outhdr->hdrlen = seglen >> 3;
+ if (seglen & 0x7) {
+ outhdr->hdrlen++;
+ outhdr->pad = 8 - (seglen & 0x7);
+ } else {
+ outhdr->pad = 0;
+ }
+ outhdr->type = inhdr->type;
+ outhdr->segments_left = inhdr->segments_left;
+ outhdr->cmpri = cmpri;
+ outhdr->cmpre = cmpre;
+
+ for (i = 0; i < n; i++)
+ ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i),
+ &inhdr->rpl_segaddr[i], cmpri);
+
+ ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, n),
+ &inhdr->rpl_segaddr[n], cmpre);
+}
diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c
new file mode 100644
index 000000000000..c3ececd7cfc1
--- /dev/null
+++ b/net/ipv6/rpl_iptunnel.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/**
+ * Authors:
+ * (C) 2020 Alexander Aring <alex.aring@gmail.com>
+ */
+
+#include <linux/rpl_iptunnel.h>
+
+#include <net/dst_cache.h>
+#include <net/ip6_route.h>
+#include <net/lwtunnel.h>
+#include <net/ipv6.h>
+#include <net/rpl.h>
+
+struct rpl_iptunnel_encap {
+ struct ipv6_rpl_sr_hdr srh[0];
+};
+
+struct rpl_lwt {
+ struct dst_cache cache;
+ struct rpl_iptunnel_encap tuninfo;
+};
+
+static inline struct rpl_lwt *rpl_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct rpl_lwt *)lwt->data;
+}
+
+static inline struct rpl_iptunnel_encap *
+rpl_encap_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return &rpl_lwt_lwtunnel(lwt)->tuninfo;
+}
+
+static const struct nla_policy rpl_iptunnel_policy[RPL_IPTUNNEL_MAX + 1] = {
+ [RPL_IPTUNNEL_SRH] = { .type = NLA_BINARY },
+};
+
+static bool rpl_validate_srh(struct net *net, struct ipv6_rpl_sr_hdr *srh,
+ size_t seglen)
+{
+ int err;
+
+ if ((srh->hdrlen << 3) != seglen)
+ return false;
+
+ /* check at least one segment and seglen fit with segments_left */
+ if (!srh->segments_left ||
+ (srh->segments_left * sizeof(struct in6_addr)) != seglen)
+ return false;
+
+ if (srh->cmpri || srh->cmpre)
+ return false;
+
+ err = ipv6_chk_rpl_srh_loop(net, srh->rpl_segaddr,
+ srh->segments_left);
+ if (err)
+ return false;
+
+ if (ipv6_addr_type(&srh->rpl_segaddr[srh->segments_left - 1]) &
+ IPV6_ADDR_MULTICAST)
+ return false;
+
+ return true;
+}
+
+static int rpl_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RPL_IPTUNNEL_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct ipv6_rpl_sr_hdr *srh;
+ struct rpl_lwt *rlwt;
+ int err, srh_len;
+
+ if (family != AF_INET6)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, RPL_IPTUNNEL_MAX, nla,
+ rpl_iptunnel_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[RPL_IPTUNNEL_SRH])
+ return -EINVAL;
+
+ srh = nla_data(tb[RPL_IPTUNNEL_SRH]);
+ srh_len = nla_len(tb[RPL_IPTUNNEL_SRH]);
+
+ if (srh_len < sizeof(*srh))
+ return -EINVAL;
+
+ /* verify that SRH is consistent */
+ if (!rpl_validate_srh(net, srh, srh_len - sizeof(*srh)))
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(srh_len + sizeof(*rlwt));
+ if (!newts)
+ return -ENOMEM;
+
+ rlwt = rpl_lwt_lwtunnel(newts);
+
+ err = dst_cache_init(&rlwt->cache, GFP_ATOMIC);
+ if (err) {
+ kfree(newts);
+ return err;
+ }
+
+ memcpy(&rlwt->tuninfo.srh, srh, srh_len);
+
+ newts->type = LWTUNNEL_ENCAP_RPL;
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+ *ts = newts;
+
+ return 0;
+}
+
+static void rpl_destroy_state(struct lwtunnel_state *lwt)
+{
+ dst_cache_destroy(&rpl_lwt_lwtunnel(lwt)->cache);
+}
+
+static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt,
+ const struct ipv6_rpl_sr_hdr *srh)
+{
+ struct ipv6_rpl_sr_hdr *isrh, *csrh;
+ const struct ipv6hdr *oldhdr;
+ struct ipv6hdr *hdr;
+ unsigned char *buf;
+ size_t hdrlen;
+ int err;
+
+ oldhdr = ipv6_hdr(skb);
+
+ buf = kzalloc(ipv6_rpl_srh_alloc_size(srh->segments_left - 1) * 2,
+ GFP_ATOMIC);
+ if (!buf)
+ return -ENOMEM;
+
+ isrh = (struct ipv6_rpl_sr_hdr *)buf;
+ csrh = (struct ipv6_rpl_sr_hdr *)(buf + ((srh->hdrlen + 1) << 3));
+
+ memcpy(isrh, srh, sizeof(*isrh));
+ memcpy(isrh->rpl_segaddr, &srh->rpl_segaddr[1],
+ (srh->segments_left - 1) * 16);
+ isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr->daddr;
+
+ ipv6_rpl_srh_compress(csrh, isrh, &srh->rpl_segaddr[0],
+ isrh->segments_left - 1);
+
+ hdrlen = ((csrh->hdrlen + 1) << 3);
+
+ err = skb_cow_head(skb, hdrlen + skb->mac_len);
+ if (unlikely(err)) {
+ kfree(buf);
+ return err;
+ }
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ sizeof(struct ipv6hdr));
+
+ skb_push(skb, sizeof(struct ipv6hdr) + hdrlen);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ hdr = ipv6_hdr(skb);
+ memmove(hdr, oldhdr, sizeof(*hdr));
+ isrh = (void *)hdr + sizeof(*hdr);
+ memcpy(isrh, csrh, hdrlen);
+
+ isrh->nexthdr = hdr->nexthdr;
+ hdr->nexthdr = NEXTHDR_ROUTING;
+ hdr->daddr = srh->rpl_segaddr[0];
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
+
+ kfree(buf);
+
+ return 0;
+}
+
+static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct rpl_iptunnel_encap *tinfo;
+ int err = 0;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return -EINVAL;
+
+ tinfo = rpl_encap_lwtunnel(dst->lwtstate);
+
+ err = rpl_do_srh_inline(skb, rlwt, tinfo->srh);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct rpl_lwt *rlwt;
+ int err;
+
+ rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate);
+
+ err = rpl_do_srh(skb, rlwt);
+ if (unlikely(err))
+ goto drop;
+
+ preempt_disable();
+ dst = dst_cache_get(&rlwt->cache);
+ preempt_enable();
+
+ if (unlikely(!dst)) {
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = hdr->daddr;
+ fl6.saddr = hdr->saddr;
+ fl6.flowlabel = ip6_flowinfo(hdr);
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = hdr->nexthdr;
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = dst->error;
+ dst_release(dst);
+ goto drop;
+ }
+
+ preempt_disable();
+ dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr);
+ preempt_enable();
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+ if (unlikely(err))
+ goto drop;
+
+ return dst_output(net, sk, skb);
+
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+static int rpl_input(struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct rpl_lwt *rlwt;
+ int err;
+
+ rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate);
+
+ err = rpl_do_srh(skb, rlwt);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ preempt_disable();
+ dst = dst_cache_get(&rlwt->cache);
+ preempt_enable();
+
+ skb_dst_drop(skb);
+
+ if (!dst) {
+ ip6_route_input(skb);
+ dst = skb_dst(skb);
+ if (!dst->error) {
+ preempt_disable();
+ dst_cache_set_ip6(&rlwt->cache, dst,
+ &ipv6_hdr(skb)->saddr);
+ preempt_enable();
+ }
+ } else {
+ skb_dst_set(skb, dst);
+ }
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+ if (unlikely(err))
+ return err;
+
+ return dst_input(skb);
+}
+
+static int nla_put_rpl_srh(struct sk_buff *skb, int attrtype,
+ struct rpl_iptunnel_encap *tuninfo)
+{
+ struct rpl_iptunnel_encap *data;
+ struct nlattr *nla;
+ int len;
+
+ len = RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh);
+
+ nla = nla_reserve(skb, attrtype, len);
+ if (!nla)
+ return -EMSGSIZE;
+
+ data = nla_data(nla);
+ memcpy(data, tuninfo->srh, len);
+
+ return 0;
+}
+
+static int rpl_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate);
+
+ if (nla_put_rpl_srh(skb, RPL_IPTUNNEL_SRH, tuninfo))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rpl_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate);
+
+ return nla_total_size(RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh));
+}
+
+static int rpl_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct rpl_iptunnel_encap *a_hdr = rpl_encap_lwtunnel(a);
+ struct rpl_iptunnel_encap *b_hdr = rpl_encap_lwtunnel(b);
+ int len = RPL_IPTUNNEL_SRH_SIZE(a_hdr->srh);
+
+ if (len != RPL_IPTUNNEL_SRH_SIZE(b_hdr->srh))
+ return 1;
+
+ return memcmp(a_hdr, b_hdr, len);
+}
+
+static const struct lwtunnel_encap_ops rpl_ops = {
+ .build_state = rpl_build_state,
+ .destroy_state = rpl_destroy_state,
+ .output = rpl_output,
+ .input = rpl_input,
+ .fill_encap = rpl_fill_encap_info,
+ .get_encap_size = rpl_encap_nlsize,
+ .cmp_encap = rpl_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+int __init rpl_init(void)
+{
+ int err;
+
+ err = lwtunnel_encap_add_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL);
+ if (err)
+ goto out;
+
+ pr_info("RPL Segment Routing with IPv6\n");
+
+ return 0;
+
+out:
+ return err;
+}
+
+void rpl_exit(void)
+{
+ lwtunnel_encap_del_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL);
+}
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index ac837afb9040..c7cbfeae94f5 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -376,7 +376,7 @@ drop:
return err;
}
-static int seg6_build_state(struct nlattr *nla,
+static int seg6_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 8165802d8e05..52493423f329 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -970,8 +970,9 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
return 0;
}
-static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
- const void *cfg, struct lwtunnel_state **ts,
+static int seg6_local_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[SEG6_LOCAL_MAX + 1];
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5dc439a391fe..7d4151747340 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -843,6 +843,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
struct net *net = dev_net(skb->dev);
struct udphdr *uh;
struct sock *sk;
+ bool refcounted;
u32 ulen = 0;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -879,7 +880,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
goto csum_error;
/* Check if the socket is already available, e.g. due to early demux */
- sk = skb_steal_sock(skb);
+ sk = skb_steal_sock(skb, &refcounted);
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
@@ -888,12 +889,14 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
udp6_sk_rx_dst_set(sk, dst);
if (!uh->check && !udp_sk(sk)->no_check6_rx) {
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
goto report_csum_error;
}
ret = udp6_unicast_rcv_skb(sk, skb, uh);
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
}
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index e11bdb0aaa15..25b7ebda2fab 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -78,7 +78,7 @@ static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const
hlist_for_each_entry_rcu(x6spi,
&xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)],
- list_byaddr) {
+ list_byaddr, lockdep_is_held(&xfrm6_tunnel_spi_lock)) {
if (xfrm6_addr_equal(&x6spi->addr, saddr))
return x6spi;
}
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 266d63819415..829dcad69c2c 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -5,7 +5,7 @@
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright(c) 2016 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2019 Intel Corporation
+ * Copyright (C) 2018 - 2020 Intel Corporation
*/
#include <linux/debugfs.h>
@@ -78,6 +78,7 @@ static const char * const sta_flag_names[] = {
FLAG(MPSP_OWNER),
FLAG(MPSP_RECIPIENT),
FLAG(PS_DELIVER),
+ FLAG(USES_ENCRYPTION),
#undef FLAG
};
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 6354491c5a09..8f403c1bb908 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -6,7 +6,7 @@
* Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2015-2017 Intel Deutschland GmbH
- * Copyright 2018-2019 Intel Corporation
+ * Copyright 2018-2020 Intel Corporation
*/
#include <linux/if_ether.h>
@@ -277,22 +277,29 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key)
sta ? sta->sta.addr : bcast_addr, ret);
}
-int ieee80211_set_tx_key(struct ieee80211_key *key)
+static int _ieee80211_set_tx_key(struct ieee80211_key *key, bool force)
{
struct sta_info *sta = key->sta;
struct ieee80211_local *local = key->local;
assert_key_lock(local);
+ set_sta_flag(sta, WLAN_STA_USES_ENCRYPTION);
+
sta->ptk_idx = key->conf.keyidx;
- if (!ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT))
+ if (force || !ieee80211_hw_check(&local->hw, AMPDU_KEYBORDER_SUPPORT))
clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
ieee80211_check_fast_xmit(sta);
return 0;
}
+int ieee80211_set_tx_key(struct ieee80211_key *key)
+{
+ return _ieee80211_set_tx_key(key, false);
+}
+
static void ieee80211_pairwise_rekey(struct ieee80211_key *old,
struct ieee80211_key *new)
{
@@ -481,11 +488,8 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
if (pairwise) {
rcu_assign_pointer(sta->ptk[idx], new);
if (new &&
- !(new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX)) {
- sta->ptk_idx = idx;
- clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
- ieee80211_check_fast_xmit(sta);
- }
+ !(new->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX))
+ _ieee80211_set_tx_key(new, true);
} else {
rcu_assign_pointer(sta->gtk[idx], new);
}
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 8989a94cfe3f..f8d5c2515829 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -1049,6 +1049,11 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
might_sleep();
lockdep_assert_held(&local->sta_mtx);
+ while (sta->sta_state == IEEE80211_STA_AUTHORIZED) {
+ ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC);
+ WARN_ON_ONCE(ret);
+ }
+
/* now keys can no longer be reached */
ieee80211_free_sta_keys(local, sta);
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 364a35414d05..36f1abaab9ff 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -98,6 +98,7 @@ enum ieee80211_sta_info_flags {
WLAN_STA_MPSP_OWNER,
WLAN_STA_MPSP_RECIPIENT,
WLAN_STA_PS_DELIVER,
+ WLAN_STA_USES_ENCRYPTION,
NUM_WLAN_STA_FLAGS,
};
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 49d35936cc9d..82846aca86d9 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -590,10 +590,13 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb);
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)tx->skb->data;
- if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT))
+ if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) {
tx->key = NULL;
- else if (tx->sta &&
- (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
+ return TX_CONTINUE;
+ }
+
+ if (tx->sta &&
+ (key = rcu_dereference(tx->sta->ptk[tx->sta->ptk_idx])))
tx->key = key;
else if (ieee80211_is_group_privacy_action(tx->skb) &&
(key = rcu_dereference(tx->sdata->default_multicast_key)))
@@ -654,6 +657,9 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
if (!skip_hw && tx->key &&
tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE)
info->control.hw_key = &tx->key->conf;
+ } else if (!ieee80211_is_mgmt(hdr->frame_control) && tx->sta &&
+ test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) {
+ return TX_DROP;
}
return TX_CONTINUE;
@@ -3599,8 +3605,26 @@ begin:
tx.skb = skb;
tx.sdata = vif_to_sdata(info->control.vif);
- if (txq->sta)
+ if (txq->sta) {
tx.sta = container_of(txq->sta, struct sta_info, sta);
+ /*
+ * Drop unicast frames to unauthorised stations unless they are
+ * EAPOL frames from the local station.
+ */
+ if (unlikely(ieee80211_is_data(hdr->frame_control) &&
+ !ieee80211_vif_is_mesh(&tx.sdata->vif) &&
+ tx.sdata->vif.type != NL80211_IFTYPE_OCB &&
+ !is_multicast_ether_addr(hdr->addr1) &&
+ !test_sta_flag(tx.sta, WLAN_STA_AUTHORIZED) &&
+ (!(info->control.flags &
+ IEEE80211_TX_CTRL_PORT_CTRL_PROTO) ||
+ !ether_addr_equal(tx.sdata->vif.addr,
+ hdr->addr2)))) {
+ I802_DEBUG_INC(local->tx_handlers_drop_unauth_port);
+ ieee80211_free_txskb(&local->hw, skb);
+ goto begin;
+ }
+ }
/*
* The key can be removed while the packet was queued, so need to call
@@ -5318,6 +5342,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
struct ieee80211_local *local = sdata->local;
struct sk_buff *skb;
struct ethhdr *ehdr;
+ u32 ctrl_flags = 0;
u32 flags;
/* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE
@@ -5327,6 +5352,9 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
proto != cpu_to_be16(ETH_P_PREAUTH))
return -EINVAL;
+ if (proto == sdata->control_port_protocol)
+ ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO;
+
if (unencrypted)
flags = IEEE80211_TX_INTFL_DONT_ENCRYPT;
else
@@ -5352,7 +5380,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev,
skb_reset_mac_header(skb);
local_bh_disable();
- __ieee80211_subif_start_xmit(skb, skb->dev, flags, 0);
+ __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags);
local_bh_enable();
return 0;
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 44b675016393..2def85718d94 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -162,7 +162,7 @@ drop:
return -EINVAL;
}
-static int mpls_build_state(struct nlattr *nla,
+static int mpls_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile
index 4e98d9edfd0a..baa0640527c7 100644
--- a/net/mptcp/Makefile
+++ b/net/mptcp/Makefile
@@ -1,4 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_MPTCP) += mptcp.o
-mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o
+mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
+ mib.o pm_netlink.o
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index 40d1bb18fd60..c151628bd416 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -44,8 +44,7 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
*idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6]));
}
-void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
- void *hmac)
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
{
u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
__be32 mptcp_hashed_key[SHA256_DIGEST_WORDS];
@@ -55,6 +54,9 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
u8 key2be[8];
int i;
+ if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE))
+ len = SHA256_DIGEST_SIZE;
+
put_unaligned_be64(key1, key1be);
put_unaligned_be64(key2, key2be);
@@ -65,11 +67,10 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
for (i = 0; i < 8; i++)
input[i + 8] ^= key2be[i];
- put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]);
- put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]);
+ memcpy(&input[SHA256_BLOCK_SIZE], msg, len);
sha256_init(&state);
- sha256_update(&state, input, SHA256_BLOCK_SIZE + 8);
+ sha256_update(&state, input, SHA256_BLOCK_SIZE + len);
/* emit sha256(K1 || msg) on the second input block, so we can
* reuse 'input' for the last hashing
@@ -125,6 +126,7 @@ static int __init test_mptcp_crypto(void)
char hmac[20], hmac_hex[41];
u32 nonce1, nonce2;
u64 key1, key2;
+ u8 msg[8];
int i, j;
for (i = 0; i < ARRAY_SIZE(tests); ++i) {
@@ -134,7 +136,10 @@ static int __init test_mptcp_crypto(void)
nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0]));
nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4]));
- mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac);
+ put_unaligned_be32(nonce1, &msg[0]);
+ put_unaligned_be32(nonce2, &msg[4]);
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
for (j = 0; j < 20; ++j)
sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff);
hmac_hex[40] = 0;
diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c
new file mode 100644
index 000000000000..a536586742f2
--- /dev/null
+++ b/net/mptcp/diag.c
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: GPL-2.0
+/* MPTCP socket monitoring support
+ *
+ * Copyright (c) 2019 Red Hat
+ *
+ * Author: Davide Caratti <dcaratti@redhat.com>
+ */
+
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/inet_diag.h>
+#include <net/netlink.h>
+#include <uapi/linux/mptcp.h>
+#include "protocol.h"
+
+static int subflow_get_info(const struct sock *sk, struct sk_buff *skb)
+{
+ struct mptcp_subflow_context *sf;
+ struct nlattr *start;
+ u32 flags = 0;
+ int err;
+
+ start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP);
+ if (!start)
+ return -EMSGSIZE;
+
+ rcu_read_lock();
+ sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data);
+ if (!sf) {
+ err = 0;
+ goto nla_failure;
+ }
+
+ if (sf->mp_capable)
+ flags |= MPTCP_SUBFLOW_FLAG_MCAP_REM;
+ if (sf->request_mptcp)
+ flags |= MPTCP_SUBFLOW_FLAG_MCAP_LOC;
+ if (sf->mp_join)
+ flags |= MPTCP_SUBFLOW_FLAG_JOIN_REM;
+ if (sf->request_join)
+ flags |= MPTCP_SUBFLOW_FLAG_JOIN_LOC;
+ if (sf->backup)
+ flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM;
+ if (sf->request_bkup)
+ flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC;
+ if (sf->fully_established)
+ flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED;
+ if (sf->conn_finished)
+ flags |= MPTCP_SUBFLOW_FLAG_CONNECTED;
+ if (sf->map_valid)
+ flags |= MPTCP_SUBFLOW_FLAG_MAPVALID;
+
+ if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ,
+ sf->rel_write_seq) ||
+ nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq,
+ MPTCP_SUBFLOW_ATTR_PAD) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ,
+ sf->map_subflow_seq) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) ||
+ nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN,
+ sf->map_data_len) ||
+ nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) ||
+ nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) ||
+ nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, sf->local_id)) {
+ err = -EMSGSIZE;
+ goto nla_failure;
+ }
+
+ rcu_read_unlock();
+ nla_nest_end(skb, start);
+ return 0;
+
+nla_failure:
+ rcu_read_unlock();
+ nla_nest_cancel(skb, start);
+ return err;
+}
+
+static size_t subflow_get_info_size(const struct sock *sk)
+{
+ size_t size = 0;
+
+ size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */
+ nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */
+ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */
+ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */
+ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */
+ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */
+ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */
+ 0;
+ return size;
+}
+
+void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops)
+{
+ ops->get_info = subflow_get_info;
+ ops->get_info_size = subflow_get_info_size;
+}
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
new file mode 100644
index 000000000000..0a6a15f3456d
--- /dev/null
+++ b/net/mptcp/mib.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/mptcp.h>
+#include <net/snmp.h>
+#include <net/net_namespace.h>
+
+#include "mib.h"
+
+static const struct snmp_mib mptcp_snmp_list[] = {
+ SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE),
+ SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK),
+ SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK),
+ SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK),
+ SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
+ SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN),
+ SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX),
+ SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX),
+ SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC),
+ SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX),
+ SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC),
+ SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH),
+ SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX),
+ SNMP_MIB_SENTINEL
+};
+
+/* mptcp_mib_alloc - allocate percpu mib counters
+ *
+ * These are allocated when the first mptcp socket is created so
+ * we do not waste percpu memory if mptcp isn't in use.
+ */
+bool mptcp_mib_alloc(struct net *net)
+{
+ struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib);
+
+ if (!mib)
+ return false;
+
+ if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib))
+ free_percpu(mib);
+
+ return true;
+}
+
+void mptcp_seq_show(struct seq_file *seq)
+{
+ struct net *net = seq->private;
+ int i;
+
+ seq_puts(seq, "MPTcpExt:");
+ for (i = 0; mptcp_snmp_list[i].name; i++)
+ seq_printf(seq, " %s", mptcp_snmp_list[i].name);
+
+ seq_puts(seq, "\nMPTcpExt:");
+
+ if (!net->mib.mptcp_statistics) {
+ for (i = 0; mptcp_snmp_list[i].name; i++)
+ seq_puts(seq, " 0");
+
+ return;
+ }
+
+ for (i = 0; mptcp_snmp_list[i].name; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.mptcp_statistics,
+ mptcp_snmp_list[i].entry));
+ seq_putc(seq, '\n');
+}
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
new file mode 100644
index 000000000000..d7de340fc997
--- /dev/null
+++ b/net/mptcp/mib.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+enum linux_mptcp_mib_field {
+ MPTCP_MIB_NUM = 0,
+ MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */
+ MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */
+ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
+ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */
+ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */
+ MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */
+ MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */
+ MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */
+ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */
+ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */
+ MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */
+ MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */
+ __MPTCP_MIB_MAX
+};
+
+#define LINUX_MIB_MPTCP_MAX __MPTCP_MIB_MAX
+struct mptcp_mib {
+ unsigned long mibs[LINUX_MIB_MPTCP_MAX];
+};
+
+static inline void MPTCP_INC_STATS(struct net *net,
+ enum linux_mptcp_mib_field field)
+{
+ if (likely(net->mib.mptcp_statistics))
+ SNMP_INC_STATS(net->mib.mptcp_statistics, field);
+}
+
+static inline void __MPTCP_INC_STATS(struct net *net,
+ enum linux_mptcp_mib_field field)
+{
+ if (likely(net->mib.mptcp_statistics))
+ __SNMP_INC_STATS(net->mib.mptcp_statistics, field);
+}
+
+bool mptcp_mib_alloc(struct net *net);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index aea1a62d9999..faf57585b892 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -4,6 +4,8 @@
* Copyright (c) 2017 - 2019, Intel Corporation.
*/
+#define pr_fmt(fmt) "MPTCP: " fmt
+
#include <linux/kernel.h>
#include <net/tcp.h>
#include <net/mptcp.h>
@@ -96,6 +98,38 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
mp_opt->rcvr_key, mp_opt->data_len);
break;
+ case MPTCPOPT_MP_JOIN:
+ mp_opt->mp_join = 1;
+ if (opsize == TCPOLEN_MPTCP_MPJ_SYN) {
+ mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
+ mp_opt->join_id = *ptr++;
+ mp_opt->token = get_unaligned_be32(ptr);
+ ptr += 4;
+ mp_opt->nonce = get_unaligned_be32(ptr);
+ ptr += 4;
+ pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
+ mp_opt->backup, mp_opt->join_id,
+ mp_opt->token, mp_opt->nonce);
+ } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
+ mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
+ mp_opt->join_id = *ptr++;
+ mp_opt->thmac = get_unaligned_be64(ptr);
+ ptr += 8;
+ mp_opt->nonce = get_unaligned_be32(ptr);
+ ptr += 4;
+ pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
+ mp_opt->backup, mp_opt->join_id,
+ mp_opt->thmac, mp_opt->nonce);
+ } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
+ ptr += 2;
+ memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
+ pr_debug("MP_JOIN hmac");
+ } else {
+ pr_warn("MP_JOIN bad option size");
+ mp_opt->mp_join = 0;
+ }
+ break;
+
case MPTCPOPT_DSS:
pr_debug("DSS");
ptr++;
@@ -178,6 +212,71 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr,
break;
+ case MPTCPOPT_ADD_ADDR:
+ mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO;
+ if (!mp_opt->echo) {
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT)
+ mp_opt->family = MPTCP_ADDR_IPVERSION_4;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT)
+ mp_opt->family = MPTCP_ADDR_IPVERSION_6;
+#endif
+ else
+ break;
+ } else {
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT)
+ mp_opt->family = MPTCP_ADDR_IPVERSION_4;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT)
+ mp_opt->family = MPTCP_ADDR_IPVERSION_6;
+#endif
+ else
+ break;
+ }
+
+ mp_opt->add_addr = 1;
+ mp_opt->port = 0;
+ mp_opt->addr_id = *ptr++;
+ pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id);
+ if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
+ memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4);
+ ptr += 4;
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) {
+ mp_opt->port = get_unaligned_be16(ptr);
+ ptr += 2;
+ }
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else {
+ memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16);
+ ptr += 16;
+ if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT ||
+ opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) {
+ mp_opt->port = get_unaligned_be16(ptr);
+ ptr += 2;
+ }
+ }
+#endif
+ if (!mp_opt->echo) {
+ mp_opt->ahmac = get_unaligned_be64(ptr);
+ ptr += 8;
+ }
+ break;
+
+ case MPTCPOPT_RM_ADDR:
+ if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE)
+ break;
+
+ mp_opt->rm_addr = 1;
+ mp_opt->rm_id = *ptr++;
+ pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
+ break;
+
default:
break;
}
@@ -231,6 +330,16 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
opts->sndr_key = subflow->local_key;
*size = TCPOLEN_MPTCP_MPC_SYN;
return true;
+ } else if (subflow->request_join) {
+ pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
+ subflow->local_nonce);
+ opts->suboptions = OPTION_MPTCP_MPJ_SYN;
+ opts->join_id = subflow->local_id;
+ opts->token = subflow->remote_token;
+ opts->nonce = subflow->local_nonce;
+ opts->backup = subflow->request_bkup;
+ *size = TCPOLEN_MPTCP_MPJ_SYN;
+ return true;
}
return false;
}
@@ -240,16 +349,55 @@ void mptcp_rcv_synsent(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct tcp_sock *tp = tcp_sk(sk);
- pr_debug("subflow=%p", subflow);
if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) {
subflow->mp_capable = 1;
subflow->can_ack = 1;
subflow->remote_key = tp->rx_opt.mptcp.sndr_key;
- } else {
+ pr_debug("subflow=%p, remote_key=%llu", subflow,
+ subflow->remote_key);
+ } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) {
+ subflow->mp_join = 1;
+ subflow->thmac = tp->rx_opt.mptcp.thmac;
+ subflow->remote_nonce = tp->rx_opt.mptcp.nonce;
+ pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow,
+ subflow->thmac, subflow->remote_nonce);
+ } else if (subflow->request_mptcp) {
tcp_sk(sk)->is_mptcp = 0;
}
}
+/* MP_JOIN client subflow must wait for 4th ack before sending any data:
+ * TCP can't schedule delack timer before the subflow is fully established.
+ * MPTCP uses the delack timer to do 3rd ack retransmissions
+ */
+static void schedule_3rdack_retransmission(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long timeout;
+
+ /* reschedule with a timeout above RTT, as we must look only for drop */
+ if (tp->srtt_us)
+ timeout = tp->srtt_us << 1;
+ else
+ timeout = TCP_TIMEOUT_INIT;
+
+ WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
+ icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+ icsk->icsk_ack.timeout = timeout;
+ sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
+}
+
+static void clear_3rdack_retransmission(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_delack_timer);
+ icsk->icsk_ack.timeout = 0;
+ icsk->icsk_ack.ato = 0;
+ icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
+}
+
static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
unsigned int *size,
unsigned int remaining,
@@ -259,17 +407,21 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
struct mptcp_ext *mpext;
unsigned int data_len;
- pr_debug("subflow=%p fully established=%d seq=%x:%x remaining=%d",
- subflow, subflow->fully_established, subflow->snd_isn,
- skb ? TCP_SKB_CB(skb)->seq : 0, remaining);
+ /* When skb is not available, we better over-estimate the emitted
+ * options len. A full DSS option (28 bytes) is longer than
+ * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
+ * tell the caller to defer the estimate to
+ * mptcp_established_options_dss(), which will reserve enough space.
+ */
+ if (!skb)
+ return false;
+
+ /* MPC/MPJ needed only on 3rd ack packet */
+ if (subflow->fully_established ||
+ subflow->snd_isn != TCP_SKB_CB(skb)->seq)
+ return false;
- if (subflow->mp_capable && !subflow->fully_established && skb &&
- subflow->snd_isn == TCP_SKB_CB(skb)->seq) {
- /* When skb is not available, we better over-estimate the
- * emitted options len. A full DSS option is longer than
- * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit
- * that.
- */
+ if (subflow->mp_capable) {
mpext = mptcp_get_ext(skb);
data_len = mpext ? mpext->data_len : 0;
@@ -297,6 +449,14 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
data_len);
return true;
+ } else if (subflow->mp_join) {
+ opts->suboptions = OPTION_MPTCP_MPJ_ACK;
+ memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
+ *size = TCPOLEN_MPTCP_MPJ_ACK;
+ pr_debug("subflow=%p", subflow);
+
+ schedule_3rdack_retransmission(sk);
+ return true;
}
return false;
}
@@ -386,6 +546,83 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
return true;
}
+static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
+ struct in_addr *addr)
+{
+ u8 hmac[MPTCP_ADDR_HMAC_LEN];
+ u8 msg[7];
+
+ msg[0] = addr_id;
+ memcpy(&msg[1], &addr->s_addr, 4);
+ msg[5] = 0;
+ msg[6] = 0;
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
+
+ return get_unaligned_be64(hmac);
+}
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
+ struct in6_addr *addr)
+{
+ u8 hmac[MPTCP_ADDR_HMAC_LEN];
+ u8 msg[19];
+
+ msg[0] = addr_id;
+ memcpy(&msg[1], &addr->s6_addr, 16);
+ msg[17] = 0;
+ msg[18] = 0;
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
+
+ return get_unaligned_be64(hmac);
+}
+#endif
+
+static bool mptcp_established_options_addr(struct sock *sk,
+ unsigned int *size,
+ unsigned int remaining,
+ struct mptcp_out_options *opts)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct mptcp_addr_info saddr;
+ int len;
+
+ if (!mptcp_pm_should_signal(msk) ||
+ !(mptcp_pm_addr_signal(msk, remaining, &saddr)))
+ return false;
+
+ len = mptcp_add_addr_len(saddr.family);
+ if (remaining < len)
+ return false;
+
+ *size = len;
+ opts->addr_id = saddr.id;
+ if (saddr.family == AF_INET) {
+ opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
+ opts->addr = saddr.addr;
+ opts->ahmac = add_addr_generate_hmac(msk->local_key,
+ msk->remote_key,
+ opts->addr_id,
+ &opts->addr);
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (saddr.family == AF_INET6) {
+ opts->suboptions |= OPTION_MPTCP_ADD_ADDR6;
+ opts->addr6 = saddr.addr6;
+ opts->ahmac = add_addr6_generate_hmac(msk->local_key,
+ msk->remote_key,
+ opts->addr_id,
+ &opts->addr6);
+ }
+#endif
+ pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac);
+
+ return true;
+}
+
bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int *size, unsigned int remaining,
struct mptcp_out_options *opts)
@@ -393,6 +630,8 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
unsigned int opt_size = 0;
bool ret = false;
+ opts->suboptions = 0;
+
if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts))
ret = true;
else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining,
@@ -407,6 +646,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
*size += opt_size;
remaining -= opt_size;
+ if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) {
+ *size += opt_size;
+ remaining -= opt_size;
+ ret = true;
+ }
return ret;
}
@@ -423,54 +667,194 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
pr_debug("subflow_req=%p, local_key=%llu",
subflow_req, subflow_req->local_key);
return true;
+ } else if (subflow_req->mp_join) {
+ opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
+ opts->backup = subflow_req->backup;
+ opts->join_id = subflow_req->local_id;
+ opts->thmac = subflow_req->thmac;
+ opts->nonce = subflow_req->local_nonce;
+ pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
+ subflow_req, opts->backup, opts->join_id,
+ opts->thmac, opts->nonce);
+ *size = TCPOLEN_MPTCP_MPJ_SYNACK;
+ return true;
}
return false;
}
-static bool check_fully_established(struct mptcp_subflow_context *subflow,
+static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk,
+ struct mptcp_subflow_context *subflow,
struct sk_buff *skb,
struct mptcp_options_received *mp_opt)
{
/* here we can process OoO, in-window pkts, only in-sequence 4th ack
- * are relevant
+ * will make the subflow fully established
*/
- if (likely(subflow->fully_established ||
- TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1))
- return true;
+ if (likely(subflow->fully_established)) {
+ /* on passive sockets, check for 3rd ack retransmission
+ * note that msk is always set by subflow_syn_recv_sock()
+ * for mp_join subflows
+ */
+ if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 &&
+ TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
+ subflow->mp_join && mp_opt->mp_join &&
+ READ_ONCE(msk->pm.server_side))
+ tcp_send_ack(sk);
+ goto fully_established;
+ }
- if (mp_opt->use_ack)
+ /* we should process OoO packets before the first subflow is fully
+ * established, but not expected for MP_JOIN subflows
+ */
+ if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)
+ return subflow->mp_capable;
+
+ if (mp_opt->use_ack) {
+ /* subflows are fully established as soon as we get any
+ * additional ack.
+ */
subflow->fully_established = 1;
+ goto fully_established;
+ }
- if (subflow->can_ack)
- return true;
+ WARN_ON_ONCE(subflow->can_ack);
/* If the first established packet does not contain MP_CAPABLE + data
* then fallback to TCP
*/
if (!mp_opt->mp_capable) {
subflow->mp_capable = 0;
- tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0;
+ tcp_sk(sk)->is_mptcp = 0;
return false;
}
+
+ subflow->fully_established = 1;
subflow->remote_key = mp_opt->sndr_key;
subflow->can_ack = 1;
+
+fully_established:
+ if (likely(subflow->pm_notified))
+ return true;
+
+ subflow->pm_notified = 1;
+ if (subflow->mp_join) {
+ clear_3rdack_retransmission(sk);
+ mptcp_pm_subflow_established(msk, subflow);
+ } else {
+ mptcp_pm_fully_established(msk);
+ }
return true;
}
+static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
+{
+ u32 old_ack32, cur_ack32;
+
+ if (use_64bit)
+ return cur_ack;
+
+ old_ack32 = (u32)old_ack;
+ cur_ack32 = (u32)cur_ack;
+ cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32;
+ if (unlikely(before(cur_ack32, old_ack32)))
+ return cur_ack + (1LL << 32);
+ return cur_ack;
+}
+
+static void update_una(struct mptcp_sock *msk,
+ struct mptcp_options_received *mp_opt)
+{
+ u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una);
+ u64 write_seq = READ_ONCE(msk->write_seq);
+
+ /* avoid ack expansion on update conflict, to reduce the risk of
+ * wrongly expanding to a future ack sequence number, which is way
+ * more dangerous than missing an ack
+ */
+ new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
+
+ /* ACK for data not even sent yet? Ignore. */
+ if (after64(new_snd_una, write_seq))
+ new_snd_una = old_snd_una;
+
+ while (after64(new_snd_una, old_snd_una)) {
+ snd_una = old_snd_una;
+ old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una,
+ new_snd_una);
+ if (old_snd_una == snd_una) {
+ mptcp_data_acked((struct sock *)msk);
+ break;
+ }
+ }
+}
+
+static bool add_addr_hmac_valid(struct mptcp_sock *msk,
+ struct mptcp_options_received *mp_opt)
+{
+ u64 hmac = 0;
+
+ if (mp_opt->echo)
+ return true;
+
+ if (mp_opt->family == MPTCP_ADDR_IPVERSION_4)
+ hmac = add_addr_generate_hmac(msk->remote_key,
+ msk->local_key,
+ mp_opt->addr_id, &mp_opt->addr);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else
+ hmac = add_addr6_generate_hmac(msk->remote_key,
+ msk->local_key,
+ mp_opt->addr_id, &mp_opt->addr6);
+#endif
+
+ pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
+ msk, (unsigned long long)hmac,
+ (unsigned long long)mp_opt->ahmac);
+
+ return hmac == mp_opt->ahmac;
+}
+
void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
struct tcp_options_received *opt_rx)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
struct mptcp_options_received *mp_opt;
struct mptcp_ext *mpext;
mp_opt = &opt_rx->mptcp;
- if (!check_fully_established(subflow, skb, mp_opt))
+ if (!check_fully_established(msk, sk, subflow, skb, mp_opt))
return;
+ if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) {
+ struct mptcp_addr_info addr;
+
+ addr.port = htons(mp_opt->port);
+ addr.id = mp_opt->addr_id;
+ if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
+ addr.family = AF_INET;
+ addr.addr = mp_opt->addr;
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (mp_opt->family == MPTCP_ADDR_IPVERSION_6) {
+ addr.family = AF_INET6;
+ addr.addr6 = mp_opt->addr6;
+ }
+#endif
+ if (!mp_opt->echo)
+ mptcp_pm_add_addr_received(msk, &addr);
+ mp_opt->add_addr = 0;
+ }
+
if (!mp_opt->dss)
return;
+ /* we can't wait for recvmsg() to update the ack_seq, otherwise
+ * monodirectional flows will stuck
+ */
+ if (mp_opt->use_ack)
+ update_una(msk, mp_opt);
+
mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
if (!mpext)
return;
@@ -497,12 +881,6 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb,
mpext->use_map = 1;
}
- if (mp_opt->use_ack) {
- mpext->data_ack = mp_opt->data_ack;
- mpext->use_ack = 1;
- mpext->ack64 = mp_opt->ack64;
- }
-
mpext->data_fin = mp_opt->data_fin;
}
@@ -521,10 +899,9 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
else
len = TCPOLEN_MPTCP_MPC_ACK;
- *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) |
- (MPTCPOPT_MP_CAPABLE << 12) |
- (MPTCP_SUPPORTED_VERSION << 8) |
- MPTCP_CAP_HMAC_SHA256);
+ *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len,
+ MPTCP_SUPPORTED_VERSION,
+ MPTCP_CAP_HMAC_SHA256);
if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
opts->suboptions))
@@ -546,6 +923,77 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts)
}
mp_capable_done:
+ if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
+ if (opts->ahmac)
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ TCPOLEN_MPTCP_ADD_ADDR, 0,
+ opts->addr_id);
+ else
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ TCPOLEN_MPTCP_ADD_ADDR_BASE,
+ MPTCP_ADDR_ECHO,
+ opts->addr_id);
+ memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4);
+ ptr += 1;
+ if (opts->ahmac) {
+ put_unaligned_be64(opts->ahmac, ptr);
+ ptr += 2;
+ }
+ }
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) {
+ if (opts->ahmac)
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ TCPOLEN_MPTCP_ADD_ADDR6, 0,
+ opts->addr_id);
+ else
+ *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
+ TCPOLEN_MPTCP_ADD_ADDR6_BASE,
+ MPTCP_ADDR_ECHO,
+ opts->addr_id);
+ memcpy((u8 *)ptr, opts->addr6.s6_addr, 16);
+ ptr += 4;
+ if (opts->ahmac) {
+ put_unaligned_be64(opts->ahmac, ptr);
+ ptr += 2;
+ }
+ }
+#endif
+
+ if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
+ TCPOLEN_MPTCP_RM_ADDR_BASE,
+ 0, opts->rm_id);
+ }
+
+ if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_SYN,
+ opts->backup, opts->join_id);
+ put_unaligned_be32(opts->token, ptr);
+ ptr += 1;
+ put_unaligned_be32(opts->nonce, ptr);
+ ptr += 1;
+ }
+
+ if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_SYNACK,
+ opts->backup, opts->join_id);
+ put_unaligned_be64(opts->thmac, ptr);
+ ptr += 2;
+ put_unaligned_be32(opts->nonce, ptr);
+ ptr += 1;
+ }
+
+ if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
+ *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
+ TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
+ memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
+ ptr += 5;
+ }
+
if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
struct mptcp_ext *mpext = &opts->ext_copy;
u8 len = TCPOLEN_MPTCP_DSS_BASE;
@@ -567,10 +1015,7 @@ mp_capable_done:
flags |= MPTCP_DSS_DATA_FIN;
}
- *ptr++ = htonl((TCPOPT_MPTCP << 24) |
- (len << 16) |
- (MPTCPOPT_DSS << 12) |
- (flags));
+ *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
if (mpext->use_ack) {
put_unaligned_be64(mpext->data_ack, ptr);
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
new file mode 100644
index 000000000000..977d9c8b1453
--- /dev/null
+++ b/net/mptcp/pm.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2019, Intel Corporation.
+ */
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <net/mptcp.h>
+#include "protocol.h"
+
+static struct workqueue_struct *pm_wq;
+
+/* path manager command handlers */
+
+int mptcp_pm_announce_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ pr_debug("msk=%p, local_id=%d", msk, addr->id);
+
+ msk->pm.local = *addr;
+ WRITE_ONCE(msk->pm.addr_signal, true);
+ return 0;
+}
+
+int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id)
+{
+ return -ENOTSUPP;
+}
+
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id)
+{
+ return -ENOTSUPP;
+}
+
+/* path manager event handlers */
+
+void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side);
+
+ WRITE_ONCE(pm->server_side, server_side);
+}
+
+bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+ int ret;
+
+ pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
+ pm->subflows_max, READ_ONCE(pm->accept_subflow));
+
+ /* try to avoid acquiring the lock below */
+ if (!READ_ONCE(pm->accept_subflow))
+ return false;
+
+ spin_lock_bh(&pm->lock);
+ ret = pm->subflows < pm->subflows_max;
+ if (ret && ++pm->subflows == pm->subflows_max)
+ WRITE_ONCE(pm->accept_subflow, false);
+ spin_unlock_bh(&pm->lock);
+
+ return ret;
+}
+
+/* return true if the new status bit is currently cleared, that is, this event
+ * can be server, eventually by an already scheduled work
+ */
+static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
+ enum mptcp_pm_status new_status)
+{
+ pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status,
+ BIT(new_status));
+ if (msk->pm.status & BIT(new_status))
+ return false;
+
+ msk->pm.status |= BIT(new_status);
+ if (queue_work(pm_wq, &msk->pm.work))
+ sock_hold((struct sock *)msk);
+ return true;
+}
+
+void mptcp_pm_fully_established(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p", msk);
+
+ /* try to avoid acquiring the lock below */
+ if (!READ_ONCE(pm->work_pending))
+ return;
+
+ spin_lock_bh(&pm->lock);
+
+ if (READ_ONCE(pm->work_pending))
+ mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
+
+ spin_unlock_bh(&pm->lock);
+}
+
+void mptcp_pm_connection_closed(struct mptcp_sock *msk)
+{
+ pr_debug("msk=%p", msk);
+}
+
+void mptcp_pm_subflow_established(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p", msk);
+
+ if (!READ_ONCE(pm->work_pending))
+ return;
+
+ spin_lock_bh(&pm->lock);
+
+ if (READ_ONCE(pm->work_pending))
+ mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
+
+ spin_unlock_bh(&pm->lock);
+}
+
+void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id)
+{
+ pr_debug("msk=%p", msk);
+}
+
+void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+
+ pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
+ READ_ONCE(pm->accept_addr));
+
+ /* avoid acquiring the lock if there is no room for fouther addresses */
+ if (!READ_ONCE(pm->accept_addr))
+ return;
+
+ spin_lock_bh(&pm->lock);
+
+ /* be sure there is something to signal re-checking under PM lock */
+ if (READ_ONCE(pm->accept_addr) &&
+ mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED))
+ pm->remote = *addr;
+
+ spin_unlock_bh(&pm->lock);
+}
+
+/* path manager helpers */
+
+bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ struct mptcp_addr_info *saddr)
+{
+ int ret = false;
+
+ spin_lock_bh(&msk->pm.lock);
+
+ /* double check after the lock is acquired */
+ if (!mptcp_pm_should_signal(msk))
+ goto out_unlock;
+
+ if (remaining < mptcp_add_addr_len(msk->pm.local.family))
+ goto out_unlock;
+
+ *saddr = msk->pm.local;
+ WRITE_ONCE(msk->pm.addr_signal, false);
+ ret = true;
+
+out_unlock:
+ spin_unlock_bh(&msk->pm.lock);
+ return ret;
+}
+
+int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
+{
+ return mptcp_pm_nl_get_local_id(msk, skc);
+}
+
+static void pm_worker(struct work_struct *work)
+{
+ struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data,
+ work);
+ struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm);
+ struct sock *sk = (struct sock *)msk;
+
+ lock_sock(sk);
+ spin_lock_bh(&msk->pm.lock);
+
+ pr_debug("msk=%p status=%x", msk, pm->status);
+ if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) {
+ pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED);
+ mptcp_pm_nl_add_addr_received(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_ESTABLISHED);
+ mptcp_pm_nl_fully_established(msk);
+ }
+ if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) {
+ pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED);
+ mptcp_pm_nl_subflow_established(msk);
+ }
+
+ spin_unlock_bh(&msk->pm.lock);
+ release_sock(sk);
+ sock_put(sk);
+}
+
+void mptcp_pm_data_init(struct mptcp_sock *msk)
+{
+ msk->pm.add_addr_signaled = 0;
+ msk->pm.add_addr_accepted = 0;
+ msk->pm.local_addr_used = 0;
+ msk->pm.subflows = 0;
+ WRITE_ONCE(msk->pm.work_pending, false);
+ WRITE_ONCE(msk->pm.addr_signal, false);
+ WRITE_ONCE(msk->pm.accept_addr, false);
+ WRITE_ONCE(msk->pm.accept_subflow, false);
+ msk->pm.status = 0;
+
+ spin_lock_init(&msk->pm.lock);
+ INIT_WORK(&msk->pm.work, pm_worker);
+
+ mptcp_pm_nl_data_init(msk);
+}
+
+void mptcp_pm_close(struct mptcp_sock *msk)
+{
+ if (cancel_work_sync(&msk->pm.work))
+ sock_put((struct sock *)msk);
+}
+
+void mptcp_pm_init(void)
+{
+ pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8);
+ if (!pm_wq)
+ panic("Failed to allocate workqueue");
+
+ mptcp_pm_nl_init();
+}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
new file mode 100644
index 000000000000..86d61ab34c7c
--- /dev/null
+++ b/net/mptcp/pm_netlink.c
@@ -0,0 +1,859 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Multipath TCP
+ *
+ * Copyright (c) 2020, Red Hat, Inc.
+ */
+
+#define pr_fmt(fmt) "MPTCP: " fmt
+
+#include <linux/inet.h>
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <net/netns/generic.h>
+#include <net/mptcp.h>
+#include <net/genetlink.h>
+#include <uapi/linux/mptcp.h>
+
+#include "protocol.h"
+
+/* forward declaration */
+static struct genl_family mptcp_genl_family;
+
+static int pm_nl_pernet_id;
+
+struct mptcp_pm_addr_entry {
+ struct list_head list;
+ unsigned int flags;
+ int ifindex;
+ struct mptcp_addr_info addr;
+ struct rcu_head rcu;
+};
+
+struct pm_nl_pernet {
+ /* protects pernet updates */
+ spinlock_t lock;
+ struct list_head local_addr_list;
+ unsigned int addrs;
+ unsigned int add_addr_signal_max;
+ unsigned int add_addr_accept_max;
+ unsigned int local_addr_max;
+ unsigned int subflows_max;
+ unsigned int next_id;
+};
+
+#define MPTCP_PM_ADDR_MAX 8
+
+static bool addresses_equal(const struct mptcp_addr_info *a,
+ struct mptcp_addr_info *b, bool use_port)
+{
+ bool addr_equals = false;
+
+ if (a->family != b->family)
+ return false;
+
+ if (a->family == AF_INET)
+ addr_equals = a->addr.s_addr == b->addr.s_addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else
+ addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6);
+#endif
+
+ if (!addr_equals)
+ return false;
+ if (!use_port)
+ return true;
+
+ return a->port == b->port;
+}
+
+static void local_address(const struct sock_common *skc,
+ struct mptcp_addr_info *addr)
+{
+ addr->port = 0;
+ addr->family = skc->skc_family;
+ if (addr->family == AF_INET)
+ addr->addr.s_addr = skc->skc_rcv_saddr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ addr->addr6 = skc->skc_v6_rcv_saddr;
+#endif
+}
+
+static void remote_address(const struct sock_common *skc,
+ struct mptcp_addr_info *addr)
+{
+ addr->family = skc->skc_family;
+ addr->port = skc->skc_dport;
+ if (addr->family == AF_INET)
+ addr->addr.s_addr = skc->skc_daddr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ addr->addr6 = skc->skc_v6_daddr;
+#endif
+}
+
+static bool lookup_subflow_by_saddr(const struct list_head *list,
+ struct mptcp_addr_info *saddr)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_addr_info cur;
+ struct sock_common *skc;
+
+ list_for_each_entry(subflow, list, node) {
+ skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow);
+
+ local_address(skc, &cur);
+ if (addresses_equal(&cur, saddr, false))
+ return true;
+ }
+
+ return false;
+}
+
+static struct mptcp_pm_addr_entry *
+select_local_address(const struct pm_nl_pernet *pernet,
+ struct mptcp_sock *msk)
+{
+ struct mptcp_pm_addr_entry *entry, *ret = NULL;
+
+ rcu_read_lock();
+ spin_lock_bh(&msk->join_list_lock);
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
+ continue;
+
+ /* avoid any address already in use by subflows and
+ * pending join
+ */
+ if (entry->addr.family == ((struct sock *)msk)->sk_family &&
+ !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) &&
+ !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) {
+ ret = entry;
+ break;
+ }
+ }
+ spin_unlock_bh(&msk->join_list_lock);
+ rcu_read_unlock();
+ return ret;
+}
+
+static struct mptcp_pm_addr_entry *
+select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos)
+{
+ struct mptcp_pm_addr_entry *entry, *ret = NULL;
+ int i = 0;
+
+ rcu_read_lock();
+ /* do not keep any additional per socket state, just signal
+ * the address list in order.
+ * Note: removal from the local address list during the msk life-cycle
+ * can lead to additional addresses not being announced.
+ */
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL))
+ continue;
+ if (i++ == pos) {
+ ret = entry;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static void check_work_pending(struct mptcp_sock *msk)
+{
+ if (msk->pm.add_addr_signaled == msk->pm.add_addr_signal_max &&
+ (msk->pm.local_addr_used == msk->pm.local_addr_max ||
+ msk->pm.subflows == msk->pm.subflows_max))
+ WRITE_ONCE(msk->pm.work_pending, false);
+}
+
+static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *local;
+ struct mptcp_addr_info remote;
+ struct pm_nl_pernet *pernet;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+
+ pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
+ msk->pm.local_addr_used, msk->pm.local_addr_max,
+ msk->pm.add_addr_signaled, msk->pm.add_addr_signal_max,
+ msk->pm.subflows, msk->pm.subflows_max);
+
+ /* check first for announce */
+ if (msk->pm.add_addr_signaled < msk->pm.add_addr_signal_max) {
+ local = select_signal_address(pernet,
+ msk->pm.add_addr_signaled);
+
+ if (local) {
+ msk->pm.add_addr_signaled++;
+ mptcp_pm_announce_addr(msk, &local->addr);
+ } else {
+ /* pick failed, avoid fourther attempts later */
+ msk->pm.local_addr_used = msk->pm.add_addr_signal_max;
+ }
+
+ check_work_pending(msk);
+ }
+
+ /* check if should create a new subflow */
+ if (msk->pm.local_addr_used < msk->pm.local_addr_max &&
+ msk->pm.subflows < msk->pm.subflows_max) {
+ remote_address((struct sock_common *)sk, &remote);
+
+ local = select_local_address(pernet, msk);
+ if (local) {
+ msk->pm.local_addr_used++;
+ msk->pm.subflows++;
+ check_work_pending(msk);
+ spin_unlock_bh(&msk->pm.lock);
+ __mptcp_subflow_connect(sk, local->ifindex,
+ &local->addr, &remote);
+ spin_lock_bh(&msk->pm.lock);
+ return;
+ }
+
+ /* lookup failed, avoid fourther attempts later */
+ msk->pm.local_addr_used = msk->pm.local_addr_max;
+ check_work_pending(msk);
+ }
+}
+
+void mptcp_pm_nl_fully_established(struct mptcp_sock *msk)
+{
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+}
+
+void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
+{
+ mptcp_pm_create_subflow_or_signal_addr(msk);
+}
+
+void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
+{
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_addr_info remote;
+ struct mptcp_addr_info local;
+
+ pr_debug("accepted %d:%d remote family %d",
+ msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max,
+ msk->pm.remote.family);
+ msk->pm.add_addr_accepted++;
+ msk->pm.subflows++;
+ if (msk->pm.add_addr_accepted >= msk->pm.add_addr_accept_max ||
+ msk->pm.subflows >= msk->pm.subflows_max)
+ WRITE_ONCE(msk->pm.accept_addr, false);
+
+ /* connect to the specified remote address, using whatever
+ * local address the routing configuration will pick.
+ */
+ remote = msk->pm.remote;
+ if (!remote.port)
+ remote.port = sk->sk_dport;
+ memset(&local, 0, sizeof(local));
+ local.family = remote.family;
+
+ spin_unlock_bh(&msk->pm.lock);
+ __mptcp_subflow_connect((struct sock *)msk, 0, &local, &remote);
+ spin_lock_bh(&msk->pm.lock);
+}
+
+static bool address_use_port(struct mptcp_pm_addr_entry *entry)
+{
+ return (entry->flags &
+ (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) ==
+ MPTCP_PM_ADDR_FLAG_SIGNAL;
+}
+
+static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_pm_addr_entry *cur;
+ int ret = -EINVAL;
+
+ spin_lock_bh(&pernet->lock);
+ /* to keep the code simple, don't do IDR-like allocation for address ID,
+ * just bail when we exceed limits
+ */
+ if (pernet->next_id > 255)
+ goto out;
+ if (pernet->addrs >= MPTCP_PM_ADDR_MAX)
+ goto out;
+
+ /* do not insert duplicate address, differentiate on port only
+ * singled addresses
+ */
+ list_for_each_entry(cur, &pernet->local_addr_list, list) {
+ if (addresses_equal(&cur->addr, &entry->addr,
+ address_use_port(entry) &&
+ address_use_port(cur)))
+ goto out;
+ }
+
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ pernet->add_addr_signal_max++;
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ pernet->local_addr_max++;
+
+ entry->addr.id = pernet->next_id++;
+ pernet->addrs++;
+ list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
+ ret = entry->addr.id;
+
+out:
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+}
+
+int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
+{
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_addr_info skc_local;
+ struct mptcp_addr_info msk_local;
+ struct pm_nl_pernet *pernet;
+ int ret = -1;
+
+ if (WARN_ON_ONCE(!msk))
+ return -1;
+
+ /* The 0 ID mapping is defined by the first subflow, copied into the msk
+ * addr
+ */
+ local_address((struct sock_common *)msk, &msk_local);
+ local_address((struct sock_common *)msk, &skc_local);
+ if (addresses_equal(&msk_local, &skc_local, false))
+ return 0;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ if (addresses_equal(&entry->addr, &skc_local, false)) {
+ ret = entry->addr.id;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (ret >= 0)
+ return ret;
+
+ /* address not found, add to local list */
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->flags = 0;
+ entry->addr = skc_local;
+ ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
+ if (ret < 0)
+ kfree(entry);
+
+ return ret;
+}
+
+void mptcp_pm_nl_data_init(struct mptcp_sock *msk)
+{
+ struct mptcp_pm_data *pm = &msk->pm;
+ struct pm_nl_pernet *pernet;
+ bool subflows;
+
+ pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id);
+
+ pm->add_addr_signal_max = READ_ONCE(pernet->add_addr_signal_max);
+ pm->add_addr_accept_max = READ_ONCE(pernet->add_addr_accept_max);
+ pm->local_addr_max = READ_ONCE(pernet->local_addr_max);
+ pm->subflows_max = READ_ONCE(pernet->subflows_max);
+ subflows = !!pm->subflows_max;
+ WRITE_ONCE(pm->work_pending, (!!pm->local_addr_max && subflows) ||
+ !!pm->add_addr_signal_max);
+ WRITE_ONCE(pm->accept_addr, !!pm->add_addr_accept_max && subflows);
+ WRITE_ONCE(pm->accept_subflow, subflows);
+}
+
+#define MPTCP_PM_CMD_GRP_OFFSET 0
+
+static const struct genl_multicast_group mptcp_pm_mcgrps[] = {
+ [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, },
+};
+
+static const struct nla_policy
+mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = {
+ [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, },
+ [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, },
+ [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, },
+ [MPTCP_PM_ADDR_ATTR_ADDR6] = { .type = NLA_EXACT_LEN,
+ .len = sizeof(struct in6_addr), },
+ [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 },
+ [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 },
+ [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 },
+};
+
+static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = {
+ [MPTCP_PM_ATTR_ADDR] =
+ NLA_POLICY_NESTED(mptcp_pm_addr_policy),
+ [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, },
+ [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, },
+};
+
+static int mptcp_pm_family_to_addr(int family)
+{
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (family == AF_INET6)
+ return MPTCP_PM_ADDR_ATTR_ADDR6;
+#endif
+ return MPTCP_PM_ADDR_ATTR_ADDR4;
+}
+
+static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info,
+ bool require_family,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1];
+ int err, addr_addr;
+
+ if (!attr) {
+ GENL_SET_ERR_MSG(info, "missing address info");
+ return -EINVAL;
+ }
+
+ /* no validation needed - was already done via nested policy */
+ err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr,
+ mptcp_pm_addr_policy, info->extack);
+ if (err)
+ return err;
+
+ memset(entry, 0, sizeof(*entry));
+ if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) {
+ if (!require_family)
+ goto skip_family;
+
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "missing family");
+ return -EINVAL;
+ }
+
+ entry->addr.family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]);
+ if (entry->addr.family != AF_INET
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ && entry->addr.family != AF_INET6
+#endif
+ ) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "unknown address family");
+ return -EINVAL;
+ }
+ addr_addr = mptcp_pm_family_to_addr(entry->addr.family);
+ if (!tb[addr_addr]) {
+ NL_SET_ERR_MSG_ATTR(info->extack, attr,
+ "missing address data");
+ return -EINVAL;
+ }
+
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (entry->addr.family == AF_INET6)
+ entry->addr.addr6 = nla_get_in6_addr(tb[addr_addr]);
+ else
+#endif
+ entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]);
+
+skip_family:
+ if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX])
+ entry->ifindex = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+
+ if (tb[MPTCP_PM_ADDR_ATTR_ID])
+ entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]);
+
+ if (tb[MPTCP_PM_ADDR_ATTR_FLAGS])
+ entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]);
+
+ return 0;
+}
+
+static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
+{
+ return net_generic(genl_info_net(info), pm_nl_pernet_id);
+}
+
+static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ int ret;
+
+ ret = mptcp_pm_parse_addr(attr, info, true, &addr);
+ if (ret < 0)
+ return ret;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "can't allocate addr");
+ return -ENOMEM;
+ }
+
+ *entry = addr;
+ ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "too many addresses or duplicate one");
+ kfree(entry);
+ return ret;
+ }
+
+ return 0;
+}
+
+static struct mptcp_pm_addr_entry *
+__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
+{
+ struct mptcp_pm_addr_entry *entry;
+
+ list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ if (entry->addr.id == id)
+ return entry;
+ }
+ return NULL;
+}
+
+static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ int ret;
+
+ ret = mptcp_pm_parse_addr(attr, info, false, &addr);
+ if (ret < 0)
+ return ret;
+
+ spin_lock_bh(&pernet->lock);
+ entry = __lookup_addr_by_id(pernet, addr.addr.id);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "address not found");
+ ret = -EINVAL;
+ goto out;
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)
+ pernet->add_addr_signal_max--;
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
+ pernet->local_addr_max--;
+
+ pernet->addrs--;
+ list_del_rcu(&entry->list);
+ kfree_rcu(entry, rcu);
+out:
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+}
+
+static void __flush_addrs(struct pm_nl_pernet *pernet)
+{
+ while (!list_empty(&pernet->local_addr_list)) {
+ struct mptcp_pm_addr_entry *cur;
+
+ cur = list_entry(pernet->local_addr_list.next,
+ struct mptcp_pm_addr_entry, list);
+ list_del_rcu(&cur->list);
+ kfree_rcu(cur, rcu);
+ }
+}
+
+static void __reset_counters(struct pm_nl_pernet *pernet)
+{
+ pernet->add_addr_signal_max = 0;
+ pernet->add_addr_accept_max = 0;
+ pernet->local_addr_max = 0;
+ pernet->addrs = 0;
+}
+
+static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+
+ spin_lock_bh(&pernet->lock);
+ __flush_addrs(pernet);
+ __reset_counters(pernet);
+ spin_unlock_bh(&pernet->lock);
+ return 0;
+}
+
+static int mptcp_nl_fill_addr(struct sk_buff *skb,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_addr_info *addr = &entry->addr;
+ struct nlattr *attr;
+
+ attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags))
+ goto nla_put_failure;
+ if (entry->ifindex &&
+ nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex))
+ goto nla_put_failure;
+
+ if (addr->family == AF_INET)
+ nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4,
+ addr->addr.s_addr);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->family == AF_INET6)
+ nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6);
+#endif
+ nla_nest_end(skb, attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, attr);
+ return -EMSGSIZE;
+}
+
+static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct mptcp_pm_addr_entry addr, *entry;
+ struct sk_buff *msg;
+ void *reply;
+ int ret;
+
+ ret = mptcp_pm_parse_addr(attr, info, false, &addr);
+ if (ret < 0)
+ return ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
+ info->genlhdr->cmd);
+ if (!reply) {
+ GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
+ ret = -EMSGSIZE;
+ goto fail;
+ }
+
+ spin_lock_bh(&pernet->lock);
+ entry = __lookup_addr_by_id(pernet, addr.addr.id);
+ if (!entry) {
+ GENL_SET_ERR_MSG(info, "address not found");
+ ret = -EINVAL;
+ goto unlock_fail;
+ }
+
+ ret = mptcp_nl_fill_addr(msg, entry);
+ if (ret)
+ goto unlock_fail;
+
+ genlmsg_end(msg, reply);
+ ret = genlmsg_reply(msg, info);
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+
+unlock_fail:
+ spin_unlock_bh(&pernet->lock);
+
+fail:
+ nlmsg_free(msg);
+ return ret;
+}
+
+static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct net *net = sock_net(msg->sk);
+ struct mptcp_pm_addr_entry *entry;
+ struct pm_nl_pernet *pernet;
+ int id = cb->args[0];
+ void *hdr;
+
+ pernet = net_generic(net, pm_nl_pernet_id);
+
+ spin_lock_bh(&pernet->lock);
+ list_for_each_entry(entry, &pernet->local_addr_list, list) {
+ if (entry->addr.id <= id)
+ continue;
+
+ hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &mptcp_genl_family,
+ NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR);
+ if (!hdr)
+ break;
+
+ if (mptcp_nl_fill_addr(msg, entry) < 0) {
+ genlmsg_cancel(msg, hdr);
+ break;
+ }
+
+ id = entry->addr.id;
+ genlmsg_end(msg, hdr);
+ }
+ spin_unlock_bh(&pernet->lock);
+
+ cb->args[0] = id;
+ return msg->len;
+}
+
+static int parse_limit(struct genl_info *info, int id, unsigned int *limit)
+{
+ struct nlattr *attr = info->attrs[id];
+
+ if (!attr)
+ return 0;
+
+ *limit = nla_get_u32(attr);
+ if (*limit > MPTCP_PM_ADDR_MAX) {
+ GENL_SET_ERR_MSG(info, "limit greater than maximum");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int
+mptcp_nl_cmd_set_limits(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ unsigned int rcv_addrs, subflows;
+ int ret;
+
+ spin_lock_bh(&pernet->lock);
+ rcv_addrs = pernet->add_addr_accept_max;
+ ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs);
+ if (ret)
+ goto unlock;
+
+ subflows = pernet->subflows_max;
+ ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows);
+ if (ret)
+ goto unlock;
+
+ WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs);
+ WRITE_ONCE(pernet->subflows_max, subflows);
+
+unlock:
+ spin_unlock_bh(&pernet->lock);
+ return ret;
+}
+
+static int
+mptcp_nl_cmd_get_limits(struct sk_buff *skb, struct genl_info *info)
+{
+ struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
+ struct sk_buff *msg;
+ void *reply;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0,
+ MPTCP_PM_CMD_GET_LIMITS);
+ if (!reply)
+ goto fail;
+
+ if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS,
+ READ_ONCE(pernet->add_addr_accept_max)))
+ goto fail;
+
+ if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS,
+ READ_ONCE(pernet->subflows_max)))
+ goto fail;
+
+ genlmsg_end(msg, reply);
+ return genlmsg_reply(msg, info);
+
+fail:
+ GENL_SET_ERR_MSG(info, "not enough space in Netlink message");
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+static struct genl_ops mptcp_pm_ops[] = {
+ {
+ .cmd = MPTCP_PM_CMD_ADD_ADDR,
+ .doit = mptcp_nl_cmd_add_addr,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_DEL_ADDR,
+ .doit = mptcp_nl_cmd_del_addr,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_FLUSH_ADDRS,
+ .doit = mptcp_nl_cmd_flush_addrs,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_GET_ADDR,
+ .doit = mptcp_nl_cmd_get_addr,
+ .dumpit = mptcp_nl_cmd_dump_addrs,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_SET_LIMITS,
+ .doit = mptcp_nl_cmd_set_limits,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = MPTCP_PM_CMD_GET_LIMITS,
+ .doit = mptcp_nl_cmd_get_limits,
+ },
+};
+
+static struct genl_family mptcp_genl_family __ro_after_init = {
+ .name = MPTCP_PM_NAME,
+ .version = MPTCP_PM_VER,
+ .maxattr = MPTCP_PM_ATTR_MAX,
+ .policy = mptcp_pm_policy,
+ .netnsok = true,
+ .module = THIS_MODULE,
+ .ops = mptcp_pm_ops,
+ .n_ops = ARRAY_SIZE(mptcp_pm_ops),
+ .mcgrps = mptcp_pm_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps),
+};
+
+static int __net_init pm_nl_init_net(struct net *net)
+{
+ struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id);
+
+ INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
+ __reset_counters(pernet);
+ pernet->next_id = 1;
+ spin_lock_init(&pernet->lock);
+ return 0;
+}
+
+static void __net_exit pm_nl_exit_net(struct list_head *net_list)
+{
+ struct net *net;
+
+ list_for_each_entry(net, net_list, exit_list) {
+ /* net is removed from namespace list, can't race with
+ * other modifiers
+ */
+ __flush_addrs(net_generic(net, pm_nl_pernet_id));
+ }
+}
+
+static struct pernet_operations mptcp_pm_pernet_ops = {
+ .init = pm_nl_init_net,
+ .exit_batch = pm_nl_exit_net,
+ .id = &pm_nl_pernet_id,
+ .size = sizeof(struct pm_nl_pernet),
+};
+
+void mptcp_pm_nl_init(void)
+{
+ if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0)
+ panic("Failed to register MPTCP PM pernet subsystem.\n");
+
+ if (genl_register_family(&mptcp_genl_family))
+ panic("Failed to register MPTCP PM netlink family\n");
+}
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 92d5382e71f4..939a5045181a 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -21,6 +21,7 @@
#endif
#include <net/mptcp.h>
#include "protocol.h"
+#include "mib.h"
#define MPTCP_SAME_STATE TCP_MAX_STATES
@@ -37,6 +38,8 @@ struct mptcp_skb_cb {
#define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0]))
+static struct percpu_counter mptcp_sockets_allocated;
+
/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not
* completed yet or has failed, return the subflow socket.
* Otherwise return NULL.
@@ -54,10 +57,43 @@ static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk)
return msk->first && !sk_is_mptcp(msk->first);
}
+static struct socket *mptcp_is_tcpsk(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sock->sk != sk)
+ return NULL;
+
+ if (unlikely(sk->sk_prot == &tcp_prot)) {
+ /* we are being invoked after mptcp_accept() has
+ * accepted a non-mp-capable flow: sk is a tcp_sk,
+ * not an mptcp one.
+ *
+ * Hand the socket over to tcp so all further socket ops
+ * bypass mptcp.
+ */
+ sock->ops = &inet_stream_ops;
+ return sock;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
+ sock->ops = &inet6_stream_ops;
+ return sock;
+#endif
+ }
+
+ return NULL;
+}
+
static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
{
+ struct socket *sock;
+
sock_owned_by_me((const struct sock *)msk);
+ sock = mptcp_is_tcpsk((struct sock *)msk);
+ if (unlikely(sock))
+ return sock;
+
if (likely(!__mptcp_needs_tcp_fallback(msk)))
return NULL;
@@ -81,6 +117,10 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
struct socket *ssock;
int err;
+ ssock = __mptcp_tcp_fallback(msk);
+ if (unlikely(ssock))
+ return ssock;
+
ssock = __mptcp_nmpc_socket(msk);
if (ssock)
goto set_state;
@@ -104,19 +144,6 @@ set_state:
return ssock;
}
-static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk)
-{
- struct mptcp_subflow_context *subflow;
-
- sock_owned_by_me((const struct sock *)msk);
-
- mptcp_for_each_subflow(msk, subflow) {
- return mptcp_subflow_tcp_sock(subflow);
- }
-
- return NULL;
-}
-
static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
struct sk_buff *skb,
unsigned int offset, size_t copy_len)
@@ -131,6 +158,27 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
MPTCP_SKB_CB(skb)->offset = offset;
}
+/* both sockets must be locked */
+static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk,
+ struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ u64 dsn = mptcp_subflow_get_mapped_dsn(subflow);
+
+ /* revalidate data sequence number.
+ *
+ * mptcp_subflow_data_available() is usually called
+ * without msk lock. Its unlikely (but possible)
+ * that msk->ack_seq has been advanced since the last
+ * call found in-sequence data.
+ */
+ if (likely(dsn == msk->ack_seq))
+ return true;
+
+ subflow->data_avail = 0;
+ return mptcp_subflow_data_available(ssk);
+}
+
static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct sock *ssk,
unsigned int *bytes)
@@ -142,6 +190,11 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct tcp_sock *tp;
bool done = false;
+ if (!mptcp_subflow_dsn_valid(msk, ssk)) {
+ *bytes = 0;
+ return false;
+ }
+
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf);
@@ -254,6 +307,69 @@ wake:
sk->sk_data_ready(sk);
}
+static void __mptcp_flush_join_list(struct mptcp_sock *msk)
+{
+ if (likely(list_empty(&msk->join_list)))
+ return;
+
+ spin_lock_bh(&msk->join_list_lock);
+ list_splice_tail_init(&msk->join_list, &msk->conn_list);
+ spin_unlock_bh(&msk->join_list_lock);
+}
+
+static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
+{
+ long tout = ssk && inet_csk(ssk)->icsk_pending ?
+ inet_csk(ssk)->icsk_timeout - jiffies : 0;
+
+ if (tout <= 0)
+ tout = mptcp_sk(sk)->timer_ival;
+ mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
+}
+
+static bool mptcp_timer_pending(struct sock *sk)
+{
+ return timer_pending(&inet_csk(sk)->icsk_retransmit_timer);
+}
+
+static void mptcp_reset_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long tout;
+
+ /* should never be called with mptcp level timer cleared */
+ tout = READ_ONCE(mptcp_sk(sk)->timer_ival);
+ if (WARN_ON_ONCE(!tout))
+ tout = TCP_RTO_MIN;
+ sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout);
+}
+
+void mptcp_data_acked(struct sock *sk)
+{
+ mptcp_reset_timer(sk);
+
+ if (!sk_stream_is_writeable(sk) &&
+ schedule_work(&mptcp_sk(sk)->work))
+ sock_hold(sk);
+}
+
+void mptcp_subflow_eof(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) &&
+ schedule_work(&msk->work))
+ sock_hold(sk);
+}
+
+static void mptcp_stop_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ mptcp_sk(sk)->timer_ival = 0;
+}
+
static bool mptcp_ext_cache_refill(struct mptcp_sock *msk)
{
if (!msk->cached_ext)
@@ -277,41 +393,149 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk)
return NULL;
}
-static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk,
- const struct sk_buff *skb,
- const struct mptcp_ext *mpext)
+static bool mptcp_skb_can_collapse_to(u64 write_seq,
+ const struct sk_buff *skb,
+ const struct mptcp_ext *mpext)
{
if (!tcp_skb_can_collapse_to(skb))
return false;
/* can collapse only if MPTCP level sequence is in order */
- return mpext && mpext->data_seq + mpext->data_len == msk->write_seq;
+ return mpext && mpext->data_seq + mpext->data_len == write_seq;
+}
+
+static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
+ const struct page_frag *pfrag,
+ const struct mptcp_data_frag *df)
+{
+ return df && pfrag->page == df->page &&
+ df->data_seq + df->data_len == msk->write_seq;
+}
+
+static void dfrag_uncharge(struct sock *sk, int len)
+{
+ sk_mem_uncharge(sk, len);
+ sk_wmem_queued_add(sk, -len);
+}
+
+static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag)
+{
+ int len = dfrag->data_len + dfrag->overhead;
+
+ list_del(&dfrag->list);
+ dfrag_uncharge(sk, len);
+ put_page(dfrag->page);
+}
+
+static void mptcp_clean_una(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_data_frag *dtmp, *dfrag;
+ u64 snd_una = atomic64_read(&msk->snd_una);
+ bool cleaned = false;
+
+ list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) {
+ if (after64(dfrag->data_seq + dfrag->data_len, snd_una))
+ break;
+
+ dfrag_clear(sk, dfrag);
+ cleaned = true;
+ }
+
+ dfrag = mptcp_rtx_head(sk);
+ if (dfrag && after64(snd_una, dfrag->data_seq)) {
+ u64 delta = dfrag->data_seq + dfrag->data_len - snd_una;
+
+ dfrag->data_seq += delta;
+ dfrag->data_len -= delta;
+
+ dfrag_uncharge(sk, delta);
+ cleaned = true;
+ }
+
+ if (cleaned) {
+ sk_mem_reclaim_partial(sk);
+
+ /* Only wake up writers if a subflow is ready */
+ if (test_bit(MPTCP_SEND_SPACE, &msk->flags))
+ sk_stream_write_space(sk);
+ }
+}
+
+/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
+ * data
+ */
+static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
+{
+ if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag),
+ pfrag, sk->sk_allocation)))
+ return true;
+
+ sk->sk_prot->enter_memory_pressure(sk);
+ sk_stream_moderate_sndbuf(sk);
+ return false;
+}
+
+static struct mptcp_data_frag *
+mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag,
+ int orig_offset)
+{
+ int offset = ALIGN(orig_offset, sizeof(long));
+ struct mptcp_data_frag *dfrag;
+
+ dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset);
+ dfrag->data_len = 0;
+ dfrag->data_seq = msk->write_seq;
+ dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag);
+ dfrag->offset = offset + sizeof(struct mptcp_data_frag);
+ dfrag->page = pfrag->page;
+
+ return dfrag;
}
static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
- struct msghdr *msg, long *timeo, int *pmss_now,
+ struct msghdr *msg, struct mptcp_data_frag *dfrag,
+ long *timeo, int *pmss_now,
int *ps_goal)
{
- int mss_now, avail_size, size_goal, ret;
+ int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0;
+ bool dfrag_collapsed, can_collapse = false;
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_ext *mpext = NULL;
+ bool retransmission = !!dfrag;
struct sk_buff *skb, *tail;
- bool can_collapse = false;
struct page_frag *pfrag;
+ struct page *page;
+ u64 *write_seq;
size_t psize;
/* use the mptcp page cache so that we can easily move the data
* from one substream to another, but do per subflow memory accounting
+ * Note: pfrag is used only !retransmission, but the compiler if
+ * fooled into a warning if we don't init here
*/
pfrag = sk_page_frag(sk);
- while (!sk_page_frag_refill(ssk, pfrag) ||
+ while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) ||
!mptcp_ext_cache_refill(msk)) {
ret = sk_stream_wait_memory(ssk, timeo);
if (ret)
return ret;
+
+ /* if sk_stream_wait_memory() sleeps snd_una can change
+ * significantly, refresh the rtx queue
+ */
+ mptcp_clean_una(sk);
+
if (unlikely(__mptcp_needs_tcp_fallback(msk)))
return 0;
}
+ if (!retransmission) {
+ write_seq = &msk->write_seq;
+ page = pfrag->page;
+ } else {
+ write_seq = &dfrag->data_seq;
+ page = dfrag->page;
+ }
/* compute copy limit */
mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags);
@@ -329,32 +553,74 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
* SSN association set here
*/
can_collapse = (size_goal - skb->len > 0) &&
- mptcp_skb_can_collapse_to(msk, skb, mpext);
+ mptcp_skb_can_collapse_to(*write_seq, skb, mpext);
if (!can_collapse)
TCP_SKB_CB(skb)->eor = 1;
else
avail_size = size_goal - skb->len;
}
- psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size);
-
- /* Copy to page */
- pr_debug("left=%zu", msg_data_left(msg));
- psize = copy_page_from_iter(pfrag->page, pfrag->offset,
- min_t(size_t, msg_data_left(msg), psize),
- &msg->msg_iter);
- pr_debug("left=%zu", msg_data_left(msg));
- if (!psize)
- return -EINVAL;
+
+ if (!retransmission) {
+ /* reuse tail pfrag, if possible, or carve a new one from the
+ * page allocator
+ */
+ dfrag = mptcp_rtx_tail(sk);
+ offset = pfrag->offset;
+ dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag);
+ if (!dfrag_collapsed) {
+ dfrag = mptcp_carve_data_frag(msk, pfrag, offset);
+ offset = dfrag->offset;
+ frag_truesize = dfrag->overhead;
+ }
+ psize = min_t(size_t, pfrag->size - offset, avail_size);
+
+ /* Copy to page */
+ pr_debug("left=%zu", msg_data_left(msg));
+ psize = copy_page_from_iter(pfrag->page, offset,
+ min_t(size_t, msg_data_left(msg),
+ psize),
+ &msg->msg_iter);
+ pr_debug("left=%zu", msg_data_left(msg));
+ if (!psize)
+ return -EINVAL;
+
+ if (!sk_wmem_schedule(sk, psize + dfrag->overhead))
+ return -ENOMEM;
+ } else {
+ offset = dfrag->offset;
+ psize = min_t(size_t, dfrag->data_len, avail_size);
+ }
/* tell the TCP stack to delay the push so that we can safely
* access the skb after the sendpages call
*/
- ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize,
+ ret = do_tcp_sendpages(ssk, page, offset, psize,
msg->msg_flags | MSG_SENDPAGE_NOTLAST);
if (ret <= 0)
return ret;
- if (unlikely(ret < psize))
- iov_iter_revert(&msg->msg_iter, psize - ret);
+
+ frag_truesize += ret;
+ if (!retransmission) {
+ if (unlikely(ret < psize))
+ iov_iter_revert(&msg->msg_iter, psize - ret);
+
+ /* send successful, keep track of sent data for mptcp-level
+ * retransmission
+ */
+ dfrag->data_len += ret;
+ if (!dfrag_collapsed) {
+ get_page(dfrag->page);
+ list_add_tail(&dfrag->list, &msk->rtx_queue);
+ sk_wmem_queued_add(sk, frag_truesize);
+ } else {
+ sk_wmem_queued_add(sk, ret);
+ }
+
+ /* charge data on mptcp rtx queue to the master socket
+ * Note: we charge such data both to sk and ssk
+ */
+ sk->sk_forward_alloc -= frag_truesize;
+ }
/* if the tail skb extension is still the cached one, collapsing
* really happened. Note: we can't check for 'same skb' as the sk_buff
@@ -373,7 +639,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
msk->cached_ext = NULL;
memset(mpext, 0, sizeof(*mpext));
- mpext->data_seq = msk->write_seq;
+ mpext->data_seq = *write_seq;
mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq;
mpext->data_len = ret;
mpext->use_map = 1;
@@ -384,13 +650,51 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
mpext->dsn64);
out:
- pfrag->offset += ret;
- msk->write_seq += ret;
+ if (!retransmission)
+ pfrag->offset += frag_truesize;
+ *write_seq += ret;
mptcp_subflow_ctx(ssk)->rel_write_seq += ret;
return ret;
}
+static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *backup = NULL;
+
+ sock_owned_by_me((const struct sock *)msk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if (!sk_stream_memory_free(ssk)) {
+ struct socket *sock = ssk->sk_socket;
+
+ if (sock) {
+ clear_bit(MPTCP_SEND_SPACE, &msk->flags);
+ smp_mb__after_atomic();
+
+ /* enables sk->write_space() callbacks */
+ set_bit(SOCK_NOSPACE, &sock->flags);
+ }
+
+ return NULL;
+ }
+
+ if (subflow->backup) {
+ if (!backup)
+ backup = ssk;
+
+ continue;
+ }
+
+ return ssk;
+ }
+
+ return backup;
+}
+
static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk)
{
struct socket *sock;
@@ -438,17 +742,29 @@ fallback:
return ret >= 0 ? ret + copied : (copied ? copied : ret);
}
- ssk = mptcp_subflow_get(msk);
- if (!ssk) {
- release_sock(sk);
- return -ENOTCONN;
+ mptcp_clean_una(sk);
+
+ __mptcp_flush_join_list(msk);
+ ssk = mptcp_subflow_get_send(msk);
+ while (!sk_stream_memory_free(sk) || !ssk) {
+ ret = sk_stream_wait_memory(sk, &timeo);
+ if (ret)
+ goto out;
+
+ mptcp_clean_una(sk);
+
+ ssk = mptcp_subflow_get_send(msk);
+ if (list_empty(&msk->conn_list)) {
+ ret = -ENOTCONN;
+ goto out;
+ }
}
pr_debug("conn_list->subflow=%p", ssk);
lock_sock(ssk);
while (msg_data_left(msg)) {
- ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now,
+ ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now,
&size_goal);
if (ret < 0)
break;
@@ -461,10 +777,15 @@ fallback:
copied += ret;
}
+ mptcp_set_timeout(sk, ssk);
if (copied) {
ret = copied;
tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle,
size_goal);
+
+ /* start the timer, if it's not pending */
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
}
ssk_check_wmem(msk, ssk);
@@ -572,6 +893,7 @@ fallback:
len = min_t(size_t, len, INT_MAX);
target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+ __mptcp_flush_join_list(msk);
while (len > (size_t)copied) {
int bytes_read;
@@ -651,6 +973,69 @@ out_err:
return copied;
}
+static void mptcp_retransmit_handler(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (atomic64_read(&msk->snd_una) == msk->write_seq) {
+ mptcp_stop_timer(sk);
+ } else {
+ set_bit(MPTCP_WORK_RTX, &msk->flags);
+ if (schedule_work(&msk->work))
+ sock_hold(sk);
+ }
+}
+
+static void mptcp_retransmit_timer(struct timer_list *t)
+{
+ struct inet_connection_sock *icsk = from_timer(icsk, t,
+ icsk_retransmit_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
+
+ bh_lock_sock(sk);
+ if (!sock_owned_by_user(sk)) {
+ mptcp_retransmit_handler(sk);
+ } else {
+ /* delegate our work to tcp_release_cb() */
+ if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED,
+ &sk->sk_tsq_flags))
+ sock_hold(sk);
+ }
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/* Find an idle subflow. Return NULL if there is unacked data at tcp
+ * level.
+ *
+ * A backup subflow is returned only if that is the only kind available.
+ */
+static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *backup = NULL;
+
+ sock_owned_by_me((const struct sock *)msk);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ /* still data outstanding at TCP level? Don't retransmit. */
+ if (!tcp_write_queue_empty(ssk))
+ return NULL;
+
+ if (subflow->backup) {
+ if (!backup)
+ backup = ssk;
+ continue;
+ }
+
+ return ssk;
+ }
+
+ return backup;
+}
+
/* subflow sockets can be either outgoing (connect) or incoming
* (accept).
*
@@ -681,13 +1066,90 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
return 0;
}
+static void mptcp_check_for_eof(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ int receivers = 0;
+
+ mptcp_for_each_subflow(msk, subflow)
+ receivers += !subflow->rx_eof;
+
+ if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ /* hopefully temporary hack: propagate shutdown status
+ * to msk, when all subflows agree on it
+ */
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+
+ smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+ sk->sk_data_ready(sk);
+ }
+}
+
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
- struct sock *sk = &msk->sk.icsk_inet.sk;
+ struct sock *ssk, *sk = &msk->sk.icsk_inet.sk;
+ int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0;
+ struct mptcp_data_frag *dfrag;
+ u64 orig_write_seq;
+ size_t copied = 0;
+ struct msghdr msg;
+ long timeo = 0;
lock_sock(sk);
+ mptcp_clean_una(sk);
+ __mptcp_flush_join_list(msk);
__mptcp_move_skbs(msk);
+
+ if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
+ mptcp_check_for_eof(msk);
+
+ if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
+ goto unlock;
+
+ dfrag = mptcp_rtx_head(sk);
+ if (!dfrag)
+ goto unlock;
+
+ ssk = mptcp_subflow_get_retrans(msk);
+ if (!ssk)
+ goto reset_unlock;
+
+ lock_sock(ssk);
+
+ msg.msg_flags = MSG_DONTWAIT;
+ orig_len = dfrag->data_len;
+ orig_offset = dfrag->offset;
+ orig_write_seq = dfrag->data_seq;
+ while (dfrag->data_len > 0) {
+ ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now,
+ &size_goal);
+ if (ret < 0)
+ break;
+
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
+ copied += ret;
+ dfrag->data_len -= ret;
+ dfrag->offset += ret;
+ }
+ if (copied)
+ tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle,
+ size_goal);
+
+ dfrag->data_seq = orig_write_seq;
+ dfrag->offset = orig_offset;
+ dfrag->data_len = orig_len;
+
+ mptcp_set_timeout(sk, ssk);
+ release_sock(ssk);
+
+reset_unlock:
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
+
+unlock:
release_sock(sk);
sock_put(sk);
}
@@ -696,22 +1158,55 @@ static int __mptcp_init_sock(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
+ spin_lock_init(&msk->join_list_lock);
+
INIT_LIST_HEAD(&msk->conn_list);
+ INIT_LIST_HEAD(&msk->join_list);
+ INIT_LIST_HEAD(&msk->rtx_queue);
__set_bit(MPTCP_SEND_SPACE, &msk->flags);
INIT_WORK(&msk->work, mptcp_worker);
msk->first = NULL;
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
+ mptcp_pm_data_init(msk);
+
+ /* re-use the csk retrans timer for MPTCP-level retrans */
+ timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
+
return 0;
}
static int mptcp_init_sock(struct sock *sk)
{
- if (!mptcp_is_enabled(sock_net(sk)))
+ struct net *net = sock_net(sk);
+ int ret;
+
+ if (!mptcp_is_enabled(net))
return -ENOPROTOOPT;
- return __mptcp_init_sock(sk);
+ if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
+ return -ENOMEM;
+
+ ret = __mptcp_init_sock(sk);
+ if (ret)
+ return ret;
+
+ sk_sockets_allocated_inc(sk);
+ sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2];
+
+ return 0;
+}
+
+static void __mptcp_clear_xmit(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_data_frag *dtmp, *dfrag;
+
+ sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer);
+
+ list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
+ dfrag_clear(sk, dfrag);
}
static void mptcp_cancel_work(struct sock *sk)
@@ -767,10 +1262,14 @@ static void mptcp_close(struct sock *sk, long timeout)
mptcp_token_destroy(msk->token);
inet_sk_state_store(sk, TCP_CLOSE);
+ __mptcp_flush_join_list(msk);
+
list_splice_init(&msk->conn_list, &conn_list);
data_fin_tx_seq = msk->write_seq;
+ __mptcp_clear_xmit(sk);
+
release_sock(sk);
list_for_each_entry_safe(subflow, tmp, &conn_list, node) {
@@ -782,6 +1281,7 @@ static void mptcp_close(struct sock *sk, long timeout)
}
mptcp_cancel_work(sk);
+ mptcp_pm_close(msk);
__skb_queue_purge(&sk->sk_receive_queue);
@@ -811,6 +1311,15 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk)
inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr;
}
+static int mptcp_disconnect(struct sock *sk, int flags)
+{
+ lock_sock(sk);
+ __mptcp_clear_xmit(sk);
+ release_sock(sk);
+ mptcp_cancel_work(sk);
+ return tcp_disconnect(sk, flags);
+}
+
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk)
{
@@ -854,6 +1363,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req)
}
msk->write_seq = subflow_req->idsn + 1;
+ atomic64_set(&msk->snd_una, msk->write_seq);
if (subflow_req->remote_key_valid) {
msk->can_ack = true;
msk->remote_key = subflow_req->remote_key;
@@ -920,7 +1430,12 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err,
list_add(&subflow->node, &msk->conn_list);
bh_unlock_sock(new_mptcp_sock);
+
+ __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK);
local_bh_enable();
+ } else {
+ MPTCP_INC_STATS(sock_net(sk),
+ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK);
}
return newsk;
@@ -932,6 +1447,8 @@ static void mptcp_destroy(struct sock *sk)
if (msk->cached_ext)
__skb_ext_put(msk->cached_ext);
+
+ sk_sockets_allocated_dec(sk);
}
static int mptcp_setsockopt(struct sock *sk, int level, int optname,
@@ -984,7 +1501,8 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname,
return -EOPNOTSUPP;
}
-#define MPTCP_DEFERRED_ALL TCPF_DELACK_TIMER_DEFERRED
+#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \
+ TCPF_WRITE_TIMER_DEFERRED)
/* this is very alike tcp_release_cb() but we must handle differently a
* different set of events
@@ -1000,6 +1518,8 @@ static void mptcp_release_cb(struct sock *sk)
nflags = flags & ~MPTCP_DEFERRED_ALL;
} while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
+ sock_release_ownership(sk);
+
if (flags & TCPF_DELACK_TIMER_DEFERRED) {
struct mptcp_sock *msk = mptcp_sk(sk);
struct sock *ssk;
@@ -1008,6 +1528,11 @@ static void mptcp_release_cb(struct sock *sk)
if (!ssk || !schedule_work(&msk->work))
__sock_put(sk);
}
+
+ if (flags & TCPF_WRITE_TIMER_DEFERRED) {
+ mptcp_retransmit_handler(sk);
+ __sock_put(sk);
+ }
}
static int mptcp_get_port(struct sock *sk, unsigned short snum)
@@ -1031,13 +1556,15 @@ void mptcp_finish_connect(struct sock *ssk)
u64 ack_seq;
subflow = mptcp_subflow_ctx(ssk);
-
- if (!subflow->mp_capable)
- return;
-
sk = subflow->conn;
msk = mptcp_sk(sk);
+ if (!subflow->mp_capable) {
+ MPTCP_INC_STATS(sock_net(sk),
+ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
+ return;
+ }
+
pr_debug("msk=%p, token=%u", sk, subflow->token);
mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq);
@@ -1055,6 +1582,9 @@ void mptcp_finish_connect(struct sock *ssk)
WRITE_ONCE(msk->write_seq, subflow->idsn + 1);
WRITE_ONCE(msk->ack_seq, ack_seq);
WRITE_ONCE(msk->can_ack, 1);
+ atomic64_set(&msk->snd_una, msk->write_seq);
+
+ mptcp_pm_new_connection(msk, 0);
}
static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
@@ -1066,6 +1596,46 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent)
write_unlock_bh(&sk->sk_callback_lock);
}
+bool mptcp_finish_join(struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+ struct mptcp_sock *msk = mptcp_sk(subflow->conn);
+ struct sock *parent = (void *)msk;
+ struct socket *parent_sock;
+ bool ret;
+
+ pr_debug("msk=%p, subflow=%p", msk, subflow);
+
+ /* mptcp socket already closing? */
+ if (inet_sk_state_load(parent) != TCP_ESTABLISHED)
+ return false;
+
+ if (!msk->pm.server_side)
+ return true;
+
+ /* passive connection, attach to msk socket */
+ parent_sock = READ_ONCE(parent->sk_socket);
+ if (parent_sock && !sk->sk_socket)
+ mptcp_sock_graft(sk, parent_sock);
+
+ ret = mptcp_pm_allow_new_subflow(msk);
+ if (ret) {
+ /* active connections are already on conn_list */
+ spin_lock_bh(&msk->join_list_lock);
+ if (!WARN_ON_ONCE(!list_empty(&subflow->node)))
+ list_add_tail(&subflow->node, &msk->join_list);
+ spin_unlock_bh(&msk->join_list_lock);
+ }
+ return ret;
+}
+
+bool mptcp_sk_is_subflow(const struct sock *sk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+
+ return subflow->mp_join == 1;
+}
+
static bool mptcp_memory_free(const struct sock *sk, int wake)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1077,6 +1647,7 @@ static struct proto mptcp_prot = {
.name = "MPTCP",
.owner = THIS_MODULE,
.init = mptcp_init_sock,
+ .disconnect = mptcp_disconnect,
.close = mptcp_close,
.accept = mptcp_accept,
.setsockopt = mptcp_setsockopt,
@@ -1089,7 +1660,12 @@ static struct proto mptcp_prot = {
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = mptcp_get_port,
+ .sockets_allocated = &mptcp_sockets_allocated,
+ .memory_allocated = &tcp_memory_allocated,
+ .memory_pressure = &tcp_memory_pressure,
.stream_memory_free = mptcp_memory_free,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
+ .sysctl_mem = sysctl_tcp_mem,
.obj_size = sizeof(struct mptcp_sock),
.no_autobind = true,
};
@@ -1245,6 +1821,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
*/
+ __mptcp_flush_join_list(msk);
list_for_each_entry(subflow, &msk->conn_list, node) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -1271,7 +1848,9 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
msk = mptcp_sk(sk);
lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
+ ssock = __mptcp_tcp_fallback(msk);
+ if (!ssock)
+ ssock = __mptcp_nmpc_socket(msk);
if (ssock) {
mask = ssock->ops->poll(file, ssock, wait);
release_sock(sk);
@@ -1281,9 +1860,6 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
release_sock(sk);
sock_poll_wait(file, sock, wait);
lock_sock(sk);
- ssock = __mptcp_tcp_fallback(msk);
- if (unlikely(ssock))
- return ssock->ops->poll(file, ssock, NULL);
if (test_bit(MPTCP_DATA_READY, &msk->flags))
mask = EPOLLIN | EPOLLRDNORM;
@@ -1302,11 +1878,17 @@ static int mptcp_shutdown(struct socket *sock, int how)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
struct mptcp_subflow_context *subflow;
+ struct socket *ssock;
int ret = 0;
pr_debug("sk=%p, how=%d", msk, how);
lock_sock(sock->sk);
+ ssock = __mptcp_tcp_fallback(msk);
+ if (ssock) {
+ release_sock(sock->sk);
+ return inet_shutdown(ssock, how);
+ }
if (how == SHUT_WR || how == SHUT_RDWR)
inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
@@ -1326,6 +1908,7 @@ static int mptcp_shutdown(struct socket *sock, int how)
sock->state = SS_CONNECTED;
}
+ __mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
@@ -1376,7 +1959,11 @@ void mptcp_proto_init(void)
{
mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo;
+ if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
+ panic("Failed to allocate MPTCP pcpu counter\n");
+
mptcp_subflow_init();
+ mptcp_pm_init();
if (proto_register(&mptcp_prot, 1) != 0)
panic("Failed to register MPTCP proto.\n");
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index eb3f65264a40..67448002a2d7 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -17,6 +17,12 @@
#define OPTION_MPTCP_MPC_SYN BIT(0)
#define OPTION_MPTCP_MPC_SYNACK BIT(1)
#define OPTION_MPTCP_MPC_ACK BIT(2)
+#define OPTION_MPTCP_MPJ_SYN BIT(3)
+#define OPTION_MPTCP_MPJ_SYNACK BIT(4)
+#define OPTION_MPTCP_MPJ_ACK BIT(5)
+#define OPTION_MPTCP_ADD_ADDR BIT(6)
+#define OPTION_MPTCP_ADD_ADDR6 BIT(7)
+#define OPTION_MPTCP_RM_ADDR BIT(8)
/* MPTCP option subtypes */
#define MPTCPOPT_MP_CAPABLE 0
@@ -33,12 +39,30 @@
#define TCPOLEN_MPTCP_MPC_SYNACK 12
#define TCPOLEN_MPTCP_MPC_ACK 20
#define TCPOLEN_MPTCP_MPC_ACK_DATA 22
+#define TCPOLEN_MPTCP_MPJ_SYN 12
+#define TCPOLEN_MPTCP_MPJ_SYNACK 16
+#define TCPOLEN_MPTCP_MPJ_ACK 24
#define TCPOLEN_MPTCP_DSS_BASE 4
#define TCPOLEN_MPTCP_DSS_ACK32 4
#define TCPOLEN_MPTCP_DSS_ACK64 8
#define TCPOLEN_MPTCP_DSS_MAP32 10
#define TCPOLEN_MPTCP_DSS_MAP64 14
#define TCPOLEN_MPTCP_DSS_CHECKSUM 2
+#define TCPOLEN_MPTCP_ADD_ADDR 16
+#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18
+#define TCPOLEN_MPTCP_ADD_ADDR_BASE 8
+#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10
+#define TCPOLEN_MPTCP_ADD_ADDR6 28
+#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30
+#define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20
+#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22
+#define TCPOLEN_MPTCP_PORT_LEN 2
+#define TCPOLEN_MPTCP_RM_ADDR_BASE 4
+
+/* MPTCP MP_JOIN flags */
+#define MPTCPOPT_BACKUP BIT(0)
+#define MPTCPOPT_HMAC_LEN 20
+#define MPTCPOPT_THMAC_LEN 8
/* MPTCP MP_CAPABLE flags */
#define MPTCP_VERSION_MASK (0x0F)
@@ -55,9 +79,76 @@
#define MPTCP_DSS_HAS_ACK BIT(0)
#define MPTCP_DSS_FLAG_MASK (0x1F)
+/* MPTCP ADD_ADDR flags */
+#define MPTCP_ADDR_ECHO BIT(0)
+#define MPTCP_ADDR_HMAC_LEN 20
+#define MPTCP_ADDR_IPVERSION_4 4
+#define MPTCP_ADDR_IPVERSION_6 6
+
/* MPTCP socket flags */
#define MPTCP_DATA_READY 0
#define MPTCP_SEND_SPACE 1
+#define MPTCP_WORK_RTX 2
+#define MPTCP_WORK_EOF 3
+
+static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
+{
+ return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) |
+ ((nib & 0xF) << 8) | field);
+}
+
+#define MPTCP_PM_MAX_ADDR 4
+
+struct mptcp_addr_info {
+ sa_family_t family;
+ __be16 port;
+ u8 id;
+ union {
+ struct in_addr addr;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ struct in6_addr addr6;
+#endif
+ };
+};
+
+enum mptcp_pm_status {
+ MPTCP_PM_ADD_ADDR_RECEIVED,
+ MPTCP_PM_ESTABLISHED,
+ MPTCP_PM_SUBFLOW_ESTABLISHED,
+};
+
+struct mptcp_pm_data {
+ struct mptcp_addr_info local;
+ struct mptcp_addr_info remote;
+
+ spinlock_t lock; /*protects the whole PM data */
+
+ bool addr_signal;
+ bool server_side;
+ bool work_pending;
+ bool accept_addr;
+ bool accept_subflow;
+ u8 add_addr_signaled;
+ u8 add_addr_accepted;
+ u8 local_addr_used;
+ u8 subflows;
+ u8 add_addr_signal_max;
+ u8 add_addr_accept_max;
+ u8 local_addr_max;
+ u8 subflows_max;
+ u8 status;
+
+ struct work_struct work;
+};
+
+struct mptcp_data_frag {
+ struct list_head list;
+ u64 data_seq;
+ int data_len;
+ int offset;
+ int overhead;
+ struct page *page;
+};
/* MPTCP connection sock */
struct mptcp_sock {
@@ -67,14 +158,20 @@ struct mptcp_sock {
u64 remote_key;
u64 write_seq;
u64 ack_seq;
+ atomic64_t snd_una;
+ unsigned long timer_ival;
u32 token;
unsigned long flags;
bool can_ack;
+ spinlock_t join_list_lock;
struct work_struct work;
struct list_head conn_list;
+ struct list_head rtx_queue;
+ struct list_head join_list;
struct skb_ext *cached_ext; /* for the next sendmsg */
struct socket *subflow; /* outgoing connect/listener/!mp_capable */
struct sock *first;
+ struct mptcp_pm_data pm;
};
#define mptcp_for_each_subflow(__msk, __subflow) \
@@ -85,17 +182,42 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk)
return (struct mptcp_sock *)sk;
}
+static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (list_empty(&msk->rtx_queue))
+ return NULL;
+
+ return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
+}
+
+static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (list_empty(&msk->rtx_queue))
+ return NULL;
+
+ return list_first_entry(&msk->rtx_queue, struct mptcp_data_frag, list);
+}
+
struct mptcp_subflow_request_sock {
struct tcp_request_sock sk;
u16 mp_capable : 1,
mp_join : 1,
backup : 1,
remote_key_valid : 1;
+ u8 local_id;
+ u8 remote_id;
u64 local_key;
u64 remote_key;
u64 idsn;
u32 token;
u32 ssn_offset;
+ u64 thmac;
+ u32 local_nonce;
+ u32 remote_nonce;
};
static inline struct mptcp_subflow_request_sock *
@@ -118,16 +240,28 @@ struct mptcp_subflow_context {
u32 ssn_offset;
u32 map_data_len;
u32 request_mptcp : 1, /* send MP_CAPABLE */
+ request_join : 1, /* send MP_JOIN */
+ request_bkup : 1,
mp_capable : 1, /* remote is MPTCP capable */
+ mp_join : 1, /* remote is JOINing */
fully_established : 1, /* path validated */
+ pm_notified : 1, /* PM hook called for established status */
conn_finished : 1,
map_valid : 1,
mpc_map : 1,
+ backup : 1,
data_avail : 1,
rx_eof : 1,
data_fin_tx_enable : 1,
can_ack : 1; /* only after processing the remote a key */
u64 data_fin_tx_seq;
+ u32 remote_nonce;
+ u64 thmac;
+ u32 local_nonce;
+ u32 remote_token;
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ u8 local_id;
+ u8 remote_id;
struct sock *tcp_sock; /* tcp sk backpointer */
struct sock *conn; /* parent mptcp_sock */
@@ -171,6 +305,11 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow)
int mptcp_is_enabled(struct net *net);
bool mptcp_subflow_data_available(struct sock *sk);
void mptcp_subflow_init(void);
+
+/* called with sk socket lock held */
+int __mptcp_subflow_connect(struct sock *sk, int ifindex,
+ const struct mptcp_addr_info *loc,
+ const struct mptcp_addr_info *remote);
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock);
static inline void mptcp_subflow_tcp_fallback(struct sock *sk,
@@ -199,11 +338,15 @@ void mptcp_get_options(const struct sk_buff *skb,
void mptcp_finish_connect(struct sock *sk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
+bool mptcp_finish_join(struct sock *sk);
+void mptcp_data_acked(struct sock *sk);
+void mptcp_subflow_eof(struct sock *sk);
int mptcp_token_new_request(struct request_sock *req);
void mptcp_token_destroy_request(u32 token);
int mptcp_token_new_connect(struct sock *sk);
int mptcp_token_new_accept(u32 token, struct sock *conn);
+struct mptcp_sock *mptcp_token_get_sock(u32 token);
void mptcp_token_destroy(u32 token);
void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn);
@@ -219,8 +362,48 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
mptcp_crypto_key_sha(*key, token, idsn);
}
-void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
- void *hash_out);
+void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac);
+
+void mptcp_pm_init(void);
+void mptcp_pm_data_init(struct mptcp_sock *msk);
+void mptcp_pm_close(struct mptcp_sock *msk);
+void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side);
+void mptcp_pm_fully_established(struct mptcp_sock *msk);
+bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk);
+void mptcp_pm_connection_closed(struct mptcp_sock *msk);
+void mptcp_pm_subflow_established(struct mptcp_sock *msk,
+ struct mptcp_subflow_context *subflow);
+void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id);
+void mptcp_pm_add_addr_received(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
+
+int mptcp_pm_announce_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
+int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id);
+int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id);
+
+static inline bool mptcp_pm_should_signal(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.addr_signal);
+}
+
+static inline unsigned int mptcp_add_addr_len(int family)
+{
+ if (family == AF_INET)
+ return TCPOLEN_MPTCP_ADD_ADDR;
+ return TCPOLEN_MPTCP_ADD_ADDR6;
+}
+
+bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
+ struct mptcp_addr_info *saddr);
+int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
+
+void mptcp_pm_nl_init(void);
+void mptcp_pm_nl_data_init(struct mptcp_sock *msk);
+void mptcp_pm_nl_fully_established(struct mptcp_sock *msk);
+void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk);
+void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk);
+int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc);
static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb)
{
@@ -234,4 +417,6 @@ static inline bool before64(__u64 seq1, __u64 seq2)
#define after64(seq2, seq1) before64(seq1, seq2)
+void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops);
+
#endif /* __MPTCP_PROTOCOL_H */
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 5bae12da2769..50a8bea987c6 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -9,6 +9,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/netdevice.h>
+#include <crypto/algapi.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
@@ -19,17 +20,42 @@
#endif
#include <net/mptcp.h>
#include "protocol.h"
+#include "mib.h"
+
+static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
+ enum linux_mptcp_mib_field field)
+{
+ MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
+}
static int subflow_rebuild_header(struct sock *sk)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
- int err = 0;
+ int local_id, err = 0;
if (subflow->request_mptcp && !subflow->token) {
pr_debug("subflow=%p", sk);
err = mptcp_token_new_connect(sk);
+ } else if (subflow->request_join && !subflow->local_nonce) {
+ struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn;
+
+ pr_debug("subflow=%p", sk);
+
+ do {
+ get_random_bytes(&subflow->local_nonce, sizeof(u32));
+ } while (!subflow->local_nonce);
+
+ if (subflow->local_id)
+ goto out;
+
+ local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
+ if (local_id < 0)
+ return -EINVAL;
+
+ subflow->local_id = local_id;
}
+out:
if (err)
return err;
@@ -47,6 +73,51 @@ static void subflow_req_destructor(struct request_sock *req)
tcp_request_sock_ops.destructor(req);
}
+static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
+ void *hmac)
+{
+ u8 msg[8];
+
+ put_unaligned_be32(nonce1, &msg[0]);
+ put_unaligned_be32(nonce2, &msg[4]);
+
+ mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
+}
+
+/* validate received token and create truncated hmac and nonce for SYN-ACK */
+static bool subflow_token_join_request(struct request_sock *req,
+ const struct sk_buff *skb)
+{
+ struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ struct mptcp_sock *msk;
+ int local_id;
+
+ msk = mptcp_token_get_sock(subflow_req->token);
+ if (!msk) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
+ return false;
+ }
+
+ local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
+ if (local_id < 0) {
+ sock_put((struct sock *)msk);
+ return false;
+ }
+ subflow_req->local_id = local_id;
+
+ get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
+
+ subflow_generate_hmac(msk->local_key, msk->remote_key,
+ subflow_req->local_nonce,
+ subflow_req->remote_nonce, hmac);
+
+ subflow_req->thmac = get_unaligned_be64(hmac);
+
+ sock_put((struct sock *)msk);
+ return true;
+}
+
static void subflow_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
@@ -61,6 +132,7 @@ static void subflow_init_req(struct request_sock *req,
mptcp_get_options(skb, &rx_opt);
subflow_req->mp_capable = 0;
+ subflow_req->mp_join = 0;
subflow_req->remote_key_valid = 0;
#ifdef CONFIG_TCP_MD5SIG
@@ -71,6 +143,15 @@ static void subflow_init_req(struct request_sock *req,
return;
#endif
+ if (rx_opt.mptcp.mp_capable) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
+
+ if (rx_opt.mptcp.mp_join)
+ return;
+ } else if (rx_opt.mptcp.mp_join) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
+ }
+
if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
int err;
@@ -79,6 +160,19 @@ static void subflow_init_req(struct request_sock *req,
subflow_req->mp_capable = 1;
subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
+ } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) {
+ subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
+ subflow_req->mp_join = 1;
+ subflow_req->backup = rx_opt.mptcp.backup;
+ subflow_req->remote_id = rx_opt.mptcp.join_id;
+ subflow_req->token = rx_opt.mptcp.token;
+ subflow_req->remote_nonce = rx_opt.mptcp.nonce;
+ pr_debug("token=%u, remote_nonce=%u", subflow_req->token,
+ subflow_req->remote_nonce);
+ if (!subflow_token_join_request(req, skb)) {
+ subflow_req->mp_join = 0;
+ // @@ need to trigger RST
+ }
}
}
@@ -106,6 +200,25 @@ static void subflow_v6_init_req(struct request_sock *req,
}
#endif
+/* validate received truncated hmac and create hmac for third ACK */
+static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
+{
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ u64 thmac;
+
+ subflow_generate_hmac(subflow->remote_key, subflow->local_key,
+ subflow->remote_nonce, subflow->local_nonce,
+ hmac);
+
+ thmac = get_unaligned_be64(hmac);
+ pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
+ subflow, subflow->token,
+ (unsigned long long)thmac,
+ (unsigned long long)subflow->thmac);
+
+ return thmac == subflow->thmac;
+}
+
static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
{
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
@@ -118,7 +231,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
parent->sk_state_change(parent);
}
- if (!subflow->conn_finished) {
+ if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp)
+ return;
+
+ if (subflow->mp_capable) {
pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
subflow->remote_key);
mptcp_finish_connect(sk);
@@ -128,6 +244,33 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
}
+ } else if (subflow->mp_join) {
+ pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u",
+ subflow, subflow->thmac,
+ subflow->remote_nonce);
+ if (!subflow_thmac_valid(subflow)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
+ subflow->mp_join = 0;
+ goto do_reset;
+ }
+
+ subflow_generate_hmac(subflow->local_key, subflow->remote_key,
+ subflow->local_nonce,
+ subflow->remote_nonce,
+ subflow->hmac);
+
+ if (skb)
+ subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
+
+ if (!mptcp_finish_join(sk))
+ goto do_reset;
+
+ subflow->conn_finished = 1;
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
+ } else {
+do_reset:
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_done(sk);
}
}
@@ -178,6 +321,32 @@ drop:
}
#endif
+/* validate hmac received in third ACK */
+static bool subflow_hmac_valid(const struct request_sock *req,
+ const struct tcp_options_received *rx_opt)
+{
+ const struct mptcp_subflow_request_sock *subflow_req;
+ u8 hmac[MPTCPOPT_HMAC_LEN];
+ struct mptcp_sock *msk;
+ bool ret;
+
+ subflow_req = mptcp_subflow_rsk(req);
+ msk = mptcp_token_get_sock(subflow_req->token);
+ if (!msk)
+ return false;
+
+ subflow_generate_hmac(msk->remote_key, msk->local_key,
+ subflow_req->remote_nonce,
+ subflow_req->local_nonce, hmac);
+
+ ret = true;
+ if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac)))
+ ret = false;
+
+ sock_put((struct sock *)msk);
+ return ret;
+}
+
static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct sk_buff *skb,
struct request_sock *req,
@@ -188,6 +357,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
struct mptcp_subflow_request_sock *subflow_req;
struct tcp_options_received opt_rx;
+ bool fallback_is_fatal = false;
struct sock *new_msk = NULL;
struct sock *child;
@@ -221,6 +391,15 @@ create_msk:
new_msk = mptcp_sk_clone(listener->conn, req);
if (!new_msk)
subflow_req->mp_capable = 0;
+ } else if (subflow_req->mp_join) {
+ fallback_is_fatal = true;
+ opt_rx.mptcp.mp_join = 0;
+ mptcp_get_options(skb, &opt_rx);
+ if (!opt_rx.mptcp.mp_join ||
+ !subflow_hmac_valid(req, &opt_rx)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
+ return NULL;
+ }
}
create_child:
@@ -230,20 +409,35 @@ create_child:
if (child && *own_req) {
struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
- /* we have null ctx on TCP fallback, not fatal on MPC
- * handshake
+ /* we have null ctx on TCP fallback, which is fatal on
+ * MPJ handshake
*/
- if (!ctx)
+ if (!ctx) {
+ if (fallback_is_fatal)
+ goto close_child;
goto out;
+ }
if (ctx->mp_capable) {
/* new mpc subflow takes ownership of the newly
* created mptcp socket
*/
- inet_sk_state_store((struct sock *)new_msk,
- TCP_ESTABLISHED);
+ inet_sk_state_store(new_msk, TCP_ESTABLISHED);
+ mptcp_pm_new_connection(mptcp_sk(new_msk), 1);
ctx->conn = new_msk;
new_msk = NULL;
+ } else if (ctx->mp_join) {
+ struct mptcp_sock *owner;
+
+ owner = mptcp_token_get_sock(ctx->token);
+ if (!owner)
+ goto close_child;
+
+ ctx->conn = (struct sock *)owner;
+ if (!mptcp_finish_join(child))
+ goto close_child;
+
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
}
}
@@ -252,6 +446,12 @@ out:
if (unlikely(new_msk))
sock_put(new_msk);
return child;
+
+close_child:
+ tcp_send_active_reset(child, GFP_ATOMIC);
+ inet_csk_prepare_forced_close(child);
+ tcp_done(child);
+ return NULL;
}
static struct inet_connection_sock_af_ops subflow_specific;
@@ -353,6 +553,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
data_len = mpext->data_len;
if (data_len == 0) {
pr_err("Infinite mapping not handled");
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
return MAPPING_INVALID;
}
@@ -396,8 +597,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk)
/* If this skb data are fully covered by the current mapping,
* the new map would need caching, which is not supported
*/
- if (skb_is_fully_mapped(ssk, skb))
+ if (skb_is_fully_mapped(ssk, skb)) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
return MAPPING_INVALID;
+ }
/* will validate the next map after consuming the current one */
return MAPPING_OK;
@@ -566,7 +769,7 @@ static void subflow_data_ready(struct sock *sk)
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
struct sock *parent = subflow->conn;
- if (!subflow->mp_capable) {
+ if (!subflow->mp_capable && !subflow->mp_join) {
subflow->tcp_data_ready(sk);
parent->sk_data_ready(parent);
@@ -621,6 +824,85 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
}
#endif
+static void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
+ struct sockaddr_storage *addr)
+{
+ memset(addr, 0, sizeof(*addr));
+ addr->ss_family = info->family;
+ if (addr->ss_family == AF_INET) {
+ struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
+
+ in_addr->sin_addr = info->addr;
+ in_addr->sin_port = info->port;
+ }
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ else if (addr->ss_family == AF_INET6) {
+ struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
+
+ in6_addr->sin6_addr = info->addr6;
+ in6_addr->sin6_port = info->port;
+ }
+#endif
+}
+
+int __mptcp_subflow_connect(struct sock *sk, int ifindex,
+ const struct mptcp_addr_info *loc,
+ const struct mptcp_addr_info *remote)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct mptcp_subflow_context *subflow;
+ struct sockaddr_storage addr;
+ struct socket *sf;
+ u32 remote_token;
+ int addrlen;
+ int err;
+
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return -ENOTCONN;
+
+ err = mptcp_subflow_create_socket(sk, &sf);
+ if (err)
+ return err;
+
+ subflow = mptcp_subflow_ctx(sf->sk);
+ subflow->remote_key = msk->remote_key;
+ subflow->local_key = msk->local_key;
+ subflow->token = msk->token;
+ mptcp_info2sockaddr(loc, &addr);
+
+ addrlen = sizeof(struct sockaddr_in);
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ if (loc->family == AF_INET6)
+ addrlen = sizeof(struct sockaddr_in6);
+#endif
+ sf->sk->sk_bound_dev_if = ifindex;
+ err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
+ if (err)
+ goto failed;
+
+ mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
+ pr_debug("msk=%p remote_token=%u", msk, remote_token);
+ subflow->remote_token = remote_token;
+ subflow->local_id = loc->id;
+ subflow->request_join = 1;
+ subflow->request_bkup = 1;
+ mptcp_info2sockaddr(remote, &addr);
+
+ err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
+ if (err && err != -EINPROGRESS)
+ goto failed;
+
+ spin_lock_bh(&msk->join_list_lock);
+ list_add_tail(&subflow->node, &msk->join_list);
+ spin_unlock_bh(&msk->join_list_lock);
+
+ return err;
+
+failed:
+ sock_release(sf);
+ return err;
+}
+
int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
{
struct mptcp_subflow_context *subflow;
@@ -712,8 +994,7 @@ static void subflow_state_change(struct sock *sk)
if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
!subflow->rx_eof && subflow_is_done(sk)) {
subflow->rx_eof = 1;
- parent->sk_shutdown |= RCV_SHUTDOWN;
- __subflow_state_change(parent);
+ mptcp_subflow_eof(parent);
}
}
@@ -785,7 +1066,8 @@ static void subflow_ulp_clone(const struct request_sock *req,
struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
struct mptcp_subflow_context *new_ctx;
- if (!tcp_rsk(req)->is_mptcp || !subflow_req->mp_capable) {
+ if (!tcp_rsk(req)->is_mptcp ||
+ (!subflow_req->mp_capable && !subflow_req->mp_join)) {
subflow_ulp_fallback(newsk, old_ctx);
return;
}
@@ -796,9 +1078,6 @@ static void subflow_ulp_clone(const struct request_sock *req,
return;
}
- /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully
- * established only after we receive the remote key
- */
new_ctx->conn_finished = 1;
new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
@@ -807,14 +1086,27 @@ static void subflow_ulp_clone(const struct request_sock *req,
new_ctx->rel_write_seq = 1;
new_ctx->tcp_sock = newsk;
- new_ctx->mp_capable = 1;
- new_ctx->fully_established = subflow_req->remote_key_valid;
- new_ctx->can_ack = subflow_req->remote_key_valid;
- new_ctx->remote_key = subflow_req->remote_key;
- new_ctx->local_key = subflow_req->local_key;
- new_ctx->token = subflow_req->token;
- new_ctx->ssn_offset = subflow_req->ssn_offset;
- new_ctx->idsn = subflow_req->idsn;
+ if (subflow_req->mp_capable) {
+ /* see comments in subflow_syn_recv_sock(), MPTCP connection
+ * is fully established only after we receive the remote key
+ */
+ new_ctx->mp_capable = 1;
+ new_ctx->fully_established = subflow_req->remote_key_valid;
+ new_ctx->can_ack = subflow_req->remote_key_valid;
+ new_ctx->remote_key = subflow_req->remote_key;
+ new_ctx->local_key = subflow_req->local_key;
+ new_ctx->token = subflow_req->token;
+ new_ctx->ssn_offset = subflow_req->ssn_offset;
+ new_ctx->idsn = subflow_req->idsn;
+ } else if (subflow_req->mp_join) {
+ new_ctx->ssn_offset = subflow_req->ssn_offset;
+ new_ctx->mp_join = 1;
+ new_ctx->fully_established = 1;
+ new_ctx->backup = subflow_req->backup;
+ new_ctx->local_id = subflow_req->local_id;
+ new_ctx->token = subflow_req->token;
+ new_ctx->thmac = subflow_req->thmac;
+ }
}
static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
@@ -876,6 +1168,8 @@ void mptcp_subflow_init(void)
subflow_v6m_specific.net_frag_header_len = 0;
#endif
+ mptcp_diag_subflow_init(&subflow_ulp_ops);
+
if (tcp_register_ulp(&subflow_ulp_ops) != 0)
panic("MPTCP: failed to register subflows to ULP\n");
}
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index b71b53c0ac8d..33352dd99d4d 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -40,7 +40,7 @@ static int token_used __read_mostly;
/**
* mptcp_token_new_request - create new key/idsn/token for subflow_request
- * @req - the request socket
+ * @req: the request socket
*
* This function is called when a new mptcp connection is coming in.
*
@@ -80,7 +80,7 @@ int mptcp_token_new_request(struct request_sock *req)
/**
* mptcp_token_new_connect - create new key/idsn/token for subflow
- * @sk - the socket that will initiate a connection
+ * @sk: the socket that will initiate a connection
*
* This function is called when a new outgoing mptcp connection is
* initiated.
@@ -125,6 +125,7 @@ int mptcp_token_new_connect(struct sock *sk)
/**
* mptcp_token_new_accept - insert token for later processing
* @token: the token to insert to the tree
+ * @conn: the just cloned socket linked to the new connection
*
* Called when a SYN packet creates a new logical connection, i.e.
* is not a join request.
@@ -141,8 +142,35 @@ int mptcp_token_new_accept(u32 token, struct sock *conn)
}
/**
+ * mptcp_token_get_sock - retrieve mptcp connection sock using its token
+ * @token: token of the mptcp connection to retrieve
+ *
+ * This function returns the mptcp connection structure with the given token.
+ * A reference count on the mptcp socket returned is taken.
+ *
+ * returns NULL if no connection with the given token value exists.
+ */
+struct mptcp_sock *mptcp_token_get_sock(u32 token)
+{
+ struct sock *conn;
+
+ spin_lock_bh(&token_tree_lock);
+ conn = radix_tree_lookup(&token_tree, token);
+ if (conn) {
+ /* token still reserved? */
+ if (conn == (struct sock *)&token_used)
+ conn = NULL;
+ else
+ sock_hold(conn);
+ }
+ spin_unlock_bh(&token_tree_lock);
+
+ return mptcp_sk(conn);
+}
+
+/**
* mptcp_token_destroy_request - remove mptcp connection/token
- * @token - token of mptcp connection to remove
+ * @token: token of mptcp connection to remove
*
* Remove not-yet-fully-established incoming connection identified
* by @token.
@@ -156,7 +184,7 @@ void mptcp_token_destroy_request(u32 token)
/**
* mptcp_token_destroy - remove mptcp connection/token
- * @token - token of mptcp connection to remove
+ * @token: token of mptcp connection to remove
*
* Remove the connection identified by @token.
*/
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 292e71dc7ba4..0e0ded87e27b 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -83,7 +83,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nft_set_pipapo.o
ifdef CONFIG_X86_64
-ifneq (,$(findstring -DCONFIG_AS_AVX2=1,$(KBUILD_CFLAGS)))
+ifndef CONFIG_UML
nf_tables-objs += nft_set_pipapo_avx2.o
endif
endif
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 512259f579d7..aa6a603a2425 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1661,8 +1661,9 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
unsigned int offset, offset2, ihl, verdict;
- bool ipip, new_cp = false;
+ bool tunnel, new_cp = false;
union nf_inet_addr *raddr;
+ char *outer_proto = "IPIP";
*related = 1;
@@ -1703,8 +1704,8 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
return NF_ACCEPT; /* The packet looks wrong, ignore */
raddr = (union nf_inet_addr *)&cih->daddr;
- /* Special case for errors for IPIP packets */
- ipip = false;
+ /* Special case for errors for IPIP/UDP/GRE tunnel packets */
+ tunnel = false;
if (cih->protocol == IPPROTO_IPIP) {
struct ip_vs_dest *dest;
@@ -1721,7 +1722,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
if (cih == NULL)
return NF_ACCEPT; /* The packet looks wrong, ignore */
- ipip = true;
+ tunnel = true;
} else if ((cih->protocol == IPPROTO_UDP || /* Can be UDP encap */
cih->protocol == IPPROTO_GRE) && /* Can be GRE encap */
/* Error for our tunnel must arrive at LOCAL_IN */
@@ -1729,16 +1730,19 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
__u8 iproto;
int ulen;
- /* Non-first fragment has no UDP header */
+ /* Non-first fragment has no UDP/GRE header */
if (unlikely(cih->frag_off & htons(IP_OFFSET)))
return NF_ACCEPT;
offset2 = offset + cih->ihl * 4;
- if (cih->protocol == IPPROTO_UDP)
+ if (cih->protocol == IPPROTO_UDP) {
ulen = ipvs_udp_decap(ipvs, skb, offset2, AF_INET,
raddr, &iproto);
- else
+ outer_proto = "UDP";
+ } else {
ulen = ipvs_gre_decap(ipvs, skb, offset2, AF_INET,
raddr, &iproto);
+ outer_proto = "GRE";
+ }
if (ulen > 0) {
/* Skip IP and UDP/GRE tunnel headers */
offset = offset2 + ulen;
@@ -1747,7 +1751,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
&_ciph);
if (cih && cih->version == 4 && cih->ihl >= 5 &&
iproto == IPPROTO_IPIP)
- ipip = true;
+ tunnel = true;
else
return NF_ACCEPT;
}
@@ -1767,11 +1771,11 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
"Checking incoming ICMP for");
offset2 = offset;
- ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !ipip, &ciph);
+ ip_vs_fill_iph_skb_icmp(AF_INET, skb, offset, !tunnel, &ciph);
offset = ciph.len;
/* The embedded headers contain source and dest in reverse order.
- * For IPIP this is error for request, not for reply.
+ * For IPIP/UDP/GRE tunnel this is error for request, not for reply.
*/
cp = INDIRECT_CALL_1(pp->conn_in_get, ip_vs_conn_in_get_proto,
ipvs, AF_INET, skb, &ciph);
@@ -1779,7 +1783,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
if (!cp) {
int v;
- if (ipip || !sysctl_schedule_icmp(ipvs))
+ if (tunnel || !sysctl_schedule_icmp(ipvs))
return NF_ACCEPT;
if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph))
@@ -1797,7 +1801,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
goto out;
}
- if (ipip) {
+ if (tunnel) {
__be32 info = ic->un.gateway;
__u8 type = ic->type;
__u8 code = ic->code;
@@ -1809,17 +1813,18 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
u32 mtu = ntohs(ic->un.frag.mtu);
__be16 frag_off = cih->frag_off;
- /* Strip outer IP and ICMP, go to IPIP header */
+ /* Strip outer IP and ICMP, go to IPIP/UDP/GRE header */
if (pskb_pull(skb, ihl + sizeof(_icmph)) == NULL)
- goto ignore_ipip;
+ goto ignore_tunnel;
offset2 -= ihl + sizeof(_icmph);
skb_reset_network_header(skb);
- IP_VS_DBG(12, "ICMP for IPIP %pI4->%pI4: mtu=%u\n",
- &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, mtu);
+ IP_VS_DBG(12, "ICMP for %s %pI4->%pI4: mtu=%u\n",
+ outer_proto, &ip_hdr(skb)->saddr,
+ &ip_hdr(skb)->daddr, mtu);
ipv4_update_pmtu(skb, ipvs->net, mtu, 0, 0);
/* Client uses PMTUD? */
if (!(frag_off & htons(IP_DF)))
- goto ignore_ipip;
+ goto ignore_tunnel;
/* Prefer the resulting PMTU */
if (dest) {
struct ip_vs_dest_dst *dest_dst;
@@ -1832,11 +1837,11 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
mtu -= sizeof(struct iphdr);
info = htonl(mtu);
}
- /* Strip outer IP, ICMP and IPIP, go to IP header of
+ /* Strip outer IP, ICMP and IPIP/UDP/GRE, go to IP header of
* original request.
*/
if (pskb_pull(skb, offset2) == NULL)
- goto ignore_ipip;
+ goto ignore_tunnel;
skb_reset_network_header(skb);
IP_VS_DBG(12, "Sending ICMP for %pI4->%pI4: t=%u, c=%u, i=%u\n",
&ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
@@ -1845,7 +1850,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
/* ICMP can be shorter but anyways, account it */
ip_vs_out_stats(cp, skb);
-ignore_ipip:
+ignore_tunnel:
consume_skb(skb);
verdict = NF_STOLEN;
goto out;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index a18f8fe728e3..c4582eb71766 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -143,6 +143,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
}
static void nf_conntrack_all_lock(void)
+ __acquires(&nf_conntrack_locks_all_lock)
{
int i;
@@ -162,6 +163,7 @@ static void nf_conntrack_all_lock(void)
}
static void nf_conntrack_all_unlock(void)
+ __releases(&nf_conntrack_locks_all_lock)
{
/* All prior stores must be complete before we clear
* 'nf_conntrack_locks_all'. Otherwise nf_conntrack_lock()
@@ -863,9 +865,8 @@ out:
}
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
-static inline void nf_ct_acct_update(struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int len)
+void nf_ct_acct_add(struct nf_conn *ct, u32 dir, unsigned int packets,
+ unsigned int bytes)
{
struct nf_conn_acct *acct;
@@ -873,10 +874,11 @@ static inline void nf_ct_acct_update(struct nf_conn *ct,
if (acct) {
struct nf_conn_counter *counter = acct->counter;
- atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets);
- atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes);
+ atomic64_add(packets, &counter[dir].packets);
+ atomic64_add(bytes, &counter[dir].bytes);
}
}
+EXPORT_SYMBOL_GPL(nf_ct_acct_add);
static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
const struct nf_conn *loser_ct)
@@ -890,7 +892,7 @@ static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
/* u32 should be fine since we must have seen one packet. */
bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes);
- nf_ct_acct_update(ct, ctinfo, bytes);
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
}
}
@@ -1931,7 +1933,7 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
WRITE_ONCE(ct->timeout, extra_jiffies);
acct:
if (do_acct)
- nf_ct_acct_update(ct, ctinfo, skb->len);
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
@@ -1939,7 +1941,7 @@ bool nf_ct_kill_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
const struct sk_buff *skb)
{
- nf_ct_acct_update(ct, ctinfo, skb->len);
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
return nf_ct_delete(ct, 0, 0);
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 6a1c8f1f6171..9ddfcd002d3b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -860,7 +860,7 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family)
struct ctnetlink_filter *filter;
#ifndef CONFIG_NF_CONNTRACK_MARK
- if (cda[CTA_MARK] && cda[CTA_MARK_MASK])
+ if (cda[CTA_MARK] || cda[CTA_MARK_MASK])
return ERR_PTR(-EOPNOTSUPP);
#endif
@@ -1533,6 +1533,7 @@ static int
ctnetlink_parse_nat_setup(struct nf_conn *ct,
enum nf_nat_manip_type manip,
const struct nlattr *attr)
+ __must_hold(RCU)
{
struct nf_nat_hook *nat_hook;
int err;
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 9a477bd563b7..c0cb79495c35 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -392,7 +392,7 @@ int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table,
struct flow_block_cb *block_cb;
int err = 0;
- mutex_lock(&flow_table->flow_block_lock);
+ down_write(&flow_table->flow_block_lock);
block_cb = flow_block_cb_lookup(block, cb, cb_priv);
if (block_cb) {
err = -EEXIST;
@@ -408,7 +408,7 @@ int nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table,
list_add_tail(&block_cb->list, &block->cb_list);
unlock:
- mutex_unlock(&flow_table->flow_block_lock);
+ up_write(&flow_table->flow_block_lock);
return err;
}
EXPORT_SYMBOL_GPL(nf_flow_table_offload_add_cb);
@@ -419,13 +419,13 @@ void nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table,
struct flow_block *block = &flow_table->flow_block;
struct flow_block_cb *block_cb;
- mutex_lock(&flow_table->flow_block_lock);
+ down_write(&flow_table->flow_block_lock);
block_cb = flow_block_cb_lookup(block, cb, cb_priv);
if (block_cb)
list_del(&block_cb->list);
else
WARN_ON(true);
- mutex_unlock(&flow_table->flow_block_lock);
+ up_write(&flow_table->flow_block_lock);
}
EXPORT_SYMBOL_GPL(nf_flow_table_offload_del_cb);
@@ -551,7 +551,7 @@ int nf_flow_table_init(struct nf_flowtable *flowtable)
INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
flow_block_init(&flowtable->flow_block);
- mutex_init(&flowtable->flow_block_lock);
+ init_rwsem(&flowtable->flow_block_lock);
err = rhashtable_init(&flowtable->rhashtable,
&nf_flow_offload_rhash_params);
@@ -613,8 +613,10 @@ void nf_flow_table_free(struct nf_flowtable *flow_table)
nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, flow_table);
nf_flow_table_offload_flush(flow_table);
+ if (nf_flowtable_hw_offload(flow_table))
+ nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step,
+ flow_table);
rhashtable_destroy(&flow_table->rhashtable);
- mutex_destroy(&flow_table->flow_block_lock);
}
EXPORT_SYMBOL_GPL(nf_flow_table_free);
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 5272721080f8..a3bca758b849 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -12,6 +12,7 @@
#include <net/ip6_route.h>
#include <net/neighbour.h>
#include <net/netfilter/nf_flow_table.h>
+#include <net/netfilter/nf_conntrack_acct.h>
/* For layer 4 checksum field offset. */
#include <linux/tcp.h>
#include <linux/udp.h>
@@ -146,11 +147,13 @@ static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
(nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
+ nf_flow_snat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0))
return -1;
+
+ iph = ip_hdr(skb);
if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
(nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
+ nf_flow_dnat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0))
return -1;
return 0;
@@ -189,6 +192,7 @@ static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
return -1;
+ iph = ip_hdr(skb);
ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
tuple->src_v4.s_addr = iph->saddr;
@@ -286,6 +290,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
ip_decrease_ttl(iph);
skb->tstamp = 0;
+ if (flow_table->flags & NF_FLOWTABLE_COUNTER)
+ nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
+
if (unlikely(dst_xfrm(&rt->dst))) {
memset(skb->cb, 0, sizeof(struct inet_skb_parm));
IPCB(skb)->iif = skb->dev->ifindex;
@@ -417,11 +424,13 @@ static int nf_flow_nat_ipv6(const struct flow_offload *flow,
if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
(nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+ nf_flow_snat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0))
return -1;
+
+ ip6h = ipv6_hdr(skb);
if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
(nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
+ nf_flow_dnat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0))
return -1;
return 0;
@@ -450,6 +459,7 @@ static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
if (!pskb_may_pull(skb, thoff + sizeof(*ports)))
return -1;
+ ip6h = ipv6_hdr(skb);
ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
tuple->src_v6 = ip6h->saddr;
@@ -516,6 +526,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
ip6h->hop_limit--;
skb->tstamp = 0;
+ if (flow_table->flags & NF_FLOWTABLE_COUNTER)
+ nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
+
if (unlikely(dst_xfrm(&rt->dst))) {
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
IP6CB(skb)->iif = skb->dev->ifindex;
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index ad549317af30..e3b099c14eff 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -9,12 +9,11 @@
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_tuple.h>
-static struct work_struct nf_flow_offload_work;
-static DEFINE_SPINLOCK(flow_offload_pending_list_lock);
-static LIST_HEAD(flow_offload_pending_list);
+static struct workqueue_struct *nf_flow_offload_wq;
struct flow_offload_work {
struct list_head list;
@@ -22,6 +21,7 @@ struct flow_offload_work {
int priority;
struct nf_flowtable *flowtable;
struct flow_offload *flow;
+ struct work_struct work;
};
#define NF_FLOW_DISSECTOR(__match, __type, __field) \
@@ -92,7 +92,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp);
NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp);
- if (other_dst->lwtstate) {
+ if (other_dst && other_dst->lwtstate) {
tun_info = lwt_tun_info(other_dst->lwtstate);
nf_flow_rule_lwt_match(match, tun_info);
}
@@ -120,6 +120,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match,
default:
return -EOPNOTSUPP;
}
+ mask->control.addr_type = 0xffff;
match->dissector.used_keys |= BIT(key->control.addr_type);
mask->basic.n_proto = 0xffff;
@@ -483,7 +484,7 @@ static void flow_offload_encap_tunnel(const struct flow_offload *flow,
struct dst_entry *dst;
dst = flow->tuplehash[dir].tuple.dst_cache;
- if (dst->lwtstate) {
+ if (dst && dst->lwtstate) {
struct ip_tunnel_info *tun_info;
tun_info = lwt_tun_info(dst->lwtstate);
@@ -503,7 +504,7 @@ static void flow_offload_decap_tunnel(const struct flow_offload *flow,
struct dst_entry *dst;
dst = flow->tuplehash[!dir].tuple.dst_cache;
- if (dst->lwtstate) {
+ if (dst && dst->lwtstate) {
struct ip_tunnel_info *tun_info;
tun_info = lwt_tun_info(dst->lwtstate);
@@ -691,7 +692,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
if (cmd == FLOW_CLS_REPLACE)
cls_flow.rule = flow_rule->rule;
- mutex_lock(&flowtable->flow_block_lock);
+ down_read(&flowtable->flow_block_lock);
list_for_each_entry(block_cb, block_cb_list, list) {
err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow,
block_cb->cb_priv);
@@ -700,7 +701,7 @@ static int nf_flow_offload_tuple(struct nf_flowtable *flowtable,
i++;
}
- mutex_unlock(&flowtable->flow_block_lock);
+ up_read(&flowtable->flow_block_lock);
if (cmd == FLOW_CLS_STATS)
memcpy(stats, &cls_flow.stats, sizeof(*stats));
@@ -784,19 +785,25 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
lastused = max_t(u64, stats[0].lastused, stats[1].lastused);
offload->flow->timeout = max_t(u64, offload->flow->timeout,
lastused + NF_FLOW_TIMEOUT);
+
+ if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) {
+ if (stats[0].pkts)
+ nf_ct_acct_add(offload->flow->ct,
+ FLOW_OFFLOAD_DIR_ORIGINAL,
+ stats[0].pkts, stats[0].bytes);
+ if (stats[1].pkts)
+ nf_ct_acct_add(offload->flow->ct,
+ FLOW_OFFLOAD_DIR_REPLY,
+ stats[1].pkts, stats[1].bytes);
+ }
}
static void flow_offload_work_handler(struct work_struct *work)
{
- struct flow_offload_work *offload, *next;
- LIST_HEAD(offload_pending_list);
-
- spin_lock_bh(&flow_offload_pending_list_lock);
- list_replace_init(&flow_offload_pending_list, &offload_pending_list);
- spin_unlock_bh(&flow_offload_pending_list_lock);
+ struct flow_offload_work *offload;
- list_for_each_entry_safe(offload, next, &offload_pending_list, list) {
- switch (offload->cmd) {
+ offload = container_of(work, struct flow_offload_work, work);
+ switch (offload->cmd) {
case FLOW_CLS_REPLACE:
flow_offload_work_add(offload);
break;
@@ -808,19 +815,14 @@ static void flow_offload_work_handler(struct work_struct *work)
break;
default:
WARN_ON_ONCE(1);
- }
- list_del(&offload->list);
- kfree(offload);
}
+
+ kfree(offload);
}
static void flow_offload_queue_work(struct flow_offload_work *offload)
{
- spin_lock_bh(&flow_offload_pending_list_lock);
- list_add_tail(&offload->list, &flow_offload_pending_list);
- spin_unlock_bh(&flow_offload_pending_list_lock);
-
- schedule_work(&nf_flow_offload_work);
+ queue_work(nf_flow_offload_wq, &offload->work);
}
static struct flow_offload_work *
@@ -837,6 +839,7 @@ nf_flow_offload_work_alloc(struct nf_flowtable *flowtable,
offload->flow = flow;
offload->priority = flowtable->priority;
offload->flowtable = flowtable;
+ INIT_WORK(&offload->work, flow_offload_work_handler);
return offload;
}
@@ -887,7 +890,7 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable,
void nf_flow_table_offload_flush(struct nf_flowtable *flowtable)
{
if (nf_flowtable_hw_offload(flowtable))
- flush_work(&nf_flow_offload_work);
+ flush_workqueue(nf_flow_offload_wq);
}
static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
@@ -938,7 +941,7 @@ static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo,
{
nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable,
extack);
- flow_indr_block_call(dev, bo, cmd);
+ flow_indr_block_call(dev, bo, cmd, TC_SETUP_FT);
if (list_empty(&bo->cb_list))
return -EOPNOTSUPP;
@@ -1052,7 +1055,10 @@ static struct flow_indr_block_entry block_ing_entry = {
int nf_flow_table_offload_init(void)
{
- INIT_WORK(&nf_flow_offload_work, flow_offload_work_handler);
+ nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+ if (!nf_flow_offload_wq)
+ return -ENOMEM;
flow_indr_add_block_cb(&block_ing_entry);
@@ -1061,15 +1067,6 @@ int nf_flow_table_offload_init(void)
void nf_flow_table_offload_exit(void)
{
- struct flow_offload_work *offload, *next;
- LIST_HEAD(offload_pending_list);
-
flow_indr_del_block_cb(&block_ing_entry);
-
- cancel_work_sync(&nf_flow_offload_work);
-
- list_for_each_entry_safe(offload, next, &offload_pending_list, list) {
- list_del(&offload->list);
- kfree(offload);
- }
+ destroy_workqueue(nf_flow_offload_wq);
}
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index f8f52ff99cfb..bbd1209694b8 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -46,25 +46,7 @@ void nf_unregister_queue_handler(struct net *net)
}
EXPORT_SYMBOL(nf_unregister_queue_handler);
-static void nf_queue_entry_release_br_nf_refs(struct sk_buff *skb)
-{
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
-
- if (nf_bridge) {
- struct net_device *physdev;
-
- physdev = nf_bridge_get_physindev(skb);
- if (physdev)
- dev_put(physdev);
- physdev = nf_bridge_get_physoutdev(skb);
- if (physdev)
- dev_put(physdev);
- }
-#endif
-}
-
-void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
+static void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
@@ -76,24 +58,34 @@ void nf_queue_entry_release_refs(struct nf_queue_entry *entry)
if (state->sk)
sock_put(state->sk);
- nf_queue_entry_release_br_nf_refs(entry->skb);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (entry->physin)
+ dev_put(entry->physin);
+ if (entry->physout)
+ dev_put(entry->physout);
+#endif
+}
+
+void nf_queue_entry_free(struct nf_queue_entry *entry)
+{
+ nf_queue_entry_release_refs(entry);
+ kfree(entry);
}
-EXPORT_SYMBOL_GPL(nf_queue_entry_release_refs);
+EXPORT_SYMBOL_GPL(nf_queue_entry_free);
-static void nf_queue_entry_get_br_nf_refs(struct sk_buff *skb)
+static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ const struct sk_buff *skb = entry->skb;
+ struct nf_bridge_info *nf_bridge;
+ nf_bridge = nf_bridge_info_get(skb);
if (nf_bridge) {
- struct net_device *physdev;
-
- physdev = nf_bridge_get_physindev(skb);
- if (physdev)
- dev_hold(physdev);
- physdev = nf_bridge_get_physoutdev(skb);
- if (physdev)
- dev_hold(physdev);
+ entry->physin = nf_bridge_get_physindev(skb);
+ entry->physout = nf_bridge_get_physoutdev(skb);
+ } else {
+ entry->physin = NULL;
+ entry->physout = NULL;
}
#endif
}
@@ -110,7 +102,12 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
if (state->sk)
sock_hold(state->sk);
- nf_queue_entry_get_br_nf_refs(entry->skb);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ if (entry->physin)
+ dev_hold(entry->physin);
+ if (entry->physout)
+ dev_hold(entry->physout);
+#endif
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
@@ -158,18 +155,16 @@ static void nf_ip6_saveroute(const struct sk_buff *skb,
static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
unsigned int index, unsigned int queuenum)
{
- int status = -ENOENT;
struct nf_queue_entry *entry = NULL;
const struct nf_queue_handler *qh;
struct net *net = state->net;
unsigned int route_key_size;
+ int status;
/* QUEUE == DROP if no one is waiting, to be safe. */
qh = rcu_dereference(net->nf.queue_handler);
- if (!qh) {
- status = -ESRCH;
- goto err;
- }
+ if (!qh)
+ return -ESRCH;
switch (state->pf) {
case AF_INET:
@@ -184,14 +179,12 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
}
entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC);
- if (!entry) {
- status = -ENOMEM;
- goto err;
- }
+ if (!entry)
+ return -ENOMEM;
if (skb_dst(skb) && !skb_dst_force(skb)) {
- status = -ENETDOWN;
- goto err;
+ kfree(entry);
+ return -ENETDOWN;
}
*entry = (struct nf_queue_entry) {
@@ -201,6 +194,8 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
.size = sizeof(*entry) + route_key_size,
};
+ __nf_queue_entry_init_physdevs(entry);
+
nf_queue_entry_get_refs(entry);
switch (entry->state.pf) {
@@ -213,17 +208,12 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
}
status = qh->outfn(entry, queuenum);
-
if (status < 0) {
- nf_queue_entry_release_refs(entry);
- goto err;
+ nf_queue_entry_free(entry);
+ return status;
}
return 0;
-
-err:
- kfree(entry);
- return status;
}
/* Packets leaving via this function must come back through nf_reinject(). */
@@ -304,12 +294,10 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
hooks = nf_hook_entries_head(net, pf, entry->state.hook);
- nf_queue_entry_release_refs(entry);
-
i = entry->hook_index;
if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) {
kfree_skb(skb);
- kfree(entry);
+ nf_queue_entry_free(entry);
return;
}
@@ -348,6 +336,6 @@ next_hook:
kfree_skb(skb);
}
- kfree(entry);
+ nf_queue_entry_free(entry);
}
EXPORT_SYMBOL(nf_reinject);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index f92fb6003745..4471393da6d8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -520,7 +520,8 @@ static struct nft_table *nft_table_lookup(const struct net *net,
if (nla == NULL)
return ERR_PTR(-EINVAL);
- list_for_each_entry_rcu(table, &net->nft.tables, list) {
+ list_for_each_entry_rcu(table, &net->nft.tables, list,
+ lockdep_is_held(&net->nft.commit_mutex)) {
if (!nla_strcmp(nla, table->name) &&
table->family == family &&
nft_active_genmask(table, genmask))
@@ -2557,6 +2558,24 @@ err1:
return ERR_PTR(err);
}
+int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
+{
+ int err;
+
+ if (src->ops->clone) {
+ dst->ops = src->ops;
+ err = src->ops->clone(dst, src);
+ if (err < 0)
+ return err;
+ } else {
+ memcpy(dst, src, src->ops->size);
+ }
+
+ __module_get(src->ops->type->owner);
+
+ return 0;
+}
+
void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
{
nf_tables_expr_destroy(ctx, expr);
@@ -3272,7 +3291,7 @@ static const struct nft_set_type *nft_set_types[] = {
&nft_set_rhash_type,
&nft_set_bitmap_type,
&nft_set_rbtree_type,
-#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2)
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
&nft_set_pipapo_avx2_type,
#endif
&nft_set_pipapo_type,
@@ -3376,6 +3395,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
.len = NFT_USERDATA_MAXLEN },
[NFTA_SET_OBJ_TYPE] = { .type = NLA_U32 },
[NFTA_SET_HANDLE] = { .type = NLA_U64 },
+ [NFTA_SET_EXPR] = { .type = NLA_NESTED },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
@@ -3579,8 +3599,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
{
struct nfgenmsg *nfmsg;
struct nlmsghdr *nlh;
- struct nlattr *desc;
u32 portid = ctx->portid;
+ struct nlattr *nest;
u32 seq = ctx->seq;
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
@@ -3636,9 +3656,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
goto nla_put_failure;
- desc = nla_nest_start_noflag(skb, NFTA_SET_DESC);
-
- if (desc == NULL)
+ nest = nla_nest_start_noflag(skb, NFTA_SET_DESC);
+ if (!nest)
goto nla_put_failure;
if (set->size &&
nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size)))
@@ -3648,7 +3667,15 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
nf_tables_fill_set_concat(skb, set))
goto nla_put_failure;
- nla_nest_end(skb, desc);
+ nla_nest_end(skb, nest);
+
+ if (set->expr) {
+ nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
+ if (nf_tables_fill_expr_info(skb, set->expr) < 0)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ }
nlmsg_end(skb, nlh);
return 0;
@@ -3895,6 +3922,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
u8 genmask = nft_genmask_next(net);
int family = nfmsg->nfgen_family;
const struct nft_set_ops *ops;
+ struct nft_expr *expr = NULL;
struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
@@ -4004,6 +4032,9 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
return err;
}
+ if (nla[NFTA_SET_EXPR])
+ desc.expr = true;
+
table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
@@ -4051,13 +4082,21 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL);
if (!name) {
err = -ENOMEM;
- goto err2;
+ goto err_set_name;
}
err = nf_tables_set_alloc_name(&ctx, set, name);
kfree(name);
if (err < 0)
- goto err2;
+ goto err_set_alloc_name;
+
+ if (nla[NFTA_SET_EXPR]) {
+ expr = nft_set_elem_expr_alloc(&ctx, set, nla[NFTA_SET_EXPR]);
+ if (IS_ERR(expr)) {
+ err = PTR_ERR(expr);
+ goto err_set_alloc_name;
+ }
+ }
udata = NULL;
if (udlen) {
@@ -4074,6 +4113,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
set->dtype = dtype;
set->objtype = objtype;
set->dlen = desc.dlen;
+ set->expr = expr;
set->flags = flags;
set->size = desc.size;
set->policy = policy;
@@ -4089,30 +4129,36 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
err = ops->init(set, &desc, nla);
if (err < 0)
- goto err3;
+ goto err_set_init;
err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
if (err < 0)
- goto err4;
+ goto err_set_trans;
list_add_tail_rcu(&set->list, &table->sets);
table->use++;
return 0;
-err4:
+err_set_trans:
ops->destroy(set);
-err3:
+err_set_init:
+ if (expr)
+ nft_expr_destroy(&ctx, expr);
+err_set_alloc_name:
kfree(set->name);
-err2:
+err_set_name:
kvfree(set);
return err;
}
-static void nft_set_destroy(struct nft_set *set)
+static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
{
if (WARN_ON(set->use > 0))
return;
+ if (set->expr)
+ nft_expr_destroy(ctx, set->expr);
+
set->ops->destroy(set);
kfree(set->name);
kvfree(set);
@@ -4253,7 +4299,7 @@ EXPORT_SYMBOL_GPL(nf_tables_deactivate_set);
void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
{
if (list_empty(&set->bindings) && nft_set_is_anonymous(set))
- nft_set_destroy(set);
+ nft_set_destroy(ctx, set);
}
EXPORT_SYMBOL_GPL(nf_tables_destroy_set);
@@ -4840,6 +4886,17 @@ void *nft_set_elem_init(const struct nft_set *set,
return elem;
}
+static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
+ struct nft_expr *expr)
+{
+ if (expr->ops->destroy_clone) {
+ expr->ops->destroy_clone(ctx, expr);
+ module_put(expr->ops->type->owner);
+ } else {
+ nf_tables_expr_destroy(ctx, expr);
+ }
+}
+
void nft_set_elem_destroy(const struct nft_set *set, void *elem,
bool destroy_expr)
{
@@ -4852,16 +4909,9 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
nft_data_release(nft_set_ext_data(ext), set->dtype);
- if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) {
- struct nft_expr *expr = nft_set_ext_expr(ext);
+ if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
+ nft_set_elem_expr_destroy(&ctx, nft_set_ext_expr(ext));
- if (expr->ops->destroy_clone) {
- expr->ops->destroy_clone(&ctx, expr);
- module_put(expr->ops->type->owner);
- } else {
- nf_tables_expr_destroy(&ctx, expr);
- }
- }
if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
(*nft_set_ext_obj(ext))->use--;
kfree(elem);
@@ -4877,7 +4927,8 @@ static void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
struct nft_set_ext *ext = nft_set_elem_ext(set, elem);
if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR))
- nf_tables_expr_destroy(ctx, nft_set_ext_expr(ext));
+ nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));
+
kfree(elem);
}
@@ -4964,6 +5015,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nla[NFTA_SET_ELEM_EXPR]);
if (IS_ERR(expr))
return PTR_ERR(expr);
+
+ err = -EOPNOTSUPP;
+ if (set->expr && set->expr->ops != expr->ops)
+ goto err_set_elem_expr;
+ } else if (set->expr) {
+ expr = kzalloc(set->expr->ops->size, GFP_KERNEL);
+ if (!expr)
+ return -ENOMEM;
+
+ err = nft_expr_clone(expr, set->expr);
+ if (err < 0)
+ goto err_set_elem_expr;
}
err = nft_setelem_parse_key(ctx, set, &elem.key.val,
@@ -5079,6 +5142,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (expr) {
memcpy(nft_set_ext_expr(ext), expr, expr->ops->size);
kfree(expr);
+ expr = NULL;
}
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
@@ -5106,6 +5170,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
err = -EBUSY;
else if (!(nlmsg_flags & NLM_F_EXCL))
err = 0;
+ } else if (err == -ENOTEMPTY) {
+ /* ENOTEMPTY reports overlapping between this element
+ * and an existing one.
+ */
+ err = -EEXIST;
}
goto err_element_clash;
}
@@ -5127,7 +5196,8 @@ err_element_clash:
err_trans:
if (obj)
obj->use--;
- kfree(elem.priv);
+
+ nf_tables_set_elem_destroy(ctx, set, elem.priv);
err_parse_data:
if (nla[NFTA_SET_ELEM_DATA] != NULL)
nft_data_release(&data, desc.type);
@@ -6314,7 +6384,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
if (nla[NFTA_FLOWTABLE_FLAGS]) {
flowtable->data.flags =
ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
- if (flowtable->data.flags & ~NF_FLOWTABLE_HW_OFFLOAD)
+ if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK)
goto err3;
}
@@ -7002,7 +7072,7 @@ static void nft_commit_release(struct nft_trans *trans)
nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
break;
case NFT_MSG_DELSET:
- nft_set_destroy(nft_trans_set(trans));
+ nft_set_destroy(&trans->ctx, nft_trans_set(trans));
break;
case NFT_MSG_DELSETELEM:
nf_tables_set_elem_destroy(&trans->ctx,
@@ -7433,7 +7503,7 @@ static void nf_tables_abort_release(struct nft_trans *trans)
nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans));
break;
case NFT_MSG_NEWSET:
- nft_set_destroy(nft_trans_set(trans));
+ nft_set_destroy(&trans->ctx, nft_trans_set(trans));
break;
case NFT_MSG_NEWSETELEM:
nft_set_elem_destroy(nft_trans_elem_set(trans),
@@ -8159,7 +8229,7 @@ static void __nft_release_tables(struct net *net)
list_for_each_entry_safe(set, ns, &table->sets, list) {
list_del(&set->list);
table->use--;
- nft_set_destroy(set);
+ nft_set_destroy(&ctx, set);
}
list_for_each_entry_safe(obj, ne, &table->objects, list) {
nft_obj_del(obj);
diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c
index 2bb28483af22..954bccb7f32a 100644
--- a/net/netfilter/nf_tables_offload.c
+++ b/net/netfilter/nf_tables_offload.c
@@ -313,7 +313,7 @@ static int nft_indr_block_offload_cmd(struct nft_base_chain *chain,
nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack);
- flow_indr_block_call(dev, &bo, cmd);
+ flow_indr_block_call(dev, &bo, cmd, TC_SETUP_BLOCK);
if (list_empty(&bo.cb_list))
return -EOPNOTSUPP;
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 76535fd9278c..3243a31f6e82 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -737,12 +737,6 @@ static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
#define nf_bridge_adjust_segmented_data(s) do {} while (0)
#endif
-static void free_entry(struct nf_queue_entry *entry)
-{
- nf_queue_entry_release_refs(entry);
- kfree(entry);
-}
-
static int
__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
struct sk_buff *skb, struct nf_queue_entry *entry)
@@ -768,7 +762,7 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
entry_seg->skb = skb;
ret = __nfqnl_enqueue_packet(net, queue, entry_seg);
if (ret)
- free_entry(entry_seg);
+ nf_queue_entry_free(entry_seg);
}
return ret;
}
@@ -827,7 +821,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
if (queued) {
if (err) /* some segments are already queued */
- free_entry(entry);
+ nf_queue_entry_free(entry);
kfree_skb(skb);
return 0;
}
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 46ab28ec4b53..64ca13a1885b 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -24,23 +24,6 @@ struct nft_dynset {
struct nft_set_binding binding;
};
-static int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
-{
- int err;
-
- if (src->ops->clone) {
- dst->ops = src->ops;
- err = src->ops->clone(dst, src);
- if (err < 0)
- return err;
- } else {
- memcpy(dst, src, src->ops->size);
- }
-
- __module_get(src->ops->type->owner);
- return 0;
-}
-
static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
struct nft_regs *regs)
{
@@ -204,6 +187,11 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
tb[NFTA_DYNSET_EXPR]);
if (IS_ERR(priv->expr))
return PTR_ERR(priv->expr);
+
+ if (set->expr && set->expr->ops != priv->expr->ops) {
+ err = -EOPNOTSUPP;
+ goto err_expr_free;
+ }
}
nft_set_ext_prepare(&priv->tmpl);
@@ -222,7 +210,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
err = nf_tables_bind_set(ctx, set, &priv->binding);
if (err < 0)
- goto err1;
+ goto err_expr_free;
if (set->size == 0)
set->size = 0xffff;
@@ -230,7 +218,7 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
priv->set = set;
return 0;
-err1:
+err_expr_free:
if (priv->expr != NULL)
nft_expr_destroy(ctx, priv->expr);
return err;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a5e8469859e3..07782836fad6 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -228,7 +228,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
unsigned int i, optl, tcphdr_len, offset;
struct tcphdr *tcph;
u8 *opt;
- u32 src;
tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len);
if (!tcph)
@@ -237,7 +236,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
opt = (u8 *)tcph;
for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) {
union {
- u8 octet;
__be16 v16;
__be32 v32;
} old, new;
@@ -259,13 +257,13 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
if (!tcph)
return;
- src = regs->data[priv->sreg];
offset = i + priv->offset;
switch (priv->len) {
case 2:
old.v16 = get_unaligned((u16 *)(opt + offset));
- new.v16 = src;
+ new.v16 = (__force __be16)nft_reg_load16(
+ &regs->data[priv->sreg]);
switch (priv->type) {
case TCPOPT_MSS:
@@ -283,7 +281,7 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr,
old.v16, new.v16, false);
break;
case 4:
- new.v32 = src;
+ new.v32 = regs->data[priv->sreg];
old.v32 = get_unaligned((u32 *)(opt + offset));
if (old.v32 == new.v32)
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index aba11c2333f3..3087e23297db 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -28,6 +28,9 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr,
struct nft_fwd_netdev *priv = nft_expr_priv(expr);
int oif = regs->data[priv->sreg_dev];
+ /* This is used by ifb only. */
+ skb_set_redirected(pkt->skb, true);
+
nf_fwd_netdev_egress(pkt, oif);
regs->verdict.code = NF_STOLEN;
}
@@ -190,6 +193,13 @@ nla_put_failure:
return -1;
}
+static int nft_fwd_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+{
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS));
+}
+
static struct nft_expr_type nft_fwd_netdev_type;
static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
.type = &nft_fwd_netdev_type,
@@ -197,6 +207,7 @@ static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = {
.eval = nft_fwd_neigh_eval,
.init = nft_fwd_neigh_init,
.dump = nft_fwd_neigh_dump,
+ .validate = nft_fwd_validate,
};
static const struct nft_expr_ops nft_fwd_netdev_ops = {
@@ -205,6 +216,7 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = {
.eval = nft_fwd_netdev_eval,
.init = nft_fwd_netdev_init,
.dump = nft_fwd_netdev_dump,
+ .validate = nft_fwd_validate,
.offload = nft_fwd_netdev_offload,
};
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 1cb2e67e6e03..32f0fc8be3a4 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -81,6 +81,7 @@ static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
u32 idx, off;
nft_bitmap_location(set, key, &idx, &off);
+ *ext = NULL;
return nft_bitmap_active(priv->bitmap, idx, off, genmask);
}
@@ -285,6 +286,8 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
/* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */
if (desc->klen > 2)
return false;
+ else if (desc->expr)
+ return false;
est->size = nft_bitmap_total_size(desc->klen);
est->lookup = NFT_SET_CLASS_O_1;
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index c1afb6c94edc..8b5acc6910fd 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -1164,21 +1164,41 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
struct nft_pipapo_field *f;
int i, bsize_max, err = 0;
+ if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
+ end = (const u8 *)nft_set_ext_key_end(ext)->data;
+ else
+ end = start;
+
dup = pipapo_get(net, set, start, genmask);
- if (PTR_ERR(dup) == -ENOENT) {
- if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) {
- end = (const u8 *)nft_set_ext_key_end(ext)->data;
- dup = pipapo_get(net, set, end, nft_genmask_next(net));
- } else {
- end = start;
+ if (!IS_ERR(dup)) {
+ /* Check if we already have the same exact entry */
+ const struct nft_data *dup_key, *dup_end;
+
+ dup_key = nft_set_ext_key(&dup->ext);
+ if (nft_set_ext_exists(&dup->ext, NFT_SET_EXT_KEY_END))
+ dup_end = nft_set_ext_key_end(&dup->ext);
+ else
+ dup_end = dup_key;
+
+ if (!memcmp(start, dup_key->data, sizeof(*dup_key->data)) &&
+ !memcmp(end, dup_end->data, sizeof(*dup_end->data))) {
+ *ext2 = &dup->ext;
+ return -EEXIST;
}
+
+ return -ENOTEMPTY;
+ }
+
+ if (PTR_ERR(dup) == -ENOENT) {
+ /* Look for partially overlapping entries */
+ dup = pipapo_get(net, set, end, nft_genmask_next(net));
}
if (PTR_ERR(dup) != -ENOENT) {
if (IS_ERR(dup))
return PTR_ERR(dup);
*ext2 = &dup->ext;
- return -EEXIST;
+ return -ENOTEMPTY;
}
/* Validate */
@@ -2181,7 +2201,7 @@ const struct nft_set_type nft_set_pipapo_type = {
},
};
-#if defined(CONFIG_X86_64) && defined(CONFIG_AS_AVX2)
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
const struct nft_set_type nft_set_pipapo_avx2_type = {
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT |
NFT_SET_TIMEOUT,
diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h
index 396caf7bfca8..394bcb704db7 100644
--- a/net/netfilter/nft_set_pipapo_avx2.h
+++ b/net/netfilter/nft_set_pipapo_avx2.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _NFT_SET_PIPAPO_AVX2_H
-#ifdef CONFIG_AS_AVX2
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
#include <asm/fpu/xstate.h>
#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE)
@@ -9,6 +9,6 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
const u32 *key, const struct nft_set_ext **ext);
bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est);
-#endif /* CONFIG_AS_AVX2 */
+#endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */
#endif /* _NFT_SET_PIPAPO_AVX2_H */
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 172ef8189f99..3a5552e14f75 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -33,6 +33,11 @@ static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
(*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
}
+static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
+{
+ return !nft_rbtree_interval_end(rbe);
+}
+
static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
const struct nft_rbtree_elem *interval)
{
@@ -64,7 +69,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
if (interval &&
nft_rbtree_equal(set, this, interval) &&
nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(interval))
+ nft_rbtree_interval_start(interval))
continue;
interval = rbe;
} else if (d > 0)
@@ -89,7 +94,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
nft_set_elem_active(&interval->ext, genmask) &&
- !nft_rbtree_interval_end(interval)) {
+ nft_rbtree_interval_start(interval)) {
*ext = &interval->ext;
return true;
}
@@ -208,8 +213,43 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
u8 genmask = nft_genmask_next(net);
struct nft_rbtree_elem *rbe;
struct rb_node *parent, **p;
+ bool overlap = false;
int d;
+ /* Detect overlaps as we descend the tree. Set the flag in these cases:
+ *
+ * a1. |__ _ _? >|__ _ _ (insert start after existing start)
+ * a2. _ _ __>| ?_ _ __| (insert end before existing end)
+ * a3. _ _ ___| ?_ _ _>| (insert end after existing end)
+ * a4. >|__ _ _ _ _ __| (insert start before existing end)
+ *
+ * and clear it later on, as we eventually reach the points indicated by
+ * '?' above, in the cases described below. We'll always meet these
+ * later, locally, due to tree ordering, and overlaps for the intervals
+ * that are the closest together are always evaluated last.
+ *
+ * b1. |__ _ _! >|__ _ _ (insert start after existing end)
+ * b2. _ _ __>| !_ _ __| (insert end before existing start)
+ * b3. !_____>| (insert end after existing start)
+ *
+ * Case a4. resolves to b1.:
+ * - if the inserted start element is the leftmost, because the '0'
+ * element in the tree serves as end element
+ * - otherwise, if an existing end is found. Note that end elements are
+ * always inserted after corresponding start elements.
+ *
+ * For a new, rightmost pair of elements, we'll hit cases b1. and b3.,
+ * in that order.
+ *
+ * The flag is also cleared in two special cases:
+ *
+ * b4. |__ _ _!|<_ _ _ (insert start right before existing end)
+ * b5. |__ _ >|!__ _ _ (insert end right after existing start)
+ *
+ * which always happen as last step and imply that no further
+ * overlapping is possible.
+ */
+
parent = NULL;
p = &priv->root.rb_node;
while (*p != NULL) {
@@ -218,17 +258,42 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
d = memcmp(nft_set_ext_key(&rbe->ext),
nft_set_ext_key(&new->ext),
set->klen);
- if (d < 0)
+ if (d < 0) {
p = &parent->rb_left;
- else if (d > 0)
+
+ if (nft_rbtree_interval_start(new)) {
+ overlap = nft_rbtree_interval_start(rbe) &&
+ nft_set_elem_active(&rbe->ext,
+ genmask);
+ } else {
+ overlap = nft_rbtree_interval_end(rbe) &&
+ nft_set_elem_active(&rbe->ext,
+ genmask);
+ }
+ } else if (d > 0) {
p = &parent->rb_right;
- else {
+
+ if (nft_rbtree_interval_end(new)) {
+ overlap = nft_rbtree_interval_end(rbe) &&
+ nft_set_elem_active(&rbe->ext,
+ genmask);
+ } else if (nft_rbtree_interval_end(rbe) &&
+ nft_set_elem_active(&rbe->ext, genmask)) {
+ overlap = true;
+ }
+ } else {
if (nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(new)) {
+ nft_rbtree_interval_start(new)) {
p = &parent->rb_left;
- } else if (!nft_rbtree_interval_end(rbe) &&
+
+ if (nft_set_elem_active(&rbe->ext, genmask))
+ overlap = false;
+ } else if (nft_rbtree_interval_start(rbe) &&
nft_rbtree_interval_end(new)) {
p = &parent->rb_right;
+
+ if (nft_set_elem_active(&rbe->ext, genmask))
+ overlap = false;
} else if (nft_set_elem_active(&rbe->ext, genmask)) {
*ext = &rbe->ext;
return -EEXIST;
@@ -237,6 +302,10 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
}
}
}
+
+ if (overlap)
+ return -ENOTEMPTY;
+
rb_link_node_rcu(&new->node, parent, p);
rb_insert_color(&new->node, &priv->root);
return 0;
@@ -317,10 +386,10 @@ static void *nft_rbtree_deactivate(const struct net *net,
parent = parent->rb_right;
else {
if (nft_rbtree_interval_end(rbe) &&
- !nft_rbtree_interval_end(this)) {
+ nft_rbtree_interval_start(this)) {
parent = parent->rb_left;
continue;
- } else if (!nft_rbtree_interval_end(rbe) &&
+ } else if (nft_rbtree_interval_start(rbe) &&
nft_rbtree_interval_end(this)) {
parent = parent->rb_right;
continue;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index ed77c75bf63f..5ded01ca8b20 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2392,19 +2392,14 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
if (nlk_has_extack && extack && extack->_msg)
tlvlen += nla_total_size(strlen(extack->_msg) + 1);
- if (err) {
- if (!(nlk->flags & NETLINK_F_CAP_ACK))
- payload += nlmsg_len(nlh);
- else
- flags |= NLM_F_CAPPED;
- if (nlk_has_extack && extack && extack->bad_attr)
- tlvlen += nla_total_size(sizeof(u32));
- } else {
+ if (err && !(nlk->flags & NETLINK_F_CAP_ACK))
+ payload += nlmsg_len(nlh);
+ else
flags |= NLM_F_CAPPED;
-
- if (nlk_has_extack && extack && extack->cookie_len)
- tlvlen += nla_total_size(extack->cookie_len);
- }
+ if (err && nlk_has_extack && extack && extack->bad_attr)
+ tlvlen += nla_total_size(sizeof(u32));
+ if (nlk_has_extack && extack && extack->cookie_len)
+ tlvlen += nla_total_size(extack->cookie_len);
if (tlvlen)
flags |= NLM_F_ACK_TLVS;
@@ -2427,20 +2422,16 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG,
extack->_msg));
}
- if (err) {
- if (extack->bad_attr &&
- !WARN_ON((u8 *)extack->bad_attr < in_skb->data ||
- (u8 *)extack->bad_attr >= in_skb->data +
- in_skb->len))
- WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
- (u8 *)extack->bad_attr -
- (u8 *)nlh));
- } else {
- if (extack->cookie_len)
- WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
- extack->cookie_len,
- extack->cookie));
- }
+ if (err && extack->bad_attr &&
+ !WARN_ON((u8 *)extack->bad_attr < in_skb->data ||
+ (u8 *)extack->bad_attr >= in_skb->data +
+ in_skb->len))
+ WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
+ (u8 *)extack->bad_attr -
+ (u8 *)nlh));
+ if (extack->cookie_len)
+ WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
+ extack->cookie_len, extack->cookie));
}
nlmsg_end(skb, rep);
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 07a7dd185995..d8ae541d22a8 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -305,7 +305,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
struct sk_buff *segs, *nskb;
int err;
- BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_GSO_CB_OFFSET);
segs = __skb_gso_segment(skb, NETIF_F_SG, false);
if (IS_ERR(segs))
return PTR_ERR(segs);
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index fd8a01ca7a2d..2398d7238300 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -462,12 +462,14 @@ static void flow_table_copy_flows(struct table_instance *old,
struct hlist_head *head = &old->buckets[i];
if (ufid)
- hlist_for_each_entry(flow, head,
- ufid_table.node[old_ver])
+ hlist_for_each_entry_rcu(flow, head,
+ ufid_table.node[old_ver],
+ lockdep_ovsl_is_held())
ufid_table_instance_insert(new, flow);
else
- hlist_for_each_entry(flow, head,
- flow_table.node[old_ver])
+ hlist_for_each_entry_rcu(flow, head,
+ flow_table.node[old_ver],
+ lockdep_ovsl_is_held())
table_instance_insert(new, flow);
}
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e5b0986215d2..29bd405adbbd 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2173,6 +2173,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
struct timespec64 ts;
__u32 ts_status;
bool is_drop_n_account = false;
+ unsigned int slot_id = 0;
bool do_vnet = false;
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
@@ -2275,6 +2276,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (!h.raw)
goto drop_n_account;
+ if (po->tp_version <= TPACKET_V2) {
+ slot_id = po->rx_ring.head;
+ if (test_bit(slot_id, po->rx_ring.rx_owner_map))
+ goto drop_n_account;
+ __set_bit(slot_id, po->rx_ring.rx_owner_map);
+ }
+
if (do_vnet &&
virtio_net_hdr_from_skb(skb, h.raw + macoff -
sizeof(struct virtio_net_hdr),
@@ -2380,7 +2388,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
#endif
if (po->tp_version <= TPACKET_V2) {
+ spin_lock(&sk->sk_receive_queue.lock);
__packet_set_status(po, h.raw, status);
+ __clear_bit(slot_id, po->rx_ring.rx_owner_map);
+ spin_unlock(&sk->sk_receive_queue.lock);
sk->sk_data_ready(sk);
} else {
prb_clear_blk_fill_status(&po->rx_ring);
@@ -4277,6 +4288,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
{
struct pgv *pg_vec = NULL;
struct packet_sock *po = pkt_sk(sk);
+ unsigned long *rx_owner_map = NULL;
int was_running, order = 0;
struct packet_ring_buffer *rb;
struct sk_buff_head *rb_queue;
@@ -4362,6 +4374,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
}
break;
default:
+ if (!tx_ring) {
+ rx_owner_map = bitmap_alloc(req->tp_frame_nr,
+ GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
+ if (!rx_owner_map)
+ goto out_free_pg_vec;
+ }
break;
}
}
@@ -4391,6 +4409,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
err = 0;
spin_lock_bh(&rb_queue->lock);
swap(rb->pg_vec, pg_vec);
+ if (po->tp_version <= TPACKET_V2)
+ swap(rb->rx_owner_map, rx_owner_map);
rb->frame_max = (req->tp_frame_nr - 1);
rb->head = 0;
rb->frame_size = req->tp_frame_size;
@@ -4422,6 +4442,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
}
out_free_pg_vec:
+ bitmap_free(rx_owner_map);
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr);
out:
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 82fb2b10f790..907f4cd2a718 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -70,7 +70,10 @@ struct packet_ring_buffer {
unsigned int __percpu *pending_refcnt;
- struct tpacket_kbdq_core prb_bdqc;
+ union {
+ unsigned long *rx_owner_map;
+ struct tpacket_kbdq_core prb_bdqc;
+ };
};
extern struct mutex fanout_mutex;
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index fe42f986cd94..15ee92d79581 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -285,7 +285,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
gfp_t gfp,
rxrpc_notify_rx_t notify_rx,
bool upgrade,
- bool intr,
+ enum rxrpc_interruptibility interruptibility,
unsigned int debug_id)
{
struct rxrpc_conn_parameters cp;
@@ -310,7 +310,7 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
memset(&p, 0, sizeof(p));
p.user_call_ID = user_call_ID;
p.tx_total_len = tx_total_len;
- p.intr = intr;
+ p.interruptibility = interruptibility;
memset(&cp, 0, sizeof(cp));
cp.local = rx->local;
@@ -371,45 +371,18 @@ EXPORT_SYMBOL(rxrpc_kernel_end_call);
* rxrpc_kernel_check_life - Check to see whether a call is still alive
* @sock: The socket the call is on
* @call: The call to check
- * @_life: Where to store the life value
*
- * Allow a kernel service to find out whether a call is still alive - ie. we're
- * getting ACKs from the server. Passes back in *_life a number representing
- * the life state which can be compared to that returned by a previous call and
- * return true if the call is still alive.
- *
- * If the life state stalls, rxrpc_kernel_probe_life() should be called and
- * then 2RTT waited.
+ * Allow a kernel service to find out whether a call is still alive -
+ * ie. whether it has completed.
*/
bool rxrpc_kernel_check_life(const struct socket *sock,
- const struct rxrpc_call *call,
- u32 *_life)
+ const struct rxrpc_call *call)
{
- *_life = call->acks_latest;
return call->state != RXRPC_CALL_COMPLETE;
}
EXPORT_SYMBOL(rxrpc_kernel_check_life);
/**
- * rxrpc_kernel_probe_life - Poke the peer to see if it's still alive
- * @sock: The socket the call is on
- * @call: The call to check
- *
- * In conjunction with rxrpc_kernel_check_life(), allow a kernel service to
- * find out whether a call is still alive by pinging it. This should cause the
- * life state to be bumped in about 2*RTT.
- *
- * The must be called in TASK_RUNNING state on pain of might_sleep() objecting.
- */
-void rxrpc_kernel_probe_life(struct socket *sock, struct rxrpc_call *call)
-{
- rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false,
- rxrpc_propose_ack_ping_for_check_life);
- rxrpc_send_ack_packet(call, true, NULL);
-}
-EXPORT_SYMBOL(rxrpc_kernel_probe_life);
-
-/**
* rxrpc_kernel_get_epoch - Retrieve the epoch value from a call.
* @sock: The socket the call is on
* @call: The call to query
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 7d730c438404..3eb1ab40ca5c 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -489,7 +489,6 @@ enum rxrpc_call_flag {
RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */
RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */
RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */
- RXRPC_CALL_IS_INTR, /* The call is interruptible */
RXRPC_CALL_DISCONNECTED, /* The call has been disconnected */
};
@@ -598,6 +597,7 @@ struct rxrpc_call {
atomic_t usage;
u16 service_id; /* service ID */
u8 security_ix; /* Security type */
+ enum rxrpc_interruptibility interruptibility; /* At what point call may be interrupted */
u32 call_id; /* call ID on connection */
u32 cid; /* connection ID plus channel index */
int debug_id; /* debug ID for printks */
@@ -675,7 +675,6 @@ struct rxrpc_call {
/* transmission-phase ACK management */
ktime_t acks_latest_ts; /* Timestamp of latest ACK received */
- rxrpc_serial_t acks_latest; /* serial number of latest ACK received */
rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
rxrpc_seq_t acks_lost_top; /* tx_top at the time lost-ack ping sent */
rxrpc_serial_t acks_lost_ping; /* Serial number of probe ACK */
@@ -721,7 +720,7 @@ struct rxrpc_call_params {
u32 normal; /* Max time since last call packet (msec) */
} timeouts;
u8 nr_timeouts; /* Number of timeouts specified */
- bool intr; /* The call is interruptible */
+ enum rxrpc_interruptibility interruptibility; /* How is interruptible is the call? */
};
struct rxrpc_send_params {
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index c9f34b0a11df..f07970207b54 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -237,8 +237,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
return call;
}
- if (p->intr)
- __set_bit(RXRPC_CALL_IS_INTR, &call->flags);
+ call->interruptibility = p->interruptibility;
call->tx_total_len = p->tx_total_len;
trace_rxrpc_call(call->debug_id, rxrpc_call_new_client,
atomic_read(&call->usage),
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index ea7d4c21f889..f2a1a5dbb5a7 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -655,13 +655,20 @@ static int rxrpc_wait_for_channel(struct rxrpc_call *call, gfp_t gfp)
add_wait_queue_exclusive(&call->waitq, &myself);
for (;;) {
- if (test_bit(RXRPC_CALL_IS_INTR, &call->flags))
+ switch (call->interruptibility) {
+ case RXRPC_INTERRUPTIBLE:
+ case RXRPC_PREINTERRUPTIBLE:
set_current_state(TASK_INTERRUPTIBLE);
- else
+ break;
+ case RXRPC_UNINTERRUPTIBLE:
+ default:
set_current_state(TASK_UNINTERRUPTIBLE);
+ break;
+ }
if (call->call_id)
break;
- if (test_bit(RXRPC_CALL_IS_INTR, &call->flags) &&
+ if ((call->interruptibility == RXRPC_INTERRUPTIBLE ||
+ call->interruptibility == RXRPC_PREINTERRUPTIBLE) &&
signal_pending(current)) {
ret = -ERESTARTSYS;
break;
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index ef10fbf71b15..69e09d69c896 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -882,7 +882,6 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
before(prev_pkt, call->ackr_prev_seq))
goto out;
call->acks_latest_ts = skb->tstamp;
- call->acks_latest = sp->hdr.serial;
call->ackr_first_seq = first_soft_ack;
call->ackr_prev_seq = prev_pkt;
diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c
index 6c3f35fac42d..0c98313dd7a8 100644
--- a/net/rxrpc/key.c
+++ b/net/rxrpc/key.c
@@ -31,7 +31,7 @@ static void rxrpc_free_preparse_s(struct key_preparsed_payload *);
static void rxrpc_destroy(struct key *);
static void rxrpc_destroy_s(struct key *);
static void rxrpc_describe(const struct key *, struct seq_file *);
-static long rxrpc_read(const struct key *, char __user *, size_t);
+static long rxrpc_read(const struct key *, char *, size_t);
/*
* rxrpc defined keys take an arbitrary string as the description and an
@@ -1042,12 +1042,12 @@ EXPORT_SYMBOL(rxrpc_get_null_key);
* - this returns the result in XDR form
*/
static long rxrpc_read(const struct key *key,
- char __user *buffer, size_t buflen)
+ char *buffer, size_t buflen)
{
const struct rxrpc_key_token *token;
const struct krb5_principal *princ;
size_t size;
- __be32 __user *xdr, *oldxdr;
+ __be32 *xdr, *oldxdr;
u32 cnlen, toksize, ntoks, tok, zero;
u16 toksizes[AFSTOKEN_MAX];
int loop;
@@ -1124,30 +1124,25 @@ static long rxrpc_read(const struct key *key,
if (!buffer || buflen < size)
return size;
- xdr = (__be32 __user *) buffer;
+ xdr = (__be32 *)buffer;
zero = 0;
#define ENCODE(x) \
do { \
- __be32 y = htonl(x); \
- if (put_user(y, xdr++) < 0) \
- goto fault; \
+ *xdr++ = htonl(x); \
} while(0)
#define ENCODE_DATA(l, s) \
do { \
u32 _l = (l); \
ENCODE(l); \
- if (copy_to_user(xdr, (s), _l) != 0) \
- goto fault; \
- if (_l & 3 && \
- copy_to_user((u8 __user *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \
- goto fault; \
+ memcpy(xdr, (s), _l); \
+ if (_l & 3) \
+ memcpy((u8 *)xdr + _l, &zero, 4 - (_l & 3)); \
xdr += (_l + 3) >> 2; \
} while(0)
#define ENCODE64(x) \
do { \
__be64 y = cpu_to_be64(x); \
- if (copy_to_user(xdr, &y, 8) != 0) \
- goto fault; \
+ memcpy(xdr, &y, 8); \
xdr += 8 >> 2; \
} while(0)
#define ENCODE_STR(s) \
@@ -1238,8 +1233,4 @@ static long rxrpc_read(const struct key *key,
ASSERTCMP((char __user *) xdr - buffer, ==, size);
_leave(" = %zu", size);
return size;
-
-fault:
- _leave(" = -EFAULT");
- return -EFAULT;
}
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 813fd6888142..0fcf157aa09f 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -18,6 +18,21 @@
#include "ar-internal.h"
/*
+ * Return true if there's sufficient Tx queue space.
+ */
+static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win)
+{
+ unsigned int win_size =
+ min_t(unsigned int, call->tx_winsize,
+ call->cong_cwnd + call->cong_extra);
+ rxrpc_seq_t tx_win = READ_ONCE(call->tx_hard_ack);
+
+ if (_tx_win)
+ *_tx_win = tx_win;
+ return call->tx_top - tx_win < win_size;
+}
+
+/*
* Wait for space to appear in the Tx queue or a signal to occur.
*/
static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx,
@@ -26,9 +41,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx,
{
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- if (call->tx_top - call->tx_hard_ack <
- min_t(unsigned int, call->tx_winsize,
- call->cong_cwnd + call->cong_extra))
+ if (rxrpc_check_tx_space(call, NULL))
return 0;
if (call->state >= RXRPC_CALL_COMPLETE)
@@ -49,7 +62,7 @@ static int rxrpc_wait_for_tx_window_intr(struct rxrpc_sock *rx,
* Wait for space to appear in the Tx queue uninterruptibly, but with
* a timeout of 2*RTT if no progress was made and a signal occurred.
*/
-static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
+static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,
struct rxrpc_call *call)
{
rxrpc_seq_t tx_start, tx_win;
@@ -58,8 +71,8 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
rtt = READ_ONCE(call->peer->rtt);
rtt2 = nsecs_to_jiffies64(rtt) * 2;
- if (rtt2 < 1)
- rtt2 = 1;
+ if (rtt2 < 2)
+ rtt2 = 2;
timeout = rtt2;
tx_start = READ_ONCE(call->tx_hard_ack);
@@ -68,16 +81,13 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
set_current_state(TASK_UNINTERRUPTIBLE);
tx_win = READ_ONCE(call->tx_hard_ack);
- if (call->tx_top - tx_win <
- min_t(unsigned int, call->tx_winsize,
- call->cong_cwnd + call->cong_extra))
+ if (rxrpc_check_tx_space(call, &tx_win))
return 0;
if (call->state >= RXRPC_CALL_COMPLETE)
return call->error;
- if (test_bit(RXRPC_CALL_IS_INTR, &call->flags) &&
- timeout == 0 &&
+ if (timeout == 0 &&
tx_win == tx_start && signal_pending(current))
return -EINTR;
@@ -92,6 +102,26 @@ static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
}
/*
+ * Wait for space to appear in the Tx queue uninterruptibly.
+ */
+static int rxrpc_wait_for_tx_window_nonintr(struct rxrpc_sock *rx,
+ struct rxrpc_call *call,
+ long *timeo)
+{
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (rxrpc_check_tx_space(call, NULL))
+ return 0;
+
+ if (call->state >= RXRPC_CALL_COMPLETE)
+ return call->error;
+
+ trace_rxrpc_transmit(call, rxrpc_transmit_wait);
+ *timeo = schedule_timeout(*timeo);
+ }
+}
+
+/*
* wait for space to appear in the transmit/ACK window
* - caller holds the socket locked
*/
@@ -108,10 +138,19 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
add_wait_queue(&call->waitq, &myself);
- if (waitall)
- ret = rxrpc_wait_for_tx_window_nonintr(rx, call);
- else
- ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo);
+ switch (call->interruptibility) {
+ case RXRPC_INTERRUPTIBLE:
+ if (waitall)
+ ret = rxrpc_wait_for_tx_window_waitall(rx, call);
+ else
+ ret = rxrpc_wait_for_tx_window_intr(rx, call, timeo);
+ break;
+ case RXRPC_PREINTERRUPTIBLE:
+ case RXRPC_UNINTERRUPTIBLE:
+ default:
+ ret = rxrpc_wait_for_tx_window_nonintr(rx, call, timeo);
+ break;
+ }
remove_wait_queue(&call->waitq, &myself);
set_current_state(TASK_RUNNING);
@@ -302,9 +341,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
_debug("alloc");
- if (call->tx_top - call->tx_hard_ack >=
- min_t(unsigned int, call->tx_winsize,
- call->cong_cwnd + call->cong_extra)) {
+ if (!rxrpc_check_tx_space(call, NULL)) {
ret = -EAGAIN;
if (msg->msg_flags & MSG_DONTWAIT)
goto maybe_error;
@@ -619,7 +656,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len)
.call.tx_total_len = -1,
.call.user_call_ID = 0,
.call.nr_timeouts = 0,
- .call.intr = true,
+ .call.interruptibility = RXRPC_INTERRUPTIBLE,
.abort_code = 0,
.command = RXRPC_CMD_SEND_DATA,
.exclusive = false,
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 861a831b0ef7..df4560909157 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -789,23 +789,20 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
}
rcu_read_unlock();
- if (a->hw_stats != TCA_ACT_HW_STATS_ANY) {
- struct nla_bitfield32 hw_stats = {
- a->hw_stats,
- TCA_ACT_HW_STATS_ANY,
- };
-
- if (nla_put(skb, TCA_ACT_HW_STATS, sizeof(hw_stats), &hw_stats))
- goto nla_put_failure;
- }
+ if (a->hw_stats != TCA_ACT_HW_STATS_ANY &&
+ nla_put_bitfield32(skb, TCA_ACT_HW_STATS,
+ a->hw_stats, TCA_ACT_HW_STATS_ANY))
+ goto nla_put_failure;
- if (a->tcfa_flags) {
- struct nla_bitfield32 flags = { a->tcfa_flags,
- a->tcfa_flags, };
+ if (a->used_hw_stats_valid &&
+ nla_put_bitfield32(skb, TCA_ACT_USED_HW_STATS,
+ a->used_hw_stats, TCA_ACT_HW_STATS_ANY))
+ goto nla_put_failure;
- if (nla_put(skb, TCA_ACT_FLAGS, sizeof(flags), &flags))
- goto nla_put_failure;
- }
+ if (a->tcfa_flags &&
+ nla_put_bitfield32(skb, TCA_ACT_FLAGS,
+ a->tcfa_flags, a->tcfa_flags))
+ goto nla_put_failure;
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (nest == NULL)
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 46f47e58b3be..54d5652cfe6c 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -12,6 +12,7 @@
#include <linux/bpf.h>
#include <net/netlink.h>
+#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
@@ -53,6 +54,8 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act,
bpf_compute_data_pointers(skb);
filter_res = BPF_PROG_RUN(filter, skb);
}
+ if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK)
+ skb_orphan(skb);
rcu_read_unlock();
/* A BPF program may overwrite the default action opcode.
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 56b66d215a89..1a766393be62 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -1273,7 +1273,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
if (goto_ch)
tcf_chain_put_by_act(goto_ch);
if (params)
- kfree_rcu(params, rcu);
+ call_rcu(&params->rcu, tcf_ct_params_free);
if (res == ACT_P_CREATED)
tcf_idr_insert(tn, *a);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 1ad300e6dbc0..83dd82fc9f40 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -284,10 +284,8 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
/* mirror is always swallowed */
if (is_redirect) {
- skb2->tc_redirected = 1;
- skb2->tc_from_ingress = skb2->tc_at_ingress;
- if (skb2->tc_from_ingress)
- skb2->tstamp = 0;
+ skb_set_redirected(skb2, skb2->tc_at_ingress);
+
/* let's the caller reinsert the packet, if possible */
if (use_reinsert) {
res->ingress = want_ingress;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 3ad718576304..d41d6200d9de 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -409,6 +409,16 @@ done:
return p->tcf_action;
}
+static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u32 packets,
+ u64 lastuse, bool hw)
+{
+ struct tcf_pedit *d = to_pedit(a);
+ struct tcf_t *tm = &d->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, false, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,
int bind, int ref)
{
@@ -485,6 +495,7 @@ static struct tc_action_ops act_pedit_ops = {
.id = TCA_ID_PEDIT,
.owner = THIS_MODULE,
.act = tcf_pedit_act,
+ .stats_update = tcf_pedit_stats_update,
.dump = tcf_pedit_dump,
.cleanup = tcf_pedit_cleanup,
.init = tcf_pedit_init,
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index e857424c387c..b125b2be4467 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -73,6 +73,16 @@ err:
return TC_ACT_SHOT;
}
+static void tcf_skbedit_stats_update(struct tc_action *a, u64 bytes,
+ u32 packets, u64 lastuse, bool hw)
+{
+ struct tcf_skbedit *d = to_skbedit(a);
+ struct tcf_t *tm = &d->tcf_tm;
+
+ tcf_action_update_stats(a, bytes, packets, false, hw);
+ tm->lastuse = max_t(u64, tm->lastuse, lastuse);
+}
+
static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
[TCA_SKBEDIT_PARMS] = { .len = sizeof(struct tc_skbedit) },
[TCA_SKBEDIT_PRIORITY] = { .len = sizeof(u32) },
@@ -323,6 +333,7 @@ static struct tc_action_ops act_skbedit_ops = {
.id = TCA_ID_SKBEDIT,
.owner = THIS_MODULE,
.act = tcf_skbedit_act,
+ .stats_update = tcf_skbedit_stats_update,
.dump = tcf_skbedit_dump,
.init = tcf_skbedit_init,
.cleanup = tcf_skbedit_cleanup,
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 91a5de0bf628..f6a3b969ead0 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -708,7 +708,7 @@ static void tc_indr_block_call(struct tcf_block *block,
};
INIT_LIST_HEAD(&bo.cb_list);
- flow_indr_block_call(dev, &bo, command);
+ flow_indr_block_call(dev, &bo, command, TC_SETUP_BLOCK);
tcf_block_setup(block, &bo);
}
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 258dc45ab7e3..74a0febcafb8 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -492,7 +492,9 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f,
tcf_exts_stats_update(&f->exts, cls_flower.stats.bytes,
cls_flower.stats.pkts,
- cls_flower.stats.lastused);
+ cls_flower.stats.lastused,
+ cls_flower.stats.used_hw_stats,
+ cls_flower.stats.used_hw_stats_valid);
}
static void __fl_put(struct cls_fl_filter *f)
@@ -738,7 +740,8 @@ static void fl_set_key_val(struct nlattr **tb,
}
static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
- struct fl_flow_key *mask)
+ struct fl_flow_key *mask,
+ struct netlink_ext_ack *extack)
{
fl_set_key_val(tb, &key->tp_range.tp_min.dst,
TCA_FLOWER_KEY_PORT_DST_MIN, &mask->tp_range.tp_min.dst,
@@ -753,20 +756,30 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key,
TCA_FLOWER_KEY_PORT_SRC_MAX, &mask->tp_range.tp_max.src,
TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src));
- if ((mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
- htons(key->tp_range.tp_max.dst) <=
- htons(key->tp_range.tp_min.dst)) ||
- (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
- htons(key->tp_range.tp_max.src) <=
- htons(key->tp_range.tp_min.src)))
+ if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
+ htons(key->tp_range.tp_max.dst) <=
+ htons(key->tp_range.tp_min.dst)) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_PORT_DST_MIN],
+ "Invalid destination port range (min must be strictly smaller than max)");
return -EINVAL;
+ }
+ if (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
+ htons(key->tp_range.tp_max.src) <=
+ htons(key->tp_range.tp_min.src)) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_PORT_SRC_MIN],
+ "Invalid source port range (min must be strictly smaller than max)");
+ return -EINVAL;
+ }
return 0;
}
static int fl_set_key_mpls(struct nlattr **tb,
struct flow_dissector_key_mpls *key_val,
- struct flow_dissector_key_mpls *key_mask)
+ struct flow_dissector_key_mpls *key_mask,
+ struct netlink_ext_ack *extack)
{
if (tb[TCA_FLOWER_KEY_MPLS_TTL]) {
key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]);
@@ -775,24 +788,36 @@ static int fl_set_key_mpls(struct nlattr **tb,
if (tb[TCA_FLOWER_KEY_MPLS_BOS]) {
u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]);
- if (bos & ~MPLS_BOS_MASK)
+ if (bos & ~MPLS_BOS_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_BOS],
+ "Bottom Of Stack (BOS) must be 0 or 1");
return -EINVAL;
+ }
key_val->mpls_bos = bos;
key_mask->mpls_bos = MPLS_BOS_MASK;
}
if (tb[TCA_FLOWER_KEY_MPLS_TC]) {
u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]);
- if (tc & ~MPLS_TC_MASK)
+ if (tc & ~MPLS_TC_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_TC],
+ "Traffic Class (TC) must be between 0 and 7");
return -EINVAL;
+ }
key_val->mpls_tc = tc;
key_mask->mpls_tc = MPLS_TC_MASK;
}
if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) {
u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]);
- if (label & ~MPLS_LABEL_MASK)
+ if (label & ~MPLS_LABEL_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[TCA_FLOWER_KEY_MPLS_LABEL],
+ "Label must be between 0 and 1048575");
return -EINVAL;
+ }
key_val->mpls_label = label;
key_mask->mpls_label = MPLS_LABEL_MASK;
}
@@ -833,14 +858,16 @@ static void fl_set_key_flag(u32 flower_key, u32 flower_mask,
}
}
-static int fl_set_key_flags(struct nlattr **tb,
- u32 *flags_key, u32 *flags_mask)
+static int fl_set_key_flags(struct nlattr **tb, u32 *flags_key,
+ u32 *flags_mask, struct netlink_ext_ack *extack)
{
u32 key, mask;
/* mask is mandatory for flags */
- if (!tb[TCA_FLOWER_KEY_FLAGS_MASK])
+ if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) {
+ NL_SET_ERR_MSG(extack, "Missing flags mask");
return -EINVAL;
+ }
key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
@@ -1364,7 +1391,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
sizeof(key->icmp.code));
} else if (key->basic.n_proto == htons(ETH_P_MPLS_UC) ||
key->basic.n_proto == htons(ETH_P_MPLS_MC)) {
- ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls);
+ ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls, extack);
if (ret)
return ret;
} else if (key->basic.n_proto == htons(ETH_P_ARP) ||
@@ -1389,7 +1416,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
if (key->basic.ip_proto == IPPROTO_TCP ||
key->basic.ip_proto == IPPROTO_UDP ||
key->basic.ip_proto == IPPROTO_SCTP) {
- ret = fl_set_key_port_range(tb, key, mask);
+ ret = fl_set_key_port_range(tb, key, mask, extack);
if (ret)
return ret;
}
@@ -1451,7 +1478,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
return ret;
if (tb[TCA_FLOWER_KEY_FLAGS])
- ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
+ ret = fl_set_key_flags(tb, &key->control.flags,
+ &mask->control.flags, extack);
return ret;
}
diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c
index a34b36adb9b7..8d39dbcf1746 100644
--- a/net/sched/cls_matchall.c
+++ b/net/sched/cls_matchall.c
@@ -338,7 +338,9 @@ static void mall_stats_hw_filter(struct tcf_proto *tp,
tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true);
tcf_exts_stats_update(&head->exts, cls_mall.stats.bytes,
- cls_mall.stats.pkts, cls_mall.stats.lastused);
+ cls_mall.stats.pkts, cls_mall.stats.lastused,
+ cls_mall.stats.used_hw_stats,
+ cls_mall.stats.used_hw_stats_valid);
}
static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh,
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 6f8786b06bde..5efa3e7ace15 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -534,8 +534,8 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
fp = &b->ht[h];
for (pfp = rtnl_dereference(*fp); pfp;
fp = &pfp->next, pfp = rtnl_dereference(*fp)) {
- if (pfp == f) {
- *fp = f->next;
+ if (pfp == fold) {
+ rcu_assign_pointer(*fp, fold->next);
break;
}
}
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 09b7dc5fe7e0..61e95029c18f 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -11,6 +11,7 @@
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/refcount.h>
#include <net/act_api.h>
#include <net/netlink.h>
#include <net/pkt_cls.h>
@@ -26,9 +27,12 @@
#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
+struct tcindex_data;
+
struct tcindex_filter_result {
struct tcf_exts exts;
struct tcf_result res;
+ struct tcindex_data *p;
struct rcu_work rwork;
};
@@ -49,6 +53,7 @@ struct tcindex_data {
u32 hash; /* hash table size; 0 if undefined */
u32 alloc_hash; /* allocated size */
u32 fall_through; /* 0: only classify if explicit match */
+ refcount_t refcnt; /* a temporary refcnt for perfect hash */
struct rcu_work rwork;
};
@@ -57,6 +62,20 @@ static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
return tcf_exts_has_actions(&r->exts) || r->res.classid;
}
+static void tcindex_data_get(struct tcindex_data *p)
+{
+ refcount_inc(&p->refcnt);
+}
+
+static void tcindex_data_put(struct tcindex_data *p)
+{
+ if (refcount_dec_and_test(&p->refcnt)) {
+ kfree(p->perfect);
+ kfree(p->h);
+ kfree(p);
+ }
+}
+
static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
u16 key)
{
@@ -132,6 +151,7 @@ static int tcindex_init(struct tcf_proto *tp)
p->mask = 0xffff;
p->hash = DEFAULT_HASH_SIZE;
p->fall_through = 1;
+ refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */
rcu_assign_pointer(tp->root, p);
return 0;
@@ -141,6 +161,7 @@ static void __tcindex_destroy_rexts(struct tcindex_filter_result *r)
{
tcf_exts_destroy(&r->exts);
tcf_exts_put_net(&r->exts);
+ tcindex_data_put(r->p);
}
static void tcindex_destroy_rexts_work(struct work_struct *work)
@@ -212,6 +233,8 @@ found:
else
__tcindex_destroy_fexts(f);
} else {
+ tcindex_data_get(p);
+
if (tcf_exts_get_net(&r->exts))
tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
else
@@ -228,9 +251,7 @@ static void tcindex_destroy_work(struct work_struct *work)
struct tcindex_data,
rwork);
- kfree(p->perfect);
- kfree(p->h);
- kfree(p);
+ tcindex_data_put(p);
}
static inline int
@@ -248,9 +269,11 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
};
static int tcindex_filter_result_init(struct tcindex_filter_result *r,
+ struct tcindex_data *p,
struct net *net)
{
memset(r, 0, sizeof(*r));
+ r->p = p;
return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT,
TCA_TCINDEX_POLICE);
}
@@ -261,8 +284,10 @@ static void tcindex_partial_destroy_work(struct work_struct *work)
struct tcindex_data,
rwork);
+ rtnl_lock();
kfree(p->perfect);
kfree(p);
+ rtnl_unlock();
}
static void tcindex_free_perfect_hash(struct tcindex_data *cp)
@@ -288,6 +313,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp)
TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
if (err < 0)
goto errout;
+ cp->perfect[i].p = cp;
}
return 0;
@@ -332,6 +358,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
cp->alloc_hash = p->alloc_hash;
cp->fall_through = p->fall_through;
cp->tp = tp;
+ refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */
if (tb[TCA_TCINDEX_HASH])
cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
@@ -357,13 +384,14 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
if (tcindex_alloc_perfect_hash(net, cp) < 0)
goto errout;
+ cp->alloc_hash = cp->hash;
for (i = 0; i < min(cp->hash, p->hash); i++)
cp->perfect[i].res = p->perfect[i].res;
balloc = 1;
}
cp->h = p->h;
- err = tcindex_filter_result_init(&new_filter_result, net);
+ err = tcindex_filter_result_init(&new_filter_result, cp, net);
if (err < 0)
goto errout_alloc;
if (old_r)
@@ -431,7 +459,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
goto errout_alloc;
f->key = handle;
f->next = NULL;
- err = tcindex_filter_result_init(&f->result, net);
+ err = tcindex_filter_result_init(&f->result, cp, net);
if (err < 0) {
kfree(f);
goto errout_alloc;
@@ -444,7 +472,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
}
if (old_r && old_r != r) {
- err = tcindex_filter_result_init(old_r, net);
+ err = tcindex_filter_result_init(old_r, cp, net);
if (err < 0) {
kfree(f);
goto errout_alloc;
@@ -568,6 +596,14 @@ static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held,
for (i = 0; i < p->hash; i++) {
struct tcindex_filter_result *r = p->perfect + i;
+ /* tcf_queue_work() does not guarantee the ordering we
+ * want, so we have to take this refcnt temporarily to
+ * ensure 'p' is freed after all tcindex_filter_result
+ * here. Imperfect hash does not need this, because it
+ * uses linked lists rather than an array.
+ */
+ tcindex_data_get(p);
+
tcf_unbind_filter(tp, &r->res);
if (tcf_exts_get_net(&r->exts))
tcf_queue_work(&r->rwork,
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index b2905b03a432..2eaac2ff380f 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -181,6 +181,11 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
s64 credits;
int len;
+ /* The previous packet is still being sent */
+ if (now < q->last) {
+ qdisc_watchdog_schedule_ns(&q->watchdog, q->last);
+ return NULL;
+ }
if (q->credits < 0) {
credits = timediff_to_credits(now - q->last, q->idleslope);
@@ -212,7 +217,12 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
credits += q->credits;
q->credits = max_t(s64, credits, q->locredit);
- q->last = now;
+ /* Estimate of the transmission of the last byte of the packet in ns */
+ if (unlikely(atomic64_read(&q->port_rate) == 0))
+ q->last = now;
+ else
+ q->last = now + div64_s64(len * NSEC_PER_SEC,
+ atomic64_read(&q->port_rate));
return skb;
}
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 3ef0a4f7399b..c7de47c942e3 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -349,10 +349,6 @@ static int red_dump_offload_stats(struct Qdisc *sch)
static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
{
struct red_sched_data *q = qdisc_priv(sch);
- struct nla_bitfield32 flags_bf = {
- .selector = red_supported_flags,
- .value = q->flags,
- };
struct nlattr *opts = NULL;
struct tc_red_qopt opt = {
.limit = q->limit,
@@ -375,7 +371,8 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb)
goto nla_put_failure;
if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) ||
nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) ||
- nla_put(skb, TCA_RED_FLAGS, sizeof(flags_bf), &flags_bf))
+ nla_put_bitfield32(skb, TCA_RED_FLAGS,
+ q->flags, red_supported_flags))
goto nla_put_failure;
return nla_nest_end(skb, opts);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index bc734cfaa29e..c87af430107a 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -228,7 +228,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
{
struct sctp_association *asoc = t->asoc;
struct dst_entry *dst = NULL;
- struct flowi6 *fl6 = &fl->u.ip6;
+ struct flowi _fl;
+ struct flowi6 *fl6 = &_fl.u.ip6;
struct sctp_bind_addr *bp;
struct ipv6_pinfo *np = inet6_sk(sk);
struct sctp_sockaddr_entry *laddr;
@@ -238,7 +239,7 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
enum sctp_scope scope;
__u8 matchlen = 0;
- memset(fl6, 0, sizeof(struct flowi6));
+ memset(&_fl, 0, sizeof(_fl));
fl6->daddr = daddr->v6.sin6_addr;
fl6->fl6_dport = daddr->v6.sin6_port;
fl6->flowi6_proto = IPPROTO_SCTP;
@@ -276,8 +277,11 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
rcu_read_unlock();
dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
- if (!asoc || saddr)
+ if (!asoc || saddr) {
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
goto out;
+ }
bp = &asoc->base.bind_addr;
scope = sctp_scope(daddr);
@@ -300,6 +304,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
if ((laddr->a.sa.sa_family == AF_INET6) &&
(sctp_v6_cmp_addr(&dst_saddr, &laddr->a))) {
rcu_read_unlock();
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
goto out;
}
}
@@ -338,6 +344,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
if (!IS_ERR_OR_NULL(dst))
dst_release(dst);
dst = bdst;
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
break;
}
@@ -351,6 +359,8 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
dst_release(dst);
dst = bdst;
matchlen = bmatchlen;
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
}
rcu_read_unlock();
@@ -359,14 +369,12 @@ out:
struct rt6_info *rt;
rt = (struct rt6_info *)dst;
- t->dst = dst;
t->dst_cookie = rt6_get_cookie(rt);
pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n",
&rt->rt6i_dst.addr, rt->rt6i_dst.plen,
- &fl6->saddr);
+ &fl->u.ip6.saddr);
} else {
t->dst = NULL;
-
pr_debug("no route\n");
}
}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 78af2fcf90cc..092d1afdee0d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -409,7 +409,8 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
{
struct sctp_association *asoc = t->asoc;
struct rtable *rt;
- struct flowi4 *fl4 = &fl->u.ip4;
+ struct flowi _fl;
+ struct flowi4 *fl4 = &_fl.u.ip4;
struct sctp_bind_addr *bp;
struct sctp_sockaddr_entry *laddr;
struct dst_entry *dst = NULL;
@@ -419,7 +420,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
if (t->dscp & SCTP_DSCP_SET_MASK)
tos = t->dscp & SCTP_DSCP_VAL_MASK;
- memset(fl4, 0x0, sizeof(struct flowi4));
+ memset(&_fl, 0x0, sizeof(_fl));
fl4->daddr = daddr->v4.sin_addr.s_addr;
fl4->fl4_dport = daddr->v4.sin_port;
fl4->flowi4_proto = IPPROTO_SCTP;
@@ -438,8 +439,11 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
&fl4->saddr);
rt = ip_route_output_key(sock_net(sk), fl4);
- if (!IS_ERR(rt))
+ if (!IS_ERR(rt)) {
dst = &rt->dst;
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
+ }
/* If there is no association or if a source address is passed, no
* more validation is required.
@@ -502,27 +506,33 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
false);
if (!odev || odev->ifindex != fl4->flowi4_oif) {
- if (!dst)
+ if (!dst) {
dst = &rt->dst;
- else
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
+ } else {
dst_release(&rt->dst);
+ }
continue;
}
dst_release(dst);
dst = &rt->dst;
+ t->dst = dst;
+ memcpy(fl, &_fl, sizeof(_fl));
break;
}
out_unlock:
rcu_read_unlock();
out:
- t->dst = dst;
- if (dst)
+ if (dst) {
pr_debug("rt_dst:%pI4, rt_src:%pI4\n",
- &fl4->daddr, &fl4->saddr);
- else
+ &fl->u.ip4.daddr, &fl->u.ip4.saddr);
+ } else {
+ t->dst = NULL;
pr_debug("no route\n");
+ }
}
/* For v4, the source address is cached in the route entry(dst). So no need
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index fed26a1e9518..827a9903ee28 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -147,29 +147,44 @@ static void sctp_clear_owner_w(struct sctp_chunk *chunk)
skb_orphan(chunk->skb);
}
+#define traverse_and_process() \
+do { \
+ msg = chunk->msg; \
+ if (msg == prev_msg) \
+ continue; \
+ list_for_each_entry(c, &msg->chunks, frag_list) { \
+ if ((clear && asoc->base.sk == c->skb->sk) || \
+ (!clear && asoc->base.sk != c->skb->sk)) \
+ cb(c); \
+ } \
+ prev_msg = msg; \
+} while (0)
+
static void sctp_for_each_tx_datachunk(struct sctp_association *asoc,
+ bool clear,
void (*cb)(struct sctp_chunk *))
{
+ struct sctp_datamsg *msg, *prev_msg = NULL;
struct sctp_outq *q = &asoc->outqueue;
+ struct sctp_chunk *chunk, *c;
struct sctp_transport *t;
- struct sctp_chunk *chunk;
list_for_each_entry(t, &asoc->peer.transport_addr_list, transports)
list_for_each_entry(chunk, &t->transmitted, transmitted_list)
- cb(chunk);
+ traverse_and_process();
list_for_each_entry(chunk, &q->retransmit, transmitted_list)
- cb(chunk);
+ traverse_and_process();
list_for_each_entry(chunk, &q->sacked, transmitted_list)
- cb(chunk);
+ traverse_and_process();
list_for_each_entry(chunk, &q->abandoned, transmitted_list)
- cb(chunk);
+ traverse_and_process();
list_for_each_entry(chunk, &q->out_chunk_list, list)
- cb(chunk);
+ traverse_and_process();
}
static void sctp_for_each_rx_skb(struct sctp_association *asoc, struct sock *sk,
@@ -9574,9 +9589,9 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
* paths won't try to lock it and then oldsk.
*/
lock_sock_nested(newsk, SINGLE_DEPTH_NESTING);
- sctp_for_each_tx_datachunk(assoc, sctp_clear_owner_w);
+ sctp_for_each_tx_datachunk(assoc, true, sctp_clear_owner_w);
sctp_assoc_migrate(assoc, newsk);
- sctp_for_each_tx_datachunk(assoc, sctp_set_owner_w);
+ sctp_for_each_tx_datachunk(assoc, false, sctp_set_owner_w);
/* If the association on the newsk is already closed before accept()
* is called, set RCV_SHUTDOWN flag.
diff --git a/net/socket.c b/net/socket.c
index b79a05de7c6e..2dd739fba866 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1707,7 +1707,8 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog)
int __sys_accept4_file(struct file *file, unsigned file_flags,
struct sockaddr __user *upeer_sockaddr,
- int __user *upeer_addrlen, int flags)
+ int __user *upeer_addrlen, int flags,
+ unsigned long nofile)
{
struct socket *sock, *newsock;
struct file *newfile;
@@ -1738,7 +1739,7 @@ int __sys_accept4_file(struct file *file, unsigned file_flags,
*/
__module_get(newsock->ops->owner);
- newfd = get_unused_fd_flags(flags);
+ newfd = __get_unused_fd_flags(flags, nofile);
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
@@ -1807,7 +1808,8 @@ int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
f = fdget(fd);
if (f.file) {
ret = __sys_accept4_file(f.file, 0, upeer_sockaddr,
- upeer_addrlen, flags);
+ upeer_addrlen, flags,
+ rlimit(RLIMIT_NOFILE));
if (f.flags)
fput(f.file);
}
@@ -2226,10 +2228,10 @@ struct used_address {
unsigned int name_len;
};
-static int copy_msghdr_from_user(struct msghdr *kmsg,
- struct user_msghdr __user *umsg,
- struct sockaddr __user **save_addr,
- struct iovec **iov)
+int __copy_msghdr_from_user(struct msghdr *kmsg,
+ struct user_msghdr __user *umsg,
+ struct sockaddr __user **save_addr,
+ struct iovec __user **uiov, size_t *nsegs)
{
struct user_msghdr msg;
ssize_t err;
@@ -2271,6 +2273,23 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
+ *uiov = msg.msg_iov;
+ *nsegs = msg.msg_iovlen;
+ return 0;
+}
+
+static int copy_msghdr_from_user(struct msghdr *kmsg,
+ struct user_msghdr __user *umsg,
+ struct sockaddr __user **save_addr,
+ struct iovec **iov)
+{
+ struct user_msghdr msg;
+ ssize_t err;
+
+ err = __copy_msghdr_from_user(kmsg, umsg, save_addr, &msg.msg_iov,
+ &msg.msg_iovlen);
+ if (err)
+ return err;
err = import_iovec(save_addr ? READ : WRITE,
msg.msg_iov, msg.msg_iovlen,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 24ca861815b1..25fbd8d9de74 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -20,6 +20,7 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/auth.h>
#include <linux/sunrpc/auth_gss.h>
+#include <linux/sunrpc/gss_krb5.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/gss_err.h>
#include <linux/workqueue.h>
@@ -1050,7 +1051,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
goto err_put_mech;
auth = &gss_auth->rpc_auth;
auth->au_cslack = GSS_CRED_SLACK >> 2;
- auth->au_rslack = GSS_VERF_SLACK >> 2;
+ auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2;
auth->au_verfsize = GSS_VERF_SLACK >> 2;
auth->au_ralign = GSS_VERF_SLACK >> 2;
auth->au_flags = 0;
@@ -1724,8 +1725,9 @@ bad_mic:
goto out;
}
-static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- struct rpc_task *task, struct xdr_stream *xdr)
+static noinline_for_stack int
+gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_rqst *rqstp = task->tk_rqstp;
struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf;
@@ -1816,8 +1818,9 @@ out:
return -EAGAIN;
}
-static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- struct rpc_task *task, struct xdr_stream *xdr)
+static noinline_for_stack int
+gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_rqst *rqstp = task->tk_rqstp;
struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
@@ -1877,7 +1880,7 @@ static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
else
iov = snd_buf->head;
p = iov->iov_base + iov->iov_len;
- pad = 3 - ((snd_buf->len - offset - 1) & 3);
+ pad = xdr_pad_size(snd_buf->len - offset);
memset(p, 0, pad);
iov->iov_len += pad;
snd_buf->len += pad;
@@ -1934,35 +1937,69 @@ gss_unwrap_resp_auth(struct rpc_cred *cred)
return 0;
}
-static int
+/*
+ * RFC 2203, Section 5.3.2.2
+ *
+ * struct rpc_gss_integ_data {
+ * opaque databody_integ<>;
+ * opaque checksum<>;
+ * };
+ *
+ * struct rpc_gss_data_t {
+ * unsigned int seq_num;
+ * proc_req_arg_t arg;
+ * };
+ */
+static noinline_for_stack int
gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
struct xdr_stream *xdr)
{
- struct xdr_buf integ_buf, *rcv_buf = &rqstp->rq_rcv_buf;
- u32 data_offset, mic_offset, integ_len, maj_stat;
+ struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf;
struct rpc_auth *auth = cred->cr_auth;
+ u32 len, offset, seqno, maj_stat;
struct xdr_netobj mic;
- __be32 *p;
+ int ret;
- p = xdr_inline_decode(xdr, 2 * sizeof(*p));
- if (unlikely(!p))
+ ret = -EIO;
+ mic.data = NULL;
+
+ /* opaque databody_integ<>; */
+ if (xdr_stream_decode_u32(xdr, &len))
goto unwrap_failed;
- integ_len = be32_to_cpup(p++);
- if (integ_len & 3)
+ if (len & 3)
goto unwrap_failed;
- data_offset = (u8 *)(p) - (u8 *)rcv_buf->head[0].iov_base;
- mic_offset = integ_len + data_offset;
- if (mic_offset > rcv_buf->len)
+ offset = rcv_buf->len - xdr_stream_remaining(xdr);
+ if (xdr_stream_decode_u32(xdr, &seqno))
goto unwrap_failed;
- if (be32_to_cpup(p) != rqstp->rq_seqno)
+ if (seqno != rqstp->rq_seqno)
goto bad_seqno;
+ if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len))
+ goto unwrap_failed;
- if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len))
+ /*
+ * The xdr_stream now points to the beginning of the
+ * upper layer payload, to be passed below to
+ * rpcauth_unwrap_resp_decode(). The checksum, which
+ * follows the upper layer payload in @rcv_buf, is
+ * located and parsed without updating the xdr_stream.
+ */
+
+ /* opaque checksum<>; */
+ offset += len;
+ if (xdr_decode_word(rcv_buf, offset, &len))
+ goto unwrap_failed;
+ offset += sizeof(__be32);
+ if (offset + len > rcv_buf->len)
goto unwrap_failed;
- if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset))
+ mic.len = len;
+ mic.data = kmalloc(len, GFP_NOFS);
+ if (!mic.data)
+ goto unwrap_failed;
+ if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len))
goto unwrap_failed;
- maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
+
+ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &gss_data, &mic);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
if (maj_stat != GSS_S_COMPLETE)
@@ -1970,19 +2007,24 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
auth->au_rslack = auth->au_verfsize + 2 + 1 + XDR_QUADLEN(mic.len);
auth->au_ralign = auth->au_verfsize + 2;
- return 0;
+ ret = 0;
+
+out:
+ kfree(mic.data);
+ return ret;
+
unwrap_failed:
trace_rpcgss_unwrap_failed(task);
- return -EIO;
+ goto out;
bad_seqno:
- trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(p));
- return -EIO;
+ trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno);
+ goto out;
bad_mic:
trace_rpcgss_verify_mic(task, maj_stat);
- return -EIO;
+ goto out;
}
-static int
+static noinline_for_stack int
gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
struct xdr_stream *xdr)
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 65b67b257302..54ae5be62f6a 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -55,10 +55,6 @@
#include "gss_rpc_upcall.h"
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_AUTH
-#endif
-
/* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests
* into replies.
*
@@ -184,6 +180,11 @@ static struct cache_head *rsi_alloc(void)
return NULL;
}
+static int rsi_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall_timeout(cd, h);
+}
+
static void rsi_request(struct cache_detail *cd,
struct cache_head *h,
char **bpp, int *blen)
@@ -282,6 +283,7 @@ static const struct cache_detail rsi_cache_template = {
.hash_size = RSI_HASHMAX,
.name = "auth.rpcsec.init",
.cache_put = rsi_put,
+ .cache_upcall = rsi_upcall,
.cache_request = rsi_request,
.cache_parse = rsi_parse,
.match = rsi_match,
@@ -428,6 +430,11 @@ rsc_alloc(void)
return NULL;
}
+static int rsc_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return -EINVAL;
+}
+
static int rsc_parse(struct cache_detail *cd,
char *mesg, int mlen)
{
@@ -554,6 +561,7 @@ static const struct cache_detail rsc_cache_template = {
.hash_size = RSC_HASHMAX,
.name = "auth.rpcsec.context",
.cache_put = rsc_put,
+ .cache_upcall = rsc_upcall,
.cache_parse = rsc_parse,
.match = rsc_match,
.init = rsc_init,
@@ -713,14 +721,12 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci,
}
if (gc->gc_seq > MAXSEQ) {
- dprintk("RPC: svcauth_gss: discarding request with "
- "large sequence number %d\n", gc->gc_seq);
+ trace_rpcgss_svc_large_seqno(rqstp->rq_xid, gc->gc_seq);
*authp = rpcsec_gsserr_ctxproblem;
return SVC_DENIED;
}
if (!gss_check_seq_num(rsci, gc->gc_seq)) {
- dprintk("RPC: svcauth_gss: discarding request with "
- "old sequence number %d\n", gc->gc_seq);
+ trace_rpcgss_svc_old_seqno(rqstp->rq_xid, gc->gc_seq);
return SVC_DROP;
}
return SVC_OK;
@@ -961,7 +967,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs
/* XXX: This is very inefficient. It would be better to either do
* this while we encrypt, or maybe in the receive code, if we can peak
* ahead and work out the service and mechanism there. */
- offset = buf->head[0].iov_len % 4;
+ offset = xdr_pad_size(buf->head[0].iov_len);
if (offset) {
buf->buflen = RPCSVC_MAXPAYLOAD;
xdr_shift_buf(buf, offset);
@@ -1245,7 +1251,6 @@ static int gss_proxy_save_rsc(struct cache_detail *cd,
if (!ud->found_creds) {
/* userspace seem buggy, we should always get at least a
* mapping to nobody */
- dprintk("RPC: No creds found!\n");
goto out;
} else {
struct timespec64 boot;
@@ -1311,8 +1316,8 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
if (status)
goto out;
- trace_rpcgss_accept_upcall(rqstp->rq_xid, ud.major_status,
- ud.minor_status);
+ trace_rpcgss_svc_accept_upcall(rqstp->rq_xid, ud.major_status,
+ ud.minor_status);
switch (ud.major_status) {
case GSS_S_CONTINUE_NEEDED:
@@ -1320,31 +1325,23 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
break;
case GSS_S_COMPLETE:
status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle);
- if (status) {
- pr_info("%s: gss_proxy_save_rsc failed (%d)\n",
- __func__, status);
+ if (status)
goto out;
- }
cli_handle.data = (u8 *)&handle;
cli_handle.len = sizeof(handle);
break;
default:
- ret = SVC_CLOSE;
goto out;
}
/* Got an answer to the upcall; use it: */
if (gss_write_init_verf(sn->rsc_cache, rqstp,
- &cli_handle, &ud.major_status)) {
- pr_info("%s: gss_write_init_verf failed\n", __func__);
+ &cli_handle, &ud.major_status))
goto out;
- }
if (gss_write_resv(resv, PAGE_SIZE,
&cli_handle, &ud.out_token,
- ud.major_status, ud.minor_status)) {
- pr_info("%s: gss_write_resv failed\n", __func__);
+ ud.major_status, ud.minor_status))
goto out;
- }
ret = SVC_COMPLETE;
out:
@@ -1495,8 +1492,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
int ret;
struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
- dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n",
- argv->iov_len);
+ trace_rpcgss_svc_accept(rqstp->rq_xid, argv->iov_len);
*authp = rpc_autherr_badcred;
if (!svcdata)
@@ -1680,7 +1676,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
goto out;
integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
integ_len = resbuf->len - integ_offset;
- BUG_ON(integ_len % 4);
+ if (integ_len & 3)
+ goto out;
*p++ = htonl(integ_len);
*p++ = htonl(gc->gc_seq);
if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) {
@@ -1704,7 +1701,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp)
resv->iov_len += XDR_QUADLEN(mic.len) << 2;
/* not strictly required: */
resbuf->len += XDR_QUADLEN(mic.len) << 2;
- BUG_ON(resv->iov_len > PAGE_SIZE);
+ if (resv->iov_len > PAGE_SIZE)
+ goto out_err;
out:
stat = 0;
out_err:
@@ -1740,9 +1738,11 @@ svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp)
* both the head and tail.
*/
if (resbuf->tail[0].iov_base) {
- BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base
- + PAGE_SIZE);
- BUG_ON(resbuf->tail[0].iov_base < resbuf->head[0].iov_base);
+ if (resbuf->tail[0].iov_base >=
+ resbuf->head[0].iov_base + PAGE_SIZE)
+ return -EINVAL;
+ if (resbuf->tail[0].iov_base < resbuf->head[0].iov_base)
+ return -EINVAL;
if (resbuf->tail[0].iov_len + resbuf->head[0].iov_len
+ 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE)
return -ENOMEM;
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index bd843a81afa0..af0ddd28b081 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -32,13 +32,13 @@
#include <linux/sunrpc/cache.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/rpc_pipe_fs.h>
+#include <trace/events/sunrpc.h>
#include "netns.h"
#define RPCDBG_FACILITY RPCDBG_CACHE
static bool cache_defer_req(struct cache_req *req, struct cache_head *item);
static void cache_revisit_request(struct cache_head *item);
-static bool cache_listeners_exist(struct cache_detail *detail);
static void cache_init(struct cache_head *h, struct cache_detail *detail)
{
@@ -65,13 +65,14 @@ static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail,
rcu_read_lock();
hlist_for_each_entry_rcu(tmp, head, cache_list) {
- if (detail->match(tmp, key)) {
- if (cache_is_expired(detail, tmp))
- continue;
- tmp = cache_get_rcu(tmp);
- rcu_read_unlock();
- return tmp;
- }
+ if (!detail->match(tmp, key))
+ continue;
+ if (test_bit(CACHE_VALID, &tmp->flags) &&
+ cache_is_expired(detail, tmp))
+ continue;
+ tmp = cache_get_rcu(tmp);
+ rcu_read_unlock();
+ return tmp;
}
rcu_read_unlock();
return NULL;
@@ -113,18 +114,21 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail,
spin_lock(&detail->hash_lock);
/* check if entry appeared while we slept */
- hlist_for_each_entry_rcu(tmp, head, cache_list) {
- if (detail->match(tmp, key)) {
- if (cache_is_expired(detail, tmp)) {
- sunrpc_begin_cache_remove_entry(tmp, detail);
- freeme = tmp;
- break;
- }
- cache_get(tmp);
- spin_unlock(&detail->hash_lock);
- cache_put(new, detail);
- return tmp;
+ hlist_for_each_entry_rcu(tmp, head, cache_list,
+ lockdep_is_held(&detail->hash_lock)) {
+ if (!detail->match(tmp, key))
+ continue;
+ if (test_bit(CACHE_VALID, &tmp->flags) &&
+ cache_is_expired(detail, tmp)) {
+ sunrpc_begin_cache_remove_entry(tmp, detail);
+ trace_cache_entry_expired(detail, tmp);
+ freeme = tmp;
+ break;
}
+ cache_get(tmp);
+ spin_unlock(&detail->hash_lock);
+ cache_put(new, detail);
+ return tmp;
}
hlist_add_head_rcu(&new->cache_list, head);
@@ -174,6 +178,25 @@ static void cache_fresh_unlocked(struct cache_head *head,
}
}
+static void cache_make_negative(struct cache_detail *detail,
+ struct cache_head *h)
+{
+ set_bit(CACHE_NEGATIVE, &h->flags);
+ trace_cache_entry_make_negative(detail, h);
+}
+
+static void cache_entry_update(struct cache_detail *detail,
+ struct cache_head *h,
+ struct cache_head *new)
+{
+ if (!test_bit(CACHE_NEGATIVE, &new->flags)) {
+ detail->update(h, new);
+ trace_cache_entry_update(detail, h);
+ } else {
+ cache_make_negative(detail, h);
+ }
+}
+
struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
struct cache_head *new, struct cache_head *old, int hash)
{
@@ -186,10 +209,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
if (!test_bit(CACHE_VALID, &old->flags)) {
spin_lock(&detail->hash_lock);
if (!test_bit(CACHE_VALID, &old->flags)) {
- if (test_bit(CACHE_NEGATIVE, &new->flags))
- set_bit(CACHE_NEGATIVE, &old->flags);
- else
- detail->update(old, new);
+ cache_entry_update(detail, old, new);
cache_fresh_locked(old, new->expiry_time, detail);
spin_unlock(&detail->hash_lock);
cache_fresh_unlocked(old, detail);
@@ -207,10 +227,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
detail->init(tmp, old);
spin_lock(&detail->hash_lock);
- if (test_bit(CACHE_NEGATIVE, &new->flags))
- set_bit(CACHE_NEGATIVE, &tmp->flags);
- else
- detail->update(tmp, new);
+ cache_entry_update(detail, tmp, new);
hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]);
detail->entries++;
cache_get(tmp);
@@ -224,13 +241,6 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
}
EXPORT_SYMBOL_GPL(sunrpc_cache_update);
-static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h)
-{
- if (cd->cache_upcall)
- return cd->cache_upcall(cd, h);
- return sunrpc_cache_pipe_upcall(cd, h);
-}
-
static inline int cache_is_valid(struct cache_head *h)
{
if (!test_bit(CACHE_VALID, &h->flags))
@@ -259,7 +269,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h
spin_lock(&detail->hash_lock);
rv = cache_is_valid(h);
if (rv == -EAGAIN) {
- set_bit(CACHE_NEGATIVE, &h->flags);
+ cache_make_negative(detail, h);
cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY,
detail);
rv = -ENOENT;
@@ -303,17 +313,14 @@ int cache_check(struct cache_detail *detail,
(h->expiry_time != 0 && age > refresh_age/2)) {
dprintk("RPC: Want update, refage=%lld, age=%lld\n",
refresh_age, age);
- if (!test_and_set_bit(CACHE_PENDING, &h->flags)) {
- switch (cache_make_upcall(detail, h)) {
- case -EINVAL:
- rv = try_to_negate_entry(detail, h);
- break;
- case -EAGAIN:
- cache_fresh_unlocked(h, detail);
- break;
- }
- } else if (!cache_listeners_exist(detail))
+ switch (detail->cache_upcall(detail, h)) {
+ case -EINVAL:
rv = try_to_negate_entry(detail, h);
+ break;
+ case -EAGAIN:
+ cache_fresh_unlocked(h, detail);
+ break;
+ }
}
if (rv == -EAGAIN) {
@@ -468,6 +475,7 @@ static int cache_clean(void)
continue;
sunrpc_begin_cache_remove_entry(ch, current_detail);
+ trace_cache_entry_expired(current_detail, ch);
rv = 1;
break;
}
@@ -1195,20 +1203,12 @@ static bool cache_listeners_exist(struct cache_detail *detail)
*
* Each request is at most one page long.
*/
-int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
+static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
{
-
char *buf;
struct cache_request *crq;
int ret = 0;
- if (!detail->cache_request)
- return -EINVAL;
-
- if (!cache_listeners_exist(detail)) {
- warn_no_listener(detail);
- return -EINVAL;
- }
if (test_bit(CACHE_CLEANED, &h->flags))
/* Too late to make an upcall */
return -EAGAIN;
@@ -1231,6 +1231,7 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
if (test_bit(CACHE_PENDING, &h->flags)) {
crq->item = cache_get(h);
list_add_tail(&crq->q.list, &detail->queue);
+ trace_cache_entry_upcall(detail, h);
} else
/* Lost a race, no longer PENDING, so don't enqueue */
ret = -EAGAIN;
@@ -1242,8 +1243,27 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
}
return ret;
}
+
+int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
+{
+ if (test_and_set_bit(CACHE_PENDING, &h->flags))
+ return 0;
+ return cache_pipe_upcall(detail, h);
+}
EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall);
+int sunrpc_cache_pipe_upcall_timeout(struct cache_detail *detail,
+ struct cache_head *h)
+{
+ if (!cache_listeners_exist(detail)) {
+ warn_no_listener(detail);
+ trace_cache_entry_no_listener(detail, h);
+ return -EINVAL;
+ }
+ return sunrpc_cache_pipe_upcall(detail, h);
+}
+EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall_timeout);
+
/*
* parse a message from user-space and pass it
* to an appropriate cache
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 7324b21f923e..325a0858700f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1099,8 +1099,9 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
task->tk_msg.rpc_proc = msg->rpc_proc;
task->tk_msg.rpc_argp = msg->rpc_argp;
task->tk_msg.rpc_resp = msg->rpc_resp;
- if (msg->rpc_cred != NULL)
- task->tk_msg.rpc_cred = get_cred(msg->rpc_cred);
+ task->tk_msg.rpc_cred = msg->rpc_cred;
+ if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
+ get_cred(task->tk_msg.rpc_cred);
}
}
@@ -1126,6 +1127,9 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
task = rpc_new_task(task_setup_data);
+ if (!RPC_IS_ASYNC(task))
+ task->tk_flags |= RPC_TASK_CRED_NOREF;
+
rpc_task_set_client(task, task_setup_data->rpc_client);
rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
@@ -2509,6 +2513,7 @@ call_decode(struct rpc_task *task)
goto out;
req->rq_rcv_buf.len = req->rq_private_buf.len;
+ trace_xprt_recvfrom(&req->rq_rcv_buf);
/* Check that the softirq receive buffer is valid */
WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf,
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 55e900255b0c..7eba20a88438 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -204,10 +204,6 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
struct rpc_task *task,
unsigned char queue_priority)
{
- WARN_ON_ONCE(RPC_IS_QUEUED(task));
- if (RPC_IS_QUEUED(task))
- return;
-
INIT_LIST_HEAD(&task->u.tk_wait.timer_list);
if (RPC_IS_PRIORITY(queue))
__rpc_add_wait_queue_priority(queue, task, queue_priority);
@@ -382,7 +378,7 @@ static void rpc_make_runnable(struct workqueue_struct *wq,
* NB: An RPC task will only receive interrupt-driven events as long
* as it's on a wait queue.
*/
-static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
+static void __rpc_do_sleep_on_priority(struct rpc_wait_queue *q,
struct rpc_task *task,
unsigned char queue_priority)
{
@@ -395,12 +391,23 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
}
+static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
+ struct rpc_task *task,
+ unsigned char queue_priority)
+{
+ if (WARN_ON_ONCE(RPC_IS_QUEUED(task)))
+ return;
+ __rpc_do_sleep_on_priority(q, task, queue_priority);
+}
+
static void __rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q,
struct rpc_task *task, unsigned long timeout,
unsigned char queue_priority)
{
+ if (WARN_ON_ONCE(RPC_IS_QUEUED(task)))
+ return;
if (time_is_after_jiffies(timeout)) {
- __rpc_sleep_on_priority(q, task, queue_priority);
+ __rpc_do_sleep_on_priority(q, task, queue_priority);
__rpc_add_timer(q, task, timeout);
} else
task->tk_status = -ETIMEDOUT;
@@ -1162,7 +1169,8 @@ static void rpc_release_resources_task(struct rpc_task *task)
{
xprt_release(task);
if (task->tk_msg.rpc_cred) {
- put_cred(task->tk_msg.rpc_cred);
+ if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
+ put_cred(task->tk_msg.rpc_cred);
task->tk_msg.rpc_cred = NULL;
}
rpc_task_release_client(task);
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 1a864f1ed119..3fc8af8bb961 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -14,9 +14,24 @@
#include <linux/types.h>
#include <linux/pagemap.h>
#include <linux/udp.h>
+#include <linux/sunrpc/msg_prot.h>
#include <linux/sunrpc/xdr.h>
#include <linux/export.h>
+#include "socklib.h"
+
+/*
+ * Helper structure for copying from an sk_buff.
+ */
+struct xdr_skb_reader {
+ struct sk_buff *skb;
+ unsigned int offset;
+ size_t count;
+ __wsum csum;
+};
+
+typedef size_t (*xdr_skb_read_actor)(struct xdr_skb_reader *desc, void *to,
+ size_t len);
/**
* xdr_skb_read_bits - copy some data bits from skb to internal buffer
@@ -186,3 +201,129 @@ no_checksum:
return 0;
}
EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr);
+
+static inline int xprt_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t seek)
+{
+ if (seek)
+ iov_iter_advance(&msg->msg_iter, seek);
+ return sock_sendmsg(sock, msg);
+}
+
+static int xprt_send_kvec(struct socket *sock, struct msghdr *msg,
+ struct kvec *vec, size_t seek)
+{
+ iov_iter_kvec(&msg->msg_iter, WRITE, vec, 1, vec->iov_len);
+ return xprt_sendmsg(sock, msg, seek);
+}
+
+static int xprt_send_pagedata(struct socket *sock, struct msghdr *msg,
+ struct xdr_buf *xdr, size_t base)
+{
+ int err;
+
+ err = xdr_alloc_bvec(xdr, GFP_KERNEL);
+ if (err < 0)
+ return err;
+
+ iov_iter_bvec(&msg->msg_iter, WRITE, xdr->bvec, xdr_buf_pagecount(xdr),
+ xdr->page_len + xdr->page_base);
+ return xprt_sendmsg(sock, msg, base + xdr->page_base);
+}
+
+/* Common case:
+ * - stream transport
+ * - sending from byte 0 of the message
+ * - the message is wholly contained in @xdr's head iovec
+ */
+static int xprt_send_rm_and_kvec(struct socket *sock, struct msghdr *msg,
+ rpc_fraghdr marker, struct kvec *vec,
+ size_t base)
+{
+ struct kvec iov[2] = {
+ [0] = {
+ .iov_base = &marker,
+ .iov_len = sizeof(marker)
+ },
+ [1] = *vec,
+ };
+ size_t len = iov[0].iov_len + iov[1].iov_len;
+
+ iov_iter_kvec(&msg->msg_iter, WRITE, iov, 2, len);
+ return xprt_sendmsg(sock, msg, base);
+}
+
+/**
+ * xprt_sock_sendmsg - write an xdr_buf directly to a socket
+ * @sock: open socket to send on
+ * @msg: socket message metadata
+ * @xdr: xdr_buf containing this request
+ * @base: starting position in the buffer
+ * @marker: stream record marker field
+ * @sent_p: return the total number of bytes successfully queued for sending
+ *
+ * Return values:
+ * On success, returns zero and fills in @sent_p.
+ * %-ENOTSOCK if @sock is not a struct socket.
+ */
+int xprt_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ struct xdr_buf *xdr, unsigned int base,
+ rpc_fraghdr marker, unsigned int *sent_p)
+{
+ unsigned int rmsize = marker ? sizeof(marker) : 0;
+ unsigned int remainder = rmsize + xdr->len - base;
+ unsigned int want;
+ int err = 0;
+
+ *sent_p = 0;
+
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ msg->msg_flags |= MSG_MORE;
+ want = xdr->head[0].iov_len + rmsize;
+ if (base < want) {
+ unsigned int len = want - base;
+
+ remainder -= len;
+ if (remainder == 0)
+ msg->msg_flags &= ~MSG_MORE;
+ if (rmsize)
+ err = xprt_send_rm_and_kvec(sock, msg, marker,
+ &xdr->head[0], base);
+ else
+ err = xprt_send_kvec(sock, msg, &xdr->head[0], base);
+ if (remainder == 0 || err != len)
+ goto out;
+ *sent_p += err;
+ base = 0;
+ } else {
+ base -= want;
+ }
+
+ if (base < xdr->page_len) {
+ unsigned int len = xdr->page_len - base;
+
+ remainder -= len;
+ if (remainder == 0)
+ msg->msg_flags &= ~MSG_MORE;
+ err = xprt_send_pagedata(sock, msg, xdr, base);
+ if (remainder == 0 || err != len)
+ goto out;
+ *sent_p += err;
+ base = 0;
+ } else {
+ base -= xdr->page_len;
+ }
+
+ if (base >= xdr->tail[0].iov_len)
+ return 0;
+ msg->msg_flags &= ~MSG_MORE;
+ err = xprt_send_kvec(sock, msg, &xdr->tail[0], base);
+out:
+ if (err > 0) {
+ *sent_p += err;
+ err = 0;
+ }
+ return err;
+}
diff --git a/net/sunrpc/socklib.h b/net/sunrpc/socklib.h
new file mode 100644
index 000000000000..c48114ad6f00
--- /dev/null
+++ b/net/sunrpc/socklib.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de>
+ * Copyright (C) 2020, Oracle.
+ */
+
+#ifndef _NET_SUNRPC_SOCKLIB_H_
+#define _NET_SUNRPC_SOCKLIB_H_
+
+int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb);
+int xprt_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ struct xdr_buf *xdr, unsigned int base,
+ rpc_fraghdr marker, unsigned int *sent_p);
+
+#endif /* _NET_SUNRPC_SOCKLIB_H_ */
diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h
index c9bacb3c930f..47a756503d11 100644
--- a/net/sunrpc/sunrpc.h
+++ b/net/sunrpc/sunrpc.h
@@ -50,10 +50,6 @@ static inline int sock_is_loopback(struct sock *sk)
return loopback;
}
-int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
- struct page *headpage, unsigned long headoffset,
- struct page *tailpage, unsigned long tailoffset);
-
int rpc_clients_notifier_register(void);
void rpc_clients_notifier_unregister(void);
#endif /* _NET_SUNRPC_SUNRPC_H */
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 187dd4e73d64..9ed3126600ce 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1529,10 +1529,6 @@ svc_process(struct svc_rqst *rqstp)
goto out_drop;
}
- /* Reserve space for the record marker */
- if (rqstp->rq_prot == IPPROTO_TCP)
- svc_putnl(resv, 0);
-
/* Returns 1 for send, 0 for drop */
if (likely(svc_process_common(rqstp, argv, resv)))
return svc_send(rqstp);
@@ -1637,6 +1633,22 @@ u32 svc_max_payload(const struct svc_rqst *rqstp)
EXPORT_SYMBOL_GPL(svc_max_payload);
/**
+ * svc_encode_read_payload - mark a range of bytes as a READ payload
+ * @rqstp: svc_rqst to operate on
+ * @offset: payload's byte offset in rqstp->rq_res
+ * @length: size of payload, in bytes
+ *
+ * Returns zero on success, or a negative errno if a permanent
+ * error occurred.
+ */
+int svc_encode_read_payload(struct svc_rqst *rqstp, unsigned int offset,
+ unsigned int length)
+{
+ return rqstp->rq_xprt->xpt_ops->xpo_read_payload(rqstp, offset, length);
+}
+EXPORT_SYMBOL_GPL(svc_encode_read_payload);
+
+/**
* svc_fill_write_vector - Construct data argument for VFS write call
* @rqstp: svc_rqst to operate on
* @pages: list of pages containing data payload
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index de3c077733a7..e27e3532ec75 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -104,8 +104,17 @@ void svc_unreg_xprt_class(struct svc_xprt_class *xcl)
}
EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
-/*
- * Format the transport list for printing
+/**
+ * svc_print_xprts - Format the transport list for printing
+ * @buf: target buffer for formatted address
+ * @maxlen: length of target buffer
+ *
+ * Fills in @buf with a string containing a list of transport names, each name
+ * terminated with '\n'. If the buffer is too small, some entries may be
+ * missing, but it is guaranteed that all lines in the output buffer are
+ * complete.
+ *
+ * Returns positive length of the filled-in string.
*/
int svc_print_xprts(char *buf, int maxlen)
{
@@ -118,9 +127,9 @@ int svc_print_xprts(char *buf, int maxlen)
list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) {
int slen;
- sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload);
- slen = strlen(tmpstr);
- if (len + slen > maxlen)
+ slen = snprintf(tmpstr, sizeof(tmpstr), "%s %d\n",
+ xcl->xcl_name, xcl->xcl_max_payload);
+ if (slen >= sizeof(tmpstr) || len + slen >= maxlen)
break;
len += slen;
strcat(buf, tmpstr);
@@ -802,6 +811,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt)
len = svc_deferred_recv(rqstp);
else
len = xprt->xpt_ops->xpo_recvfrom(rqstp);
+ if (len > 0)
+ trace_svc_recvfrom(&rqstp->rq_arg);
rqstp->rq_stime = ktime_get();
rqstp->rq_reserved = serv->sv_max_mesg;
atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
@@ -905,6 +916,7 @@ int svc_send(struct svc_rqst *rqstp)
xb->len = xb->head[0].iov_len +
xb->page_len +
xb->tail[0].iov_len;
+ trace_svc_sendto(xb);
/* Grab mutex to serialize outgoing data. */
mutex_lock(&xprt->xpt_mutex);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 04aa80a2d752..6c8f802c4261 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -148,6 +148,11 @@ static struct cache_head *ip_map_alloc(void)
return NULL;
}
+static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall(cd, h);
+}
+
static void ip_map_request(struct cache_detail *cd,
struct cache_head *h,
char **bpp, int *blen)
@@ -467,6 +472,11 @@ static struct cache_head *unix_gid_alloc(void)
return NULL;
}
+static int unix_gid_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+ return sunrpc_cache_pipe_upcall_timeout(cd, h);
+}
+
static void unix_gid_request(struct cache_detail *cd,
struct cache_head *h,
char **bpp, int *blen)
@@ -584,6 +594,7 @@ static const struct cache_detail unix_gid_cache_template = {
.hash_size = GID_HASHMAX,
.name = "auth.unix.gid",
.cache_put = unix_gid_put,
+ .cache_upcall = unix_gid_upcall,
.cache_request = unix_gid_request,
.cache_parse = unix_gid_parse,
.cache_show = unix_gid_show,
@@ -881,6 +892,7 @@ static const struct cache_detail ip_map_cache_template = {
.hash_size = IP_HASHMAX,
.name = "auth.unix.ip",
.cache_put = ip_map_put,
+ .cache_upcall = ip_map_upcall,
.cache_request = ip_map_request,
.cache_parse = ip_map_parse,
.cache_show = ip_map_show,
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 2934dd711715..519cf9c4f8fd 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -55,6 +55,7 @@
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/xprt.h>
+#include "socklib.h"
#include "sunrpc.h"
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
@@ -174,109 +175,10 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
}
}
-/*
- * send routine intended to be shared by the fore- and back-channel
- */
-int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
- struct page *headpage, unsigned long headoffset,
- struct page *tailpage, unsigned long tailoffset)
-{
- int result;
- int size;
- struct page **ppage = xdr->pages;
- size_t base = xdr->page_base;
- unsigned int pglen = xdr->page_len;
- unsigned int flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
- int slen;
- int len = 0;
-
- slen = xdr->len;
-
- /* send head */
- if (slen == xdr->head[0].iov_len)
- flags = 0;
- len = kernel_sendpage(sock, headpage, headoffset,
- xdr->head[0].iov_len, flags);
- if (len != xdr->head[0].iov_len)
- goto out;
- slen -= xdr->head[0].iov_len;
- if (slen == 0)
- goto out;
-
- /* send page data */
- size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen;
- while (pglen > 0) {
- if (slen == size)
- flags = 0;
- result = kernel_sendpage(sock, *ppage, base, size, flags);
- if (result > 0)
- len += result;
- if (result != size)
- goto out;
- slen -= size;
- pglen -= size;
- size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen;
- base = 0;
- ppage++;
- }
-
- /* send tail */
- if (xdr->tail[0].iov_len) {
- result = kernel_sendpage(sock, tailpage, tailoffset,
- xdr->tail[0].iov_len, 0);
- if (result > 0)
- len += result;
- }
-
-out:
- return len;
-}
-
-
-/*
- * Generic sendto routine
- */
-static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
+static int svc_sock_read_payload(struct svc_rqst *rqstp, unsigned int offset,
+ unsigned int length)
{
- struct svc_sock *svsk =
- container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt);
- struct socket *sock = svsk->sk_sock;
- union {
- struct cmsghdr hdr;
- long all[SVC_PKTINFO_SPACE / sizeof(long)];
- } buffer;
- struct cmsghdr *cmh = &buffer.hdr;
- int len = 0;
- unsigned long tailoff;
- unsigned long headoff;
- RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
-
- if (rqstp->rq_prot == IPPROTO_UDP) {
- struct msghdr msg = {
- .msg_name = &rqstp->rq_addr,
- .msg_namelen = rqstp->rq_addrlen,
- .msg_control = cmh,
- .msg_controllen = sizeof(buffer),
- .msg_flags = MSG_MORE,
- };
-
- svc_set_cmsg_data(rqstp, cmh);
-
- if (sock_sendmsg(sock, &msg) < 0)
- goto out;
- }
-
- tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1);
- headoff = 0;
- len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff,
- rqstp->rq_respages[0], tailoff);
-
-out:
- dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n",
- svsk, xdr->head[0].iov_base, xdr->head[0].iov_len,
- xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf)));
-
- return len;
+ return 0;
}
/*
@@ -600,17 +502,43 @@ out_free:
return 0;
}
-static int
-svc_udp_sendto(struct svc_rqst *rqstp)
+/**
+ * svc_udp_sendto - Send out a reply on a UDP socket
+ * @rqstp: completed svc_rqst
+ *
+ * Returns the number of bytes sent, or a negative errno.
+ */
+static int svc_udp_sendto(struct svc_rqst *rqstp)
{
- int error;
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct xdr_buf *xdr = &rqstp->rq_res;
+ union {
+ struct cmsghdr hdr;
+ long all[SVC_PKTINFO_SPACE / sizeof(long)];
+ } buffer;
+ struct cmsghdr *cmh = &buffer.hdr;
+ struct msghdr msg = {
+ .msg_name = &rqstp->rq_addr,
+ .msg_namelen = rqstp->rq_addrlen,
+ .msg_control = cmh,
+ .msg_controllen = sizeof(buffer),
+ };
+ unsigned int uninitialized_var(sent);
+ int err;
- error = svc_sendto(rqstp, &rqstp->rq_res);
- if (error == -ECONNREFUSED)
- /* ICMP error on earlier request. */
- error = svc_sendto(rqstp, &rqstp->rq_res);
+ svc_set_cmsg_data(rqstp, cmh);
- return error;
+ err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent);
+ xdr_free_bvec(xdr);
+ if (err == -ECONNREFUSED) {
+ /* ICMP error on earlier request. */
+ err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent);
+ xdr_free_bvec(xdr);
+ }
+ if (err < 0)
+ return err;
+ return sent;
}
static int svc_udp_has_wspace(struct svc_xprt *xprt)
@@ -653,6 +581,7 @@ static const struct svc_xprt_ops svc_udp_ops = {
.xpo_create = svc_udp_create,
.xpo_recvfrom = svc_udp_recvfrom,
.xpo_sendto = svc_udp_sendto,
+ .xpo_read_payload = svc_sock_read_payload,
.xpo_release_rqst = svc_release_udp_skb,
.xpo_detach = svc_sock_detach,
.xpo_free = svc_sock_free,
@@ -1128,35 +1057,39 @@ err_noclose:
return 0; /* record not complete */
}
-/*
- * Send out data on TCP socket.
+/**
+ * svc_tcp_sendto - Send out a reply on a TCP socket
+ * @rqstp: completed svc_rqst
+ *
+ * Returns the number of bytes sent, or a negative errno.
*/
static int svc_tcp_sendto(struct svc_rqst *rqstp)
{
- struct xdr_buf *xbufp = &rqstp->rq_res;
- int sent;
- __be32 reclen;
+ struct svc_xprt *xprt = rqstp->rq_xprt;
+ struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ struct xdr_buf *xdr = &rqstp->rq_res;
+ rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
+ (u32)xdr->len);
+ struct msghdr msg = {
+ .msg_flags = 0,
+ };
+ unsigned int uninitialized_var(sent);
+ int err;
- /* Set up the first element of the reply kvec.
- * Any other kvecs that may be in use have been taken
- * care of by the server implementation itself.
- */
- reclen = htonl(0x80000000|((xbufp->len ) - 4));
- memcpy(xbufp->head[0].iov_base, &reclen, 4);
-
- sent = svc_sendto(rqstp, &rqstp->rq_res);
- if (sent != xbufp->len) {
- printk(KERN_NOTICE
- "rpc-srv/tcp: %s: %s %d when sending %d bytes "
- "- shutting down socket\n",
- rqstp->rq_xprt->xpt_server->sv_name,
- (sent<0)?"got error":"sent only",
- sent, xbufp->len);
- set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags);
- svc_xprt_enqueue(rqstp->rq_xprt);
- sent = -EAGAIN;
- }
+ err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent);
+ xdr_free_bvec(xdr);
+ if (err < 0 || sent != (xdr->len + sizeof(marker)))
+ goto out_close;
return sent;
+
+out_close:
+ pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
+ xprt->xpt_server->sv_name,
+ (err < 0) ? "got error" : "sent",
+ (err < 0) ? err : sent, xdr->len);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_xprt_enqueue(xprt);
+ return -EAGAIN;
}
static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
@@ -1171,6 +1104,7 @@ static const struct svc_xprt_ops svc_tcp_ops = {
.xpo_create = svc_tcp_create,
.xpo_recvfrom = svc_tcp_recvfrom,
.xpo_sendto = svc_tcp_sendto,
+ .xpo_read_payload = svc_sock_read_payload,
.xpo_release_rqst = svc_release_skb,
.xpo_detach = svc_tcp_sock_detach,
.xpo_free = svc_sock_free,
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index e5497dc2475b..15b58c5144f9 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1235,61 +1235,6 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
}
EXPORT_SYMBOL_GPL(xdr_encode_word);
-/**
- * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf
- * @buf: pointer to buffer containing a mic
- * @mic: on success, returns the address of the mic
- * @offset: the offset in buf where mic may be found
- *
- * This function may modify the xdr buf if the mic is found to be straddling
- * a boundary between head, pages, and tail. On success the mic can be read
- * from the address returned. There is no need to free the mic.
- *
- * Return: Success returns 0, otherwise an integer error.
- */
-int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset)
-{
- struct xdr_buf subbuf;
- unsigned int boundary;
-
- if (xdr_decode_word(buf, offset, &mic->len))
- return -EFAULT;
- offset += 4;
-
- /* Is the mic partially in the head? */
- boundary = buf->head[0].iov_len;
- if (offset < boundary && (offset + mic->len) > boundary)
- xdr_shift_buf(buf, boundary - offset);
-
- /* Is the mic partially in the pages? */
- boundary += buf->page_len;
- if (offset < boundary && (offset + mic->len) > boundary)
- xdr_shrink_pagelen(buf, boundary - offset);
-
- if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len))
- return -EFAULT;
-
- /* Is the mic contained entirely in the head? */
- mic->data = subbuf.head[0].iov_base;
- if (subbuf.head[0].iov_len == mic->len)
- return 0;
- /* ..or is the mic contained entirely in the tail? */
- mic->data = subbuf.tail[0].iov_base;
- if (subbuf.tail[0].iov_len == mic->len)
- return 0;
-
- /* Find a contiguous area in @buf to hold all of @mic */
- if (mic->len > buf->buflen - buf->len)
- return -ENOMEM;
- if (buf->tail[0].iov_len != 0)
- mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
- else
- mic->data = buf->head[0].iov_base + buf->head[0].iov_len;
- __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len);
- return 0;
-}
-EXPORT_SYMBOL_GPL(xdr_buf_read_mic);
-
/* Returns 0 on success, or else a negative error code. */
static int
xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 1aafe8d3f3f4..493a30a296fc 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1117,8 +1117,6 @@ void xprt_complete_rqst(struct rpc_task *task, int copied)
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
- dprintk("RPC: %5u xid %08x complete (%d bytes received)\n",
- task->tk_pid, ntohl(req->rq_xid), copied);
trace_xprt_complete_rqst(xprt, req->rq_xid, copied);
xprt->stat.recvs++;
@@ -1462,6 +1460,7 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task)
*/
req->rq_ntrans++;
+ trace_xprt_sendto(&req->rq_snd_buf);
connect_cookie = xprt->connect_cookie;
status = xprt->ops->send_request(req);
if (status != 0) {
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 1a0ae0c61353..c92c1aac270a 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -44,10 +44,10 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
size_t maxmsg;
- maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv);
+ maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv);
maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
return maxmsg - RPCRDMA_HDRLEN_MIN;
}
@@ -115,7 +115,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
if (rc < 0)
goto failed_marshal;
- if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+ if (rpcrdma_post_sends(r_xprt, req))
goto drop_connection;
return 0;
@@ -190,7 +190,7 @@ create_req:
if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS)
return NULL;
- size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE);
+ size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE);
req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL);
if (!req)
return NULL;
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 125297c9aa3e..ef997880e17a 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -52,7 +52,7 @@
/**
* frwr_release_mr - Destroy one MR
- * @mr: MR allocated by frwr_init_mr
+ * @mr: MR allocated by frwr_mr_init
*
*/
void frwr_release_mr(struct rpcrdma_mr *mr)
@@ -74,7 +74,7 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr)
if (mr->mr_dir != DMA_NONE) {
trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
+ ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
mr->mr_sg, mr->mr_nents, mr->mr_dir);
mr->mr_dir = DMA_NONE;
}
@@ -106,21 +106,22 @@ void frwr_reset(struct rpcrdma_req *req)
}
/**
- * frwr_init_mr - Initialize one MR
- * @ia: interface adapter
+ * frwr_mr_init - Initialize one MR
+ * @r_xprt: controlling transport instance
* @mr: generic MR to prepare for FRWR
*
* Returns zero if successful. Otherwise a negative errno
* is returned.
*/
-int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
{
- unsigned int depth = ia->ri_max_frwr_depth;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ unsigned int depth = ep->re_max_fr_depth;
struct scatterlist *sg;
struct ib_mr *frmr;
int rc;
- frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
+ frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth);
if (IS_ERR(frmr))
goto out_mr_err;
@@ -128,6 +129,7 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
if (!sg)
goto out_list_err;
+ mr->mr_xprt = r_xprt;
mr->frwr.fr_mr = frmr;
mr->mr_dir = DMA_NONE;
INIT_LIST_HEAD(&mr->mr_list);
@@ -149,29 +151,24 @@ out_list_err:
/**
* frwr_query_device - Prepare a transport for use with FRWR
- * @r_xprt: controlling transport instance
+ * @ep: endpoint to fill in
* @device: RDMA device to query
*
* On success, sets:
- * ep->rep_attr
- * ep->rep_max_requests
- * ia->ri_max_rdma_segs
- *
- * And these FRWR-related fields:
- * ia->ri_max_frwr_depth
- * ia->ri_mrtype
+ * ep->re_attr
+ * ep->re_max_requests
+ * ep->re_max_rdma_segs
+ * ep->re_max_fr_depth
+ * ep->re_mrtype
*
* Return values:
* On success, returns zero.
* %-EINVAL - the device does not support FRWR memory registration
* %-ENOMEM - the device is not sufficiently capable for NFS/RDMA
*/
-int frwr_query_device(struct rpcrdma_xprt *r_xprt,
- const struct ib_device *device)
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
{
const struct ib_device_attr *attrs = &device->attrs;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
int max_qp_wr, depth, delta;
unsigned int max_sge;
@@ -188,23 +185,23 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge);
return -ENOMEM;
}
- ep->rep_attr.cap.max_send_sge = max_sge;
- ep->rep_attr.cap.max_recv_sge = 1;
+ ep->re_attr.cap.max_send_sge = max_sge;
+ ep->re_attr.cap.max_recv_sge = 1;
- ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
+ ep->re_mrtype = IB_MR_TYPE_MEM_REG;
if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
- ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
+ ep->re_mrtype = IB_MR_TYPE_SG_GAPS;
/* Quirk: Some devices advertise a large max_fast_reg_page_list_len
* capability, but perform optimally when the MRs are not larger
* than a page.
*/
if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS)
- ia->ri_max_frwr_depth = attrs->max_sge_rd;
+ ep->re_max_fr_depth = attrs->max_sge_rd;
else
- ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len;
- if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS)
- ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS;
+ ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len;
+ if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS)
+ ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS;
/* Add room for frwr register and invalidate WRs.
* 1. FRWR reg WR for head
@@ -220,11 +217,11 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
/* Calculate N if the device max FRWR depth is smaller than
* RPCRDMA_MAX_DATA_SEGS.
*/
- if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) {
- delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth;
+ if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) {
+ delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth;
do {
depth += 2; /* FRWR reg + invalidate */
- delta -= ia->ri_max_frwr_depth;
+ delta -= ep->re_max_fr_depth;
} while (delta > 0);
}
@@ -233,34 +230,34 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
max_qp_wr -= 1;
if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
return -ENOMEM;
- if (ep->rep_max_requests > max_qp_wr)
- ep->rep_max_requests = max_qp_wr;
- ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
- if (ep->rep_attr.cap.max_send_wr > max_qp_wr) {
- ep->rep_max_requests = max_qp_wr / depth;
- if (!ep->rep_max_requests)
+ if (ep->re_max_requests > max_qp_wr)
+ ep->re_max_requests = max_qp_wr;
+ ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
+ if (ep->re_attr.cap.max_send_wr > max_qp_wr) {
+ ep->re_max_requests = max_qp_wr / depth;
+ if (!ep->re_max_requests)
return -ENOMEM;
- ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
+ ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
}
- ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
- ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
- ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests;
- ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
- ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
-
- ia->ri_max_rdma_segs =
- DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
+ ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+ ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
+ ep->re_attr.cap.max_recv_wr = ep->re_max_requests;
+ ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+ ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
+
+ ep->re_max_rdma_segs =
+ DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth);
/* Reply chunks require segments for head and tail buffers */
- ia->ri_max_rdma_segs += 2;
- if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
- ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
+ ep->re_max_rdma_segs += 2;
+ if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
+ ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
/* Ensure the underlying device is capable of conveying the
* largest r/wsize NFS will ask for. This guarantees that
* failing over from one RDMA device to another will not
* break NFS I/O.
*/
- if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS)
+ if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS)
return -ENOMEM;
return 0;
@@ -286,14 +283,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
int nsegs, bool writing, __be32 xid,
struct rpcrdma_mr *mr)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct ib_reg_wr *reg_wr;
int i, n, dma_nents;
struct ib_mr *ibmr;
u8 key;
- if (nsegs > ia->ri_max_frwr_depth)
- nsegs = ia->ri_max_frwr_depth;
+ if (nsegs > ep->re_max_fr_depth)
+ nsegs = ep->re_max_fr_depth;
for (i = 0; i < nsegs;) {
if (seg->mr_page)
sg_set_page(&mr->mr_sg[i],
@@ -306,7 +303,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
++seg;
++i;
- if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS)
+ if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
continue;
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
@@ -315,7 +312,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
mr->mr_dir = rpcrdma_data_dir(writing);
mr->mr_nents = i;
- dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents,
+ dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents,
mr->mr_dir);
if (!dma_nents)
goto out_dmamap_err;
@@ -356,8 +353,8 @@ out_mapmr_err:
/**
* frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed FastReg WR
*
*/
static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
@@ -369,20 +366,25 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_fastreg(wc, frwr);
/* The MR will get recycled when the associated req is retransmitted */
+
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
- * frwr_send - post Send WR containing the RPC Call message
- * @ia: interface adapter
- * @req: Prepared RPC Call
+ * frwr_send - post Send WRs containing the RPC Call message
+ * @r_xprt: controlling transport instance
+ * @req: prepared RPC Call
*
* For FRWR, chain any FastReg WRs to the Send WR. Only a
* single ib_post_send call is needed to register memory
* and then post the Send WR.
*
- * Returns the result of ib_post_send.
+ * Returns the return code from ib_post_send.
+ *
+ * Caller must hold the transport send lock to ensure that the
+ * pointers to the transport's rdma_cm_id and QP are stable.
*/
-int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *post_wr;
struct rpcrdma_mr *mr;
@@ -403,7 +405,7 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
post_wr = &frwr->fr_regwr.wr;
}
- return ib_post_send(ia->ri_id->qp, post_wr, NULL);
+ return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL);
}
/**
@@ -419,7 +421,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
list_for_each_entry(mr, mrs, mr_list)
if (mr->mr_handle == rep->rr_inv_rkey) {
list_del_init(&mr->mr_list);
- trace_xprtrdma_mr_remoteinv(mr);
+ trace_xprtrdma_mr_reminv(mr);
rpcrdma_mr_put(mr);
break; /* only one invalidated MR per RPC */
}
@@ -435,8 +437,8 @@ static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
/**
* frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
*/
static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
@@ -449,12 +451,14 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_li(wc, frwr);
__frwr_release_mr(wc, mr);
+
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
* frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
* Awaken anyone waiting for an MR to finish being fenced.
*/
@@ -469,6 +473,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
trace_xprtrdma_wc_li_wake(wc, frwr);
__frwr_release_mr(wc, mr);
complete(&frwr->fr_linv_done);
+
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
@@ -526,10 +532,10 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
- * unless ri_id->qp is a valid pointer.
+ * unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
/* The final LOCAL_INV WR in the chain is supposed to
* do the wake. If it was never posted, the wake will
@@ -556,8 +562,8 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/**
* frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
*/
static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
@@ -575,6 +581,8 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
/* Ensure @rep is generated before __frwr_release_mr */
smp_rmb();
rpcrdma_complete_rqst(rep);
+
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
@@ -629,10 +637,10 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
- * unless ri_id->qp is a valid pointer.
+ * unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
if (!rc)
return;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 28020ec104d4..4a81e6995d3e 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -103,21 +103,20 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
/**
* rpcrdma_set_max_header_sizes - Initialize inline payload sizes
- * @r_xprt: transport instance to initialize
+ * @ep: endpoint to initialize
*
* The max_inline fields contain the maximum size of an RPC message
* so the marshaling code doesn't have to repeat this calculation
* for every RPC.
*/
-void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep)
{
- unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ unsigned int maxsegs = ep->re_max_rdma_segs;
- ep->rep_max_inline_send =
- ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs);
- ep->rep_max_inline_recv =
- ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
+ ep->re_max_inline_send =
+ ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs);
+ ep->re_max_inline_recv =
+ ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
}
/* The client can send a request inline as long as the RPCRDMA header
@@ -132,9 +131,10 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
struct xdr_buf *xdr = &rqst->rq_snd_buf;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned int count, remaining, offset;
- if (xdr->len > r_xprt->rx_ep.rep_max_inline_send)
+ if (xdr->len > ep->re_max_inline_send)
return false;
if (xdr->page_len) {
@@ -145,7 +145,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
remaining -= min_t(unsigned int,
PAGE_SIZE - offset, remaining);
offset = 0;
- if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge)
+ if (++count > ep->re_attr.cap.max_send_sge)
return false;
}
}
@@ -162,7 +162,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
- return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv;
+ return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv;
}
/* The client is required to provide a Reply chunk if the maximum
@@ -176,7 +176,7 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
const struct xdr_buf *buf = &rqst->rq_rcv_buf;
return (buf->head[0].iov_len + buf->tail[0].iov_len) <
- r_xprt->rx_ep.rep_max_inline_recv;
+ r_xprt->rx_ep->re_max_inline_recv;
}
/* Split @vec on page boundaries into SGEs. FMR registers pages, not
@@ -255,7 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
/* When encoding a Read chunk, the tail iovec contains an
* XDR pad and may be omitted.
*/
- if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
+ if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup)
goto out;
/* When encoding a Write chunk, some servers need to see an
@@ -263,7 +263,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
* layer provides space in the tail iovec that may be used
* for this purpose.
*/
- if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
+ if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup)
goto out;
if (xdrbuf->tail[0].iov_len)
@@ -275,32 +275,6 @@ out:
return n;
}
-static inline int
-encode_item_present(struct xdr_stream *xdr)
-{
- __be32 *p;
-
- p = xdr_reserve_space(xdr, sizeof(*p));
- if (unlikely(!p))
- return -EMSGSIZE;
-
- *p = xdr_one;
- return 0;
-}
-
-static inline int
-encode_item_not_present(struct xdr_stream *xdr)
-{
- __be32 *p;
-
- p = xdr_reserve_space(xdr, sizeof(*p));
- if (unlikely(!p))
- return -EMSGSIZE;
-
- *p = xdr_zero;
- return 0;
-}
-
static void
xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr)
{
@@ -414,7 +388,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
} while (nsegs);
done:
- return encode_item_not_present(xdr);
+ return xdr_stream_encode_item_absent(xdr);
}
/* Register and XDR encode the Write list. Supports encoding a list
@@ -453,7 +427,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
if (nsegs < 0)
return nsegs;
- if (encode_item_present(xdr) < 0)
+ if (xdr_stream_encode_item_present(xdr) < 0)
return -EMSGSIZE;
segcount = xdr_reserve_space(xdr, sizeof(*segcount));
if (unlikely(!segcount))
@@ -480,7 +454,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt,
*segcount = cpu_to_be32(nchunks);
done:
- return encode_item_not_present(xdr);
+ return xdr_stream_encode_item_absent(xdr);
}
/* Register and XDR encode the Reply chunk. Supports encoding an array
@@ -507,14 +481,14 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
__be32 *segcount;
if (wtype != rpcrdma_replych)
- return encode_item_not_present(xdr);
+ return xdr_stream_encode_item_absent(xdr);
seg = req->rl_segments;
nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg);
if (nsegs < 0)
return nsegs;
- if (encode_item_present(xdr) < 0)
+ if (xdr_stream_encode_item_present(xdr) < 0)
return -EMSGSIZE;
segcount = xdr_reserve_space(xdr, sizeof(*segcount));
if (unlikely(!segcount))
@@ -1476,8 +1450,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
if (credits == 0)
credits = 1; /* don't deadlock */
- else if (credits > r_xprt->rx_ep.rep_max_requests)
- credits = r_xprt->rx_ep.rep_max_requests;
+ else if (credits > r_xprt->rx_ep->re_max_requests)
+ credits = r_xprt->rx_ep->re_max_requests;
if (buf->rb_credits != credits)
rpcrdma_update_cwnd(r_xprt, credits);
rpcrdma_post_recvs(r_xprt, false);
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index 908e78bb87c6..d510a3a15d4b 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -117,7 +117,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
{
int ret;
- ret = svc_rdma_map_reply_msg(rdma, ctxt, &rqst->rq_snd_buf, NULL);
+ ret = svc_rdma_map_reply_msg(rdma, ctxt, NULL, &rqst->rq_snd_buf);
if (ret < 0)
return -EIO;
@@ -181,7 +181,9 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
if (!ctxt)
goto drop_connection;
- p = ctxt->sc_xprt_buf;
+ p = xdr_reserve_space(&ctxt->sc_stream, RPCRDMA_HDRLEN_MIN);
+ if (!p)
+ goto put_ctxt;
*p++ = rqst->rq_xid;
*p++ = rpcrdma_version;
*p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
@@ -189,7 +191,6 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
*p++ = xdr_zero;
*p++ = xdr_zero;
*p = xdr_zero;
- svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_MIN);
#ifdef SVCRDMA_BACKCHANNEL_DEBUG
pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
@@ -197,12 +198,13 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
rqst->rq_xtime = ktime_get();
rc = svc_rdma_bc_sendto(rdma, rqst, ctxt);
- if (rc) {
- svc_rdma_send_ctxt_put(rdma, ctxt);
- goto drop_connection;
- }
+ if (rc)
+ goto put_ctxt;
return 0;
+put_ctxt:
+ svc_rdma_send_ctxt_put(rdma, ctxt);
+
drop_connection:
dprintk("svcrdma: failed to send bc call\n");
return -ENOTCONN;
@@ -250,6 +252,7 @@ xprt_rdma_bc_put(struct rpc_xprt *xprt)
{
dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+ xprt_rdma_free_addresses(xprt);
xprt_free(xprt);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 96bccd398469..54469b72b25f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -193,6 +193,7 @@ svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma)
out:
ctxt->rc_page_count = 0;
+ ctxt->rc_read_payload_length = 0;
return ctxt;
out_empty:
@@ -357,15 +358,14 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
arg->len = ctxt->rc_byte_len;
}
-/* This accommodates the largest possible Write chunk,
- * in one segment.
+/* This accommodates the largest possible Write chunk.
*/
-#define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT))
+#define MAX_BYTES_WRITE_CHUNK ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT))
/* This accommodates the largest possible Position-Zero
- * Read chunk or Reply chunk, in one segment.
+ * Read chunk or Reply chunk.
*/
-#define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT))
+#define MAX_BYTES_SPECIAL_CHUNK ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT))
/* Sanity check the Read list.
*
@@ -373,7 +373,7 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
* - This implementation supports only one Read chunk.
*
* Sanity checks:
- * - Read list does not overflow buffer.
+ * - Read list does not overflow Receive buffer.
* - Segment size limited by largest NFS data payload.
*
* The segment count is limited to how many segments can
@@ -381,30 +381,44 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp,
* buffer. That's about 40 Read segments for a 1KB inline
* threshold.
*
- * Returns pointer to the following Write list.
+ * Return values:
+ * %true: Read list is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Read list.
+ * %false: Read list is corrupt. @rctxt's xdr_stream is left
+ * in an unknown state.
*/
-static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end)
+static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt)
{
- u32 position;
+ u32 position, len;
bool first;
+ __be32 *p;
+
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+ len = 0;
first = true;
- while (*p++ != xdr_zero) {
+ while (*p != xdr_zero) {
+ p = xdr_inline_decode(&rctxt->rc_stream,
+ rpcrdma_readseg_maxsz * sizeof(*p));
+ if (!p)
+ return false;
+
if (first) {
- position = be32_to_cpup(p++);
+ position = be32_to_cpup(p);
first = false;
- } else if (be32_to_cpup(p++) != position) {
- return NULL;
+ } else if (be32_to_cpup(p) != position) {
+ return false;
}
- p++; /* handle */
- if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG)
- return NULL;
- p += 2; /* offset */
+ p += 2;
+ len += be32_to_cpup(p);
- if (p > end)
- return NULL;
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
}
- return p;
+ return len <= MAX_BYTES_SPECIAL_CHUNK;
}
/* The segment count is limited to how many segments can
@@ -412,67 +426,100 @@ static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end)
* buffer. That's about 60 Write segments for a 1KB inline
* threshold.
*/
-static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end,
- u32 maxlen)
+static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen)
{
- u32 i, segcount;
+ u32 i, segcount, total;
+ __be32 *p;
+
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+ segcount = be32_to_cpup(p);
- segcount = be32_to_cpup(p++);
+ total = 0;
for (i = 0; i < segcount; i++) {
- p++; /* handle */
- if (be32_to_cpup(p++) > maxlen)
- return NULL;
- p += 2; /* offset */
+ u32 handle, length;
+ u64 offset;
- if (p > end)
- return NULL;
- }
+ p = xdr_inline_decode(&rctxt->rc_stream,
+ rpcrdma_segment_maxsz * sizeof(*p));
+ if (!p)
+ return false;
+
+ handle = be32_to_cpup(p++);
+ length = be32_to_cpup(p++);
+ xdr_decode_hyper(p, &offset);
+ trace_svcrdma_decode_wseg(handle, length, offset);
- return p;
+ total += length;
+ }
+ return total <= maxlen;
}
/* Sanity check the Write list.
*
* Implementation limits:
- * - This implementation supports only one Write chunk.
+ * - This implementation currently supports only one Write chunk.
*
* Sanity checks:
- * - Write list does not overflow buffer.
- * - Segment size limited by largest NFS data payload.
- *
- * Returns pointer to the following Reply chunk.
+ * - Write list does not overflow Receive buffer.
+ * - Chunk size limited by largest NFS data payload.
+ *
+ * Return values:
+ * %true: Write list is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Write list.
+ * %false: Write list is corrupt. @rctxt's xdr_stream is left
+ * in an unknown state.
*/
-static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end)
+static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt)
{
- u32 chcount;
+ u32 chcount = 0;
+ __be32 *p;
- chcount = 0;
- while (*p++ != xdr_zero) {
- p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG);
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+ rctxt->rc_write_list = p;
+ while (*p != xdr_zero) {
+ if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK))
+ return false;
+ ++chcount;
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
if (!p)
- return NULL;
- if (chcount++ > 1)
- return NULL;
+ return false;
}
- return p;
+ if (!chcount)
+ rctxt->rc_write_list = NULL;
+ return chcount < 2;
}
/* Sanity check the Reply chunk.
*
* Sanity checks:
- * - Reply chunk does not overflow buffer.
- * - Segment size limited by largest NFS data payload.
- *
- * Returns pointer to the following RPC header.
+ * - Reply chunk does not overflow Receive buffer.
+ * - Chunk size limited by largest NFS data payload.
+ *
+ * Return values:
+ * %true: Reply chunk is valid. @rctxt's xdr_stream is updated
+ * to point to the first byte past the Reply chunk.
+ * %false: Reply chunk is corrupt. @rctxt's xdr_stream is left
+ * in an unknown state.
*/
-static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end)
+static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt)
{
- if (*p++ != xdr_zero) {
- p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG);
- if (!p)
- return NULL;
+ __be32 *p;
+
+ p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p));
+ if (!p)
+ return false;
+ rctxt->rc_reply_chunk = p;
+ if (*p != xdr_zero) {
+ if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK))
+ return false;
+ } else {
+ rctxt->rc_reply_chunk = NULL;
}
- return p;
+ return true;
}
/* RPC-over-RDMA Version One private extension: Remote Invalidation.
@@ -537,60 +584,61 @@ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma,
ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey);
}
-/* On entry, xdr->head[0].iov_base points to first byte in the
- * RPC-over-RDMA header.
+/**
+ * svc_rdma_xdr_decode_req - Decode the transport header
+ * @rq_arg: xdr_buf containing ingress RPC/RDMA message
+ * @rctxt: state of decoding
+ *
+ * On entry, xdr->head[0].iov_base points to first byte of the
+ * RPC-over-RDMA transport header.
*
* On successful exit, head[0] points to first byte past the
* RPC-over-RDMA header. For RDMA_MSG, this is the RPC message.
+ *
* The length of the RPC-over-RDMA header is returned.
*
* Assumptions:
* - The transport header is entirely contained in the head iovec.
*/
-static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg)
+static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg,
+ struct svc_rdma_recv_ctxt *rctxt)
{
- __be32 *p, *end, *rdma_argp;
+ __be32 *p, *rdma_argp;
unsigned int hdr_len;
- /* Verify that there's enough bytes for header + something */
- if (rq_arg->len <= RPCRDMA_HDRLEN_ERR)
- goto out_short;
-
rdma_argp = rq_arg->head[0].iov_base;
- if (*(rdma_argp + 1) != rpcrdma_version)
- goto out_version;
+ xdr_init_decode(&rctxt->rc_stream, rq_arg, rdma_argp, NULL);
- switch (*(rdma_argp + 3)) {
+ p = xdr_inline_decode(&rctxt->rc_stream,
+ rpcrdma_fixed_maxsz * sizeof(*p));
+ if (unlikely(!p))
+ goto out_short;
+ p++;
+ if (*p != rpcrdma_version)
+ goto out_version;
+ p += 2;
+ switch (*p) {
case rdma_msg:
break;
case rdma_nomsg:
break;
-
case rdma_done:
goto out_drop;
-
case rdma_error:
goto out_drop;
-
default:
goto out_proc;
}
- end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len);
- p = xdr_check_read_list(rdma_argp + 4, end);
- if (!p)
- goto out_inval;
- p = xdr_check_write_list(p, end);
- if (!p)
+ if (!xdr_check_read_list(rctxt))
goto out_inval;
- p = xdr_check_reply_chunk(p, end);
- if (!p)
+ if (!xdr_check_write_list(rctxt))
goto out_inval;
- if (p > end)
+ if (!xdr_check_reply_chunk(rctxt))
goto out_inval;
- rq_arg->head[0].iov_base = p;
- hdr_len = (unsigned long)p - (unsigned long)rdma_argp;
+ rq_arg->head[0].iov_base = rctxt->rc_stream.p;
+ hdr_len = xdr_stream_pos(&rctxt->rc_stream);
rq_arg->head[0].iov_len -= hdr_len;
rq_arg->len -= hdr_len;
trace_svcrdma_decode_rqst(rdma_argp, hdr_len);
@@ -650,7 +698,6 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
__be32 *rdma_argp, int status)
{
struct svc_rdma_send_ctxt *ctxt;
- unsigned int length;
__be32 *p;
int ret;
@@ -658,29 +705,46 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt,
if (!ctxt)
return;
- p = ctxt->sc_xprt_buf;
+ p = xdr_reserve_space(&ctxt->sc_stream,
+ rpcrdma_fixed_maxsz * sizeof(*p));
+ if (!p)
+ goto put_ctxt;
+
*p++ = *rdma_argp;
*p++ = *(rdma_argp + 1);
*p++ = xprt->sc_fc_credits;
- *p++ = rdma_error;
+ *p = rdma_error;
+
switch (status) {
case -EPROTONOSUPPORT:
+ p = xdr_reserve_space(&ctxt->sc_stream, 3 * sizeof(*p));
+ if (!p)
+ goto put_ctxt;
+
*p++ = err_vers;
*p++ = rpcrdma_version;
- *p++ = rpcrdma_version;
+ *p = rpcrdma_version;
trace_svcrdma_err_vers(*rdma_argp);
break;
default:
- *p++ = err_chunk;
+ p = xdr_reserve_space(&ctxt->sc_stream, sizeof(*p));
+ if (!p)
+ goto put_ctxt;
+
+ *p = err_chunk;
trace_svcrdma_err_chunk(*rdma_argp);
}
- length = (unsigned long)p - (unsigned long)ctxt->sc_xprt_buf;
- svc_rdma_sync_reply_hdr(xprt, ctxt, length);
+ ctxt->sc_send_wr.num_sge = 1;
ctxt->sc_send_wr.opcode = IB_WR_SEND;
+ ctxt->sc_sges[0].length = ctxt->sc_hdrbuf.len;
ret = svc_rdma_send(xprt, &ctxt->sc_send_wr);
if (ret)
- svc_rdma_send_ctxt_put(xprt, ctxt);
+ goto put_ctxt;
+ return;
+
+put_ctxt:
+ svc_rdma_send_ctxt_put(xprt, ctxt);
}
/* By convention, backchannel calls arrive via rdma_msg type
@@ -785,7 +849,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
rqstp->rq_next_page = rqstp->rq_respages;
p = (__be32 *)rqstp->rq_arg.head[0].iov_base;
- ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg);
+ ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt);
if (ret < 0)
goto out_err;
if (ret == 0)
diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
index 48fe3b16b0d9..bd7c195d872e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
@@ -41,7 +41,7 @@ struct svc_rdma_rw_ctxt {
struct rdma_rw_ctx rw_ctx;
int rw_nents;
struct sg_table rw_sg_table;
- struct scatterlist rw_first_sgl[0];
+ struct scatterlist rw_first_sgl[];
};
static inline struct svc_rdma_rw_ctxt *
@@ -439,7 +439,8 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info,
if (ret < 0)
goto out_initerr;
- trace_svcrdma_encode_wseg(seg_handle, write_len, seg_offset);
+ trace_svcrdma_send_wseg(seg_handle, write_len, seg_offset);
+
list_add(&ctxt->rw_list, &cc->cc_rwctxts);
cc->cc_sqecount += ret;
if (write_len == seg_length - info->wi_seg_off) {
@@ -482,18 +483,19 @@ static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
vec->iov_len);
}
-/* Send an xdr_buf's page list by itself. A Write chunk is
- * just the page list. a Reply chunk is the head, page list,
- * and tail. This function is shared between the two types
- * of chunk.
+/* Send an xdr_buf's page list by itself. A Write chunk is just
+ * the page list. A Reply chunk is @xdr's head, page list, and
+ * tail. This function is shared between the two types of chunk.
*/
static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
- struct xdr_buf *xdr)
+ struct xdr_buf *xdr,
+ unsigned int offset,
+ unsigned long length)
{
info->wi_xdr = xdr;
- info->wi_next_off = 0;
+ info->wi_next_off = offset - xdr->head[0].iov_len;
return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
- xdr->page_len);
+ length);
}
/**
@@ -501,6 +503,8 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
* @rdma: controlling RDMA transport
* @wr_ch: Write chunk provided by client
* @xdr: xdr_buf containing the data payload
+ * @offset: payload's byte offset in @xdr
+ * @length: size of payload, in bytes
*
* Returns a non-negative number of bytes the chunk consumed, or
* %-E2BIG if the payload was larger than the Write chunk,
@@ -510,19 +514,20 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
*/
int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
- struct xdr_buf *xdr)
+ struct xdr_buf *xdr,
+ unsigned int offset, unsigned long length)
{
struct svc_rdma_write_info *info;
int ret;
- if (!xdr->page_len)
+ if (!length)
return 0;
info = svc_rdma_write_info_alloc(rdma, wr_ch);
if (!info)
return -ENOMEM;
- ret = svc_rdma_send_xdr_pagelist(info, xdr);
+ ret = svc_rdma_send_xdr_pagelist(info, xdr, offset, length);
if (ret < 0)
goto out_err;
@@ -530,8 +535,8 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
if (ret < 0)
goto out_err;
- trace_svcrdma_encode_write(xdr->page_len);
- return xdr->page_len;
+ trace_svcrdma_send_write_chunk(xdr->page_len);
+ return length;
out_err:
svc_rdma_write_info_free(info);
@@ -541,8 +546,7 @@ out_err:
/**
* svc_rdma_send_reply_chunk - Write all segments in the Reply chunk
* @rdma: controlling RDMA transport
- * @rp_ch: Reply chunk provided by client
- * @writelist: true if client provided a Write list
+ * @rctxt: Write and Reply chunks from client
* @xdr: xdr_buf containing an RPC Reply
*
* Returns a non-negative number of bytes the chunk consumed, or
@@ -552,13 +556,14 @@ out_err:
* %-ENOTCONN if posting failed (connection is lost),
* %-EIO if rdma_rw initialization failed (DMA mapping, etc).
*/
-int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
- bool writelist, struct xdr_buf *xdr)
+int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ struct xdr_buf *xdr)
{
struct svc_rdma_write_info *info;
int consumed, ret;
- info = svc_rdma_write_info_alloc(rdma, rp_ch);
+ info = svc_rdma_write_info_alloc(rdma, rctxt->rc_reply_chunk);
if (!info)
return -ENOMEM;
@@ -570,8 +575,10 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
/* Send the page list in the Reply chunk only if the
* client did not provide Write chunks.
*/
- if (!writelist && xdr->page_len) {
- ret = svc_rdma_send_xdr_pagelist(info, xdr);
+ if (!rctxt->rc_write_list && xdr->page_len) {
+ ret = svc_rdma_send_xdr_pagelist(info, xdr,
+ xdr->head[0].iov_len,
+ xdr->page_len);
if (ret < 0)
goto out_err;
consumed += xdr->page_len;
@@ -588,7 +595,7 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
if (ret < 0)
goto out_err;
- trace_svcrdma_encode_reply(consumed);
+ trace_svcrdma_send_reply_chunk(consumed);
return consumed;
out_err:
@@ -691,7 +698,7 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp,
if (ret < 0)
break;
- trace_svcrdma_encode_rseg(rs_handle, rs_length, rs_offset);
+ trace_svcrdma_send_rseg(rs_handle, rs_length, rs_offset);
info->ri_chunklen += rs_length;
}
@@ -722,7 +729,7 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp,
if (ret < 0)
goto out;
- trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position);
+ trace_svcrdma_send_read_chunk(info->ri_chunklen, info->ri_position);
head->rc_hdr_count = 0;
@@ -778,7 +785,7 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp,
if (ret < 0)
goto out;
- trace_svcrdma_encode_pzr(info->ri_chunklen);
+ trace_svcrdma_send_pzr(info->ri_chunklen);
head->rc_arg.len += info->ri_chunklen;
head->rc_arg.buflen += info->ri_chunklen;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index f3f108090aa4..90cba3058f04 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -151,6 +151,8 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma)
ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED;
ctxt->sc_cqe.done = svc_rdma_wc_send;
ctxt->sc_xprt_buf = buffer;
+ xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
+ rdma->sc_max_req_size);
ctxt->sc_sges[0].addr = addr;
for (i = 0; i < rdma->sc_max_send_sges; i++)
@@ -204,6 +206,10 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma)
spin_unlock(&rdma->sc_send_lock);
out:
+ rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0);
+ xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf,
+ ctxt->sc_xprt_buf, NULL);
+
ctxt->sc_send_wr.num_sge = 0;
ctxt->sc_cur_sge_no = 0;
ctxt->sc_page_count = 0;
@@ -295,6 +301,12 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr)
might_sleep();
+ /* Sync the transport header buffer */
+ ib_dma_sync_single_for_device(rdma->sc_pd->device,
+ wr->sg_list[0].addr,
+ wr->sg_list[0].length,
+ DMA_TO_DEVICE);
+
/* If the SQ is full, wait until an SQ entry is available */
while (1) {
if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) {
@@ -322,166 +334,173 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr)
return ret;
}
-static u32 xdr_padsize(u32 len)
+/**
+ * svc_rdma_encode_read_list - Encode RPC Reply's Read chunk list
+ * @sctxt: Send context for the RPC Reply
+ *
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Reply Read list
+ * %-EMSGSIZE on XDR buffer overflow
+ */
+static ssize_t svc_rdma_encode_read_list(struct svc_rdma_send_ctxt *sctxt)
{
- return (len & 3) ? (4 - (len & 3)) : 0;
+ /* RPC-over-RDMA version 1 replies never have a Read list. */
+ return xdr_stream_encode_item_absent(&sctxt->sc_stream);
}
-/* Returns length of transport header, in bytes.
+/**
+ * svc_rdma_encode_write_segment - Encode one Write segment
+ * @src: matching Write chunk in the RPC Call header
+ * @sctxt: Send context for the RPC Reply
+ * @remaining: remaining bytes of the payload left in the Write chunk
+ *
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Write segment
+ * %-EMSGSIZE on XDR buffer overflow
*/
-static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp)
+static ssize_t svc_rdma_encode_write_segment(__be32 *src,
+ struct svc_rdma_send_ctxt *sctxt,
+ unsigned int *remaining)
{
- unsigned int nsegs;
__be32 *p;
-
- p = rdma_resp;
-
- /* RPC-over-RDMA V1 replies never have a Read list. */
- p += rpcrdma_fixed_maxsz + 1;
-
- /* Skip Write list. */
- while (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
- }
-
- /* Skip Reply chunk. */
- if (*p++ != xdr_zero) {
- nsegs = be32_to_cpup(p++);
- p += nsegs * rpcrdma_segment_maxsz;
+ const size_t len = rpcrdma_segment_maxsz * sizeof(*p);
+ u32 handle, length;
+ u64 offset;
+
+ p = xdr_reserve_space(&sctxt->sc_stream, len);
+ if (!p)
+ return -EMSGSIZE;
+
+ handle = be32_to_cpup(src++);
+ length = be32_to_cpup(src++);
+ xdr_decode_hyper(src, &offset);
+
+ *p++ = cpu_to_be32(handle);
+ if (*remaining < length) {
+ /* segment only partly filled */
+ length = *remaining;
+ *remaining = 0;
+ } else {
+ /* entire segment was consumed */
+ *remaining -= length;
}
+ *p++ = cpu_to_be32(length);
+ xdr_encode_hyper(p, offset);
- return (unsigned long)p - (unsigned long)rdma_resp;
+ trace_svcrdma_encode_wseg(handle, length, offset);
+ return len;
}
-/* One Write chunk is copied from Call transport header to Reply
- * transport header. Each segment's length field is updated to
- * reflect number of bytes consumed in the segment.
- *
- * Returns number of segments in this chunk.
+/**
+ * svc_rdma_encode_write_chunk - Encode one Write chunk
+ * @src: matching Write chunk in the RPC Call header
+ * @sctxt: Send context for the RPC Reply
+ * @remaining: size in bytes of the payload in the Write chunk
+ *
+ * Copy a Write chunk from the Call transport header to the
+ * Reply transport header. Update each segment's length field
+ * to reflect the number of bytes written in that segment.
+ *
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Write chunk
+ * %-EMSGSIZE on XDR buffer overflow
*/
-static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src,
+static ssize_t svc_rdma_encode_write_chunk(__be32 *src,
+ struct svc_rdma_send_ctxt *sctxt,
unsigned int remaining)
{
unsigned int i, nsegs;
- u32 seg_len;
+ ssize_t len, ret;
- /* Write list discriminator */
- *dst++ = *src++;
+ len = 0;
+ trace_svcrdma_encode_write_chunk(remaining);
- /* number of segments in this chunk */
- nsegs = be32_to_cpup(src);
- *dst++ = *src++;
+ src++;
+ ret = xdr_stream_encode_item_present(&sctxt->sc_stream);
+ if (ret < 0)
+ return -EMSGSIZE;
+ len += ret;
- for (i = nsegs; i; i--) {
- /* segment's RDMA handle */
- *dst++ = *src++;
-
- /* bytes returned in this segment */
- seg_len = be32_to_cpu(*src);
- if (remaining >= seg_len) {
- /* entire segment was consumed */
- *dst = *src;
- remaining -= seg_len;
- } else {
- /* segment only partly filled */
- *dst = cpu_to_be32(remaining);
- remaining = 0;
- }
- dst++; src++;
+ nsegs = be32_to_cpup(src++);
+ ret = xdr_stream_encode_u32(&sctxt->sc_stream, nsegs);
+ if (ret < 0)
+ return -EMSGSIZE;
+ len += ret;
- /* segment's RDMA offset */
- *dst++ = *src++;
- *dst++ = *src++;
+ for (i = nsegs; i; i--) {
+ ret = svc_rdma_encode_write_segment(src, sctxt, &remaining);
+ if (ret < 0)
+ return -EMSGSIZE;
+ src += rpcrdma_segment_maxsz;
+ len += ret;
}
- return nsegs;
+ return len;
}
-/* The client provided a Write list in the Call message. Fill in
- * the segments in the first Write chunk in the Reply's transport
+/**
+ * svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list
+ * @rctxt: Reply context with information about the RPC Call
+ * @sctxt: Send context for the RPC Reply
+ * @length: size in bytes of the payload in the first Write chunk
+ *
+ * The client provides a Write chunk list in the Call message. Fill
+ * in the segments in the first Write chunk in the Reply's transport
* header with the number of bytes consumed in each segment.
* Remaining chunks are returned unused.
*
* Assumptions:
* - Client has provided only one Write chunk
- */
-static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch,
- unsigned int consumed)
-{
- unsigned int nsegs;
- __be32 *p, *q;
-
- /* RPC-over-RDMA V1 replies never have a Read list. */
- p = rdma_resp + rpcrdma_fixed_maxsz + 1;
-
- q = wr_ch;
- while (*q != xdr_zero) {
- nsegs = xdr_encode_write_chunk(p, q, consumed);
- q += 2 + nsegs * rpcrdma_segment_maxsz;
- p += 2 + nsegs * rpcrdma_segment_maxsz;
- consumed = 0;
- }
-
- /* Terminate Write list */
- *p++ = xdr_zero;
-
- /* Reply chunk discriminator; may be replaced later */
- *p = xdr_zero;
-}
-
-/* The client provided a Reply chunk in the Call message. Fill in
- * the segments in the Reply chunk in the Reply message with the
- * number of bytes consumed in each segment.
*
- * Assumptions:
- * - Reply can always fit in the provided Reply chunk
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Reply's Write list
+ * %-EMSGSIZE on XDR buffer overflow
*/
-static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch,
- unsigned int consumed)
+static ssize_t
+svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_send_ctxt *sctxt,
+ unsigned int length)
{
- __be32 *p;
+ ssize_t len, ret;
- /* Find the Reply chunk in the Reply's xprt header.
- * RPC-over-RDMA V1 replies never have a Read list.
- */
- p = rdma_resp + rpcrdma_fixed_maxsz + 1;
+ ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt, length);
+ if (ret < 0)
+ return ret;
+ len = ret;
- /* Skip past Write list */
- while (*p++ != xdr_zero)
- p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz;
+ /* Terminate the Write list */
+ ret = xdr_stream_encode_item_absent(&sctxt->sc_stream);
+ if (ret < 0)
+ return ret;
- xdr_encode_write_chunk(p, rp_ch, consumed);
+ return len + ret;
}
-/* Parse the RPC Call's transport header.
+/**
+ * svc_rdma_encode_reply_chunk - Encode RPC Reply's Reply chunk
+ * @rctxt: Reply context with information about the RPC Call
+ * @sctxt: Send context for the RPC Reply
+ * @length: size in bytes of the payload in the Reply chunk
+ *
+ * Assumptions:
+ * - Reply can always fit in the client-provided Reply chunk
+ *
+ * Return values:
+ * On success, returns length in bytes of the Reply XDR buffer
+ * that was consumed by the Reply's Reply chunk
+ * %-EMSGSIZE on XDR buffer overflow
*/
-static void svc_rdma_get_write_arrays(__be32 *rdma_argp,
- __be32 **write, __be32 **reply)
+static ssize_t
+svc_rdma_encode_reply_chunk(const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rdma_send_ctxt *sctxt,
+ unsigned int length)
{
- __be32 *p;
-
- p = rdma_argp + rpcrdma_fixed_maxsz;
-
- /* Read list */
- while (*p++ != xdr_zero)
- p += 5;
-
- /* Write list */
- if (*p != xdr_zero) {
- *write = p;
- while (*p++ != xdr_zero)
- p += 1 + be32_to_cpu(*p) * 4;
- } else {
- *write = NULL;
- p++;
- }
-
- /* Reply chunk */
- if (*p != xdr_zero)
- *reply = p;
- else
- *reply = NULL;
+ return svc_rdma_encode_write_chunk(rctxt->rc_reply_chunk, sctxt,
+ length);
}
static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma,
@@ -520,38 +539,36 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
}
/**
- * svc_rdma_sync_reply_hdr - DMA sync the transport header buffer
+ * svc_rdma_pull_up_needed - Determine whether to use pull-up
* @rdma: controlling transport
- * @ctxt: send_ctxt for the Send WR
- * @len: length of transport header
+ * @sctxt: send_ctxt for the Send WR
+ * @rctxt: Write and Reply chunks provided by client
+ * @xdr: xdr_buf containing RPC message to transmit
*
- */
-void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- unsigned int len)
-{
- ctxt->sc_sges[0].length = len;
- ctxt->sc_send_wr.num_sge++;
- ib_dma_sync_single_for_device(rdma->sc_pd->device,
- ctxt->sc_sges[0].addr, len,
- DMA_TO_DEVICE);
-}
-
-/* If the xdr_buf has more elements than the device can
- * transmit in a single RDMA Send, then the reply will
- * have to be copied into a bounce buffer.
+ * Returns:
+ * %true if pull-up must be used
+ * %false otherwise
*/
static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
- struct xdr_buf *xdr,
- __be32 *wr_lst)
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ struct xdr_buf *xdr)
{
int elements;
+ /* For small messages, copying bytes is cheaper than DMA mapping.
+ */
+ if (sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH)
+ return true;
+
+ /* Check whether the xdr_buf has more elements than can
+ * fit in a single RDMA Send.
+ */
/* xdr->head */
elements = 1;
/* xdr->pages */
- if (!wr_lst) {
+ if (!rctxt || !rctxt->rc_write_list) {
unsigned int remaining;
unsigned long pageoff;
@@ -573,29 +590,36 @@ static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
return elements >= rdma->sc_max_send_sges;
}
-/* The device is not capable of sending the reply directly.
- * Assemble the elements of @xdr into the transport header
- * buffer.
+/**
+ * svc_rdma_pull_up_reply_msg - Copy Reply into a single buffer
+ * @rdma: controlling transport
+ * @sctxt: send_ctxt for the Send WR; xprt hdr is already prepared
+ * @rctxt: Write and Reply chunks provided by client
+ * @xdr: prepared xdr_buf containing RPC message
+ *
+ * The device is not capable of sending the reply directly.
+ * Assemble the elements of @xdr into the transport header buffer.
+ *
+ * Returns zero on success, or a negative errno on failure.
*/
static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- struct xdr_buf *xdr, __be32 *wr_lst)
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ const struct xdr_buf *xdr)
{
unsigned char *dst, *tailbase;
unsigned int taillen;
- dst = ctxt->sc_xprt_buf;
- dst += ctxt->sc_sges[0].length;
-
+ dst = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len;
memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len);
dst += xdr->head[0].iov_len;
tailbase = xdr->tail[0].iov_base;
taillen = xdr->tail[0].iov_len;
- if (wr_lst) {
+ if (rctxt && rctxt->rc_write_list) {
u32 xdrpad;
- xdrpad = xdr_padsize(xdr->page_len);
+ xdrpad = xdr_pad_size(xdr->page_len);
if (taillen && xdrpad) {
tailbase += xdrpad;
taillen -= xdrpad;
@@ -621,29 +645,26 @@ static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
if (taillen)
memcpy(dst, tailbase, taillen);
- ctxt->sc_sges[0].length += xdr->len;
- ib_dma_sync_single_for_device(rdma->sc_pd->device,
- ctxt->sc_sges[0].addr,
- ctxt->sc_sges[0].length,
- DMA_TO_DEVICE);
-
+ sctxt->sc_sges[0].length += xdr->len;
+ trace_svcrdma_send_pullup(sctxt->sc_sges[0].length);
return 0;
}
-/* svc_rdma_map_reply_msg - Map the buffer holding RPC message
+/* svc_rdma_map_reply_msg - DMA map the buffer holding RPC message
* @rdma: controlling transport
- * @ctxt: send_ctxt for the Send WR
+ * @sctxt: send_ctxt for the Send WR
+ * @rctxt: Write and Reply chunks provided by client
* @xdr: prepared xdr_buf containing RPC message
- * @wr_lst: pointer to Call header's Write list, or NULL
*
* Load the xdr_buf into the ctxt's sge array, and DMA map each
- * element as it is added.
+ * element as it is added. The Send WR's num_sge field is set.
*
* Returns zero on success, or a negative errno on failure.
*/
int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
- struct svc_rdma_send_ctxt *ctxt,
- struct xdr_buf *xdr, __be32 *wr_lst)
+ struct svc_rdma_send_ctxt *sctxt,
+ const struct svc_rdma_recv_ctxt *rctxt,
+ struct xdr_buf *xdr)
{
unsigned int len, remaining;
unsigned long page_off;
@@ -652,11 +673,24 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
u32 xdr_pad;
int ret;
- if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst))
- return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst);
+ /* Set up the (persistently-mapped) transport header SGE. */
+ sctxt->sc_send_wr.num_sge = 1;
+ sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len;
- ++ctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_buf(rdma, ctxt,
+ /* If there is a Reply chunk, nothing follows the transport
+ * header, and we're done here.
+ */
+ if (rctxt && rctxt->rc_reply_chunk)
+ return 0;
+
+ /* For pull-up, svc_rdma_send() will sync the transport header.
+ * No additional DMA mapping is necessary.
+ */
+ if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr))
+ return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr);
+
+ ++sctxt->sc_cur_sge_no;
+ ret = svc_rdma_dma_map_buf(rdma, sctxt,
xdr->head[0].iov_base,
xdr->head[0].iov_len);
if (ret < 0)
@@ -667,10 +701,10 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
* have added XDR padding in the tail buffer, and that
* should not be included inline.
*/
- if (wr_lst) {
+ if (rctxt && rctxt->rc_write_list) {
base = xdr->tail[0].iov_base;
len = xdr->tail[0].iov_len;
- xdr_pad = xdr_padsize(xdr->page_len);
+ xdr_pad = xdr_pad_size(xdr->page_len);
if (len && xdr_pad) {
base += xdr_pad;
@@ -686,8 +720,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
while (remaining) {
len = min_t(u32, PAGE_SIZE - page_off, remaining);
- ++ctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++,
+ ++sctxt->sc_cur_sge_no;
+ ret = svc_rdma_dma_map_page(rdma, sctxt, *ppages++,
page_off, len);
if (ret < 0)
return ret;
@@ -700,8 +734,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
len = xdr->tail[0].iov_len;
tail:
if (len) {
- ++ctxt->sc_cur_sge_no;
- ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len);
+ ++sctxt->sc_cur_sge_no;
+ ret = svc_rdma_dma_map_buf(rdma, sctxt, base, len);
if (ret < 0)
return ret;
}
@@ -748,18 +782,14 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
*/
static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *sctxt,
- struct svc_rdma_recv_ctxt *rctxt,
- struct svc_rqst *rqstp,
- __be32 *wr_lst, __be32 *rp_ch)
+ const struct svc_rdma_recv_ctxt *rctxt,
+ struct svc_rqst *rqstp)
{
int ret;
- if (!rp_ch) {
- ret = svc_rdma_map_reply_msg(rdma, sctxt,
- &rqstp->rq_res, wr_lst);
- if (ret < 0)
- return ret;
- }
+ ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqstp->rq_res);
+ if (ret < 0)
+ return ret;
svc_rdma_save_io_pages(rqstp, sctxt);
@@ -769,8 +799,6 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma,
} else {
sctxt->sc_send_wr.opcode = IB_WR_SEND;
}
- dprintk("svcrdma: posting Send WR with %u sge(s)\n",
- sctxt->sc_send_wr.num_sge);
return svc_rdma_send(rdma, &sctxt->sc_send_wr);
}
@@ -785,26 +813,31 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
struct svc_rdma_send_ctxt *ctxt,
struct svc_rqst *rqstp)
{
+ struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
+ __be32 *rdma_argp = rctxt->rc_recv_buf;
__be32 *p;
- int ret;
- p = ctxt->sc_xprt_buf;
- trace_svcrdma_err_chunk(*p);
- p += 3;
+ rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0);
+ xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, ctxt->sc_xprt_buf,
+ NULL);
+
+ p = xdr_reserve_space(&ctxt->sc_stream, RPCRDMA_HDRLEN_ERR);
+ if (!p)
+ return -ENOMSG;
+
+ *p++ = *rdma_argp;
+ *p++ = *(rdma_argp + 1);
+ *p++ = rdma->sc_fc_credits;
*p++ = rdma_error;
*p = err_chunk;
- svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_ERR);
+ trace_svcrdma_err_chunk(*rdma_argp);
svc_rdma_save_io_pages(rqstp, ctxt);
+ ctxt->sc_send_wr.num_sge = 1;
ctxt->sc_send_wr.opcode = IB_WR_SEND;
- ret = svc_rdma_send(rdma, &ctxt->sc_send_wr);
- if (ret) {
- svc_rdma_send_ctxt_put(rdma, ctxt);
- return ret;
- }
-
- return 0;
+ ctxt->sc_sges[0].length = ctxt->sc_hdrbuf.len;
+ return svc_rdma_send(rdma, &ctxt->sc_send_wr);
}
/**
@@ -825,14 +858,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
struct svcxprt_rdma *rdma =
container_of(xprt, struct svcxprt_rdma, sc_xprt);
struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
- __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch;
+ __be32 *rdma_argp = rctxt->rc_recv_buf;
+ __be32 *wr_lst = rctxt->rc_write_list;
+ __be32 *rp_ch = rctxt->rc_reply_chunk;
struct xdr_buf *xdr = &rqstp->rq_res;
struct svc_rdma_send_ctxt *sctxt;
+ __be32 *p;
int ret;
- rdma_argp = rctxt->rc_recv_buf;
- svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch);
-
/* Create the RDMA response header. xprt->xpt_mutex,
* acquired in svc_send(), serializes RPC replies. The
* code path below that inserts the credit grant value
@@ -843,36 +876,52 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
sctxt = svc_rdma_send_ctxt_get(rdma);
if (!sctxt)
goto err0;
- rdma_resp = sctxt->sc_xprt_buf;
- p = rdma_resp;
+ p = xdr_reserve_space(&sctxt->sc_stream,
+ rpcrdma_fixed_maxsz * sizeof(*p));
+ if (!p)
+ goto err0;
*p++ = *rdma_argp;
*p++ = *(rdma_argp + 1);
*p++ = rdma->sc_fc_credits;
- *p++ = rp_ch ? rdma_nomsg : rdma_msg;
-
- /* Start with empty chunks */
- *p++ = xdr_zero;
- *p++ = xdr_zero;
- *p = xdr_zero;
+ *p = rp_ch ? rdma_nomsg : rdma_msg;
+ if (svc_rdma_encode_read_list(sctxt) < 0)
+ goto err0;
if (wr_lst) {
/* XXX: Presume the client sent only one Write chunk */
- ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr);
+ unsigned long offset;
+ unsigned int length;
+
+ if (rctxt->rc_read_payload_length) {
+ offset = rctxt->rc_read_payload_offset;
+ length = rctxt->rc_read_payload_length;
+ } else {
+ offset = xdr->head[0].iov_len;
+ length = xdr->page_len;
+ }
+ ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr, offset,
+ length);
if (ret < 0)
goto err2;
- svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
+ if (svc_rdma_encode_write_list(rctxt, sctxt, length) < 0)
+ goto err0;
+ } else {
+ if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0)
+ goto err0;
}
if (rp_ch) {
- ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr);
+ ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res);
if (ret < 0)
goto err2;
- svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret);
+ if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0)
+ goto err0;
+ } else {
+ if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0)
+ goto err0;
}
- svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp));
- ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp,
- wr_lst, rp_ch);
+ ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp);
if (ret < 0)
goto err1;
ret = 0;
@@ -900,3 +949,30 @@ out:
ret = -ENOTCONN;
goto out;
}
+
+/**
+ * svc_rdma_read_payload - special processing for a READ payload
+ * @rqstp: svc_rqst to operate on
+ * @offset: payload's byte offset in @xdr
+ * @length: size of payload, in bytes
+ *
+ * Returns zero on success.
+ *
+ * For the moment, just record the xdr_buf location of the READ
+ * payload. svc_rdma_sendto will use that location later when
+ * we actually send the payload.
+ */
+int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset,
+ unsigned int length)
+{
+ struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
+
+ /* XXX: Just one READ payload slot for now, since our
+ * transport implementation currently supports only one
+ * Write chunk.
+ */
+ rctxt->rc_read_payload_offset = offset;
+ rctxt->rc_read_payload_length = length;
+
+ return 0;
+}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 145a3615c319..8bb99980ae85 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -82,6 +82,7 @@ static const struct svc_xprt_ops svc_rdma_ops = {
.xpo_create = svc_rdma_create,
.xpo_recvfrom = svc_rdma_recvfrom,
.xpo_sendto = svc_rdma_sendto,
+ .xpo_read_payload = svc_rdma_read_payload,
.xpo_release_rqst = svc_rdma_release_rqst,
.xpo_detach = svc_rdma_detach,
.xpo_free = svc_rdma_free,
@@ -240,10 +241,6 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id,
static int rdma_listen_handler(struct rdma_cm_id *cma_id,
struct rdma_cm_event *event)
{
- struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr;
-
- trace_svcrdma_cm_event(event, sap);
-
switch (event->event) {
case RDMA_CM_EVENT_CONNECT_REQUEST:
dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
@@ -265,12 +262,9 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
static int rdma_cma_handler(struct rdma_cm_id *cma_id,
struct rdma_cm_event *event)
{
- struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.dst_addr;
struct svcxprt_rdma *rdma = cma_id->context;
struct svc_xprt *xprt = &rdma->sc_xprt;
- trace_svcrdma_cm_event(event, sap);
-
switch (event->event) {
case RDMA_CM_EVENT_ESTABLISHED:
/* Accept complete */
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 3cfeba68ee9a..659da37020a4 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -240,9 +240,10 @@ xprt_rdma_connect_worker(struct work_struct *work)
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
int rc;
- rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rc = rpcrdma_xprt_connect(r_xprt);
xprt_clear_connecting(xprt);
- if (r_xprt->rx_ep.rep_connected > 0) {
+ if (r_xprt->rx_ep && r_xprt->rx_ep->re_connect_status > 0) {
+ xprt->connect_cookie++;
xprt->stat.connect_count++;
xprt->stat.connect_time += (long)jiffies -
xprt->stat.connect_start;
@@ -265,7 +266,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
trace_xprtrdma_op_inject_dsc(r_xprt);
- rdma_disconnect(r_xprt->rx_ia.ri_id);
+ rdma_disconnect(r_xprt->rx_ep->re_id);
}
/**
@@ -284,9 +285,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
- rpcrdma_ep_destroy(r_xprt);
+ rpcrdma_xprt_disconnect(r_xprt);
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
- rpcrdma_ia_close(&r_xprt->rx_ia);
xprt_rdma_free_addresses(xprt);
xprt_free(xprt);
@@ -316,10 +316,15 @@ xprt_setup_rdma(struct xprt_create *args)
if (args->addrlen > sizeof(xprt->addr))
return ERR_PTR(-EBADF);
+ if (!try_module_get(THIS_MODULE))
+ return ERR_PTR(-EIO);
+
xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0,
xprt_rdma_slot_table_entries);
- if (!xprt)
+ if (!xprt) {
+ module_put(THIS_MODULE);
return ERR_PTR(-ENOMEM);
+ }
xprt->timeout = &xprt_rdma_default_timeout;
xprt->connect_timeout = xprt->timeout->to_initval;
@@ -347,23 +352,17 @@ xprt_setup_rdma(struct xprt_create *args)
xprt_rdma_format_addresses(xprt, sap);
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt);
- if (rc)
- goto out1;
-
- rc = rpcrdma_ep_create(new_xprt);
- if (rc)
- goto out2;
-
rc = rpcrdma_buffer_create(new_xprt);
- if (rc)
- goto out3;
-
- if (!try_module_get(THIS_MODULE))
- goto out4;
+ if (rc) {
+ xprt_rdma_free_addresses(xprt);
+ xprt_free(xprt);
+ module_put(THIS_MODULE);
+ return ERR_PTR(rc);
+ }
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
xprt_rdma_connect_worker);
+
xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
dprintk("RPC: %s: %s:%s\n", __func__,
@@ -371,19 +370,6 @@ xprt_setup_rdma(struct xprt_create *args)
xprt->address_strings[RPC_DISPLAY_PORT]);
trace_xprtrdma_create(new_xprt);
return xprt;
-
-out4:
- rpcrdma_buffer_destroy(&new_xprt->rx_buf);
- rc = -ENODEV;
-out3:
- rpcrdma_ep_destroy(new_xprt);
-out2:
- rpcrdma_ia_close(&new_xprt->rx_ia);
-out1:
- trace_xprtrdma_op_destroy(new_xprt);
- xprt_rdma_free_addresses(xprt);
- xprt_free(xprt);
- return ERR_PTR(rc);
}
/**
@@ -398,26 +384,11 @@ out1:
void xprt_rdma_close(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- might_sleep();
trace_xprtrdma_op_close(r_xprt);
- /* Prevent marshaling and sending of new requests */
- xprt_clear_connected(xprt);
-
- if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) {
- rpcrdma_ia_remove(ia);
- goto out;
- }
-
- if (ep->rep_connected == -ENODEV)
- return;
- rpcrdma_ep_disconnect(ep, ia);
+ rpcrdma_xprt_disconnect(r_xprt);
-out:
xprt->reestablish_timeout = 0;
++xprt->connect_cookie;
xprt_disconnect_done(xprt);
@@ -517,10 +488,11 @@ static void
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned long delay;
delay = 0;
- if (r_xprt->rx_ep.rep_connected != 0) {
+ if (ep && ep->re_connect_status != 0) {
delay = xprt_reconnect_delay(xprt);
xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
}
@@ -694,7 +666,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst)
goto drop_connection;
rqst->rq_xtime = ktime_get();
- if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+ if (rpcrdma_post_sends(r_xprt, req))
goto drop_connection;
rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 353f61ac8d51..cdd84c09df10 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -84,6 +84,7 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep);
static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
+static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep);
static struct rpcrdma_regbuf *
rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
gfp_t flags);
@@ -96,17 +97,17 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
*/
static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rdma_cm_id *id = r_xprt->rx_ep->re_id;
/* Flush Receives, then wait for deferred Reply work
* to complete.
*/
- ib_drain_rq(ia->ri_id->qp);
+ ib_drain_rq(id->qp);
/* Deferred Reply processing might have scheduled
* local invalidations.
*/
- ib_drain_sq(ia->ri_id->qp);
+ ib_drain_sq(id->qp);
}
/**
@@ -115,26 +116,43 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
* @context: ep that owns QP where event occurred
*
* Called from the RDMA provider (device driver) possibly in an interrupt
- * context.
+ * context. The QP is always destroyed before the ID, so the ID will be
+ * reliably available when this handler is invoked.
*/
-static void
-rpcrdma_qp_event_handler(struct ib_event *event, void *context)
+static void rpcrdma_qp_event_handler(struct ib_event *event, void *context)
{
struct rpcrdma_ep *ep = context;
- struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
- rx_ep);
- trace_xprtrdma_qp_event(r_xprt, event);
+ trace_xprtrdma_qp_event(ep, event);
+}
+
+/**
+ * rpcrdma_flush_disconnect - Disconnect on flushed completion
+ * @cq: completion queue
+ * @wc: work completion entry
+ *
+ * Must be called in process context.
+ */
+void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rpcrdma_xprt *r_xprt = cq->cq_context;
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ if (wc->status != IB_WC_SUCCESS &&
+ r_xprt->rx_ep->re_connect_status == 1) {
+ r_xprt->rx_ep->re_connect_status = -ECONNABORTED;
+ trace_xprtrdma_flush_dct(r_xprt, wc->status);
+ xprt_force_disconnect(xprt);
+ }
}
/**
* rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
* @cq: completion queue
- * @wc: completed WR
+ * @wc: WCE for a completed Send WR
*
*/
-static void
-rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
+static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_sendctx *sc =
@@ -143,25 +161,25 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_send(sc, wc);
rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc);
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
* rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed Receive WR
*
*/
-static void
-rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
+static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
rr_cqe);
- struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+ struct rpcrdma_xprt *r_xprt = cq->cq_context;
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_receive(wc);
- --r_xprt->rx_ep.rep_receive_count;
+ --r_xprt->rx_ep->re_receive_count;
if (wc->status != IB_WC_SUCCESS)
goto out_flushed;
@@ -178,35 +196,35 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
return;
out_flushed:
+ rpcrdma_flush_disconnect(cq, wc);
rpcrdma_rep_destroy(rep);
}
-static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt,
+static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
struct rdma_conn_param *param)
{
const struct rpcrdma_connect_private *pmsg = param->private_data;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
unsigned int rsize, wsize;
/* Default settings for RPC-over-RDMA Version One */
- r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
+ ep->re_implicit_roundup = xprt_rdma_pad_optimize;
rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
if (pmsg &&
pmsg->cp_magic == rpcrdma_cmp_magic &&
pmsg->cp_version == RPCRDMA_CMP_VERSION) {
- r_xprt->rx_ia.ri_implicit_roundup = true;
+ ep->re_implicit_roundup = true;
rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
}
- if (rsize < ep->rep_inline_recv)
- ep->rep_inline_recv = rsize;
- if (wsize < ep->rep_inline_send)
- ep->rep_inline_send = wsize;
+ if (rsize < ep->re_inline_recv)
+ ep->re_inline_recv = rsize;
+ if (wsize < ep->re_inline_send)
+ ep->re_inline_send = wsize;
- rpcrdma_set_max_header_sizes(r_xprt);
+ rpcrdma_set_max_header_sizes(ep);
}
/**
@@ -220,116 +238,103 @@ static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt,
static int
rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
- struct rpcrdma_xprt *r_xprt = id->context;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr;
+ struct rpcrdma_ep *ep = id->context;
+ struct rpc_xprt *xprt = ep->re_xprt;
might_sleep();
- trace_xprtrdma_cm_event(r_xprt, event);
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
case RDMA_CM_EVENT_ROUTE_RESOLVED:
- ia->ri_async_rc = 0;
- complete(&ia->ri_done);
+ ep->re_async_rc = 0;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_ADDR_ERROR:
- ia->ri_async_rc = -EPROTO;
- complete(&ia->ri_done);
+ ep->re_async_rc = -EPROTO;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_ROUTE_ERROR:
- ia->ri_async_rc = -ENETUNREACH;
- complete(&ia->ri_done);
+ ep->re_async_rc = -ENETUNREACH;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
- pr_info("rpcrdma: removing device %s for %s:%s\n",
- ia->ri_id->device->name,
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
-#endif
- init_completion(&ia->ri_remove_done);
- set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
- ep->rep_connected = -ENODEV;
+ pr_info("rpcrdma: removing device %s for %pISpc\n",
+ ep->re_id->device->name, sap);
+ /* fall through */
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ ep->re_connect_status = -ENODEV;
xprt_force_disconnect(xprt);
- wait_for_completion(&ia->ri_remove_done);
-
- ia->ri_id = NULL;
- /* Return 1 to ensure the core destroys the id. */
- return 1;
+ goto disconnected;
case RDMA_CM_EVENT_ESTABLISHED:
- ++xprt->connect_cookie;
- ep->rep_connected = 1;
- rpcrdma_update_cm_private(r_xprt, &event->param.conn);
- trace_xprtrdma_inline_thresh(r_xprt);
- wake_up_all(&ep->rep_connect_wait);
+ kref_get(&ep->re_kref);
+ ep->re_connect_status = 1;
+ rpcrdma_update_cm_private(ep, &event->param.conn);
+ trace_xprtrdma_inline_thresh(ep);
+ wake_up_all(&ep->re_connect_wait);
break;
case RDMA_CM_EVENT_CONNECT_ERROR:
- ep->rep_connected = -ENOTCONN;
+ ep->re_connect_status = -ENOTCONN;
goto disconnected;
case RDMA_CM_EVENT_UNREACHABLE:
- ep->rep_connected = -ENETUNREACH;
+ ep->re_connect_status = -ENETUNREACH;
goto disconnected;
case RDMA_CM_EVENT_REJECTED:
- dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
- rdma_reject_msg(id, event->status));
- ep->rep_connected = -ECONNREFUSED;
+ dprintk("rpcrdma: connection to %pISpc rejected: %s\n",
+ sap, rdma_reject_msg(id, event->status));
+ ep->re_connect_status = -ECONNREFUSED;
if (event->status == IB_CM_REJ_STALE_CONN)
- ep->rep_connected = -EAGAIN;
+ ep->re_connect_status = -EAGAIN;
goto disconnected;
case RDMA_CM_EVENT_DISCONNECTED:
- ep->rep_connected = -ECONNABORTED;
+ ep->re_connect_status = -ECONNABORTED;
disconnected:
- xprt_force_disconnect(xprt);
- wake_up_all(&ep->rep_connect_wait);
- break;
+ return rpcrdma_ep_destroy(ep);
default:
break;
}
- dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__,
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
- ia->ri_id->device->name, rdma_event_msg(event->event));
+ dprintk("RPC: %s: %pISpc on %s/frwr: %s\n", __func__, sap,
+ ep->re_id->device->name, rdma_event_msg(event->event));
return 0;
}
-static struct rdma_cm_id *
-rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
+static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_ep *ep)
{
unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
struct rdma_cm_id *id;
int rc;
- init_completion(&ia->ri_done);
+ init_completion(&ep->re_done);
- id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler,
- xprt, RDMA_PS_TCP, IB_QPT_RC);
+ id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(id))
return id;
- ia->ri_async_rc = -ETIMEDOUT;
- rc = rdma_resolve_addr(id, NULL,
- (struct sockaddr *)&xprt->rx_xprt.addr,
+ ep->re_async_rc = -ETIMEDOUT;
+ rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr,
RDMA_RESOLVE_TIMEOUT);
if (rc)
goto out;
- rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
if (rc < 0)
goto out;
- rc = ia->ri_async_rc;
+ rc = ep->re_async_rc;
if (rc)
goto out;
- ia->ri_async_rc = -ETIMEDOUT;
+ ep->re_async_rc = -ETIMEDOUT;
rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
if (rc)
goto out;
- rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
if (rc < 0)
goto out;
- rc = ia->ri_async_rc;
+ rc = ep->re_async_rc;
if (rc)
goto out;
@@ -340,356 +345,181 @@ out:
return ERR_PTR(rc);
}
-/*
- * Exported functions.
- */
-
-/**
- * rpcrdma_ia_open - Open and initialize an Interface Adapter.
- * @xprt: transport with IA to (re)initialize
- *
- * Returns 0 on success, negative errno if an appropriate
- * Interface Adapter could not be found and opened.
- */
-int
-rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
+static void rpcrdma_ep_put(struct kref *kref)
{
- struct rpcrdma_ia *ia = &xprt->rx_ia;
- int rc;
+ struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
- ia->ri_id = rpcrdma_create_id(xprt, ia);
- if (IS_ERR(ia->ri_id)) {
- rc = PTR_ERR(ia->ri_id);
- goto out_err;
+ if (ep->re_id->qp) {
+ rdma_destroy_qp(ep->re_id);
+ ep->re_id->qp = NULL;
}
- ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0);
- if (IS_ERR(ia->ri_pd)) {
- rc = PTR_ERR(ia->ri_pd);
- pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
- goto out_err;
- }
+ if (ep->re_attr.recv_cq)
+ ib_free_cq(ep->re_attr.recv_cq);
+ ep->re_attr.recv_cq = NULL;
+ if (ep->re_attr.send_cq)
+ ib_free_cq(ep->re_attr.send_cq);
+ ep->re_attr.send_cq = NULL;
- return 0;
+ if (ep->re_pd)
+ ib_dealloc_pd(ep->re_pd);
+ ep->re_pd = NULL;
-out_err:
- rpcrdma_ia_close(ia);
- return rc;
+ kfree(ep);
+ module_put(THIS_MODULE);
}
-/**
- * rpcrdma_ia_remove - Handle device driver unload
- * @ia: interface adapter being removed
- *
- * Divest transport H/W resources associated with this adapter,
- * but allow it to be restored later.
- *
- * Caller must hold the transport send lock.
+/* Returns:
+ * %0 if @ep still has a positive kref count, or
+ * %1 if @ep was destroyed successfully.
*/
-void
-rpcrdma_ia_remove(struct rpcrdma_ia *ia)
+static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep)
{
- struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
- rx_ia);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-
- /* This is similar to rpcrdma_ep_destroy, but:
- * - Don't cancel the connect worker.
- * - Don't call rpcrdma_ep_disconnect, which waits
- * for another conn upcall, which will deadlock.
- * - rdma_disconnect is unneeded, the underlying
- * connection is already gone.
- */
- if (ia->ri_id->qp) {
- rpcrdma_xprt_drain(r_xprt);
- rdma_destroy_qp(ia->ri_id);
- ia->ri_id->qp = NULL;
- }
- ib_free_cq(ep->rep_attr.recv_cq);
- ep->rep_attr.recv_cq = NULL;
- ib_free_cq(ep->rep_attr.send_cq);
- ep->rep_attr.send_cq = NULL;
-
- /* The ULP is responsible for ensuring all DMA
- * mappings and MRs are gone.
- */
- rpcrdma_reps_unmap(r_xprt);
- rpcrdma_reqs_reset(r_xprt);
- rpcrdma_mrs_destroy(r_xprt);
- rpcrdma_sendctxs_destroy(r_xprt);
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
-
- /* Allow waiters to continue */
- complete(&ia->ri_remove_done);
-
- trace_xprtrdma_remove(r_xprt);
-}
-
-/**
- * rpcrdma_ia_close - Clean up/close an IA.
- * @ia: interface adapter to close
- *
- */
-void
-rpcrdma_ia_close(struct rpcrdma_ia *ia)
-{
- if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
- if (ia->ri_id->qp)
- rdma_destroy_qp(ia->ri_id);
- rdma_destroy_id(ia->ri_id);
- }
- ia->ri_id = NULL;
-
- /* If the pd is still busy, xprtrdma missed freeing a resource */
- if (ia->ri_pd && !IS_ERR(ia->ri_pd))
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
+ return kref_put(&ep->re_kref, rpcrdma_ep_put);
}
-/**
- * rpcrdma_ep_create - Create unconnected endpoint
- * @r_xprt: transport to instantiate
- *
- * Returns zero on success, or a negative errno.
- */
-int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
+static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
- struct ib_cq *sendcq, *recvcq;
+ struct rpcrdma_connect_private *pmsg;
+ struct ib_device *device;
+ struct rdma_cm_id *id;
+ struct rpcrdma_ep *ep;
int rc;
- ep->rep_max_requests = r_xprt->rx_xprt.max_reqs;
- ep->rep_inline_send = xprt_rdma_max_inline_write;
- ep->rep_inline_recv = xprt_rdma_max_inline_read;
+ ep = kzalloc(sizeof(*ep), GFP_NOFS);
+ if (!ep)
+ return -EAGAIN;
+ ep->re_xprt = &r_xprt->rx_xprt;
+ kref_init(&ep->re_kref);
- rc = frwr_query_device(r_xprt, ia->ri_id->device);
+ id = rpcrdma_create_id(r_xprt, ep);
+ if (IS_ERR(id)) {
+ rc = PTR_ERR(id);
+ goto out_free;
+ }
+ __module_get(THIS_MODULE);
+ device = id->device;
+ ep->re_id = id;
+
+ ep->re_max_requests = r_xprt->rx_xprt.max_reqs;
+ ep->re_inline_send = xprt_rdma_max_inline_write;
+ ep->re_inline_recv = xprt_rdma_max_inline_read;
+ rc = frwr_query_device(ep, device);
if (rc)
- return rc;
- r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests);
+ goto out_destroy;
+
+ r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests);
- ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
- ep->rep_attr.qp_context = ep;
- ep->rep_attr.srq = NULL;
- ep->rep_attr.cap.max_inline_data = 0;
- ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
- ep->rep_attr.qp_type = IB_QPT_RC;
- ep->rep_attr.port_num = ~0;
+ ep->re_attr.event_handler = rpcrdma_qp_event_handler;
+ ep->re_attr.qp_context = ep;
+ ep->re_attr.srq = NULL;
+ ep->re_attr.cap.max_inline_data = 0;
+ ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ ep->re_attr.qp_type = IB_QPT_RC;
+ ep->re_attr.port_num = ~0;
dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
"iovs: send %d recv %d\n",
__func__,
- ep->rep_attr.cap.max_send_wr,
- ep->rep_attr.cap.max_recv_wr,
- ep->rep_attr.cap.max_send_sge,
- ep->rep_attr.cap.max_recv_sge);
-
- ep->rep_send_batch = ep->rep_max_requests >> 3;
- ep->rep_send_count = ep->rep_send_batch;
- init_waitqueue_head(&ep->rep_connect_wait);
- ep->rep_receive_count = 0;
-
- sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt,
- ep->rep_attr.cap.max_send_wr + 1,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(sendcq)) {
- rc = PTR_ERR(sendcq);
- goto out1;
+ ep->re_attr.cap.max_send_wr,
+ ep->re_attr.cap.max_recv_wr,
+ ep->re_attr.cap.max_send_sge,
+ ep->re_attr.cap.max_recv_sge);
+
+ ep->re_send_batch = ep->re_max_requests >> 3;
+ ep->re_send_count = ep->re_send_batch;
+ init_waitqueue_head(&ep->re_connect_wait);
+
+ ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt,
+ ep->re_attr.cap.max_send_wr,
+ IB_POLL_WORKQUEUE);
+ if (IS_ERR(ep->re_attr.send_cq)) {
+ rc = PTR_ERR(ep->re_attr.send_cq);
+ goto out_destroy;
}
- recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
- ep->rep_attr.cap.max_recv_wr + 1,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(recvcq)) {
- rc = PTR_ERR(recvcq);
- goto out2;
+ ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt,
+ ep->re_attr.cap.max_recv_wr,
+ IB_POLL_WORKQUEUE);
+ if (IS_ERR(ep->re_attr.recv_cq)) {
+ rc = PTR_ERR(ep->re_attr.recv_cq);
+ goto out_destroy;
}
-
- ep->rep_attr.send_cq = sendcq;
- ep->rep_attr.recv_cq = recvcq;
+ ep->re_receive_count = 0;
/* Initialize cma parameters */
- memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
+ memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma));
/* Prepare RDMA-CM private message */
+ pmsg = &ep->re_cm_private;
pmsg->cp_magic = rpcrdma_cmp_magic;
pmsg->cp_version = RPCRDMA_CMP_VERSION;
pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK;
- pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send);
- pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv);
- ep->rep_remote_cma.private_data = pmsg;
- ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
+ pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send);
+ pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv);
+ ep->re_remote_cma.private_data = pmsg;
+ ep->re_remote_cma.private_data_len = sizeof(*pmsg);
/* Client offers RDMA Read but does not initiate */
- ep->rep_remote_cma.initiator_depth = 0;
- ep->rep_remote_cma.responder_resources =
- min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom);
+ ep->re_remote_cma.initiator_depth = 0;
+ ep->re_remote_cma.responder_resources =
+ min_t(int, U8_MAX, device->attrs.max_qp_rd_atom);
/* Limit transport retries so client can detect server
* GID changes quickly. RPC layer handles re-establishing
* transport connection and retransmission.
*/
- ep->rep_remote_cma.retry_count = 6;
+ ep->re_remote_cma.retry_count = 6;
/* RPC-over-RDMA handles its own flow control. In addition,
* make all RNR NAKs visible so we know that RPC-over-RDMA
* flow control is working correctly (no NAKs should be seen).
*/
- ep->rep_remote_cma.flow_control = 0;
- ep->rep_remote_cma.rnr_retry_count = 0;
+ ep->re_remote_cma.flow_control = 0;
+ ep->re_remote_cma.rnr_retry_count = 0;
- return 0;
-
-out2:
- ib_free_cq(sendcq);
-out1:
- return rc;
-}
-
-/**
- * rpcrdma_ep_destroy - Disconnect and destroy endpoint.
- * @r_xprt: transport instance to shut down
- *
- */
-void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- if (ia->ri_id && ia->ri_id->qp) {
- rpcrdma_ep_disconnect(ep, ia);
- rdma_destroy_qp(ia->ri_id);
- ia->ri_id->qp = NULL;
- }
-
- if (ep->rep_attr.recv_cq)
- ib_free_cq(ep->rep_attr.recv_cq);
- if (ep->rep_attr.send_cq)
- ib_free_cq(ep->rep_attr.send_cq);
-}
-
-/* Re-establish a connection after a device removal event.
- * Unlike a normal reconnection, a fresh PD and a new set
- * of MRs and buffers is needed.
- */
-static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
- struct ib_qp_init_attr *qp_init_attr)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- int rc, err;
-
- trace_xprtrdma_reinsert(r_xprt);
-
- rc = -EHOSTUNREACH;
- if (rpcrdma_ia_open(r_xprt))
- goto out1;
-
- rc = -ENOMEM;
- err = rpcrdma_ep_create(r_xprt);
- if (err) {
- pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
- goto out2;
- }
- memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr));
-
- rc = -ENETUNREACH;
- err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr);
- if (err) {
- pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
- goto out3;
- }
- return 0;
-
-out3:
- rpcrdma_ep_destroy(r_xprt);
-out2:
- rpcrdma_ia_close(ia);
-out1:
- return rc;
-}
-
-static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt,
- struct ib_qp_init_attr *qp_init_attr)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rdma_cm_id *id, *old;
- int err, rc;
-
- rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia);
-
- rc = -EHOSTUNREACH;
- id = rpcrdma_create_id(r_xprt, ia);
- if (IS_ERR(id))
- goto out;
-
- /* As long as the new ID points to the same device as the
- * old ID, we can reuse the transport's existing PD and all
- * previously allocated MRs. Also, the same device means
- * the transport's previous DMA mappings are still valid.
- *
- * This is a sanity check only. There should be no way these
- * point to two different devices here.
- */
- old = id;
- rc = -ENETUNREACH;
- if (ia->ri_id->device != id->device) {
- pr_err("rpcrdma: can't reconnect on different device!\n");
+ ep->re_pd = ib_alloc_pd(device, 0);
+ if (IS_ERR(ep->re_pd)) {
+ rc = PTR_ERR(ep->re_pd);
goto out_destroy;
}
- err = rdma_create_qp(id, ia->ri_pd, qp_init_attr);
- if (err)
+ rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr);
+ if (rc)
goto out_destroy;
- /* Atomically replace the transport's ID and QP. */
- rc = 0;
- old = ia->ri_id;
- ia->ri_id = id;
- rdma_destroy_qp(old);
+ r_xprt->rx_ep = ep;
+ return 0;
out_destroy:
- rdma_destroy_id(old);
-out:
+ rpcrdma_ep_destroy(ep);
+ rdma_destroy_id(id);
+out_free:
+ kfree(ep);
+ r_xprt->rx_ep = NULL;
return rc;
}
-/*
- * Connect unconnected endpoint.
+/**
+ * rpcrdma_xprt_connect - Connect an unconnected transport
+ * @r_xprt: controlling transport instance
+ *
+ * Returns 0 on success or a negative errno.
*/
-int
-rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
- rx_ia);
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- struct ib_qp_init_attr qp_init_attr;
+ struct rpcrdma_ep *ep;
int rc;
retry:
- memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr));
- switch (ep->rep_connected) {
- case 0:
- rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr);
- if (rc) {
- rc = -ENETUNREACH;
- goto out_noupdate;
- }
- break;
- case -ENODEV:
- rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr);
- if (rc)
- goto out_noupdate;
- break;
- default:
- rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr);
- if (rc)
- goto out;
- }
+ rpcrdma_xprt_disconnect(r_xprt);
+ rc = rpcrdma_ep_create(r_xprt);
+ if (rc)
+ return rc;
+ ep = r_xprt->rx_ep;
- ep->rep_connected = 0;
+ ep->re_connect_status = 0;
xprt_clear_connected(xprt);
rpcrdma_reset_cwnd(r_xprt);
@@ -699,64 +529,68 @@ retry:
if (rc)
goto out;
- rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+ rc = rdma_connect(ep->re_id, &ep->re_remote_cma);
if (rc)
goto out;
if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
- wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
- if (ep->rep_connected <= 0) {
- if (ep->rep_connected == -EAGAIN)
+ wait_event_interruptible(ep->re_connect_wait,
+ ep->re_connect_status != 0);
+ if (ep->re_connect_status <= 0) {
+ if (ep->re_connect_status == -EAGAIN)
goto retry;
- rc = ep->rep_connected;
+ rc = ep->re_connect_status;
goto out;
}
rc = rpcrdma_reqs_setup(r_xprt);
if (rc) {
- rpcrdma_ep_disconnect(ep, ia);
+ rpcrdma_xprt_disconnect(r_xprt);
goto out;
}
rpcrdma_mrs_create(r_xprt);
out:
if (rc)
- ep->rep_connected = rc;
-
-out_noupdate:
+ ep->re_connect_status = rc;
trace_xprtrdma_connect(r_xprt, rc);
return rc;
}
/**
- * rpcrdma_ep_disconnect - Disconnect underlying transport
- * @ep: endpoint to disconnect
- * @ia: associated interface adapter
+ * rpcrdma_xprt_disconnect - Disconnect underlying transport
+ * @r_xprt: controlling transport instance
*
* Caller serializes. Either the transport send lock is held,
* or we're being called to destroy the transport.
+ *
+ * On return, @r_xprt is completely divested of all hardware
+ * resources and prepared for the next ->connect operation.
*/
-void
-rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
- rx_ep);
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct rdma_cm_id *id;
int rc;
- /* returns without wait if ID is not connected */
- rc = rdma_disconnect(ia->ri_id);
- if (!rc)
- wait_event_interruptible(ep->rep_connect_wait,
- ep->rep_connected != 1);
- else
- ep->rep_connected = rc;
+ if (!ep)
+ return;
+
+ id = ep->re_id;
+ rc = rdma_disconnect(id);
trace_xprtrdma_disconnect(r_xprt, rc);
rpcrdma_xprt_drain(r_xprt);
+ rpcrdma_reps_unmap(r_xprt);
rpcrdma_reqs_reset(r_xprt);
rpcrdma_mrs_destroy(r_xprt);
rpcrdma_sendctxs_destroy(r_xprt);
+
+ if (rpcrdma_ep_destroy(ep))
+ rdma_destroy_id(id);
+
+ r_xprt->rx_ep = NULL;
}
/* Fixed-size circular FIFO queue. This implementation is wait-free and
@@ -793,7 +627,7 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
{
struct rpcrdma_sendctx *sc;
- sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge),
+ sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge),
GFP_KERNEL);
if (!sc)
return NULL;
@@ -813,14 +647,14 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
* the ->send_request call to fail temporarily before too many
* Sends are posted.
*/
- i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS;
+ i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS;
buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
if (!buf->rb_sc_ctxs)
return -ENOMEM;
buf->rb_sc_last = i - 1;
for (i = 0; i <= buf->rb_sc_last; i++) {
- sc = rpcrdma_sendctx_create(&r_xprt->rx_ep);
+ sc = rpcrdma_sendctx_create(r_xprt->rx_ep);
if (!sc)
return -ENOMEM;
@@ -924,10 +758,10 @@ static void
rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned int count;
- for (count = 0; count < ia->ri_max_rdma_segs; count++) {
+ for (count = 0; count < ep->re_max_rdma_segs; count++) {
struct rpcrdma_mr *mr;
int rc;
@@ -935,14 +769,12 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
if (!mr)
break;
- rc = frwr_init_mr(ia, mr);
+ rc = frwr_mr_init(r_xprt, mr);
if (rc) {
kfree(mr);
break;
}
- mr->mr_xprt = r_xprt;
-
spin_lock(&buf->rb_lock);
rpcrdma_mr_push(mr, &buf->rb_mrs);
list_add(&mr->mr_all, &buf->rb_all_mrs);
@@ -973,12 +805,12 @@ rpcrdma_mr_refresh_worker(struct work_struct *work)
void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
- /* If there is no underlying device, it's no use to
- * wake the refresh worker.
+ /* If there is no underlying connection, it's no use
+ * to wake the refresh worker.
*/
- if (ep->rep_connected != -ENODEV) {
+ if (ep->re_connect_status == 1) {
/* The work is scheduled on a WQ_MEM_RECLAIM
* workqueue in order to prevent MR allocation
* from recursing into NFS during direct reclaim.
@@ -1042,7 +874,7 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Compute maximum header buffer size in bytes */
maxhdrsize = rpcrdma_fixed_maxsz + 3 +
- r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz;
+ r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz;
maxhdrsize *= sizeof(__be32);
rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
DMA_TO_DEVICE, GFP_KERNEL);
@@ -1120,7 +952,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
if (rep == NULL)
goto out;
- rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv,
+ rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv,
DMA_FROM_DEVICE, GFP_KERNEL);
if (!rep->rr_rdmabuf)
goto out_free;
@@ -1345,7 +1177,7 @@ void rpcrdma_mr_put(struct rpcrdma_mr *mr)
if (mr->mr_dir != DMA_NONE) {
trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
+ ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
mr->mr_sg, mr->mr_nents, mr->mr_dir);
mr->mr_dir = DMA_NONE;
}
@@ -1463,7 +1295,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags)
bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_regbuf *rb)
{
- struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+ struct ib_device *device = r_xprt->rx_ep->re_id->device;
if (rb->rg_direction == DMA_NONE)
return false;
@@ -1476,7 +1308,7 @@ bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
}
rb->rg_device = device;
- rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey;
+ rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey;
return true;
}
@@ -1502,31 +1334,28 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
}
/**
- * rpcrdma_ep_post - Post WRs to a transport's Send Queue
- * @ia: transport's device information
- * @ep: transport's RDMA endpoint information
+ * rpcrdma_post_sends - Post WRs to a transport's Send Queue
+ * @r_xprt: controlling transport instance
* @req: rpcrdma_req containing the Send WR to post
*
* Returns 0 if the post was successful, otherwise -ENOTCONN
* is returned.
*/
-int
-rpcrdma_ep_post(struct rpcrdma_ia *ia,
- struct rpcrdma_ep *ep,
- struct rpcrdma_req *req)
+int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *send_wr = &req->rl_wr;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
int rc;
- if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) {
+ if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) {
send_wr->send_flags |= IB_SEND_SIGNALED;
- ep->rep_send_count = ep->rep_send_batch;
+ ep->re_send_count = ep->re_send_batch;
} else {
send_wr->send_flags &= ~IB_SEND_SIGNALED;
- --ep->rep_send_count;
+ --ep->re_send_count;
}
- rc = frwr_send(ia, req);
+ rc = frwr_send(r_xprt, req);
trace_xprtrdma_post_send(req, rc);
if (rc)
return -ENOTCONN;
@@ -1542,7 +1371,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct ib_recv_wr *wr, *bad_wr;
struct rpcrdma_rep *rep;
int needed, count, rc;
@@ -1551,9 +1380,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
count = 0;
needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
- if (likely(ep->rep_receive_count > needed))
+ if (likely(ep->re_receive_count > needed))
goto out;
- needed -= ep->rep_receive_count;
+ needed -= ep->re_receive_count;
if (!temp)
needed += RPCRDMA_MAX_RECV_BATCH;
@@ -1579,7 +1408,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
if (!wr)
goto out;
- rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
+ rc = ib_post_recv(ep->re_id->qp, wr,
(const struct ib_recv_wr **)&bad_wr);
out:
trace_xprtrdma_post_recvs(r_xprt, count, rc);
@@ -1593,6 +1422,6 @@ out:
--count;
}
}
- ep->rep_receive_count += count;
+ ep->re_receive_count += count;
return;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 37d5080c250b..0a16fdb09b2c 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -65,43 +65,33 @@
#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
/*
- * Interface Adapter -- one per transport instance
+ * RDMA Endpoint -- connection endpoint details
*/
-struct rpcrdma_ia {
- struct rdma_cm_id *ri_id;
- struct ib_pd *ri_pd;
- int ri_async_rc;
- unsigned int ri_max_rdma_segs;
- unsigned int ri_max_frwr_depth;
- bool ri_implicit_roundup;
- enum ib_mr_type ri_mrtype;
- unsigned long ri_flags;
- struct completion ri_done;
- struct completion ri_remove_done;
-};
-
-enum {
- RPCRDMA_IAF_REMOVING = 0,
-};
-
-/*
- * RDMA Endpoint -- one per transport instance
- */
-
struct rpcrdma_ep {
- unsigned int rep_send_count;
- unsigned int rep_send_batch;
- unsigned int rep_max_inline_send;
- unsigned int rep_max_inline_recv;
- int rep_connected;
- struct ib_qp_init_attr rep_attr;
- wait_queue_head_t rep_connect_wait;
- struct rpcrdma_connect_private rep_cm_private;
- struct rdma_conn_param rep_remote_cma;
- unsigned int rep_max_requests; /* depends on device */
- unsigned int rep_inline_send; /* negotiated */
- unsigned int rep_inline_recv; /* negotiated */
- int rep_receive_count;
+ struct kref re_kref;
+ struct rdma_cm_id *re_id;
+ struct ib_pd *re_pd;
+ unsigned int re_max_rdma_segs;
+ unsigned int re_max_fr_depth;
+ bool re_implicit_roundup;
+ enum ib_mr_type re_mrtype;
+ struct completion re_done;
+ unsigned int re_send_count;
+ unsigned int re_send_batch;
+ unsigned int re_max_inline_send;
+ unsigned int re_max_inline_recv;
+ int re_async_rc;
+ int re_connect_status;
+ struct ib_qp_init_attr re_attr;
+ wait_queue_head_t re_connect_wait;
+ struct rpc_xprt *re_xprt;
+ struct rpcrdma_connect_private
+ re_cm_private;
+ struct rdma_conn_param re_remote_cma;
+ int re_receive_count;
+ unsigned int re_max_requests; /* depends on device */
+ unsigned int re_inline_send; /* negotiated */
+ unsigned int re_inline_recv; /* negotiated */
};
/* Pre-allocate extra Work Requests for handling backward receives
@@ -422,8 +412,7 @@ struct rpcrdma_stats {
*/
struct rpcrdma_xprt {
struct rpc_xprt rx_xprt;
- struct rpcrdma_ia rx_ia;
- struct rpcrdma_ep rx_ep;
+ struct rpcrdma_ep *rx_ep;
struct rpcrdma_buffer rx_buf;
struct delayed_work rx_connect_worker;
struct rpc_timeout rx_timeout;
@@ -455,22 +444,13 @@ extern int xprt_rdma_pad_optimize;
extern unsigned int xprt_rdma_memreg_strategy;
/*
- * Interface Adapter calls - xprtrdma/verbs.c
- */
-int rpcrdma_ia_open(struct rpcrdma_xprt *xprt);
-void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
-void rpcrdma_ia_close(struct rpcrdma_ia *);
-
-/*
* Endpoint calls - xprtrdma/verbs.c
*/
-int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt);
-int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc);
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
-int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
- struct rpcrdma_req *);
+int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
/*
@@ -536,15 +516,14 @@ rpcrdma_data_dir(bool writing)
/* Memory registration calls xprtrdma/frwr_ops.c
*/
void frwr_reset(struct rpcrdma_req *req);
-int frwr_query_device(struct rpcrdma_xprt *r_xprt,
- const struct ib_device *device);
-int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device);
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr);
void frwr_release_mr(struct rpcrdma_mr *mr);
struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, __be32 xid,
struct rpcrdma_mr *mr);
-int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
@@ -569,7 +548,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
enum rpcrdma_chunktype rtype);
void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc);
int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
-void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep);
void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt);
void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index d86c664ea6af..0bda8a73e8a8 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -54,6 +54,7 @@
#include <trace/events/sunrpc.h>
+#include "socklib.h"
#include "sunrpc.h"
static void xs_close(struct rpc_xprt *xprt);
@@ -749,125 +750,6 @@ xs_stream_start_connect(struct sock_xprt *transport)
#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
-static int xs_sendmsg(struct socket *sock, struct msghdr *msg, size_t seek)
-{
- if (seek)
- iov_iter_advance(&msg->msg_iter, seek);
- return sock_sendmsg(sock, msg);
-}
-
-static int xs_send_kvec(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t seek)
-{
- iov_iter_kvec(&msg->msg_iter, WRITE, vec, 1, vec->iov_len);
- return xs_sendmsg(sock, msg, seek);
-}
-
-static int xs_send_pagedata(struct socket *sock, struct msghdr *msg, struct xdr_buf *xdr, size_t base)
-{
- int err;
-
- err = xdr_alloc_bvec(xdr, GFP_KERNEL);
- if (err < 0)
- return err;
-
- iov_iter_bvec(&msg->msg_iter, WRITE, xdr->bvec,
- xdr_buf_pagecount(xdr),
- xdr->page_len + xdr->page_base);
- return xs_sendmsg(sock, msg, base + xdr->page_base);
-}
-
-#define xs_record_marker_len() sizeof(rpc_fraghdr)
-
-/* Common case:
- * - stream transport
- * - sending from byte 0 of the message
- * - the message is wholly contained in @xdr's head iovec
- */
-static int xs_send_rm_and_kvec(struct socket *sock, struct msghdr *msg,
- rpc_fraghdr marker, struct kvec *vec, size_t base)
-{
- struct kvec iov[2] = {
- [0] = {
- .iov_base = &marker,
- .iov_len = sizeof(marker)
- },
- [1] = *vec,
- };
- size_t len = iov[0].iov_len + iov[1].iov_len;
-
- iov_iter_kvec(&msg->msg_iter, WRITE, iov, 2, len);
- return xs_sendmsg(sock, msg, base);
-}
-
-/**
- * xs_sendpages - write pages directly to a socket
- * @sock: socket to send on
- * @addr: UDP only -- address of destination
- * @addrlen: UDP only -- length of destination address
- * @xdr: buffer containing this request
- * @base: starting position in the buffer
- * @rm: stream record marker field
- * @sent_p: return the total number of bytes successfully queued for sending
- *
- */
-static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, rpc_fraghdr rm, int *sent_p)
-{
- struct msghdr msg = {
- .msg_name = addr,
- .msg_namelen = addrlen,
- .msg_flags = XS_SENDMSG_FLAGS | MSG_MORE,
- };
- unsigned int rmsize = rm ? sizeof(rm) : 0;
- unsigned int remainder = rmsize + xdr->len - base;
- unsigned int want;
- int err = 0;
-
- if (unlikely(!sock))
- return -ENOTSOCK;
-
- want = xdr->head[0].iov_len + rmsize;
- if (base < want) {
- unsigned int len = want - base;
- remainder -= len;
- if (remainder == 0)
- msg.msg_flags &= ~MSG_MORE;
- if (rmsize)
- err = xs_send_rm_and_kvec(sock, &msg, rm,
- &xdr->head[0], base);
- else
- err = xs_send_kvec(sock, &msg, &xdr->head[0], base);
- if (remainder == 0 || err != len)
- goto out;
- *sent_p += err;
- base = 0;
- } else
- base -= want;
-
- if (base < xdr->page_len) {
- unsigned int len = xdr->page_len - base;
- remainder -= len;
- if (remainder == 0)
- msg.msg_flags &= ~MSG_MORE;
- err = xs_send_pagedata(sock, &msg, xdr, base);
- if (remainder == 0 || err != len)
- goto out;
- *sent_p += err;
- base = 0;
- } else
- base -= xdr->page_len;
-
- if (base >= xdr->tail[0].iov_len)
- return 0;
- msg.msg_flags &= ~MSG_MORE;
- err = xs_send_kvec(sock, &msg, &xdr->tail[0], base);
-out:
- if (err > 0) {
- *sent_p += err;
- err = 0;
- }
- return err;
-}
-
/**
* xs_nospace - handle transmit was incomplete
* @req: pointer to RPC request
@@ -959,8 +841,11 @@ static int xs_local_send_request(struct rpc_rqst *req)
struct xdr_buf *xdr = &req->rq_snd_buf;
rpc_fraghdr rm = xs_stream_record_marker(xdr);
unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen;
+ struct msghdr msg = {
+ .msg_flags = XS_SENDMSG_FLAGS,
+ };
+ unsigned int uninitialized_var(sent);
int status;
- int sent = 0;
/* Close the stream if the previous transmission was incomplete */
if (xs_send_request_was_aborted(transport, req)) {
@@ -972,8 +857,8 @@ static int xs_local_send_request(struct rpc_rqst *req)
req->rq_svec->iov_base, req->rq_svec->iov_len);
req->rq_xtime = ktime_get();
- status = xs_sendpages(transport->sock, NULL, 0, xdr,
- transport->xmit.offset, rm, &sent);
+ status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
+ transport->xmit.offset, rm, &sent);
dprintk("RPC: %s(%u) = %d\n",
__func__, xdr->len - transport->xmit.offset, status);
@@ -1025,7 +910,12 @@ static int xs_udp_send_request(struct rpc_rqst *req)
struct rpc_xprt *xprt = req->rq_xprt;
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct xdr_buf *xdr = &req->rq_snd_buf;
- int sent = 0;
+ struct msghdr msg = {
+ .msg_name = xs_addr(xprt),
+ .msg_namelen = xprt->addrlen,
+ .msg_flags = XS_SENDMSG_FLAGS,
+ };
+ unsigned int uninitialized_var(sent);
int status;
xs_pktdump("packet data:",
@@ -1039,8 +929,7 @@ static int xs_udp_send_request(struct rpc_rqst *req)
return -EBADSLT;
req->rq_xtime = ktime_get();
- status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen,
- xdr, 0, 0, &sent);
+ status = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, 0, &sent);
dprintk("RPC: xs_udp_send_request(%u) = %d\n",
xdr->len, status);
@@ -1106,9 +995,12 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
struct xdr_buf *xdr = &req->rq_snd_buf;
rpc_fraghdr rm = xs_stream_record_marker(xdr);
unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen;
+ struct msghdr msg = {
+ .msg_flags = XS_SENDMSG_FLAGS,
+ };
bool vm_wait = false;
+ unsigned int uninitialized_var(sent);
int status;
- int sent;
/* Close the stream if the previous transmission was incomplete */
if (xs_send_request_was_aborted(transport, req)) {
@@ -1129,9 +1021,8 @@ static int xs_tcp_send_request(struct rpc_rqst *req)
* called sendmsg(). */
req->rq_xtime = ktime_get();
while (1) {
- sent = 0;
- status = xs_sendpages(transport->sock, NULL, 0, xdr,
- transport->xmit.offset, rm, &sent);
+ status = xprt_sock_sendmsg(transport->sock, &msg, xdr,
+ transport->xmit.offset, rm, &sent);
dprintk("RPC: xs_tcp_send_request(%u) = %d\n",
xdr->len - transport->xmit.offset, status);
@@ -1970,7 +1861,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
struct rpc_xprt *xprt = &transport->xprt;
struct file *filp;
struct socket *sock;
- int status = -EIO;
+ int status;
status = __sock_create(xprt->xprt_net, AF_LOCAL,
SOCK_STREAM, 0, &sock, 1);
@@ -2636,46 +2527,25 @@ static void bc_free(struct rpc_task *task)
free_page((unsigned long)buf);
}
-/*
- * Use the svc_sock to send the callback. Must be called with svsk->sk_mutex
- * held. Borrows heavily from svc_tcp_sendto and xs_tcp_send_request.
- */
static int bc_sendto(struct rpc_rqst *req)
{
- int len;
- struct xdr_buf *xbufp = &req->rq_snd_buf;
+ struct xdr_buf *xdr = &req->rq_snd_buf;
struct sock_xprt *transport =
container_of(req->rq_xprt, struct sock_xprt, xprt);
- unsigned long headoff;
- unsigned long tailoff;
- struct page *tailpage;
struct msghdr msg = {
- .msg_flags = MSG_MORE
+ .msg_flags = 0,
};
rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT |
- (u32)xbufp->len);
- struct kvec iov = {
- .iov_base = &marker,
- .iov_len = sizeof(marker),
- };
+ (u32)xdr->len);
+ unsigned int sent = 0;
+ int err;
req->rq_xtime = ktime_get();
-
- len = kernel_sendmsg(transport->sock, &msg, &iov, 1, iov.iov_len);
- if (len != iov.iov_len)
+ err = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, marker, &sent);
+ xdr_free_bvec(xdr);
+ if (err < 0 || sent != (xdr->len + sizeof(marker)))
return -EAGAIN;
-
- tailpage = NULL;
- if (xbufp->tail[0].iov_len)
- tailpage = virt_to_page(xbufp->tail[0].iov_base);
- tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK;
- headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK;
- len = svc_send_common(transport->sock, xbufp,
- virt_to_page(xbufp->head[0].iov_base), headoff,
- tailpage, tailoff);
- if (len != xbufp->len)
- return -EAGAIN;
- return len;
+ return sent;
}
/*
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 6d466ebdb64f..871feadbbc19 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -394,6 +394,11 @@ static inline u32 msg_connected(struct tipc_msg *m)
return msg_type(m) == TIPC_CONN_MSG;
}
+static inline u32 msg_direct(struct tipc_msg *m)
+{
+ return msg_type(m) == TIPC_DIRECT_MSG;
+}
+
static inline u32 msg_errcode(struct tipc_msg *m)
{
return msg_bits(m, 1, 25, 0xf);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 0c88778c88b5..10292c942384 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1586,7 +1586,8 @@ static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list)
case TIPC_MEDIUM_IMPORTANCE:
case TIPC_HIGH_IMPORTANCE:
case TIPC_CRITICAL_IMPORTANCE:
- if (msg_connected(hdr) || msg_named(hdr)) {
+ if (msg_connected(hdr) || msg_named(hdr) ||
+ msg_direct(hdr)) {
tipc_loopback_trace(peer_net, list);
spin_lock_init(&list->lock);
tipc_sk_rcv(peer_net, list);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 693e8902161e..87466607097f 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1461,7 +1461,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen)
}
__skb_queue_head_init(&pkts);
- mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false);
+ mtu = tipc_node_get_mtu(net, dnode, tsk->portid, true);
rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts);
if (unlikely(rc != dlen))
return rc;
diff --git a/net/wireless/.gitignore b/net/wireless/.gitignore
index 61cbc304a3d3..1a29cd69d6cf 100644
--- a/net/wireless/.gitignore
+++ b/net/wireless/.gitignore
@@ -1,2 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
shipped-certs.c
extra-certs.c
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ad87e9db9a91..5fa402144cda 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -16790,7 +16790,7 @@ void cfg80211_sta_opmode_change_notify(struct net_device *dev, const u8 *mac,
goto nla_put_failure;
if ((sta_opmode->changed & STA_OPMODE_MAX_BW_CHANGED) &&
- nla_put_u8(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw))
+ nla_put_u32(msg, NL80211_ATTR_CHANNEL_WIDTH, sta_opmode->bw))
goto nla_put_failure;
if ((sta_opmode->changed & STA_OPMODE_N_SS_CHANGED) &&
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index dd41e41f9d26..4000382aef48 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -2019,7 +2019,11 @@ void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev,
spin_lock_bh(&rdev->bss_lock);
- if (WARN_ON(cbss->pub.channel == chan))
+ /*
+ * Some APs use CSA also for bandwidth changes, i.e., without actually
+ * changing the control channel, so no need to update in such a case.
+ */
+ if (cbss->pub.channel == chan)
goto done;
/* use transmitting bss */
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 50f567a88f45..6cc7f7f1dd68 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -46,6 +46,25 @@ static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb,
pskb_pull(skb, skb->mac_len + x->props.header_len);
}
+static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb,
+ unsigned int hsize)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ int phlen = 0;
+
+ if (xo->flags & XFRM_GSO_SEGMENT)
+ skb->transport_header = skb->network_header + hsize;
+
+ skb_reset_mac_len(skb);
+ if (x->sel.family != AF_INET6) {
+ phlen = IPV4_BEET_PHMAXLEN;
+ if (x->outer_mode.family == AF_INET6)
+ phlen += sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ }
+
+ pskb_pull(skb, skb->mac_len + hsize + (x->props.header_len - phlen));
+}
+
/* Adjust pointers into the packet when IPsec is done at layer2 */
static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb)
{
@@ -66,9 +85,16 @@ static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb)
return __xfrm_transport_prep(x, skb,
sizeof(struct ipv6hdr));
break;
+ case XFRM_MODE_BEET:
+ if (x->outer_mode.family == AF_INET)
+ return __xfrm_mode_beet_prep(x, skb,
+ sizeof(struct iphdr));
+ if (x->outer_mode.family == AF_INET6)
+ return __xfrm_mode_beet_prep(x, skb,
+ sizeof(struct ipv6hdr));
+ break;
case XFRM_MODE_ROUTEOPTIMIZATION:
case XFRM_MODE_IN_TRIGGER:
- case XFRM_MODE_BEET:
break;
}
}
@@ -78,8 +104,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
int err;
unsigned long flags;
struct xfrm_state *x;
- struct sk_buff *skb2, *nskb;
struct softnet_data *sd;
+ struct sk_buff *skb2, *nskb, *pskb = NULL;
netdev_features_t esp_features = features;
struct xfrm_offload *xo = xfrm_offload(skb);
struct sec_path *sp;
@@ -168,14 +194,14 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
} else {
if (skb == skb2)
skb = nskb;
-
- if (!skb)
- return NULL;
+ else
+ pskb->next = nskb;
continue;
}
skb_push(skb2, skb2->data - skb_mac_header(skb2));
+ pskb = skb2;
}
return skb;
@@ -383,6 +409,7 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void
return xfrm_dev_feat_change(dev);
case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
return xfrm_dev_down(dev);
}
return NOTIFY_DONE;
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index fafc7aba705f..2fd3d990d992 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -535,8 +535,8 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb
{
struct sk_buff *segs, *nskb;
- BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
- BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_GSO_CB_OFFSET);
segs = skb_gso_segment(skb, 0);
kfree_skb(skb);
if (IS_ERR(segs))
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index d560d723b601..297b2fdb3c29 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -434,7 +434,9 @@ EXPORT_SYMBOL(xfrm_policy_destroy);
static void xfrm_policy_kill(struct xfrm_policy *policy)
{
+ write_lock_bh(&policy->lock);
policy->walk.dead = 1;
+ write_unlock_bh(&policy->lock);
atomic_inc(&policy->genid);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 170d6e7f31d3..8be2d926acc2 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -612,7 +612,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
{
struct xfrm_state *x;
- x = kmem_cache_alloc(xfrm_state_cache, GFP_ATOMIC | __GFP_ZERO);
+ x = kmem_cache_zalloc(xfrm_state_cache, GFP_ATOMIC);
if (x) {
write_pnet(&x->xs_net, net);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b88ba45ff1ac..e6cfaa680ef3 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -110,7 +110,8 @@ static inline int verify_sec_ctx_len(struct nlattr **attrs)
return 0;
uctx = nla_data(rt);
- if (uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len))
+ if (uctx->len > nla_len(rt) ||
+ uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len))
return -EINVAL;
return 0;
@@ -2275,6 +2276,9 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
err = verify_newpolicy_info(&ua->policy);
if (err)
goto free_state;
+ err = verify_sec_ctx_len(attrs);
+ if (err)
+ goto free_state;
/* build an XP */
xp = xfrm_policy_construct(net, &ua->policy, attrs, &err);