diff options
Diffstat (limited to 'net')
730 files changed, 36546 insertions, 30623 deletions
diff --git a/net/802/Makefile b/net/802/Makefile index bfed80221b8b..99abc29d537c 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -3,12 +3,11 @@ # Makefile for the Linux 802.x protocol layers. # -# Check the p8022 selections against net/core/Makefile. -obj-$(CONFIG_LLC) += p8022.o psnap.o +obj-$(CONFIG_LLC) += psnap.o obj-$(CONFIG_NET_FC) += fc.o obj-$(CONFIG_FDDI) += fddi.o obj-$(CONFIG_HIPPI) += hippi.o -obj-$(CONFIG_ATALK) += p8022.o psnap.o +obj-$(CONFIG_ATALK) += psnap.o obj-$(CONFIG_STP) += stp.o obj-$(CONFIG_GARP) += garp.o obj-$(CONFIG_MRP) += mrp.o diff --git a/net/802/p8022.c b/net/802/p8022.c deleted file mode 100644 index 78c25168d7c9..000000000000 --- a/net/802/p8022.c +++ /dev/null @@ -1,64 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * NET3: Support for 802.2 demultiplexing off Ethernet - * - * Demultiplex 802.2 encoded protocols. We match the entry by the - * SSAP/DSAP pair and then deliver to the registered datalink that - * matches. The control byte is ignored and handling of such items - * is up to the routine passed the frame. - * - * Unlike the 802.3 datalink we have a list of 802.2 entries as - * there are multiple protocols to demux. The list is currently - * short (3 or 4 entries at most). The current demux assumes this. - */ -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <net/datalink.h> -#include <linux/mm.h> -#include <linux/in.h> -#include <linux/init.h> -#include <net/llc.h> -#include <net/p8022.h> - -static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb, - const unsigned char *dest) -{ - llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap); - return 0; -} - -struct datalink_proto *register_8022_client(unsigned char type, - int (*func)(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt, - struct net_device *orig_dev)) -{ - struct datalink_proto *proto; - - proto = kmalloc(sizeof(*proto), GFP_ATOMIC); - if (proto) { - proto->type[0] = type; - proto->header_length = 3; - proto->request = p8022_request; - proto->sap = llc_sap_open(type, func); - if (!proto->sap) { - kfree(proto); - proto = NULL; - } - } - return proto; -} - -void unregister_8022_client(struct datalink_proto *proto) -{ - llc_sap_put(proto->sap); - kfree(proto); -} - -EXPORT_SYMBOL(register_8022_client); -EXPORT_SYMBOL(unregister_8022_client); - -MODULE_DESCRIPTION("Support for 802.2 demultiplexing off Ethernet"); -MODULE_LICENSE("GPL"); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index e45187b88220..06908e37c3d9 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -23,7 +23,6 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> -#include <net/p8022.h> #include <net/arp.h> #include <linux/rtnetlink.h> #include <linux/notifier.h> @@ -131,7 +130,8 @@ int vlan_check_real_dev(struct net_device *real_dev, { const char *name = real_dev->name; - if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { + if (real_dev->features & NETIF_F_VLAN_CHALLENGED || + real_dev->type != ARPHRD_ETHER) { pr_info("VLANs not supported on %s\n", name); NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device"); return -EOPNOTSUPP; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 91d134961357..fbf296137b09 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -27,6 +27,7 @@ #include <linux/phy.h> #include <net/arp.h> #include <net/macsec.h> +#include <net/netdev_lock.h> #include "vlan.h" #include "vlanproc.h" @@ -273,17 +274,6 @@ static int vlan_dev_open(struct net_device *dev) goto out; } - if (dev->flags & IFF_ALLMULTI) { - err = dev_set_allmulti(real_dev, 1); - if (err < 0) - goto del_unicast; - } - if (dev->flags & IFF_PROMISC) { - err = dev_set_promiscuity(real_dev, 1); - if (err < 0) - goto clear_allmulti; - } - ether_addr_copy(vlan->real_dev_addr, real_dev->dev_addr); if (vlan->flags & VLAN_FLAG_GVRP) @@ -297,12 +287,6 @@ static int vlan_dev_open(struct net_device *dev) netif_carrier_on(dev); return 0; -clear_allmulti: - if (dev->flags & IFF_ALLMULTI) - dev_set_allmulti(real_dev, -1); -del_unicast: - if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) - dev_uc_del(real_dev, dev->dev_addr); out: netif_carrier_off(dev); return err; @@ -315,10 +299,6 @@ static int vlan_dev_stop(struct net_device *dev) dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); - if (dev->flags & IFF_ALLMULTI) - dev_set_allmulti(real_dev, -1); - if (dev->flags & IFF_PROMISC) - dev_set_promiscuity(real_dev, -1); if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr)) dev_uc_del(real_dev, dev->dev_addr); @@ -377,7 +357,6 @@ static int vlan_hwtstamp_set(struct net_device *dev, static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; - const struct net_device_ops *ops = real_dev->netdev_ops; struct ifreq ifrr; int err = -EOPNOTSUPP; @@ -388,8 +367,7 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCGMIIPHY: case SIOCGMIIREG: case SIOCSMIIREG: - if (netif_device_present(real_dev) && ops->ndo_eth_ioctl) - err = ops->ndo_eth_ioctl(real_dev, &ifrr, cmd); + err = dev_eth_ioctl(real_dev, &ifrr, cmd); break; } @@ -490,12 +468,10 @@ static void vlan_dev_change_rx_flags(struct net_device *dev, int change) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; - if (dev->flags & IFF_UP) { - if (change & IFF_ALLMULTI) - dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); - if (change & IFF_PROMISC) - dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); - } + if (change & IFF_ALLMULTI) + dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); + if (change & IFF_PROMISC) + dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); } static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 134419667d59..a000b1ef0520 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -135,11 +135,14 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], return 0; } -static int vlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vlan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct net_device *real_dev; unsigned int max_mtu; __be16 proto; @@ -155,7 +158,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, return -EINVAL; } - real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + real_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) { NL_SET_ERR_MSG_MOD(extack, "link does not exist"); return -ENODEV; diff --git a/net/9p/client.c b/net/9p/client.c index 09f8ced9f8bb..5c1ca57ccd28 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1548,7 +1548,8 @@ p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, struct p9_client *clnt = fid->clnt; struct p9_req_t *req; int count = iov_iter_count(to); - int rsize, received, non_zc = 0; + u32 rsize, received; + bool non_zc = false; char *dataptr; *err = 0; @@ -1571,7 +1572,7 @@ p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, 0, 11, "dqd", fid->fid, offset, rsize); } else { - non_zc = 1; + non_zc = true; req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, rsize); } @@ -1592,11 +1593,13 @@ p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, return 0; } if (rsize < received) { - pr_err("bogus RREAD count (%d > %d)\n", received, rsize); - received = rsize; + pr_err("bogus RREAD count (%u > %u)\n", received, rsize); + *err = -EIO; + p9_req_put(clnt, req); + return 0; } - p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", received); + p9_debug(P9_DEBUG_9P, "<<< RREAD count %u\n", received); if (non_zc) { int n = copy_to_iter(dataptr, received, to); @@ -1623,9 +1626,9 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) *err = 0; while (iov_iter_count(from)) { - int count = iov_iter_count(from); - int rsize = fid->iounit; - int written; + size_t count = iov_iter_count(from); + u32 rsize = fid->iounit; + u32 written; if (!rsize || rsize > clnt->msize - P9_IOHDRSZ) rsize = clnt->msize - P9_IOHDRSZ; @@ -1633,7 +1636,7 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) if (count < rsize) rsize = count; - p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %d (/%d)\n", + p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %u (/%zu)\n", fid->fid, offset, rsize, count); /* Don't bother zerocopy for small IO (< 1024) */ @@ -1659,11 +1662,14 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) break; } if (rsize < written) { - pr_err("bogus RWRITE count (%d > %d)\n", written, rsize); - written = rsize; + pr_err("bogus RWRITE count (%u > %u)\n", written, rsize); + *err = -EIO; + iov_iter_revert(from, count - iov_iter_count(from)); + p9_req_put(clnt, req); + break; } - p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", written); + p9_debug(P9_DEBUG_9P, "<<< RWRITE count %u\n", written); p9_req_put(clnt, req); iov_iter_revert(from, count - written - iov_iter_count(from)); @@ -1698,7 +1704,7 @@ p9_client_write_subreq(struct netfs_io_subrequest *subreq) start, len, &subreq->io_iter); } if (IS_ERR(req)) { - netfs_write_subrequest_terminated(subreq, PTR_ERR(req), false); + netfs_write_subrequest_terminated(subreq, PTR_ERR(req)); return; } @@ -1706,19 +1712,19 @@ p9_client_write_subreq(struct netfs_io_subrequest *subreq) if (err) { trace_9p_protocol_dump(clnt, &req->rc); p9_req_put(clnt, req); - netfs_write_subrequest_terminated(subreq, err, false); + netfs_write_subrequest_terminated(subreq, err); return; } if (written > len) { pr_err("bogus RWRITE count (%d > %u)\n", written, len); - written = len; + written = -EIO; } p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", len); p9_req_put(clnt, req); - netfs_write_subrequest_terminated(subreq, written, false); + netfs_write_subrequest_terminated(subreq, written); } EXPORT_SYMBOL(p9_client_write_subreq); @@ -2098,7 +2104,8 @@ EXPORT_SYMBOL_GPL(p9_client_xattrcreate); int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) { - int err, rsize, non_zc = 0; + int err, non_zc = 0; + u32 rsize; struct p9_client *clnt; struct p9_req_t *req; char *dataptr; @@ -2107,7 +2114,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) iov_iter_kvec(&to, ITER_DEST, &kv, 1, count); - p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n", + p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %u\n", fid->fid, offset, count); clnt = fid->clnt; @@ -2142,11 +2149,12 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) goto free_and_error; } if (rsize < count) { - pr_err("bogus RREADDIR count (%d > %d)\n", count, rsize); - count = rsize; + pr_err("bogus RREADDIR count (%u > %u)\n", count, rsize); + err = -EIO; + goto free_and_error; } - p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count); + p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %u\n", count); if (non_zc) memmove(data, dataptr, count); diff --git a/net/9p/error.c b/net/9p/error.c index 8da744494b68..8ba8afc91482 100644 --- a/net/9p/error.c +++ b/net/9p/error.c @@ -16,6 +16,7 @@ #include <linux/list.h> #include <linux/jhash.h> #include <linux/errno.h> +#include <linux/hashtable.h> #include <net/9p/9p.h> /** @@ -33,8 +34,8 @@ struct errormap { struct hlist_node list; }; -#define ERRHASHSZ 32 -static struct hlist_head hash_errmap[ERRHASHSZ]; +#define ERRHASH_BITS 5 +static DEFINE_HASHTABLE(hash_errmap, ERRHASH_BITS); /* FixMe - reduce to a reasonable size */ static struct errormap errmap[] = { @@ -176,18 +177,14 @@ static struct errormap errmap[] = { int p9_error_init(void) { struct errormap *c; - int bucket; - - /* initialize hash table */ - for (bucket = 0; bucket < ERRHASHSZ; bucket++) - INIT_HLIST_HEAD(&hash_errmap[bucket]); + u32 hash; /* load initial error map into hash table */ for (c = errmap; c->name; c++) { c->namelen = strlen(c->name); - bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ; + hash = jhash(c->name, c->namelen, 0); INIT_HLIST_NODE(&c->list); - hlist_add_head(&c->list, &hash_errmap[bucket]); + hash_add(hash_errmap, &c->list, hash); } return 1; @@ -205,12 +202,12 @@ int p9_errstr2errno(char *errstr, int len) { int errno; struct errormap *c; - int bucket; + u32 hash; errno = 0; c = NULL; - bucket = jhash(errstr, len, 0) % ERRHASHSZ; - hlist_for_each_entry(c, &hash_errmap[bucket], list) { + hash = jhash(errstr, len, 0); + hash_for_each_possible(hash_errmap, c, list, hash) { if (c->namelen == len && !memcmp(c->name, errstr, len)) { errno = c->val; break; diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 196060dc6138..339ec4e54778 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> +#include <linux/in6.h> #include <linux/module.h> #include <linux/net.h> #include <linux/ipv6.h> @@ -191,12 +192,13 @@ static void p9_conn_cancel(struct p9_conn *m, int err) spin_lock(&m->req_lock); - if (m->err) { + if (READ_ONCE(m->err)) { spin_unlock(&m->req_lock); return; } - m->err = err; + WRITE_ONCE(m->err, err); + ASSERT_EXCLUSIVE_WRITER(m->err); list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { list_move(&req->req_list, &cancel_list); @@ -283,7 +285,7 @@ static void p9_read_work(struct work_struct *work) m = container_of(work, struct p9_conn, rq); - if (m->err < 0) + if (READ_ONCE(m->err) < 0) return; p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset); @@ -450,7 +452,7 @@ static void p9_write_work(struct work_struct *work) m = container_of(work, struct p9_conn, wq); - if (m->err < 0) { + if (READ_ONCE(m->err) < 0) { clear_bit(Wworksched, &m->wsched); return; } @@ -622,7 +624,7 @@ static void p9_poll_mux(struct p9_conn *m) __poll_t n; int err = -ECONNRESET; - if (m->err < 0) + if (READ_ONCE(m->err) < 0) return; n = p9_fd_poll(m->client, NULL, &err); @@ -665,6 +667,7 @@ static void p9_poll_mux(struct p9_conn *m) static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) { __poll_t n; + int err; struct p9_trans_fd *ts = client->trans; struct p9_conn *m = &ts->conn; @@ -673,9 +676,10 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) spin_lock(&m->req_lock); - if (m->err < 0) { + err = READ_ONCE(m->err); + if (err < 0) { spin_unlock(&m->req_lock); - return m->err; + return err; } WRITE_ONCE(req->status, REQ_STATUS_UNSENT); @@ -954,64 +958,55 @@ static void p9_fd_close(struct p9_client *client) kfree(ts); } -/* - * stolen from NFS - maybe should be made a generic function? - */ -static inline int valid_ipaddr4(const char *buf) -{ - int rc, count, in[4]; - - rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); - if (rc != 4) - return -EINVAL; - for (count = 0; count < 4; count++) { - if (in[count] > 255) - return -EINVAL; - } - return 0; -} - static int p9_bind_privport(struct socket *sock) { - struct sockaddr_in cl; + struct sockaddr_storage stor = { 0 }; int port, err = -EINVAL; - memset(&cl, 0, sizeof(cl)); - cl.sin_family = AF_INET; - cl.sin_addr.s_addr = htonl(INADDR_ANY); + stor.ss_family = sock->ops->family; + if (stor.ss_family == AF_INET) + ((struct sockaddr_in *)&stor)->sin_addr.s_addr = htonl(INADDR_ANY); + else + ((struct sockaddr_in6 *)&stor)->sin6_addr = in6addr_any; for (port = p9_ipport_resv_max; port >= p9_ipport_resv_min; port--) { - cl.sin_port = htons((ushort)port); - err = kernel_bind(sock, (struct sockaddr *)&cl, sizeof(cl)); + if (stor.ss_family == AF_INET) + ((struct sockaddr_in *)&stor)->sin_port = htons((ushort)port); + else + ((struct sockaddr_in6 *)&stor)->sin6_port = htons((ushort)port); + err = kernel_bind(sock, (struct sockaddr *)&stor, sizeof(stor)); if (err != -EADDRINUSE) break; } return err; } - static int p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) { int err; + char port_str[6]; struct socket *csocket; - struct sockaddr_in sin_server; + struct sockaddr_storage stor = { 0 }; struct p9_fd_opts opts; err = parse_opts(args, &opts); if (err < 0) return err; - if (addr == NULL || valid_ipaddr4(addr) < 0) + if (!addr) return -EINVAL; + sprintf(port_str, "%u", opts.port); + err = inet_pton_with_scope(current->nsproxy->net_ns, AF_UNSPEC, addr, + port_str, &stor); + if (err < 0) + return err; + csocket = NULL; client->trans_opts.tcp.port = opts.port; client->trans_opts.tcp.privport = opts.privport; - sin_server.sin_family = AF_INET; - sin_server.sin_addr.s_addr = in_aton(addr); - sin_server.sin_port = htons(opts.port); - err = __sock_create(current->nsproxy->net_ns, PF_INET, + err = __sock_create(current->nsproxy->net_ns, stor.ss_family, SOCK_STREAM, IPPROTO_TCP, &csocket, 1); if (err) { pr_err("%s (%d): problem creating socket\n", @@ -1030,8 +1025,8 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) } err = READ_ONCE(csocket->ops)->connect(csocket, - (struct sockaddr *)&sin_server, - sizeof(struct sockaddr_in), 0); + (struct sockaddr *)&stor, + sizeof(stor), 0); if (err < 0) { pr_err("%s (%d): problem connecting socket to %s\n", __func__, task_pid_nr(current), addr); diff --git a/net/Kconfig b/net/Kconfig index c3fca69a7c83..ebc80a98fc91 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -68,13 +68,17 @@ config SKB_EXTENSIONS config NET_DEVMEM def_bool y + select GENERIC_ALLOCATOR depends on DMA_SHARED_BUFFER - depends on GENERIC_ALLOCATOR depends on PAGE_POOL config NET_SHAPER bool +config NET_CRC32C + bool + select CRC32 + menu "Networking options" source "net/packet/Kconfig" @@ -245,7 +249,6 @@ source "net/bridge/netfilter/Kconfig" endif -source "net/dccp/Kconfig" source "net/sctp/Kconfig" source "net/rds/Kconfig" source "net/tipc/Kconfig" diff --git a/net/Makefile b/net/Makefile index 60ed5190eda8..aac960c41db6 100644 --- a/net/Makefile +++ b/net/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_PHONET) += phonet/ ifneq ($(CONFIG_VLAN_8021Q),) obj-y += 8021q/ endif -obj-$(CONFIG_IP_DCCP) += dccp/ obj-$(CONFIG_IP_SCTP) += sctp/ obj-$(CONFIG_RDS) += rds/ obj-$(CONFIG_WIRELESS) += wireless/ diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 9fa0b246902b..9c787e2e4b17 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -432,49 +432,18 @@ static struct atalk_addr *__aarp_proxy_find(struct net_device *dev, return a ? sa : NULL; } -/* - * Probe a Phase 1 device or a device that requires its Net:Node to - * be set via an ioctl. - */ -static void aarp_send_probe_phase1(struct atalk_iface *iface) -{ - struct ifreq atreq; - struct sockaddr_at *sa = (struct sockaddr_at *)&atreq.ifr_addr; - const struct net_device_ops *ops = iface->dev->netdev_ops; - - sa->sat_addr.s_node = iface->address.s_node; - sa->sat_addr.s_net = ntohs(iface->address.s_net); - - /* We pass the Net:Node to the drivers/cards by a Device ioctl. */ - if (!(ops->ndo_do_ioctl(iface->dev, &atreq, SIOCSIFADDR))) { - ops->ndo_do_ioctl(iface->dev, &atreq, SIOCGIFADDR); - if (iface->address.s_net != htons(sa->sat_addr.s_net) || - iface->address.s_node != sa->sat_addr.s_node) - iface->status |= ATIF_PROBE_FAIL; - - iface->address.s_net = htons(sa->sat_addr.s_net); - iface->address.s_node = sa->sat_addr.s_node; - } -} - - void aarp_probe_network(struct atalk_iface *atif) { - if (atif->dev->type == ARPHRD_LOCALTLK || - atif->dev->type == ARPHRD_PPP) - aarp_send_probe_phase1(atif); - else { - unsigned int count; + unsigned int count; - for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) { - aarp_send_probe(atif->dev, &atif->address); + for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) { + aarp_send_probe(atif->dev, &atif->address); - /* Defer 1/10th */ - msleep(100); + /* Defer 1/10th */ + msleep(100); - if (atif->status & ATIF_PROBE_FAIL) - break; - } + if (atif->status & ATIF_PROBE_FAIL) + break; } } @@ -887,7 +856,7 @@ int __init aarp_proto_init(void) add_timer(&aarp_timer); rc = register_netdevice_notifier(&aarp_notifier); if (rc) { - del_timer_sync(&aarp_timer); + timer_delete_sync(&aarp_timer); unregister_snap_client(aarp_dl); } return rc; @@ -1042,7 +1011,7 @@ const struct seq_operations aarp_seq_ops = { /* General module cleanup. Called from cleanup_module() in ddp.c. */ void aarp_cleanup_module(void) { - del_timer_sync(&aarp_timer); + timer_delete_sync(&aarp_timer); unregister_netdevice_notifier(&aarp_notifier); unregister_snap_client(aarp_dl); aarp_purge(); diff --git a/net/atm/clip.c b/net/atm/clip.c index 42b910cb4e8e..61b5b700817d 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -904,7 +904,7 @@ static void atm_clip_exit_noproc(void) /* First, stop the idle timer, so it stops banging * on the table. */ - del_timer_sync(&idle_timer); + timer_delete_sync(&idle_timer); dev = clip_devs; while (dev) { diff --git a/net/atm/lec.c b/net/atm/lec.c index ffef658862db..ded2f0df2ee6 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -181,6 +181,7 @@ static void lec_send(struct atm_vcc *vcc, struct sk_buff *skb) { struct net_device *dev = skb->dev; + unsigned int len = skb->len; ATM_SKB(skb)->vcc = vcc; atm_account_tx(vcc, skb); @@ -191,7 +192,7 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb) } dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; + dev->stats.tx_bytes += len; } static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue) @@ -1301,7 +1302,7 @@ lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove) return -1; hlist_del(&to_remove->next); - del_timer(&to_remove->timer); + timer_delete(&to_remove->timer); /* * If this is the only MAC connected to this VCC, @@ -1481,7 +1482,7 @@ static void lec_arp_destroy(struct lec_priv *priv) hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { - del_timer_sync(&entry->timer); + timer_delete_sync(&entry->timer); lec_arp_clear_vccs(entry); hlist_del(&entry->next); lec_arp_put(entry); @@ -1490,7 +1491,7 @@ static void lec_arp_destroy(struct lec_priv *priv) hlist_for_each_entry_safe(entry, next, &priv->lec_no_forward, next) { - del_timer_sync(&entry->timer); + timer_delete_sync(&entry->timer); lec_arp_clear_vccs(entry); hlist_del(&entry->next); lec_arp_put(entry); @@ -1574,7 +1575,7 @@ static void lec_arp_expire_vcc(struct timer_list *t) struct lec_arp_table *to_remove = from_timer(to_remove, t, timer); struct lec_priv *priv = to_remove->priv; - del_timer(&to_remove->timer); + timer_delete(&to_remove->timer); pr_debug("%p %p: vpi:%d vci:%d\n", to_remove, priv, @@ -1842,16 +1843,16 @@ lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr, &priv->lec_arp_empty_ones, next) { if (memcmp(entry->atm_addr, atm_addr, ATM_ESA_LEN) == 0) { hlist_del(&entry->next); - del_timer(&entry->timer); + timer_delete(&entry->timer); tmp = lec_arp_find(priv, mac_addr); if (tmp) { - del_timer(&tmp->timer); + timer_delete(&tmp->timer); tmp->status = ESI_FORWARD_DIRECT; memcpy(tmp->atm_addr, atm_addr, ATM_ESA_LEN); tmp->vcc = entry->vcc; tmp->old_push = entry->old_push; tmp->last_used = jiffies; - del_timer(&entry->timer); + timer_delete(&entry->timer); lec_arp_put(entry); entry = tmp; } else { @@ -1882,7 +1883,7 @@ lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr, /* Temporary, changes before end of function */ } memcpy(entry->atm_addr, atm_addr, ATM_ESA_LEN); - del_timer(&entry->timer); + timer_delete(&entry->timer); for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { hlist_for_each_entry(tmp, &priv->lec_arp_tables[i], next) { @@ -1945,7 +1946,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, entry = make_entry(priv, bus_mac); if (entry == NULL) goto out; - del_timer(&entry->timer); + timer_delete(&entry->timer); memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); entry->recv_vcc = vcc; entry->old_recv_push = old_push; @@ -1987,7 +1988,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data, entry->recv_vcc ? entry->recv_vcc-> vci : 0); found_entry = 1; - del_timer(&entry->timer); + timer_delete(&entry->timer); entry->vcc = vcc; entry->old_push = old_push; if (entry->status == ESI_VC_PENDING) { @@ -2171,7 +2172,7 @@ static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc) &priv->lec_arp_empty_ones, next) { if (entry->vcc == vcc) { lec_arp_clear_vccs(entry); - del_timer(&entry->timer); + timer_delete(&entry->timer); hlist_del(&entry->next); lec_arp_put(entry); } @@ -2181,7 +2182,7 @@ static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc) &priv->lec_no_forward, next) { if (entry->recv_vcc == vcc) { lec_arp_clear_vccs(entry); - del_timer(&entry->timer); + timer_delete(&entry->timer); hlist_del(&entry->next); lec_arp_put(entry); } @@ -2214,7 +2215,7 @@ lec_arp_check_empties(struct lec_priv *priv, hlist_for_each_entry_safe(entry, next, &priv->lec_arp_empty_ones, next) { if (vcc == entry->vcc) { - del_timer(&entry->timer); + timer_delete(&entry->timer); ether_addr_copy(entry->mac_addr, src); entry->status = ESI_FORWARD_DIRECT; entry->last_used = jiffies; diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 324e3ab96bb3..f6b447bba329 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -804,7 +804,7 @@ static int atm_mpoa_mpoad_attach(struct atm_vcc *vcc, int arg) /* This lets us now how our LECs are doing */ err = register_netdevice_notifier(&mpoa_notifier); if (err < 0) { - del_timer(&mpc_timer); + timer_delete(&mpc_timer); return err; } } @@ -1314,6 +1314,8 @@ static void MPOA_cache_impos_rcvd(struct k_message *msg, holding_time = msg->content.eg_info.holding_time; dprintk("(%s) entry = %p, holding_time = %u\n", mpc->dev->name, entry, holding_time); + if (entry == NULL && !holding_time) + return; if (entry == NULL && holding_time) { entry = mpc->eg_ops->add_entry(msg, mpc); mpc->eg_ops->put(entry); @@ -1493,7 +1495,7 @@ static void __exit atm_mpoa_cleanup(void) mpc_proc_clean(); - del_timer_sync(&mpc_timer); + timer_delete_sync(&mpc_timer); unregister_netdevice_notifier(&mpoa_notifier); deregister_atm_ioctl(&atm_ioctl_ops); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d6f9fae06a9d..b790bb92ed1c 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -467,7 +467,7 @@ einval_put: goto out_put; } -static void ax25_fillin_cb_from_dev(ax25_cb *ax25, ax25_dev *ax25_dev) +static void ax25_fillin_cb_from_dev(ax25_cb *ax25, const ax25_dev *ax25_dev) { ax25->rtt = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]) / 2; ax25->t1 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]); @@ -677,22 +677,33 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; } - rtnl_lock(); - dev = __dev_get_by_name(&init_net, devname); + rcu_read_lock(); + dev = dev_get_by_name_rcu(&init_net, devname); if (!dev) { - rtnl_unlock(); + rcu_read_unlock(); res = -ENODEV; break; } + if (ax25->ax25_dev) { + if (dev == ax25->ax25_dev->dev) { + rcu_read_unlock(); + break; + } + netdev_put(ax25->ax25_dev->dev, &ax25->dev_tracker); + ax25_dev_put(ax25->ax25_dev); + } + ax25->ax25_dev = ax25_dev_ax25dev(dev); if (!ax25->ax25_dev) { - rtnl_unlock(); + rcu_read_unlock(); res = -ENODEV; break; } ax25_fillin_cb(ax25, ax25->ax25_dev); - rtnl_unlock(); + netdev_hold(dev, &ax25->dev_tracker, GFP_ATOMIC); + ax25_dev_hold(ax25->ax25_dev); + rcu_read_unlock(); break; default: @@ -1060,11 +1071,11 @@ static int ax25_release(struct socket *sock) } if (ax25_dev) { if (!ax25_dev->device_up) { - del_timer_sync(&ax25->timer); - del_timer_sync(&ax25->t1timer); - del_timer_sync(&ax25->t2timer); - del_timer_sync(&ax25->t3timer); - del_timer_sync(&ax25->idletimer); + timer_delete_sync(&ax25->timer); + timer_delete_sync(&ax25->t1timer); + timer_delete_sync(&ax25->t2timer); + timer_delete_sync(&ax25->t3timer); + timer_delete_sync(&ax25->idletimer); } netdev_put(ax25_dev->dev, &ax25->dev_tracker); ax25_dev_put(ax25_dev); @@ -1259,28 +1270,18 @@ static int __must_check ax25_connect(struct socket *sock, } } - /* - * Must bind first - autobinding in this may or may not work. If - * the socket is already bound, check to see if the device has - * been filled in, error if it hasn't. - */ + /* Must bind first - autobinding does not work. */ if (sock_flag(sk, SOCK_ZAPPED)) { - /* check if we can remove this feature. It is broken. */ - printk(KERN_WARNING "ax25_connect(): %s uses autobind, please contact jreuter@yaina.de\n", - current->comm); - if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) { - kfree(digi); - goto out_release; - } + kfree(digi); + err = -EINVAL; + goto out_release; + } - ax25_fillin_cb(ax25, ax25->ax25_dev); - ax25_cb_add(ax25); - } else { - if (ax25->ax25_dev == NULL) { - kfree(digi); - err = -EHOSTUNREACH; - goto out_release; - } + /* Check to see if the device has been filled in, error if it hasn't. */ + if (ax25->ax25_dev == NULL) { + kfree(digi); + err = -EHOSTUNREACH; + goto out_release; } if (sk->sk_type == SOCK_SEQPACKET && diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 9efd6690b344..3733c0254a50 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -90,7 +90,7 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); list_add(&ax25_dev->list, &ax25_dev_list); - dev->ax25_ptr = ax25_dev; + rcu_assign_pointer(dev->ax25_ptr, ax25_dev); spin_unlock_bh(&ax25_dev_lock); ax25_register_dev_sysctl(ax25_dev); @@ -125,7 +125,7 @@ void ax25_dev_device_down(struct net_device *dev) } } - dev->ax25_ptr = NULL; + RCU_INIT_POINTER(dev->ax25_ptr, NULL); spin_unlock_bh(&ax25_dev_lock); netdev_put(dev, &ax25_dev->dev_tracker); ax25_dev_put(ax25_dev); diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c index c4f8adbf8144..8d9fba069001 100644 --- a/net/ax25/ax25_ds_timer.c +++ b/net/ax25/ax25_ds_timer.c @@ -44,7 +44,7 @@ void ax25_ds_setup_timer(ax25_dev *ax25_dev) void ax25_ds_del_timer(ax25_dev *ax25_dev) { if (ax25_dev) - del_timer(&ax25_dev->dama.slave_timer); + timer_delete(&ax25_dev->dama.slave_timer); } void ax25_ds_set_timer(ax25_dev *ax25_dev) diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index 36249776c021..215d4ccf12b9 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -122,6 +122,7 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) if (dev == NULL) dev = skb->dev; + rcu_read_lock(); if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) { kfree_skb(skb); goto put; @@ -202,7 +203,7 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) ax25_queue_xmit(skb, dev); put: - + rcu_read_unlock(); ax25_route_lock_unuse(); return NETDEV_TX_OK; } diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index 3db76d2470e9..8bca2ace98e5 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -39,10 +39,14 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, const ax25_address *sr * specified. */ if (paclen == 0) { - if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + rcu_read_lock(); + ax25_dev = ax25_dev_ax25dev(dev); + if (!ax25_dev) { + rcu_read_unlock(); return NULL; - + } paclen = ax25_dev->values[AX25_VALUES_PACLEN]; + rcu_read_unlock(); } /* @@ -53,13 +57,19 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, const ax25_address *sr return ax25; /* It already existed */ } - if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + rcu_read_lock(); + ax25_dev = ax25_dev_ax25dev(dev); + if (!ax25_dev) { + rcu_read_unlock(); return NULL; + } - if ((ax25 = ax25_create_cb()) == NULL) + if ((ax25 = ax25_create_cb()) == NULL) { + rcu_read_unlock(); return NULL; - + } ax25_fillin_cb(ax25, ax25_dev); + rcu_read_unlock(); ax25->source_addr = *src; ax25->dest_addr = *dest; @@ -358,7 +368,9 @@ void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev) { unsigned char *ptr; + rcu_read_lock(); skb->protocol = ax25_type_trans(skb, ax25_fwd_dev(dev)); + rcu_read_unlock(); ptr = skb_push(skb, 1); *ptr = 0x00; /* KISS */ diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index b7c4d656a94b..10577434f40b 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -373,78 +373,6 @@ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev) return ax25_rt; } -/* - * Adjust path: If you specify a default route and want to connect - * a target on the digipeater path but w/o having a special route - * set before, the path has to be truncated from your target on. - */ -static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat) -{ - int k; - - for (k = 0; k < digipeat->ndigi; k++) { - if (ax25cmp(addr, &digipeat->calls[k]) == 0) - break; - } - - digipeat->ndigi = k; -} - - -/* - * Find which interface to use. - */ -int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) -{ - ax25_uid_assoc *user; - ax25_route *ax25_rt; - int err = 0; - - ax25_route_lock_use(); - ax25_rt = ax25_get_route(addr, NULL); - if (!ax25_rt) { - ax25_route_lock_unuse(); - return -EHOSTUNREACH; - } - if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) { - err = -EHOSTUNREACH; - goto put; - } - - user = ax25_findbyuid(current_euid()); - if (user) { - ax25->source_addr = user->call; - ax25_uid_put(user); - } else { - if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { - err = -EPERM; - goto put; - } - ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr; - } - - if (ax25_rt->digipeat != NULL) { - ax25->digipeat = kmemdup(ax25_rt->digipeat, sizeof(ax25_digi), - GFP_ATOMIC); - if (ax25->digipeat == NULL) { - err = -ENOMEM; - goto put; - } - ax25_adjust_path(addr, ax25->digipeat); - } - - if (ax25->sk != NULL) { - local_bh_disable(); - bh_lock_sock(ax25->sk); - sock_reset_flag(ax25->sk, SOCK_ZAPPED); - bh_unlock_sock(ax25->sk); - local_bh_enable(); - } - -put: - ax25_route_lock_unuse(); - return err; -} struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src, ax25_address *dest, ax25_digi *digi) diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index 9ff98f46dc6b..bff4b203a893 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -262,11 +262,11 @@ void ax25_disconnect(ax25_cb *ax25, int reason) ax25_clear_queues(ax25); if (reason == ENETUNREACH) { - del_timer_sync(&ax25->timer); - del_timer_sync(&ax25->t1timer); - del_timer_sync(&ax25->t2timer); - del_timer_sync(&ax25->t3timer); - del_timer_sync(&ax25->idletimer); + timer_delete_sync(&ax25->timer); + timer_delete_sync(&ax25->t1timer); + timer_delete_sync(&ax25->t2timer); + timer_delete_sync(&ax25->t3timer); + timer_delete_sync(&ax25->idletimer); } else { if (ax25->sk && !sock_flag(ax25->sk, SOCK_DESTROY)) ax25_stop_heartbeat(ax25); diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c index 9f7cb0a7c73f..3891a3923d6c 100644 --- a/net/ax25/ax25_timer.c +++ b/net/ax25/ax25_timer.c @@ -65,7 +65,7 @@ void ax25_start_t3timer(ax25_cb *ax25) if (ax25->t3 > 0) mod_timer(&ax25->t3timer, jiffies + ax25->t3); else - del_timer(&ax25->t3timer); + timer_delete(&ax25->t3timer); } void ax25_start_idletimer(ax25_cb *ax25) @@ -73,32 +73,32 @@ void ax25_start_idletimer(ax25_cb *ax25) if (ax25->idle > 0) mod_timer(&ax25->idletimer, jiffies + ax25->idle); else - del_timer(&ax25->idletimer); + timer_delete(&ax25->idletimer); } void ax25_stop_heartbeat(ax25_cb *ax25) { - del_timer(&ax25->timer); + timer_delete(&ax25->timer); } void ax25_stop_t1timer(ax25_cb *ax25) { - del_timer(&ax25->t1timer); + timer_delete(&ax25->t1timer); } void ax25_stop_t2timer(ax25_cb *ax25) { - del_timer(&ax25->t2timer); + timer_delete(&ax25->t2timer); } void ax25_stop_t3timer(ax25_cb *ax25) { - del_timer(&ax25->t3timer); + timer_delete(&ax25->t3timer); } void ax25_stop_idletimer(ax25_cb *ax25) { - del_timer(&ax25->idletimer); + timer_delete(&ax25->idletimer); } int ax25_t1timer_running(ax25_cb *ax25) diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 860a0786bc1e..20b316207f9a 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -9,7 +9,7 @@ config BATMAN_ADV tristate "B.A.T.M.A.N. Advanced Meshing Protocol" - select LIBCRC32C + select CRC32 help B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is a routing protocol for multi-hop ad-hoc mesh networks. The diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index b51d8b071b56..1cc9be6de456 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -19,6 +19,7 @@ batman-adv-y += hard-interface.o batman-adv-y += hash.o batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o batman-adv-y += main.o +batman-adv-y += mesh-interface.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast_forw.o batman-adv-y += netlink.o @@ -26,7 +27,6 @@ batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o batman-adv-y += originator.o batman-adv-y += routing.o batman-adv-y += send.o -batman-adv-y += soft-interface.o batman-adv-$(CONFIG_BATMAN_ADV_TRACING) += trace.o batman-adv-y += tp_meter.o batman-adv-y += translation-table.o diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index 4eee53d19eb0..c0c982b6f029 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -90,15 +90,15 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops) } /** - * batadv_algo_select() - Select algorithm of soft interface - * @bat_priv: the bat priv with all the soft interface information + * batadv_algo_select() - Select algorithm of mesh interface + * @bat_priv: the bat priv with all the mesh interface information * @name: name of the algorithm to select * - * The algorithm callbacks for the soft interface will be set when the algorithm + * The algorithm callbacks for the mesh interface will be set when the algorithm * with the correct name was found. Any previous selected algorithm will not be * deinitialized and the new selected algorithm will also not be initialized. * It is therefore not allowed to call batadv_algo_select outside the creation - * function of the soft interface. + * function of the mesh interface. * * Return: 0 on success or negative error number in case of failure */ diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 07ae5dd1f150..458879d21d66 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -23,6 +23,7 @@ #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> +#include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/netlink.h> @@ -129,7 +130,7 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) /** * batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an * originator - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: mac address of the originator * * Return: the originator object corresponding to the passed mac address or NULL @@ -325,15 +326,14 @@ batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len, /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm_packet->tvlv_len); - return (next_buff_pos <= packet_len) && - (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); + return next_buff_pos <= packet_len; } /* send a batman ogm to a given interface */ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); const char *fwd_str; u8 packet_num; s16 buff_pos; @@ -355,7 +355,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, /* we might have aggregated direct link packets with an * ordinary base packet */ - if (forw_packet->direct_link_flags & BIT(packet_num) && + if (test_bit(packet_num, forw_packet->direct_link_flags) && forw_packet->if_incoming == hard_iface) batadv_ogm_packet->flags |= BATADV_DIRECTLINK; else @@ -396,20 +396,20 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, /* send a batman ogm packet */ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) { - struct net_device *soft_iface; + struct net_device *mesh_iface; if (!forw_packet->if_incoming) { pr_err("Error - can't forward packet: incoming iface not specified\n"); return; } - soft_iface = forw_packet->if_incoming->soft_iface; + mesh_iface = forw_packet->if_incoming->mesh_iface; if (WARN_ON(!forw_packet->if_outgoing)) return; - if (forw_packet->if_outgoing->soft_iface != soft_iface) { - pr_warn("%s: soft interface switch for queued OGM\n", __func__); + if (forw_packet->if_outgoing->mesh_iface != mesh_iface) { + pr_warn("%s: mesh interface switch for queued OGM\n", __func__); return; } @@ -424,7 +424,7 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet) * batadv_iv_ogm_can_aggregate() - find out if an OGM can be aggregated on an * existing forward packet * @new_bat_ogm_packet: OGM packet to be aggregated - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @packet_len: (total) length of the OGM * @send_time: timestamp (jiffies) when the packet is to be sent * @directlink: true if this is a direct link packet @@ -444,28 +444,37 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, const struct batadv_forw_packet *forw_packet) { struct batadv_ogm_packet *batadv_ogm_packet; - int aggregated_bytes = forw_packet->packet_len + packet_len; + unsigned int aggregated_bytes = forw_packet->packet_len + packet_len; struct batadv_hard_iface *primary_if = NULL; + u8 packet_num = forw_packet->num_packets; bool res = false; unsigned long aggregation_end_time; + unsigned int max_bytes; batadv_ogm_packet = (struct batadv_ogm_packet *)forw_packet->skb->data; aggregation_end_time = send_time; aggregation_end_time += msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS); + max_bytes = min_t(unsigned int, if_outgoing->net_dev->mtu, + BATADV_MAX_AGGREGATION_BYTES); + /* we can aggregate the current packet to this aggregated packet * if: * * - the send time is within our MAX_AGGREGATION_MS time * - the resulting packet won't be bigger than - * MAX_AGGREGATION_BYTES + * MAX_AGGREGATION_BYTES and MTU of the outgoing interface + * - the number of packets is lower than MAX_AGGREGATION_PACKETS * otherwise aggregation is not possible */ if (!time_before(send_time, forw_packet->send_time) || !time_after_eq(aggregation_end_time, forw_packet->send_time)) return false; - if (aggregated_bytes > BATADV_MAX_AGGREGATION_BYTES) + if (aggregated_bytes > max_bytes) + return false; + + if (packet_num >= BATADV_MAX_AGGREGATION_PACKETS) return false; /* packet is not leaving on the same interface. */ @@ -540,16 +549,16 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, struct batadv_hard_iface *if_outgoing, int own_packet) { - struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_forw_packet *forw_packet_aggr; struct sk_buff *skb; unsigned char *skb_buff; unsigned int skb_size; atomic_t *queue_left = own_packet ? NULL : &bat_priv->batman_queue_left; - if (atomic_read(&bat_priv->aggregated_ogms) && - packet_len < BATADV_MAX_AGGREGATION_BYTES) - skb_size = BATADV_MAX_AGGREGATION_BYTES; + if (atomic_read(&bat_priv->aggregated_ogms)) + skb_size = max_t(unsigned int, BATADV_MAX_AGGREGATION_BYTES, + packet_len); else skb_size = packet_len; @@ -574,12 +583,13 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, memcpy(skb_buff, packet_buff, packet_len); forw_packet_aggr->own = own_packet; - forw_packet_aggr->direct_link_flags = BATADV_NO_FLAGS; + bitmap_zero(forw_packet_aggr->direct_link_flags, + BATADV_MAX_AGGREGATION_PACKETS); forw_packet_aggr->send_time = send_time; /* save packet direct link flag status */ if (direct_link) - forw_packet_aggr->direct_link_flags |= 1; + set_bit(0, forw_packet_aggr->direct_link_flags); INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work, batadv_iv_send_outstanding_bat_ogm_packet); @@ -592,22 +602,20 @@ static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr, const unsigned char *packet_buff, int packet_len, bool direct_link) { - unsigned long new_direct_link_flag; - skb_put_data(forw_packet_aggr->skb, packet_buff, packet_len); forw_packet_aggr->packet_len += packet_len; - forw_packet_aggr->num_packets++; /* save packet direct link flag status */ - if (direct_link) { - new_direct_link_flag = BIT(forw_packet_aggr->num_packets); - forw_packet_aggr->direct_link_flags |= new_direct_link_flag; - } + if (direct_link) + set_bit(forw_packet_aggr->num_packets, + forw_packet_aggr->direct_link_flags); + + forw_packet_aggr->num_packets++; } /** * batadv_iv_ogm_queue_add() - queue up an OGM for transmission - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @packet_buff: pointer to the OGM * @packet_len: (total) length of the OGM * @if_incoming: interface where the packet was received @@ -686,7 +694,7 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { - struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); u16 tvlv_len; if (batadv_ogm_packet->ttl <= 1) { @@ -739,7 +747,7 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, static void batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; @@ -778,7 +786,7 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) */ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff; struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *primary_if, *tmp_hard_iface; @@ -840,7 +848,7 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) */ rcu_read_lock(); list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) { - if (tmp_hard_iface->soft_iface != hard_iface->soft_iface) + if (tmp_hard_iface->mesh_iface != hard_iface->mesh_iface) continue; if (!kref_get_unless_zero(&tmp_hard_iface->refcount)) @@ -901,7 +909,7 @@ static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node, /** * batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an * originator - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: the orig node who originally emitted the ogm packet * @orig_ifinfo: ifinfo for the outgoing interface of the orig_node * @ethhdr: Ethernet header of the OGM @@ -1065,7 +1073,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { - struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; u8 total_count; @@ -1207,7 +1215,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, const struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { - struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_orig_node *orig_node; struct batadv_orig_ifinfo *orig_ifinfo = NULL; struct batadv_neigh_node *neigh_node; @@ -1309,7 +1317,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing) { - struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_hardif_neigh_node *hardif_neigh = NULL; struct batadv_neigh_node *router = NULL; struct batadv_neigh_node *router_router = NULL; @@ -1549,7 +1557,7 @@ static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet, static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) { - struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_orig_node *orig_neigh_node, *orig_node; struct batadv_hard_iface *hard_iface; struct batadv_ogm_packet *ogm_packet; @@ -1599,7 +1607,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; - if (hard_iface->soft_iface != if_incoming->soft_iface) + if (hard_iface->mesh_iface != if_incoming->mesh_iface) continue; if (batadv_compare_eth(ethhdr->h_source, @@ -1664,7 +1672,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; - if (hard_iface->soft_iface != bat_priv->soft_iface) + if (hard_iface->mesh_iface != bat_priv->mesh_iface) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) @@ -1690,7 +1698,7 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work) delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); - bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); + bat_priv = netdev_priv(forw_packet->if_incoming->mesh_iface); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) { dropped = true; @@ -1721,7 +1729,7 @@ out: static int batadv_iv_ogm_receive(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { - struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_ogm_packet *ogm_packet; u8 *packet_pos; int ogm_offset; @@ -1800,7 +1808,7 @@ batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node, * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @neigh_node: Single hops neighbour @@ -1863,7 +1871,7 @@ batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @sub_s: Number of sub entries to skip @@ -1925,7 +1933,7 @@ batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @head: Bucket to be dumped * @idx_s: Number of entries to be skipped @@ -1966,7 +1974,7 @@ batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, * batadv_iv_ogm_orig_dump() - Dump the originators into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @if_outgoing: Limit dump to entries with this outgoing interface */ static void @@ -2088,7 +2096,7 @@ batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @hard_iface: Hard interface to dump the neighbours for * @idx_s: Number of entries to skip * @@ -2125,7 +2133,7 @@ batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, * batadv_iv_ogm_neigh_dump() - Dump the neighbours into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @single_hardif: Limit dump to this hard interface */ static void @@ -2152,7 +2160,7 @@ batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, } else { list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != bat_priv->soft_iface) + if (hard_iface->mesh_iface != bat_priv->mesh_iface) continue; if (i_hardif++ < i_hardif_s) @@ -2236,7 +2244,7 @@ static void batadv_iv_iface_enabled(struct batadv_hard_iface *hard_iface) /** * batadv_iv_init_sel_class() - initialize GW selection class - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv) { @@ -2391,7 +2399,7 @@ out: * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success @@ -2466,7 +2474,7 @@ out: * batadv_iv_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information */ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv) diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index ac11f1f08db0..c16c2e60889d 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -43,7 +43,7 @@ static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); @@ -97,7 +97,7 @@ static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface) */ static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); @@ -113,8 +113,6 @@ static void batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) { ewma_throughput_init(&hardif_neigh->bat_v.throughput); - INIT_WORK(&hardif_neigh->bat_v.metric_work, - batadv_v_elp_throughput_metric_update); } /** @@ -168,7 +166,7 @@ batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @hard_iface: The hard interface to be dumped * @idx_s: Entries to be skipped * @@ -205,7 +203,7 @@ batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq, * message * @msg: Netlink message to dump into * @cb: Control block containing additional options - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @single_hardif: Limit dumping to this hard interface */ static void @@ -230,7 +228,7 @@ batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, } } else { list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != bat_priv->soft_iface) + if (hard_iface->mesh_iface != bat_priv->mesh_iface) continue; if (i_hardif++ < i_hardif_s) @@ -256,7 +254,7 @@ batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb, * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @neigh_node: Single hops neighbour @@ -324,7 +322,7 @@ batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @orig_node: Originator to dump * @sub_s: Number of sub entries to skip @@ -376,7 +374,7 @@ batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @if_outgoing: Limit dump to entries with this outgoing interface * @head: Bucket to be dumped * @idx_s: Number of entries to be skipped @@ -416,7 +414,7 @@ batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, * batadv_v_orig_dump() - Dump the originators into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @if_outgoing: Limit dump to entries with this outgoing interface */ static void @@ -504,7 +502,7 @@ err_ifinfo1: /** * batadv_v_init_sel_class() - initialize GW selection class - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_v_init_sel_class(struct batadv_priv *bat_priv) { @@ -555,7 +553,7 @@ out: /** * batadv_v_gw_get_best_gw_node() - retrieve the best GW node - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: the GW node having the best GW-metric, NULL if no GW is known */ @@ -592,7 +590,7 @@ next: /** * batadv_v_gw_is_eligible() - check if a originator would be selected as GW - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @curr_gw_orig: originator representing the currently selected GW * @orig_node: the originator representing the new candidate * @@ -649,7 +647,7 @@ out: * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @gw_node: Gateway to be dumped * * Return: Error code, or 0 on success @@ -748,7 +746,7 @@ out: * batadv_v_gw_dump() - Dump gateways into a message * @msg: Netlink message to dump into * @cb: Control block containing additional options - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information */ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb, struct batadv_priv *bat_priv) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1d704574e6bf..70d6778da0d7 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -18,6 +18,7 @@ #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kref.h> +#include <linux/list.h> #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/nl80211.h> @@ -26,6 +27,7 @@ #include <linux/rcupdate.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> @@ -42,6 +44,18 @@ #include "send.h" /** + * struct batadv_v_metric_queue_entry - list of hardif neighbors which require + * and metric update + */ +struct batadv_v_metric_queue_entry { + /** @hardif_neigh: hardif neighbor scheduled for metric update */ + struct batadv_hardif_neigh_node *hardif_neigh; + + /** @list: list node for metric_queue */ + struct list_head list; +}; + +/** * batadv_v_elp_start_timer() - restart timer for ELP periodic work * @hard_iface: the interface for which the timer has to be reset */ @@ -59,25 +73,36 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) /** * batadv_v_elp_get_throughput() - get the throughput towards a neighbour * @neigh: the neighbour for which the throughput has to be obtained + * @pthroughput: calculated throughput towards the given neighbour in multiples + * of 100kpbs (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). * - * Return: The throughput towards the given neighbour in multiples of 100kpbs - * (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). + * Return: true when value behind @pthroughput was set */ -static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) +static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh, + u32 *pthroughput) { struct batadv_hard_iface *hard_iface = neigh->if_incoming; + struct net_device *mesh_iface = hard_iface->mesh_iface; struct ethtool_link_ksettings link_settings; struct net_device *real_netdev; struct station_info sinfo; u32 throughput; int ret; + /* don't query throughput when no longer associated with any + * batman-adv interface + */ + if (!mesh_iface) + return false; + /* if the user specified a customised value for this interface, then * return it directly */ throughput = atomic_read(&hard_iface->bat_v.throughput_override); - if (throughput != 0) - return throughput; + if (throughput != 0) { + *pthroughput = throughput; + return true; + } /* if this is a wireless device, then ask its throughput through * cfg80211 API @@ -104,27 +129,39 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) * possible to delete this neighbor. For now set * the throughput metric to 0. */ - return 0; + *pthroughput = 0; + return true; } if (ret) goto default_throughput; - if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) - return sinfo.expected_throughput / 100; + if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) { + *pthroughput = sinfo.expected_throughput / 100; + return true; + } /* try to estimate the expected throughput based on reported tx * rates */ - if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) - return cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) { + *pthroughput = cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + return true; + } goto default_throughput; } + /* only use rtnl_trylock because the elp worker will be cancelled while + * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise + * wait forever when the elp work_item was started and it is then also + * trying to rtnl_lock + */ + if (!rtnl_trylock()) + return false; + /* if not a wifi interface, check if this device provides data via * ethtool (e.g. an Ethernet adapter) */ - rtnl_lock(); ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); rtnl_unlock(); if (ret == 0) { @@ -135,13 +172,15 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; throughput = link_settings.base.speed; - if (throughput && throughput != SPEED_UNKNOWN) - return throughput * 10; + if (throughput && throughput != SPEED_UNKNOWN) { + *pthroughput = throughput * 10; + return true; + } } default_throughput: if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) { - batadv_info(hard_iface->soft_iface, + batadv_info(mesh_iface, "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n", hard_iface->net_dev->name, BATADV_THROUGHPUT_DEFAULT_VALUE / 10, @@ -150,31 +189,26 @@ default_throughput: } /* if none of the above cases apply, return the base_throughput */ - return BATADV_THROUGHPUT_DEFAULT_VALUE; + *pthroughput = BATADV_THROUGHPUT_DEFAULT_VALUE; + return true; } /** * batadv_v_elp_throughput_metric_update() - worker updating the throughput * metric of a single hop neighbour - * @work: the work queue item + * @neigh: the neighbour to probe */ -void batadv_v_elp_throughput_metric_update(struct work_struct *work) +static void +batadv_v_elp_throughput_metric_update(struct batadv_hardif_neigh_node *neigh) { - struct batadv_hardif_neigh_node_bat_v *neigh_bat_v; - struct batadv_hardif_neigh_node *neigh; - - neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v, - metric_work); - neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, - bat_v); + u32 throughput; + bool valid; - ewma_throughput_add(&neigh->bat_v.throughput, - batadv_v_elp_get_throughput(neigh)); + valid = batadv_v_elp_get_throughput(neigh, &throughput); + if (!valid) + return; - /* decrement refcounter to balance increment performed before scheduling - * this task - */ - batadv_hardif_neigh_put(neigh); + ewma_throughput_add(&neigh->bat_v.throughput, throughput); } /** @@ -192,7 +226,7 @@ static bool batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) { struct batadv_hard_iface *hard_iface = neigh->if_incoming; - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); unsigned long last_tx_diff; struct sk_buff *skb; int probe_len, i; @@ -248,18 +282,20 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) */ static void batadv_v_elp_periodic_work(struct work_struct *work) { + struct batadv_v_metric_queue_entry *metric_entry; + struct batadv_v_metric_queue_entry *metric_safe; struct batadv_hardif_neigh_node *hardif_neigh; struct batadv_hard_iface *hard_iface; struct batadv_hard_iface_bat_v *bat_v; struct batadv_elp_packet *elp_packet; + struct list_head metric_queue; struct batadv_priv *bat_priv; struct sk_buff *skb; u32 elp_interval; - bool ret; bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work); hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v); - bat_priv = netdev_priv(hard_iface->soft_iface); + bat_priv = netdev_priv(hard_iface->mesh_iface); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) goto out; @@ -291,6 +327,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) atomic_inc(&hard_iface->bat_v.elp_seqno); + INIT_LIST_HEAD(&metric_queue); + /* The throughput metric is updated on each sent packet. This way, if a * node is dead and no longer sends packets, batman-adv is still able to * react timely to its death. @@ -315,16 +353,28 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) /* Reading the estimated throughput from cfg80211 is a task that * may sleep and that is not allowed in an rcu protected - * context. Therefore schedule a task for that. + * context. Therefore add it to metric_queue and process it + * outside rcu protected context. */ - ret = queue_work(batadv_event_workqueue, - &hardif_neigh->bat_v.metric_work); - - if (!ret) + metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC); + if (!metric_entry) { batadv_hardif_neigh_put(hardif_neigh); + continue; + } + + metric_entry->hardif_neigh = hardif_neigh; + list_add(&metric_entry->list, &metric_queue); } rcu_read_unlock(); + list_for_each_entry_safe(metric_entry, metric_safe, &metric_queue, list) { + batadv_v_elp_throughput_metric_update(metric_entry->hardif_neigh); + + batadv_hardif_neigh_put(metric_entry->hardif_neigh); + list_del(&metric_entry->list); + kfree(metric_entry); + } + restart_timer: batadv_v_elp_start_timer(hard_iface); out: @@ -426,7 +476,7 @@ void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface) /* update orig field of every elp iface belonging to this mesh */ rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (primary_iface->soft_iface != hard_iface->soft_iface) + if (primary_iface->mesh_iface != hard_iface->mesh_iface) continue; batadv_v_elp_iface_activate(primary_iface, hard_iface); @@ -436,7 +486,7 @@ void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface) /** * batadv_v_elp_neigh_update() - update an ELP neighbour node - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @neigh_addr: the neighbour interface address * @if_incoming: the interface the packet was received through * @elp_packet: the received ELP packet @@ -502,7 +552,7 @@ orig_free: int batadv_v_elp_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { - struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_elp_packet *elp_packet; struct batadv_hard_iface *primary_if; struct ethhdr *ethhdr; diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 9e2740195fa2..c9cb0a307100 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -10,7 +10,6 @@ #include "main.h" #include <linux/skbuff.h> -#include <linux/workqueue.h> int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface); void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface); @@ -19,6 +18,5 @@ void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface, void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface); int batadv_v_elp_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming); -void batadv_v_elp_throughput_metric_update(struct work_struct *work); #endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index e503ee0d896b..b86bb647da5b 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -45,7 +45,7 @@ /** * batadv_v_ogm_orig_get() - retrieve and possibly create an originator node - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: the address of the originator * * Return: the orig_node corresponding to the specified address. If such an @@ -96,7 +96,7 @@ static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface) /** * batadv_v_ogm_start_timer() - restart the OGM sending timer - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) { @@ -121,7 +121,7 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) static void batadv_v_ogm_send_to_if(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); if (hard_iface->if_status != BATADV_IF_ACTIVE) { kfree_skb(skb); @@ -239,7 +239,7 @@ static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface) static void batadv_v_ogm_queue_on_if(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); if (!atomic_read(&bat_priv->aggregated_ogms)) { batadv_v_ogm_send_to_if(skb, hard_iface); @@ -256,10 +256,10 @@ static void batadv_v_ogm_queue_on_if(struct sk_buff *skb, } /** - * batadv_v_ogm_send_softif() - periodic worker broadcasting the own OGM - * @bat_priv: the bat priv with all the soft interface information + * batadv_v_ogm_send_meshif() - periodic worker broadcasting the own OGM + * @bat_priv: the bat priv with all the mesh interface information */ -static void batadv_v_ogm_send_softif(struct batadv_priv *bat_priv) +static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv) { struct batadv_hard_iface *hard_iface; struct batadv_ogm2_packet *ogm_packet; @@ -302,7 +302,7 @@ static void batadv_v_ogm_send_softif(struct batadv_priv *bat_priv) /* broadcast on every interface */ rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != bat_priv->soft_iface) + if (hard_iface->mesh_iface != bat_priv->mesh_iface) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) @@ -373,7 +373,7 @@ static void batadv_v_ogm_send(struct work_struct *work) bat_priv = container_of(bat_v, struct batadv_priv, bat_v); mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); - batadv_v_ogm_send_softif(bat_priv); + batadv_v_ogm_send_meshif(bat_priv); mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex); } @@ -408,7 +408,7 @@ void batadv_v_ogm_aggr_work(struct work_struct *work) */ int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); batadv_v_ogm_start_queue_timer(hard_iface); batadv_v_ogm_start_timer(bat_priv); @@ -435,7 +435,7 @@ void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface) */ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface) { - struct batadv_priv *bat_priv = netdev_priv(primary_iface->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(primary_iface->mesh_iface); struct batadv_ogm2_packet *ogm_packet; mutex_lock(&bat_priv->bat_v.ogm_buff_mutex); @@ -452,7 +452,7 @@ unlock: /** * batadv_v_forward_penalty() - apply a penalty to the throughput metric * forwarded with B.A.T.M.A.N. V OGMs - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @if_incoming: the interface where the OGM has been received * @if_outgoing: the interface where the OGM has to be forwarded to * @throughput: the current throughput @@ -505,7 +505,7 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, /** * batadv_v_ogm_forward() - check conditions and forward an OGM to the given * outgoing interface - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @ogm_received: previously received OGM to be forwarded * @orig_node: the originator which has been updated * @neigh_node: the neigh_node through with the OGM has been received @@ -592,7 +592,7 @@ out: /** * batadv_v_ogm_metric_update() - update route metric based on OGM - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received * @neigh_node: the neigh_node through with the OGM has been received @@ -675,7 +675,7 @@ out: /** * batadv_v_ogm_route_update() - update routes based on OGM - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: the Ethernet header of the OGM2 * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received @@ -770,7 +770,7 @@ out: /** * batadv_v_ogm_process_per_outif() - process a batman v OGM for an outgoing if - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: the Ethernet header of the OGM2 * @ogm2: OGM2 structure * @orig_node: Originator structure for which the OGM has been received @@ -839,8 +839,7 @@ batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, /* check if there is enough space for the optional TVLV */ next_buff_pos += ntohs(ogm2_packet->tvlv_len); - return (next_buff_pos <= packet_len) && - (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES); + return next_buff_pos <= packet_len; } /** @@ -852,7 +851,7 @@ batadv_v_ogm_aggr_packet(int buff_pos, int packet_len, static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_incoming) { - struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct ethhdr *ethhdr; struct batadv_orig_node *orig_node = NULL; struct batadv_hardif_neigh_node *hardif_neigh = NULL; @@ -926,7 +925,7 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; - if (hard_iface->soft_iface != bat_priv->soft_iface) + if (hard_iface->mesh_iface != bat_priv->mesh_iface) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) @@ -985,7 +984,7 @@ out: int batadv_v_ogm_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming) { - struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface); struct batadv_ogm2_packet *ogm_packet; struct ethhdr *ethhdr; int ogm_offset; @@ -1036,7 +1035,7 @@ free_skb: /** * batadv_v_ogm_init() - initialise the OGM2 engine - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or a negative error code in case of failure */ @@ -1071,7 +1070,7 @@ int batadv_v_ogm_init(struct batadv_priv *bat_priv) /** * batadv_v_ogm_free() - free OGM private resources - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_v_ogm_free(struct batadv_priv *bat_priv) { diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index 649c41f393e1..2c49b2711650 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -23,7 +23,7 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n) /** * batadv_bit_get_packet() - receive and process one packet within the sequence * number window - * @priv: the bat priv with all the soft interface information + * @priv: the bat priv with all the mesh interface information * @seq_bits: pointer to the sequence number receive packet * @seq_num_diff: difference between the current/received sequence number and * the last sequence number diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 449faf5a5487..747755647c6a 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -12,6 +12,7 @@ #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/crc16.h> +#include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> @@ -38,7 +39,6 @@ #include <net/arp.h> #include <net/genetlink.h> #include <net/netlink.h> -#include <net/sock.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> @@ -47,7 +47,6 @@ #include "log.h" #include "netlink.h" #include "originator.h" -#include "soft-interface.h" #include "translation-table.h" static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; @@ -210,7 +209,7 @@ static void batadv_claim_put(struct batadv_bla_claim *claim) /** * batadv_claim_hash_find() - looks for a claim in the claim hash - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @data: search data (may be local/static data) * * Return: claim if found or NULL otherwise. @@ -249,7 +248,7 @@ batadv_claim_hash_find(struct batadv_priv *bat_priv, /** * batadv_backbone_hash_find() - looks for a backbone gateway in the hash - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: the address of the originator * @vid: the VLAN ID * @@ -333,7 +332,7 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) /** * batadv_bla_send_claim() - sends a claim frame according to the provided info - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @mac: the mac address to be announced within the claim * @vid: the VLAN ID * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...) @@ -344,7 +343,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac, struct sk_buff *skb; struct ethhdr *ethhdr; struct batadv_hard_iface *primary_if; - struct net_device *soft_iface; + struct net_device *mesh_iface; u8 *hw_src; struct batadv_bla_claim_dst local_claim_dest; __be32 zeroip = 0; @@ -357,12 +356,12 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac, sizeof(local_claim_dest)); local_claim_dest.type = claimtype; - soft_iface = primary_if->soft_iface; + mesh_iface = primary_if->mesh_iface; skb = arp_create(ARPOP_REPLY, ETH_P_ARP, /* IP DST: 0.0.0.0 */ zeroip, - primary_if->soft_iface, + primary_if->mesh_iface, /* IP SRC: 0.0.0.0 */ zeroip, /* Ethernet DST: Broadcast */ @@ -440,7 +439,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac, } skb_reset_mac_header(skb); - skb->protocol = eth_type_trans(skb, soft_iface); + skb->protocol = eth_type_trans(skb, mesh_iface); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); @@ -467,7 +466,7 @@ static void batadv_bla_loopdetect_report(struct work_struct *work) report_work); bat_priv = backbone_gw->bat_priv; - batadv_info(bat_priv->soft_iface, + batadv_info(bat_priv->mesh_iface, "Possible loop on VLAN %d detected which can't be handled by BLA - please check your network setup!\n", batadv_print_vid(backbone_gw->vid)); snprintf(vid_str, sizeof(vid_str), "%d", @@ -482,7 +481,7 @@ static void batadv_bla_loopdetect_report(struct work_struct *work) /** * batadv_bla_get_backbone_gw() - finds or creates a backbone gateway - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: the mac address of the originator * @vid: the VLAN ID * @own_backbone: set if the requested backbone is local @@ -555,7 +554,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig, /** * batadv_bla_update_own_backbone_gw() - updates the own backbone gw for a VLAN - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the selected primary interface * @vid: VLAN identifier * @@ -581,7 +580,7 @@ batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv, /** * batadv_bla_answer_request() - answer a bla request by sending own claims - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @primary_if: interface where the request came on * @vid: the vid where the request came on * @@ -658,7 +657,7 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) /** * batadv_bla_send_announce() - Send an announcement frame - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @backbone_gw: our backbone gateway which should be announced */ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, @@ -679,7 +678,7 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, /** * batadv_bla_add_claim() - Adds a claim in the claim hash - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @mac: the mac address of the claim * @vid: the VLAN ID of the frame * @backbone_gw: the backbone gateway which claims it @@ -789,7 +788,7 @@ batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim) /** * batadv_bla_del_claim() - delete a claim from the claim hash - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @mac: mac address of the claim to be removed * @vid: VLAN id for the claim to be removed */ @@ -827,7 +826,7 @@ free_claim: /** * batadv_handle_announce() - check for ANNOUNCE frame - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @an_addr: announcement mac address (ARP Sender HW address) * @backbone_addr: originator address of the sender (Ethernet source MAC) * @vid: the VLAN ID of the frame @@ -885,8 +884,8 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, /** * batadv_handle_request() - check for REQUEST frame - * @bat_priv: the bat priv with all the soft interface information - * @primary_if: the primary hard interface of this batman soft interface + * @bat_priv: the bat priv with all the mesh interface information + * @primary_if: the primary hard interface of this batman mesh interface * @backbone_addr: backbone address to be requested (ARP sender HW MAC) * @ethhdr: ethernet header of a packet * @vid: the VLAN ID of the frame @@ -918,8 +917,8 @@ static bool batadv_handle_request(struct batadv_priv *bat_priv, /** * batadv_handle_unclaim() - check for UNCLAIM frame - * @bat_priv: the bat priv with all the soft interface information - * @primary_if: the primary hard interface of this batman soft interface + * @bat_priv: the bat priv with all the mesh interface information + * @primary_if: the primary hard interface of this batman mesh interface * @backbone_addr: originator address of the backbone (Ethernet source) * @claim_addr: Client to be unclaimed (ARP sender HW MAC) * @vid: the VLAN ID of the frame @@ -956,8 +955,8 @@ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv, /** * batadv_handle_claim() - check for CLAIM frame - * @bat_priv: the bat priv with all the soft interface information - * @primary_if: the primary hard interface of this batman soft interface + * @bat_priv: the bat priv with all the mesh interface information + * @primary_if: the primary hard interface of this batman mesh interface * @backbone_addr: originator address of the backbone (Ethernet Source) * @claim_addr: client mac address to be claimed (ARP sender HW MAC) * @vid: the VLAN ID of the frame @@ -993,7 +992,7 @@ static bool batadv_handle_claim(struct batadv_priv *bat_priv, /** * batadv_check_claim_group() - check for claim group membership - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the primary interface of this batman interface * @hw_src: the Hardware source in the ARP Header * @hw_dst: the Hardware destination in the ARP Header @@ -1068,8 +1067,8 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv, /** * batadv_bla_process_claim() - Check if this is a claim frame, and process it - * @bat_priv: the bat priv with all the soft interface information - * @primary_if: the primary hard interface of this batman soft interface + * @bat_priv: the bat priv with all the mesh interface information + * @primary_if: the primary hard interface of this batman mesh interface * @skb: the frame to be checked * * Return: true if it was a claim frame, otherwise return false to @@ -1211,7 +1210,7 @@ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv, /** * batadv_bla_purge_backbone_gw() - Remove backbone gateways after a timeout or * immediately - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @now: whether the whole hash shall be wiped now * * Check when we last heard from other nodes, and remove them in case of @@ -1263,7 +1262,7 @@ purge_now: /** * batadv_bla_purge_claims() - Remove claims after a timeout or immediately - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the selected primary interface, may be NULL if now is set * @now: whether the whole hash shall be wiped now * @@ -1322,7 +1321,7 @@ skip: /** * batadv_bla_update_orig_address() - Update the backbone gateways when the own * originator address changes - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the new selected primary_if * @oldif: the old primary interface, may be NULL */ @@ -1377,7 +1376,7 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, /** * batadv_bla_send_loopdetect() - send a loopdetect frame - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @backbone_gw: the backbone gateway for which a loop should be detected * * To detect loops that the bridge loop avoidance can't handle, send a loop @@ -1397,7 +1396,7 @@ batadv_bla_send_loopdetect(struct batadv_priv *bat_priv, /** * batadv_bla_status_update() - purge bla interfaces if necessary - * @net_dev: the soft interface net device + * @net_dev: the mesh interface net device */ void batadv_bla_status_update(struct net_device *net_dev) { @@ -1521,7 +1520,7 @@ static struct lock_class_key batadv_backbone_hash_lock_class_key; /** * batadv_bla_init() - initialize all bla structures - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success, < 0 on error. */ @@ -1587,7 +1586,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv) /** * batadv_bla_check_duplist() - Check if a frame is in the broadcast dup. - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: contains the multicast packet to be checked * @payload_ptr: pointer to position inside the head buffer of the skb * marking the start of the data to be CRC'ed @@ -1681,7 +1680,7 @@ out: /** * batadv_bla_check_ucast_duplist() - Check if a frame is in the broadcast dup. - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: contains the multicast packet to be checked, decapsulated from a * unicast_packet * @@ -1699,7 +1698,7 @@ static bool batadv_bla_check_ucast_duplist(struct batadv_priv *bat_priv, /** * batadv_bla_check_bcast_duplist() - Check if a frame is in the broadcast dup. - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: contains the bcast_packet to be checked * * Check if it is on our broadcast list. Another gateway might have sent the @@ -1724,7 +1723,7 @@ bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, /** * batadv_bla_is_backbone_gw_orig() - Check if the originator is a gateway for * the VLAN identified by vid. - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: originator mac address * @vid: VLAN identifier * @@ -1767,7 +1766,7 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, * @orig_node: the orig_node of the frame * @hdr_size: maximum length of the frame * - * Return: true if the orig_node is also a gateway on the soft interface, + * Return: true if the orig_node is also a gateway on the mesh interface, * otherwise it returns false. */ bool batadv_bla_is_backbone_gw(struct sk_buff *skb, @@ -1797,9 +1796,9 @@ bool batadv_bla_is_backbone_gw(struct sk_buff *skb, /** * batadv_bla_free() - free all bla structures - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * - * for softinterface free or module unload + * for meshinterface free or module unload */ void batadv_bla_free(struct batadv_priv *bat_priv) { @@ -1823,7 +1822,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv) /** * batadv_bla_loopdetect_check() - check and handle a detected loop - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the packet to check * @primary_if: interface where the request came on * @vid: the VLAN ID of the frame @@ -1878,7 +1877,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, /** * batadv_bla_rx() - check packets coming from the mesh. - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * @packet_type: the batman packet type this frame came in @@ -2011,7 +2010,7 @@ out: /** * batadv_bla_tx() - check packets going into the mesh - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * @@ -2233,27 +2232,18 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; - struct net *net = sock_net(cb->skb->sk); - struct net_device *soft_iface; + struct net_device *mesh_iface; struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; int idx = cb->args[1]; - int ifindex; int ret = 0; - ifindex = batadv_netlink_get_ifindex(cb->nlh, - BATADV_ATTR_MESH_IFINDEX); - if (!ifindex) - return -EINVAL; + mesh_iface = batadv_netlink_get_meshif(cb); + if (IS_ERR(mesh_iface)) + return PTR_ERR(mesh_iface); - soft_iface = dev_get_by_index(net, ifindex); - if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { - ret = -ENODEV; - goto out; - } - - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(mesh_iface); hash = bat_priv->bla.claim_hash; primary_if = batadv_primary_if_get_selected(bat_priv); @@ -2277,7 +2267,7 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) out: batadv_hardif_put(primary_if); - dev_put(soft_iface); + dev_put(mesh_iface); return ret; } @@ -2403,27 +2393,18 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; - struct net *net = sock_net(cb->skb->sk); - struct net_device *soft_iface; + struct net_device *mesh_iface; struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; int idx = cb->args[1]; - int ifindex; int ret = 0; - ifindex = batadv_netlink_get_ifindex(cb->nlh, - BATADV_ATTR_MESH_IFINDEX); - if (!ifindex) - return -EINVAL; - - soft_iface = dev_get_by_index(net, ifindex); - if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { - ret = -ENODEV; - goto out; - } + mesh_iface = batadv_netlink_get_meshif(cb); + if (IS_ERR(mesh_iface)) + return PTR_ERR(mesh_iface); - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(mesh_iface); hash = bat_priv->bla.backbone_hash; primary_if = batadv_primary_if_get_selected(bat_priv); @@ -2447,7 +2428,7 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) out: batadv_hardif_put(primary_if); - dev_put(soft_iface); + dev_put(mesh_iface); return ret; } @@ -2456,7 +2437,7 @@ out: /** * batadv_bla_check_claim() - check if address is claimed * - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: mac address of which the claim status is checked * @vid: the VLAN ID * diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 801eff8a40e5..8b8132eb0a79 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -7,11 +7,11 @@ #include "distributed-arp-table.h" #include "main.h" -#include <linux/unaligned.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> +#include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> @@ -32,11 +32,11 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/udp.h> +#include <linux/unaligned.h> #include <linux/workqueue.h> #include <net/arp.h> #include <net/genetlink.h> #include <net/netlink.h> -#include <net/sock.h> #include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" @@ -46,7 +46,6 @@ #include "netlink.h" #include "originator.h" #include "send.h" -#include "soft-interface.h" #include "translation-table.h" #include "tvlv.h" @@ -97,7 +96,7 @@ static void batadv_dat_purge(struct work_struct *work); /** * batadv_dat_start_timer() - initialise the DAT periodic worker - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_dat_start_timer(struct batadv_priv *bat_priv) { @@ -146,7 +145,7 @@ static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry) /** * __batadv_dat_purge() - delete entries from the DAT local storage - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @to_purge: function in charge to decide whether an entry has to be purged or * not. This function takes the dat_entry as argument and has to * returns a boolean value: true is the entry has to be deleted, @@ -316,7 +315,7 @@ static u32 batadv_hash_dat(const void *data, u32 size) /** * batadv_dat_entry_hash_find() - look for a given dat_entry in the local hash * table - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @ip: search key * @vid: VLAN identifier * @@ -358,7 +357,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, /** * batadv_dat_entry_add() - add a new dat entry or update it if already exists - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @ip: ipv4 to add/edit * @mac_addr: mac address to assign to the given ipv4 * @vid: VLAN identifier @@ -415,7 +414,7 @@ out: /** * batadv_dbg_arp() - print a debug message containing all the ARP packet * details - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: ARP packet * @hdr_size: size of the possible header before the ARP packet * @msg: message to print together with the debugging information @@ -550,7 +549,7 @@ out: /** * batadv_choose_next_candidate() - select the next DHT candidate - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @cands: candidates array * @select: number of candidates already present in the array * @ip_key: key to look up in the DHT @@ -614,7 +613,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, /** * batadv_dat_select_candidates() - select the nodes which the DHT message has * to be sent to - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @ip_dst: ipv4 to look up in the DHT * @vid: VLAN identifier * @@ -659,7 +658,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst, /** * batadv_dat_forward_data() - copy and send payload to the selected candidates - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: payload to send * @ip: the DHT key * @vid: VLAN identifier @@ -735,7 +734,7 @@ free_orig: /** * batadv_dat_tvlv_container_update() - update the dat tvlv container after dat * setting change - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_dat_tvlv_container_update(struct batadv_priv *bat_priv) { @@ -757,7 +756,7 @@ static void batadv_dat_tvlv_container_update(struct batadv_priv *bat_priv) /** * batadv_dat_status_update() - update the dat tvlv container after dat * setting change - * @net_dev: the soft interface net device + * @net_dev: the mesh interface net device */ void batadv_dat_status_update(struct net_device *net_dev) { @@ -768,7 +767,7 @@ void batadv_dat_status_update(struct net_device *net_dev) /** * batadv_dat_tvlv_ogm_handler_v1() - process incoming dat tvlv container - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the gateway data @@ -787,7 +786,7 @@ static void batadv_dat_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, /** * batadv_dat_hash_free() - free the local DAT hash table - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_dat_hash_free(struct batadv_priv *bat_priv) { @@ -803,7 +802,7 @@ static void batadv_dat_hash_free(struct batadv_priv *bat_priv) /** * batadv_dat_init() - initialise the DAT internals - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 in case of success, a negative error code otherwise */ @@ -829,7 +828,7 @@ int batadv_dat_init(struct batadv_priv *bat_priv) /** * batadv_dat_free() - free the DAT internals - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_dat_free(struct batadv_priv *bat_priv) { @@ -937,27 +936,18 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; - struct net *net = sock_net(cb->skb->sk); - struct net_device *soft_iface; + struct net_device *mesh_iface; struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; int idx = cb->args[1]; - int ifindex; int ret = 0; - ifindex = batadv_netlink_get_ifindex(cb->nlh, - BATADV_ATTR_MESH_IFINDEX); - if (!ifindex) - return -EINVAL; - - soft_iface = dev_get_by_index(net, ifindex); - if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { - ret = -ENODEV; - goto out; - } + mesh_iface = batadv_netlink_get_meshif(cb); + if (IS_ERR(mesh_iface)) + return PTR_ERR(mesh_iface); - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(mesh_iface); hash = bat_priv->dat.hash; primary_if = batadv_primary_if_get_selected(bat_priv); @@ -983,14 +973,14 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) out: batadv_hardif_put(primary_if); - dev_put(soft_iface); + dev_put(mesh_iface); return ret; } /** * batadv_arp_get_type() - parse an ARP packet and gets the type - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: packet to analyse * @hdr_size: size of the possible header before the ARP packet in the skb * @@ -1090,7 +1080,7 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size) /** * batadv_dat_arp_create_reply() - create an ARP Reply - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @ip_src: ARP sender IP * @ip_dst: ARP target IP * @hw_src: Ethernet source and ARP sender MAC @@ -1109,7 +1099,7 @@ batadv_dat_arp_create_reply(struct batadv_priv *bat_priv, __be32 ip_src, { struct sk_buff *skb; - skb = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_dst, bat_priv->soft_iface, + skb = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_dst, bat_priv->mesh_iface, ip_src, hw_dst, hw_src, hw_dst); if (!skb) return NULL; @@ -1126,7 +1116,7 @@ batadv_dat_arp_create_reply(struct batadv_priv *bat_priv, __be32 ip_src, /** * batadv_dat_snoop_outgoing_arp_request() - snoop the ARP request and try to * answer using DAT - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: packet to check * * Return: true if the message has been sent to the dht candidates, false @@ -1142,7 +1132,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, bool ret = false; struct batadv_dat_entry *dat_entry = NULL; struct sk_buff *skb_new; - struct net_device *soft_iface = bat_priv->soft_iface; + struct net_device *mesh_iface = bat_priv->mesh_iface; int hdr_size = 0; unsigned short vid; @@ -1172,7 +1162,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, * client will answer itself. DAT would only generate a * duplicate packet. * - * Moreover, if the soft-interface is enslaved into a bridge, an + * Moreover, if the mesh-interface is enslaved into a bridge, an * additional DAT answer may trigger kernel warnings about * a packet coming from the wrong port. */ @@ -1201,7 +1191,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, if (!skb_new) goto out; - skb_new->protocol = eth_type_trans(skb_new, soft_iface); + skb_new->protocol = eth_type_trans(skb_new, mesh_iface); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, @@ -1223,7 +1213,7 @@ out: /** * batadv_dat_snoop_incoming_arp_request() - snoop the ARP request and try to * answer using the local DAT storage - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: packet to check * @hdr_size: size of the encapsulation header * @@ -1291,7 +1281,7 @@ out: /** * batadv_dat_snoop_outgoing_arp_reply() - snoop the ARP reply and fill the DHT - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: packet to check */ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, @@ -1334,7 +1324,7 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, /** * batadv_dat_snoop_incoming_arp_reply() - snoop the ARP reply and fill the * local DAT storage only - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: packet to check * @hdr_size: size of the encapsulation header * @@ -1615,7 +1605,7 @@ static bool batadv_dat_get_dhcp_chaddr(struct sk_buff *skb, u8 *buf) /** * batadv_dat_put_dhcp() - puts addresses from a DHCP packet into the DHT and * DAT cache - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @chaddr: the DHCP client MAC address * @yiaddr: the DHCP client IP address * @hw_dst: the DHCP server MAC address @@ -1700,7 +1690,7 @@ batadv_dat_check_dhcp_ack(struct sk_buff *skb, __be16 proto, __be32 *ip_src, /** * batadv_dat_snoop_outgoing_dhcp_ack() - snoop DHCPACK and fill DAT with it - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the packet to snoop * @proto: ethernet protocol hint (behind a potential vlan) * @vid: VLAN identifier @@ -1733,7 +1723,7 @@ void batadv_dat_snoop_outgoing_dhcp_ack(struct batadv_priv *bat_priv, /** * batadv_dat_snoop_incoming_dhcp_ack() - snoop DHCPACK and fill DAT cache - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the packet to snoop * @hdr_size: header size, up to the tail of the batman-adv header * @@ -1781,7 +1771,7 @@ void batadv_dat_snoop_incoming_dhcp_ack(struct batadv_priv *bat_priv, /** * batadv_dat_drop_broadcast_packet() - check if an ARP request has to be * dropped (because the node has already obtained the reply via DAT) or not - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @forw_packet: the broadcast packet * * Return: true if the node can drop the packet, false otherwise. diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index bed7f3d20844..e7b75e82eb1d 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -56,7 +56,7 @@ batadv_dat_init_orig_node_addr(struct batadv_orig_node *orig_node) /** * batadv_dat_init_own_addr() - assign a DAT address to the node itself - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @primary_if: a pointer to the primary interface */ static inline void @@ -77,7 +77,7 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb); /** * batadv_dat_inc_counter() - increment the correct DAT packet counter - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @subtype: the 4addr subtype of the packet to be counted * * Updates the ethtool statistics for the received packet if it is a DAT subtype diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 757c084ac2d1..cc14bc41381e 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -349,7 +349,7 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, struct batadv_hard_iface *recv_if, struct batadv_orig_node *orig_node_src) { - struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); struct batadv_neigh_node *neigh_node = NULL; struct batadv_frag_packet *packet; u16 total_size; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 0ddd8b4b3f4c..7a11b245e9f4 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -10,6 +10,7 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> +#include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> @@ -31,7 +32,6 @@ #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/udp.h> -#include <net/sock.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> @@ -40,7 +40,6 @@ #include "netlink.h" #include "originator.h" #include "routing.h" -#include "soft-interface.h" #include "translation-table.h" /* These are the offsets of the "hw type" and "hw address length" in the dhcp @@ -72,7 +71,7 @@ void batadv_gw_node_release(struct kref *ref) /** * batadv_gw_get_selected_gw_node() - Get currently selected gateway - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: selected gateway (with increased refcnt), NULL on errors */ @@ -96,7 +95,7 @@ out: /** * batadv_gw_get_selected_orig() - Get originator of currently selected gateway - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: orig_node of selected gateway (with increased refcnt), NULL on errors */ @@ -145,7 +144,7 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, /** * batadv_gw_reselect() - force a gateway reselection - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Set a flag to remind the GW component to perform a new gateway reselection. * However this function does not ensure that the current gateway is going to be @@ -161,7 +160,7 @@ void batadv_gw_reselect(struct batadv_priv *bat_priv) /** * batadv_gw_check_client_stop() - check if client mode has been switched off - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * This function assumes the caller has checked that the gw state *is actually * changing*. This function is not supposed to be called when there is no state @@ -193,7 +192,7 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) /** * batadv_gw_election() - Elect the best gateway - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_gw_election(struct batadv_priv *bat_priv) { @@ -281,7 +280,7 @@ out: /** * batadv_gw_check_election() - Elect orig node as best gateway when eligible - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked */ void batadv_gw_check_election(struct batadv_priv *bat_priv, @@ -315,7 +314,7 @@ out: /** * batadv_gw_node_add() - add gateway node to list of available gateways - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information * @@ -362,7 +361,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv, /** * batadv_gw_node_get() - retrieve gateway node from list of available gateways - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: originator announcing gateway capabilities * * Return: gateway node if found or NULL otherwise. @@ -392,7 +391,7 @@ struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, /** * batadv_gw_node_update() - update list of available gateways with changed * bandwidth information - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: originator announcing gateway capabilities * @gateway: announced bandwidth information */ @@ -459,7 +458,7 @@ out: /** * batadv_gw_node_delete() - Remove orig_node from gateway list - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is currently in process of being removed */ void batadv_gw_node_delete(struct batadv_priv *bat_priv, @@ -474,8 +473,8 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv, } /** - * batadv_gw_node_free() - Free gateway information from soft interface - * @bat_priv: the bat priv with all the soft interface information + * batadv_gw_node_free() - Free gateway information from mesh interface + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_gw_node_free(struct batadv_priv *bat_priv) { @@ -502,24 +501,15 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv) int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; - struct net *net = sock_net(cb->skb->sk); - struct net_device *soft_iface; + struct net_device *mesh_iface; struct batadv_priv *bat_priv; - int ifindex; int ret; - ifindex = batadv_netlink_get_ifindex(cb->nlh, - BATADV_ATTR_MESH_IFINDEX); - if (!ifindex) - return -EINVAL; + mesh_iface = batadv_netlink_get_meshif(cb); + if (IS_ERR(mesh_iface)) + return PTR_ERR(mesh_iface); - soft_iface = dev_get_by_index(net, ifindex); - if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { - ret = -ENODEV; - goto out; - } - - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { @@ -538,7 +528,7 @@ int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb) out: batadv_hardif_put(primary_if); - dev_put(soft_iface); + dev_put(mesh_iface); return ret; } @@ -667,7 +657,7 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, /** * batadv_gw_out_of_range() - check if the dhcp request destination is the best * gateway - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the outgoing packet * * Check if the skb is a DHCP request and if it is sent to the current best GW diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 2dd36ef03c84..315fa90f0c94 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -20,7 +20,7 @@ /** * batadv_gw_tvlv_container_update() - update the gw tvlv container after * gateway setting change - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv) { @@ -48,7 +48,7 @@ void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv) /** * batadv_gw_tvlv_ogm_handler_v1() - process incoming gateway tvlv container - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the gateway data @@ -89,7 +89,7 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, /** * batadv_gw_init() - initialise the gateway handling internals - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_gw_init(struct batadv_priv *bat_priv) { @@ -105,7 +105,7 @@ void batadv_gw_init(struct batadv_priv *bat_priv) /** * batadv_gw_free() - free the gateway handling internals - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_gw_free(struct batadv_priv *bat_priv) { diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 96a412beab2d..558d39dffc23 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -36,9 +36,9 @@ #include "distributed-arp-table.h" #include "gateway_client.h" #include "log.h" +#include "mesh-interface.h" #include "originator.h" #include "send.h" -#include "soft-interface.h" #include "translation-table.h" /** @@ -51,7 +51,7 @@ void batadv_hardif_release(struct kref *ref) struct batadv_hard_iface *hard_iface; hard_iface = container_of(ref, struct batadv_hard_iface, refcount); - dev_put(hard_iface->net_dev); + netdev_put(hard_iface->net_dev, &hard_iface->dev_tracker); kfree_rcu(hard_iface, rcu); } @@ -141,7 +141,7 @@ static bool batadv_mutual_parents(const struct net_device *dev1, * is important to prevent this new interface from being used to create a new * mesh network (this behaviour would lead to a batman-over-batman * configuration). This function recursively checks all the fathers of the - * device passed as argument looking for a batman-adv soft interface. + * device passed as argument looking for a batman-adv mesh interface. * * Return: true if the device is descendant of a batman-adv mesh interface (or * if it is a batman-adv interface itself), false otherwise @@ -155,7 +155,7 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev) bool ret; /* check if this is a batman-adv mesh interface */ - if (batadv_softif_is_valid(net_dev)) + if (batadv_meshif_is_valid(net_dev)) return true; iflink = dev_get_iflink(net_dev); @@ -233,10 +233,10 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) } hard_iface = batadv_hardif_get_by_netdev(netdev); - if (!hard_iface || !hard_iface->soft_iface) + if (!hard_iface || !hard_iface->mesh_iface) goto out; - net = dev_net(hard_iface->soft_iface); + net = dev_net(hard_iface->mesh_iface); real_net = batadv_getlink_net(netdev, net); /* iflink to itself, most likely physical device */ @@ -438,13 +438,13 @@ out: } static struct batadv_hard_iface * -batadv_hardif_get_active(const struct net_device *soft_iface) +batadv_hardif_get_active(const struct net_device *mesh_iface) { struct batadv_hard_iface *hard_iface; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != soft_iface) + if (hard_iface->mesh_iface != mesh_iface) continue; if (hard_iface->if_status == BATADV_IF_ACTIVE && @@ -506,35 +506,39 @@ batadv_hardif_is_iface_up(const struct batadv_hard_iface *hard_iface) return false; } -static void batadv_check_known_mac_addr(const struct net_device *net_dev) +static void batadv_check_known_mac_addr(const struct batadv_hard_iface *hard_iface) { - const struct batadv_hard_iface *hard_iface; + const struct net_device *mesh_iface = hard_iface->mesh_iface; + const struct batadv_hard_iface *tmp_hard_iface; - rcu_read_lock(); - list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->if_status != BATADV_IF_ACTIVE && - hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) + if (!mesh_iface) + return; + + list_for_each_entry(tmp_hard_iface, &batadv_hardif_list, list) { + if (tmp_hard_iface == hard_iface) + continue; + + if (tmp_hard_iface->mesh_iface != mesh_iface) continue; - if (hard_iface->net_dev == net_dev) + if (tmp_hard_iface->if_status == BATADV_IF_NOT_IN_USE) continue; - if (!batadv_compare_eth(hard_iface->net_dev->dev_addr, - net_dev->dev_addr)) + if (!batadv_compare_eth(tmp_hard_iface->net_dev->dev_addr, + hard_iface->net_dev->dev_addr)) continue; pr_warn("The newly added mac address (%pM) already exists on: %s\n", - net_dev->dev_addr, hard_iface->net_dev->name); + hard_iface->net_dev->dev_addr, tmp_hard_iface->net_dev->name); pr_warn("It is strongly recommended to keep mac addresses unique to avoid problems!\n"); } - rcu_read_unlock(); } /** * batadv_hardif_recalc_extra_skbroom() - Recalculate skbuff extra head/tailroom - * @soft_iface: netdev struct of the mesh interface + * @mesh_iface: netdev struct of the mesh interface */ -static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) +static void batadv_hardif_recalc_extra_skbroom(struct net_device *mesh_iface) { const struct batadv_hard_iface *hard_iface; unsigned short lower_header_len = ETH_HLEN; @@ -547,7 +551,7 @@ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) continue; - if (hard_iface->soft_iface != soft_iface) + if (hard_iface->mesh_iface != mesh_iface) continue; lower_header_len = max_t(unsigned short, lower_header_len, @@ -567,20 +571,20 @@ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) /* fragmentation headers don't strip the unicast/... header */ needed_headroom += sizeof(struct batadv_frag_packet); - soft_iface->needed_headroom = needed_headroom; - soft_iface->needed_tailroom = lower_tailroom; + mesh_iface->needed_headroom = needed_headroom; + mesh_iface->needed_tailroom = lower_tailroom; } /** - * batadv_hardif_min_mtu() - Calculate maximum MTU for soft interface - * @soft_iface: netdev struct of the soft interface + * batadv_hardif_min_mtu() - Calculate maximum MTU for mesh interface + * @mesh_iface: netdev struct of the mesh interface * - * Return: MTU for the soft-interface (limited by the minimal MTU of all active + * Return: MTU for the mesh-interface (limited by the minimal MTU of all active * slave interfaces) */ -int batadv_hardif_min_mtu(struct net_device *soft_iface) +int batadv_hardif_min_mtu(struct net_device *mesh_iface) { - struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_priv *bat_priv = netdev_priv(mesh_iface); const struct batadv_hard_iface *hard_iface; int min_mtu = INT_MAX; @@ -590,7 +594,7 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; - if (hard_iface->soft_iface != soft_iface) + if (hard_iface->mesh_iface != mesh_iface) continue; min_mtu = min_t(int, hard_iface->net_dev->mtu, min_mtu); @@ -616,26 +620,24 @@ out: */ atomic_set(&bat_priv->packet_size_max, min_mtu); - /* the real soft-interface MTU is computed by removing the payload + /* the real mesh-interface MTU is computed by removing the payload * overhead from the maximum amount of bytes that was just computed. - * - * However batman-adv does not support MTUs bigger than ETH_DATA_LEN */ - return min_t(int, min_mtu - batadv_max_header_len(), ETH_DATA_LEN); + return min_t(int, min_mtu - batadv_max_header_len(), BATADV_MAX_MTU); } /** * batadv_update_min_mtu() - Adjusts the MTU if a new interface with a smaller * MTU appeared - * @soft_iface: netdev struct of the soft interface + * @mesh_iface: netdev struct of the mesh interface */ -void batadv_update_min_mtu(struct net_device *soft_iface) +void batadv_update_min_mtu(struct net_device *mesh_iface) { - struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_priv *bat_priv = netdev_priv(mesh_iface); int limit_mtu; int mtu; - mtu = batadv_hardif_min_mtu(soft_iface); + mtu = batadv_hardif_min_mtu(mesh_iface); if (bat_priv->mtu_set_by_user) limit_mtu = bat_priv->mtu_set_by_user; @@ -643,12 +645,12 @@ void batadv_update_min_mtu(struct net_device *soft_iface) limit_mtu = ETH_DATA_LEN; mtu = min(mtu, limit_mtu); - dev_set_mtu(soft_iface, mtu); + dev_set_mtu(mesh_iface, mtu); /* Check if the local translate table should be cleaned up to match a * new (and smaller) MTU. */ - batadv_tt_local_resize_to_mtu(soft_iface); + batadv_tt_local_resize_to_mtu(mesh_iface); } static void @@ -660,7 +662,7 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) if (hard_iface->if_status != BATADV_IF_INACTIVE) goto out; - bat_priv = netdev_priv(hard_iface->soft_iface); + bat_priv = netdev_priv(hard_iface->mesh_iface); bat_priv->algo_ops->iface.update_mac(hard_iface); hard_iface->if_status = BATADV_IF_TO_BE_ACTIVATED; @@ -672,10 +674,10 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) if (!primary_if) batadv_primary_if_select(bat_priv, hard_iface); - batadv_info(hard_iface->soft_iface, "Interface activated: %s\n", + batadv_info(hard_iface->mesh_iface, "Interface activated: %s\n", hard_iface->net_dev->name); - batadv_update_min_mtu(hard_iface->soft_iface); + batadv_update_min_mtu(hard_iface->mesh_iface); if (bat_priv->algo_ops->iface.activate) bat_priv->algo_ops->iface.activate(hard_iface); @@ -693,21 +695,21 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) hard_iface->if_status = BATADV_IF_INACTIVE; - batadv_info(hard_iface->soft_iface, "Interface deactivated: %s\n", + batadv_info(hard_iface->mesh_iface, "Interface deactivated: %s\n", hard_iface->net_dev->name); - batadv_update_min_mtu(hard_iface->soft_iface); + batadv_update_min_mtu(hard_iface->mesh_iface); } /** - * batadv_hardif_enable_interface() - Enslave hard interface to soft interface - * @hard_iface: hard interface to add to soft interface - * @soft_iface: netdev struct of the mesh interface + * batadv_hardif_enable_interface() - Enslave hard interface to mesh interface + * @hard_iface: hard interface to add to mesh interface + * @mesh_iface: netdev struct of the mesh interface * * Return: 0 on success or negative error number in case of failure */ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, - struct net_device *soft_iface) + struct net_device *mesh_iface) { struct batadv_priv *bat_priv; __be16 ethertype = htons(ETH_P_BATMAN); @@ -717,7 +719,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, int ret; hardif_mtu = READ_ONCE(hard_iface->net_dev->mtu); - required_mtu = READ_ONCE(soft_iface->mtu) + max_header_len; + required_mtu = READ_ONCE(mesh_iface->mtu) + max_header_len; if (hardif_mtu < ETH_MIN_MTU + max_header_len) return -EINVAL; @@ -727,12 +729,12 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, kref_get(&hard_iface->refcount); - dev_hold(soft_iface); - hard_iface->soft_iface = soft_iface; - bat_priv = netdev_priv(hard_iface->soft_iface); + netdev_hold(mesh_iface, &hard_iface->meshif_dev_tracker, GFP_ATOMIC); + hard_iface->mesh_iface = mesh_iface; + bat_priv = netdev_priv(hard_iface->mesh_iface); ret = netdev_master_upper_dev_link(hard_iface->net_dev, - soft_iface, NULL, NULL, NULL); + mesh_iface, NULL, NULL, NULL); if (ret) goto err_dev; @@ -748,31 +750,33 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, hard_iface->batman_adv_ptype.dev = hard_iface->net_dev; dev_add_pack(&hard_iface->batman_adv_ptype); - batadv_info(hard_iface->soft_iface, "Adding interface: %s\n", + batadv_info(hard_iface->mesh_iface, "Adding interface: %s\n", hard_iface->net_dev->name); if (atomic_read(&bat_priv->fragmentation) && hardif_mtu < required_mtu) - batadv_info(hard_iface->soft_iface, + batadv_info(hard_iface->mesh_iface, "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. Packets going over this interface will be fragmented on layer2 which could impact the performance. Setting the MTU to %i would solve the problem.\n", hard_iface->net_dev->name, hardif_mtu, required_mtu); if (!atomic_read(&bat_priv->fragmentation) && hardif_mtu < required_mtu) - batadv_info(hard_iface->soft_iface, + batadv_info(hard_iface->mesh_iface, "The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. If you experience problems getting traffic through try increasing the MTU to %i.\n", hard_iface->net_dev->name, hardif_mtu, required_mtu); + batadv_check_known_mac_addr(hard_iface); + if (batadv_hardif_is_iface_up(hard_iface)) batadv_hardif_activate_interface(hard_iface); else - batadv_err(hard_iface->soft_iface, + batadv_err(hard_iface->mesh_iface, "Not using interface %s (retrying later): interface not active\n", hard_iface->net_dev->name); - batadv_hardif_recalc_extra_skbroom(soft_iface); + batadv_hardif_recalc_extra_skbroom(mesh_iface); if (bat_priv->algo_ops->iface.enabled) bat_priv->algo_ops->iface.enabled(hard_iface); @@ -781,17 +785,17 @@ out: return 0; err_upper: - netdev_upper_dev_unlink(hard_iface->net_dev, soft_iface); + netdev_upper_dev_unlink(hard_iface->net_dev, mesh_iface); err_dev: - hard_iface->soft_iface = NULL; - dev_put(soft_iface); + hard_iface->mesh_iface = NULL; + netdev_put(mesh_iface, &hard_iface->meshif_dev_tracker); batadv_hardif_put(hard_iface); return ret; } /** - * batadv_hardif_cnt() - get number of interfaces enslaved to soft interface - * @soft_iface: soft interface to check + * batadv_hardif_cnt() - get number of interfaces enslaved to mesh interface + * @mesh_iface: mesh interface to check * * This function is only using RCU for locking - the result can therefore be * off when another function is modifying the list at the same time. The @@ -799,14 +803,14 @@ err_dev: * * Return: number of connected/enslaved hard interfaces */ -static size_t batadv_hardif_cnt(const struct net_device *soft_iface) +static size_t batadv_hardif_cnt(const struct net_device *mesh_iface) { struct batadv_hard_iface *hard_iface; size_t count = 0; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != soft_iface) + if (hard_iface->mesh_iface != mesh_iface) continue; count++; @@ -817,12 +821,12 @@ static size_t batadv_hardif_cnt(const struct net_device *soft_iface) } /** - * batadv_hardif_disable_interface() - Remove hard interface from soft interface + * batadv_hardif_disable_interface() - Remove hard interface from mesh interface * @hard_iface: hard interface to be removed */ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); struct batadv_hard_iface *primary_if = NULL; batadv_hardif_deactivate_interface(hard_iface); @@ -830,7 +834,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) if (hard_iface->if_status != BATADV_IF_INACTIVE) goto out; - batadv_info(hard_iface->soft_iface, "Removing interface: %s\n", + batadv_info(hard_iface->mesh_iface, "Removing interface: %s\n", hard_iface->net_dev->name); dev_remove_pack(&hard_iface->batman_adv_ptype); batadv_hardif_put(hard_iface); @@ -839,7 +843,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) if (hard_iface == primary_if) { struct batadv_hard_iface *new_if; - new_if = batadv_hardif_get_active(hard_iface->soft_iface); + new_if = batadv_hardif_get_active(hard_iface->mesh_iface); batadv_primary_if_select(bat_priv, new_if); batadv_hardif_put(new_if); @@ -851,16 +855,16 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) /* delete all references to this hard_iface */ batadv_purge_orig_ref(bat_priv); batadv_purge_outstanding_packets(bat_priv, hard_iface); - dev_put(hard_iface->soft_iface); + netdev_put(hard_iface->mesh_iface, &hard_iface->meshif_dev_tracker); - netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface); - batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); + netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->mesh_iface); + batadv_hardif_recalc_extra_skbroom(hard_iface->mesh_iface); /* nobody uses this interface anymore */ - if (batadv_hardif_cnt(hard_iface->soft_iface) <= 1) + if (batadv_hardif_cnt(hard_iface->mesh_iface) <= 1) batadv_gw_check_client_stop(bat_priv); - hard_iface->soft_iface = NULL; + hard_iface->mesh_iface = NULL; batadv_hardif_put(hard_iface); out: @@ -875,16 +879,16 @@ batadv_hardif_add_interface(struct net_device *net_dev) ASSERT_RTNL(); if (!batadv_is_valid_iface(net_dev)) - goto out; - - dev_hold(net_dev); + return NULL; hard_iface = kzalloc(sizeof(*hard_iface), GFP_ATOMIC); if (!hard_iface) - goto release_dev; + return NULL; + netdev_hold(net_dev, &hard_iface->dev_tracker, GFP_ATOMIC); hard_iface->net_dev = net_dev; - hard_iface->soft_iface = NULL; + + hard_iface->mesh_iface = NULL; hard_iface->if_status = BATADV_IF_NOT_IN_USE; INIT_LIST_HEAD(&hard_iface->list); @@ -903,17 +907,11 @@ batadv_hardif_add_interface(struct net_device *net_dev) batadv_v_hardif_init(hard_iface); - batadv_check_known_mac_addr(hard_iface->net_dev); kref_get(&hard_iface->refcount); list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list); batadv_hardif_generation++; return hard_iface; - -release_dev: - dev_put(net_dev); -out: - return NULL; } static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface) @@ -932,13 +930,13 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface) } /** - * batadv_hard_if_event_softif() - Handle events for soft interfaces + * batadv_hard_if_event_meshif() - Handle events for mesh interfaces * @event: NETDEV_* event to handle * @net_dev: net_device which generated an event * * Return: NOTIFY_* result */ -static int batadv_hard_if_event_softif(unsigned long event, +static int batadv_hard_if_event_meshif(unsigned long event, struct net_device *net_dev) { struct batadv_priv *bat_priv; @@ -946,7 +944,7 @@ static int batadv_hard_if_event_softif(unsigned long event, switch (event) { case NETDEV_REGISTER: bat_priv = netdev_priv(net_dev); - batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS); + batadv_meshif_create_vlan(bat_priv, BATADV_NO_FLAGS); break; } @@ -961,8 +959,8 @@ static int batadv_hard_if_event(struct notifier_block *this, struct batadv_hard_iface *primary_if = NULL; struct batadv_priv *bat_priv; - if (batadv_softif_is_valid(net_dev)) - return batadv_hard_if_event_softif(event, net_dev); + if (batadv_meshif_is_valid(net_dev)) + return batadv_hard_if_event_meshif(event, net_dev); hard_iface = batadv_hardif_get_by_netdev(net_dev); if (!hard_iface && (event == NETDEV_REGISTER || @@ -988,16 +986,16 @@ static int batadv_hard_if_event(struct notifier_block *this, batadv_hardif_remove_interface(hard_iface); break; case NETDEV_CHANGEMTU: - if (hard_iface->soft_iface) - batadv_update_min_mtu(hard_iface->soft_iface); + if (hard_iface->mesh_iface) + batadv_update_min_mtu(hard_iface->mesh_iface); break; case NETDEV_CHANGEADDR: if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) goto hardif_put; - batadv_check_known_mac_addr(hard_iface->net_dev); + batadv_check_known_mac_addr(hard_iface); - bat_priv = netdev_priv(hard_iface->soft_iface); + bat_priv = netdev_priv(hard_iface->mesh_iface); bat_priv->algo_ops->iface.update_mac(hard_iface); primary_if = batadv_primary_if_get_selected(bat_priv); diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 64f660dbbe54..262a78364742 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -23,12 +23,12 @@ enum batadv_hard_if_state { /** * @BATADV_IF_NOT_IN_USE: interface is not used as slave interface of a - * batman-adv soft interface + * batman-adv mesh interface */ BATADV_IF_NOT_IN_USE, /** - * @BATADV_IF_TO_BE_REMOVED: interface will be removed from soft + * @BATADV_IF_TO_BE_REMOVED: interface will be removed from mesh * interface */ BATADV_IF_TO_BE_REMOVED, @@ -74,10 +74,10 @@ bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface); struct batadv_hard_iface* batadv_hardif_get_by_netdev(const struct net_device *net_dev); int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, - struct net_device *soft_iface); + struct net_device *mesh_iface); void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface); -int batadv_hardif_min_mtu(struct net_device *soft_iface); -void batadv_update_min_mtu(struct net_device *soft_iface); +int batadv_hardif_min_mtu(struct net_device *mesh_iface); +void batadv_update_min_mtu(struct net_device *mesh_iface); void batadv_hardif_release(struct kref *ref); int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, u8 *orig_addr, u8 *orig_neigh); @@ -97,7 +97,7 @@ static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface) /** * batadv_primary_if_get_selected() - Get reference to primary interface - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: primary interface (with increased refcnt), otherwise NULL */ diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 7a93a1e94c40..c19d07eeb070 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -13,7 +13,7 @@ /** * batadv_debug_log() - Add debug log entry - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @fmt: format string * * Return: 0 on success or negative error number in case of failure diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 6717c965f0fa..567afaa8df99 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -71,7 +71,7 @@ __printf(2, 3); /** * _batadv_dbg() - Store debug output with(out) rate limiting * @type: type of debug message - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @ratelimited: whether output should be rate limited * @fmt: format string * @arg: variable arguments @@ -97,7 +97,7 @@ static inline void _batadv_dbg(int type __always_unused, /** * batadv_dbg() - Store debug output without rate limiting * @type: type of debug message - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @arg: format string and variable arguments */ #define batadv_dbg(type, bat_priv, arg...) \ @@ -106,7 +106,7 @@ static inline void _batadv_dbg(int type __always_unused, /** * batadv_dbg_ratelimited() - Store debug output with rate limiting * @type: type of debug message - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @arg: format string and variable arguments */ #define batadv_dbg_ratelimited(type, bat_priv, arg...) \ @@ -114,7 +114,7 @@ static inline void _batadv_dbg(int type __always_unused, /** * batadv_info() - Store message in debug buffer and print it to kmsg buffer - * @net_dev: the soft interface net device + * @net_dev: the mesh interface net device * @fmt: format string * @arg: variable arguments */ @@ -128,7 +128,7 @@ static inline void _batadv_dbg(int type __always_unused, /** * batadv_err() - Store error in debug buffer and print it to kmsg buffer - * @net_dev: the soft interface net device + * @net_dev: the mesh interface net device * @fmt: format string * @arg: variable arguments */ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 8e0f44c71696..c0bc75513355 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -11,7 +11,7 @@ #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> -#include <linux/crc32c.h> +#include <linux/crc32.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/gfp.h> @@ -51,13 +51,13 @@ #include "gateway_common.h" #include "hard-interface.h" #include "log.h" +#include "mesh-interface.h" #include "multicast.h" #include "netlink.h" #include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" -#include "soft-interface.h" #include "tp_meter.h" #include "translation-table.h" @@ -69,8 +69,6 @@ unsigned int batadv_hardif_generation; static int (*batadv_rx_handler[256])(struct sk_buff *skb, struct batadv_hard_iface *recv_if); -unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; - struct workqueue_struct *batadv_event_workqueue; static void batadv_recv_handler_init(void); @@ -143,14 +141,14 @@ static void __exit batadv_exit(void) } /** - * batadv_mesh_init() - Initialize soft interface - * @soft_iface: netdev struct of the soft interface + * batadv_mesh_init() - Initialize mesh interface + * @mesh_iface: netdev struct of the mesh interface * * Return: 0 on success or negative error number in case of failure */ -int batadv_mesh_init(struct net_device *soft_iface) +int batadv_mesh_init(struct net_device *mesh_iface) { - struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_priv *bat_priv = netdev_priv(mesh_iface); int ret; spin_lock_init(&bat_priv->forw_bat_list_lock); @@ -167,7 +165,7 @@ int batadv_mesh_init(struct net_device *soft_iface) #endif spin_lock_init(&bat_priv->tvlv.container_list_lock); spin_lock_init(&bat_priv->tvlv.handler_list_lock); - spin_lock_init(&bat_priv->softif_vlan_list_lock); + spin_lock_init(&bat_priv->meshif_vlan_list_lock); spin_lock_init(&bat_priv->tp_list_lock); INIT_HLIST_HEAD(&bat_priv->forw_bat_list); @@ -186,7 +184,7 @@ int batadv_mesh_init(struct net_device *soft_iface) #endif INIT_HLIST_HEAD(&bat_priv->tvlv.container_list); INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list); - INIT_HLIST_HEAD(&bat_priv->softif_vlan_list); + INIT_HLIST_HEAD(&bat_priv->meshif_vlan_list); INIT_HLIST_HEAD(&bat_priv->tp_list); bat_priv->gw.generation = 0; @@ -253,12 +251,12 @@ err_orig: } /** - * batadv_mesh_free() - Deinitialize soft interface - * @soft_iface: netdev struct of the soft interface + * batadv_mesh_free() - Deinitialize mesh interface + * @mesh_iface: netdev struct of the mesh interface */ -void batadv_mesh_free(struct net_device *soft_iface) +void batadv_mesh_free(struct net_device *mesh_iface) { - struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_priv *bat_priv = netdev_priv(mesh_iface); atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); @@ -297,7 +295,7 @@ void batadv_mesh_free(struct net_device *soft_iface) /** * batadv_is_my_mac() - check if the given mac address belongs to any of the * real interfaces in the current mesh - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: the address to check * * Return: 'true' if the mac address was found, false otherwise. @@ -312,7 +310,7 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; - if (hard_iface->soft_iface != bat_priv->soft_iface) + if (hard_iface->mesh_iface != bat_priv->mesh_iface) continue; if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) { @@ -457,10 +455,10 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, if (unlikely(skb->mac_len != ETH_HLEN || !skb_mac_header(skb))) goto err_free; - if (!hard_iface->soft_iface) + if (!hard_iface->mesh_iface) goto err_free; - bat_priv = netdev_priv(hard_iface->soft_iface); + bat_priv = netdev_priv(hard_iface->mesh_iface); if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto err_free; @@ -637,6 +635,13 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) vhdr = (struct vlan_ethhdr *)(skb->data + header_len); vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK; + + /* VID 0 is only used to indicate "priority tag" frames which only + * contain priority information and no VID. + */ + if (vid == 0) + return BATADV_NO_FLAGS; + vid |= BATADV_VLAN_HAS_TAG; return vid; @@ -644,7 +649,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) /** * batadv_vlan_ap_isola_get() - return AP isolation status for the given vlan - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @vid: the VLAN identifier for which the AP isolation attributed as to be * looked up * @@ -654,15 +659,15 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len) bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) { bool ap_isolation_enabled = false; - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; /* if the AP isolation is requested on a VLAN, then check for its * setting in the proper VLAN private data structure */ - vlan = batadv_softif_vlan_get(bat_priv, vid); + vlan = batadv_meshif_vlan_get(bat_priv, vid); if (vlan) { ap_isolation_enabled = atomic_read(&vlan->ap_isolation); - batadv_softif_vlan_put(vlan); + batadv_meshif_vlan_put(vlan); } return ap_isolation_enabled; @@ -670,7 +675,7 @@ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid) /** * batadv_throw_uevent() - Send an uevent with batman-adv specific env data - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @type: subsystem type of event. Stored in uevent's BATTYPE * @action: action type of event. Stored in uevent's BATACTION * @data: string with additional information to the event (ignored for @@ -685,7 +690,7 @@ int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, struct kobject *bat_kobj; char *uevent_env[4] = { NULL, NULL, NULL, NULL }; - bat_kobj = &bat_priv->soft_iface->dev.kobj; + bat_kobj = &bat_priv->mesh_iface->dev.kobj; uevent_env[0] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_TYPE_VAR, diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 97ea71a052f8..692109be2210 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -7,13 +7,13 @@ #ifndef _NET_BATMAN_ADV_MAIN_H_ #define _NET_BATMAN_ADV_MAIN_H_ -#define BATADV_DRIVER_AUTHOR "Marek Lindner <mareklindner@neomailbox.ch>, " \ +#define BATADV_DRIVER_AUTHOR "Marek Lindner <marek.lindner@mailbox.org>, " \ "Simon Wunderlich <sw@simonwunderlich.de>" #define BATADV_DRIVER_DESC "B.A.T.M.A.N. advanced" #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2024.3" +#define BATADV_SOURCE_VERSION "2025.2" #endif /* B.A.T.M.A.N. parameters */ @@ -22,6 +22,8 @@ #define BATADV_THROUGHPUT_MAX_VALUE 0xFFFFFFFF #define BATADV_JITTER 20 +#define BATADV_MAX_MTU (ETH_MAX_MTU - batadv_max_header_len()) + /* Time To Live of broadcast messages */ #define BATADV_TTL 50 @@ -102,9 +104,7 @@ */ #define BATADV_TQ_SIMILARITY_THRESHOLD 50 -/* should not be bigger than 512 bytes or change the size of - * forw_packet->direct_link_flags - */ +#define BATADV_MAX_AGGREGATION_PACKETS 32 #define BATADV_MAX_AGGREGATION_BYTES 512 #define BATADV_MAX_AGGREGATION_MS 100 @@ -129,10 +129,10 @@ #define BATADV_TP_MAX_NUM 5 /** - * enum batadv_mesh_state - State of a soft interface + * enum batadv_mesh_state - State of a mesh interface */ enum batadv_mesh_state { - /** @BATADV_MESH_INACTIVE: soft interface is not yet running */ + /** @BATADV_MESH_INACTIVE: mesh interface is not yet running */ BATADV_MESH_INACTIVE, /** @BATADV_MESH_ACTIVE: interface is up and running */ @@ -235,11 +235,10 @@ static inline int batadv_print_vid(unsigned short vid) extern struct list_head batadv_hardif_list; extern unsigned int batadv_hardif_generation; -extern unsigned char batadv_broadcast_addr[]; extern struct workqueue_struct *batadv_event_workqueue; -int batadv_mesh_init(struct net_device *soft_iface); -void batadv_mesh_free(struct net_device *soft_iface); +int batadv_mesh_init(struct net_device *mesh_iface); +void batadv_mesh_free(struct net_device *mesh_iface); bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr); int batadv_max_header_len(void); void batadv_skb_set_priority(struct sk_buff *skb, int offset); @@ -345,8 +344,8 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, #define batadv_seq_after(x, y) batadv_seq_before(y, x) /** - * batadv_add_counter() - Add to per cpu statistics counter of soft interface - * @bat_priv: the bat priv with all the soft interface information + * batadv_add_counter() - Add to per cpu statistics counter of mesh interface + * @bat_priv: the bat priv with all the mesh interface information * @idx: counter index which should be modified * @count: value to increase counter by * @@ -359,8 +358,8 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx, } /** - * batadv_inc_counter() - Increase per cpu statistics counter of soft interface - * @b: the bat priv with all the soft interface information + * batadv_inc_counter() - Increase per cpu statistics counter of mesh interface + * @b: the bat priv with all the mesh interface information * @i: counter index which should be modified */ #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1) diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/mesh-interface.c index 2758aba47a2f..5bbc366f974d 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/mesh-interface.c @@ -4,7 +4,7 @@ * Marek Lindner, Simon Wunderlich */ -#include "soft-interface.h" +#include "mesh-interface.h" #include "main.h" #include <linux/atomic.h> @@ -36,7 +36,6 @@ #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> -#include <net/net_namespace.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> @@ -77,21 +76,9 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) return 0; } -static int batadv_interface_open(struct net_device *dev) -{ - netif_start_queue(dev); - return 0; -} - -static int batadv_interface_release(struct net_device *dev) -{ - netif_stop_queue(dev); - return 0; -} - /** * batadv_sum_counter() - Sum the cpu-local counters for index 'idx' - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @idx: index of counter to sum up * * Return: sum of all cpu-local counters @@ -125,7 +112,7 @@ static struct net_device_stats *batadv_interface_stats(struct net_device *dev) static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) { struct batadv_priv *bat_priv = netdev_priv(dev); - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; struct sockaddr *addr = p; u8 old_addr[ETH_ALEN]; @@ -140,7 +127,7 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) return 0; rcu_read_lock(); - hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { + hlist_for_each_entry_rcu(vlan, &bat_priv->meshif_vlan_list, list) { batadv_tt_local_remove(bat_priv, old_addr, vlan->vid, "mac address changed", false); batadv_tt_local_add(dev, addr->sa_data, vlan->vid, @@ -170,7 +157,7 @@ static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) * @dev: registered network device to modify * * We do not actually need to set any rx filters for the virtual batman - * soft interface. However a dummy handler enables a user to set static + * mesh interface. However a dummy handler enables a user to set static * multicast listeners for instance. */ static void batadv_interface_set_rx_mode(struct net_device *dev) @@ -178,10 +165,10 @@ static void batadv_interface_set_rx_mode(struct net_device *dev) } static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, - struct net_device *soft_iface) + struct net_device *mesh_iface) { struct ethhdr *ethhdr; - struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_priv *bat_priv = netdev_priv(mesh_iface); struct batadv_hard_iface *primary_if = NULL; struct batadv_bcast_packet *bcast_packet; static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, @@ -209,7 +196,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, /* reset control block to avoid left overs from previous users */ memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); - netif_trans_update(soft_iface); + netif_trans_update(mesh_iface); vid = batadv_get_vid(skb, 0); skb_reset_mac_header(skb); @@ -246,7 +233,7 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb, /* Register the client MAC in the transtable */ if (!is_multicast_ether_addr(ethhdr->h_source) && !batadv_bla_is_loopdetect_mac(ethhdr->h_source)) { - client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source, + client_added = batadv_tt_local_add(mesh_iface, ethhdr->h_source, vid, skb->skb_iif, skb->mark); if (!client_added) @@ -397,12 +384,12 @@ end: /** * batadv_interface_rx() - receive ethernet frame on local batman-adv interface - * @soft_iface: local interface which will receive the ethernet frame - * @skb: ethernet frame for @soft_iface + * @mesh_iface: local interface which will receive the ethernet frame + * @skb: ethernet frame for @mesh_iface * @hdr_size: size of already parsed batman-adv header * @orig_node: originator from which the batman-adv packet was sent * - * Sends an ethernet frame to the receive path of the local @soft_iface. + * Sends an ethernet frame to the receive path of the local @mesh_iface. * skb->data has still point to the batman-adv header with the size @hdr_size. * The caller has to have parsed this header already and made sure that at least * @hdr_size bytes are still available for pull in @skb. @@ -412,12 +399,12 @@ end: * unicast packets will be dropped directly when it was sent between two * isolated clients. */ -void batadv_interface_rx(struct net_device *soft_iface, +void batadv_interface_rx(struct net_device *mesh_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node) { struct batadv_bcast_packet *batadv_bcast_packet; - struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_priv *bat_priv = netdev_priv(mesh_iface); struct vlan_ethhdr *vhdr; struct ethhdr *ethhdr; unsigned short vid; @@ -457,7 +444,7 @@ void batadv_interface_rx(struct net_device *soft_iface, } /* skb->dev & skb->pkt_type are set here */ - skb->protocol = eth_type_trans(skb, soft_iface); + skb->protocol = eth_type_trans(skb, mesh_iface); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); batadv_inc_counter(bat_priv, BATADV_CNT_RX); @@ -502,38 +489,38 @@ out: } /** - * batadv_softif_vlan_release() - release vlan from lists and queue for free + * batadv_meshif_vlan_release() - release vlan from lists and queue for free * after rcu grace period * @ref: kref pointer of the vlan object */ -void batadv_softif_vlan_release(struct kref *ref) +void batadv_meshif_vlan_release(struct kref *ref) { - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; - vlan = container_of(ref, struct batadv_softif_vlan, refcount); + vlan = container_of(ref, struct batadv_meshif_vlan, refcount); - spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock); + spin_lock_bh(&vlan->bat_priv->meshif_vlan_list_lock); hlist_del_rcu(&vlan->list); - spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock); + spin_unlock_bh(&vlan->bat_priv->meshif_vlan_list_lock); kfree_rcu(vlan, rcu); } /** - * batadv_softif_vlan_get() - get the vlan object for a specific vid - * @bat_priv: the bat priv with all the soft interface information + * batadv_meshif_vlan_get() - get the vlan object for a specific vid + * @bat_priv: the bat priv with all the mesh interface information * @vid: the identifier of the vlan object to retrieve * * Return: the private data of the vlan matching the vid passed as argument or * NULL otherwise. The refcounter of the returned object is incremented by 1. */ -struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, +struct batadv_meshif_vlan *batadv_meshif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid) { - struct batadv_softif_vlan *vlan_tmp, *vlan = NULL; + struct batadv_meshif_vlan *vlan_tmp, *vlan = NULL; rcu_read_lock(); - hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) { + hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->meshif_vlan_list, list) { if (vlan_tmp->vid != vid) continue; @@ -549,28 +536,28 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, } /** - * batadv_softif_create_vlan() - allocate the needed resources for a new vlan - * @bat_priv: the bat priv with all the soft interface information + * batadv_meshif_create_vlan() - allocate the needed resources for a new vlan + * @bat_priv: the bat priv with all the mesh interface information * @vid: the VLAN identifier * * Return: 0 on success, a negative error otherwise. */ -int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) +int batadv_meshif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) { - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; - spin_lock_bh(&bat_priv->softif_vlan_list_lock); + spin_lock_bh(&bat_priv->meshif_vlan_list_lock); - vlan = batadv_softif_vlan_get(bat_priv, vid); + vlan = batadv_meshif_vlan_get(bat_priv, vid); if (vlan) { - batadv_softif_vlan_put(vlan); - spin_unlock_bh(&bat_priv->softif_vlan_list_lock); + batadv_meshif_vlan_put(vlan); + spin_unlock_bh(&bat_priv->meshif_vlan_list_lock); return -EEXIST; } vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC); if (!vlan) { - spin_unlock_bh(&bat_priv->softif_vlan_list_lock); + spin_unlock_bh(&bat_priv->meshif_vlan_list_lock); return -ENOMEM; } @@ -581,37 +568,37 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid) atomic_set(&vlan->ap_isolation, 0); kref_get(&vlan->refcount); - hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list); - spin_unlock_bh(&bat_priv->softif_vlan_list_lock); + hlist_add_head_rcu(&vlan->list, &bat_priv->meshif_vlan_list); + spin_unlock_bh(&bat_priv->meshif_vlan_list_lock); /* add a new TT local entry. This one will be marked with the NOPURGE * flag */ - batadv_tt_local_add(bat_priv->soft_iface, - bat_priv->soft_iface->dev_addr, vid, + batadv_tt_local_add(bat_priv->mesh_iface, + bat_priv->mesh_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); - /* don't return reference to new softif_vlan */ - batadv_softif_vlan_put(vlan); + /* don't return reference to new meshif_vlan */ + batadv_meshif_vlan_put(vlan); return 0; } /** - * batadv_softif_destroy_vlan() - remove and destroy a softif_vlan object - * @bat_priv: the bat priv with all the soft interface information + * batadv_meshif_destroy_vlan() - remove and destroy a meshif_vlan object + * @bat_priv: the bat priv with all the mesh interface information * @vlan: the object to remove */ -static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv, - struct batadv_softif_vlan *vlan) +static void batadv_meshif_destroy_vlan(struct batadv_priv *bat_priv, + struct batadv_meshif_vlan *vlan) { /* explicitly remove the associated TT local entry because it is marked * with the NOPURGE flag */ - batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr, + batadv_tt_local_remove(bat_priv, bat_priv->mesh_iface->dev_addr, vlan->vid, "vlan interface destroyed", false); - batadv_softif_vlan_put(vlan); + batadv_meshif_vlan_put(vlan); } /** @@ -629,7 +616,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; /* only 802.1Q vlans are supported. * batman-adv does not know how to handle other types @@ -637,24 +624,32 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto, if (proto != htons(ETH_P_8021Q)) return -EINVAL; + /* VID 0 is only used to indicate "priority tag" frames which only + * contain priority information and no VID. No management structures + * should be created for this VID and it should be handled like an + * untagged frame. + */ + if (vid == 0) + return 0; + vid |= BATADV_VLAN_HAS_TAG; /* if a new vlan is getting created and it already exists, it means that - * it was not deleted yet. batadv_softif_vlan_get() increases the + * it was not deleted yet. batadv_meshif_vlan_get() increases the * refcount in order to revive the object. * * if it does not exist then create it. */ - vlan = batadv_softif_vlan_get(bat_priv, vid); + vlan = batadv_meshif_vlan_get(bat_priv, vid); if (!vlan) - return batadv_softif_create_vlan(bat_priv, vid); + return batadv_meshif_create_vlan(bat_priv, vid); /* add a new TT local entry. This one will be marked with the NOPURGE * flag. This must be added again, even if the vlan object already * exists, because the entry was deleted by kill_vid() */ - batadv_tt_local_add(bat_priv->soft_iface, - bat_priv->soft_iface->dev_addr, vid, + batadv_tt_local_add(bat_priv->mesh_iface, + bat_priv->mesh_iface->dev_addr, vid, BATADV_NULL_IFINDEX, BATADV_NO_MARK); return 0; @@ -676,7 +671,7 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, unsigned short vid) { struct batadv_priv *bat_priv = netdev_priv(dev); - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; /* only 802.1Q vlans are supported. batman-adv does not know how to * handle other types @@ -684,14 +679,20 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, if (proto != htons(ETH_P_8021Q)) return -EINVAL; - vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); + /* "priority tag" frames are handled like "untagged" frames + * and no meshif_vlan needs to be destroyed + */ + if (vid == 0) + return 0; + + vlan = batadv_meshif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); if (!vlan) return -ENOENT; - batadv_softif_destroy_vlan(bat_priv, vlan); + batadv_meshif_destroy_vlan(bat_priv, vlan); /* finally free the vlan object */ - batadv_softif_vlan_put(vlan); + batadv_meshif_vlan_put(vlan); return 0; } @@ -727,12 +728,12 @@ static void batadv_set_lockdep_class(struct net_device *dev) } /** - * batadv_softif_init_late() - late stage initialization of soft interface + * batadv_meshif_init_late() - late stage initialization of mesh interface * @dev: registered network device to modify * * Return: error code on failures */ -static int batadv_softif_init_late(struct net_device *dev) +static int batadv_meshif_init_late(struct net_device *dev) { struct batadv_priv *bat_priv; u32 random_seqno; @@ -742,7 +743,7 @@ static int batadv_softif_init_late(struct net_device *dev) batadv_set_lockdep_class(dev); bat_priv = netdev_priv(dev); - bat_priv->soft_iface = dev; + bat_priv->mesh_iface = dev; /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called @@ -776,20 +777,20 @@ static int batadv_softif_init_late(struct net_device *dev) atomic_set(&bat_priv->log_level, 0); #endif atomic_set(&bat_priv->fragmentation, 1); - atomic_set(&bat_priv->packet_size_max, ETH_DATA_LEN); + atomic_set(&bat_priv->packet_size_max, BATADV_MAX_MTU); atomic_set(&bat_priv->bcast_queue_left, BATADV_BCAST_QUEUE_LEN); atomic_set(&bat_priv->batman_queue_left, BATADV_BATMAN_QUEUE_LEN); atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); atomic_set(&bat_priv->bcast_seqno, 1); atomic_set(&bat_priv->tt.vn, 0); - atomic_set(&bat_priv->tt.local_changes, 0); atomic_set(&bat_priv->tt.ogm_append_cnt, 0); #ifdef CONFIG_BATMAN_ADV_BLA atomic_set(&bat_priv->bla.num_requests, 0); #endif atomic_set(&bat_priv->tp_num, 0); + WRITE_ONCE(bat_priv->tt.local_changes, 0); bat_priv->tt.last_changeset = NULL; bat_priv->tt.last_changeset_len = 0; bat_priv->isolation_mark = 0; @@ -823,14 +824,14 @@ free_bat_counters: } /** - * batadv_softif_slave_add() - Add a slave interface to a batadv_soft_interface - * @dev: batadv_soft_interface used as master interface + * batadv_meshif_slave_add() - Add a slave interface to a batadv_mesh_interface + * @dev: batadv_mesh_interface used as master interface * @slave_dev: net_device which should become the slave interface * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ -static int batadv_softif_slave_add(struct net_device *dev, +static int batadv_meshif_slave_add(struct net_device *dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { @@ -838,7 +839,7 @@ static int batadv_softif_slave_add(struct net_device *dev, int ret = -EINVAL; hard_iface = batadv_hardif_get_by_netdev(slave_dev); - if (!hard_iface || hard_iface->soft_iface) + if (!hard_iface || hard_iface->mesh_iface) goto out; ret = batadv_hardif_enable_interface(hard_iface, dev); @@ -849,13 +850,13 @@ out: } /** - * batadv_softif_slave_del() - Delete a slave iface from a batadv_soft_interface - * @dev: batadv_soft_interface used as master interface + * batadv_meshif_slave_del() - Delete a slave iface from a batadv_mesh_interface + * @dev: batadv_mesh_interface used as master interface * @slave_dev: net_device which should be removed from the master interface * * Return: 0 if successful or error otherwise. */ -static int batadv_softif_slave_del(struct net_device *dev, +static int batadv_meshif_slave_del(struct net_device *dev, struct net_device *slave_dev) { struct batadv_hard_iface *hard_iface; @@ -863,7 +864,7 @@ static int batadv_softif_slave_del(struct net_device *dev, hard_iface = batadv_hardif_get_by_netdev(slave_dev); - if (!hard_iface || hard_iface->soft_iface != dev) + if (!hard_iface || hard_iface->mesh_iface != dev) goto out; batadv_hardif_disable_interface(hard_iface); @@ -875,9 +876,7 @@ out: } static const struct net_device_ops batadv_netdev_ops = { - .ndo_init = batadv_softif_init_late, - .ndo_open = batadv_interface_open, - .ndo_stop = batadv_interface_release, + .ndo_init = batadv_meshif_init_late, .ndo_get_stats = batadv_interface_stats, .ndo_vlan_rx_add_vid = batadv_interface_add_vid, .ndo_vlan_rx_kill_vid = batadv_interface_kill_vid, @@ -886,8 +885,8 @@ static const struct net_device_ops batadv_netdev_ops = { .ndo_set_rx_mode = batadv_interface_set_rx_mode, .ndo_start_xmit = batadv_interface_tx, .ndo_validate_addr = eth_validate_addr, - .ndo_add_slave = batadv_softif_slave_add, - .ndo_del_slave = batadv_softif_slave_del, + .ndo_add_slave = batadv_meshif_slave_add, + .ndo_del_slave = batadv_meshif_slave_del, }; static void batadv_get_drvinfo(struct net_device *dev, @@ -995,10 +994,10 @@ static const struct ethtool_ops batadv_ethtool_ops = { }; /** - * batadv_softif_free() - Deconstructor of batadv_soft_interface + * batadv_meshif_free() - Deconstructor of batadv_mesh_interface * @dev: Device to cleanup and remove */ -static void batadv_softif_free(struct net_device *dev) +static void batadv_meshif_free(struct net_device *dev) { batadv_mesh_free(dev); @@ -1010,25 +1009,26 @@ static void batadv_softif_free(struct net_device *dev) } /** - * batadv_softif_init_early() - early stage initialization of soft interface + * batadv_meshif_init_early() - early stage initialization of mesh interface * @dev: registered network device to modify */ -static void batadv_softif_init_early(struct net_device *dev) +static void batadv_meshif_init_early(struct net_device *dev) { ether_setup(dev); dev->netdev_ops = &batadv_netdev_ops; dev->needs_free_netdev = true; - dev->priv_destructor = batadv_softif_free; + dev->priv_destructor = batadv_meshif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; - dev->netns_local = true; + dev->netns_immutable = true; /* can't call min_mtu, because the needed variables * have not been initialized yet */ dev->mtu = ETH_DATA_LEN; + dev->max_mtu = BATADV_MAX_MTU; /* generate random address */ eth_hw_addr_random(dev); @@ -1037,14 +1037,14 @@ static void batadv_softif_init_early(struct net_device *dev) } /** - * batadv_softif_validate() - validate configuration of new batadv link + * batadv_meshif_validate() - validate configuration of new batadv link * @tb: IFLA_INFO_DATA netlink attributes * @data: enum batadv_ifla_attrs attributes * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ -static int batadv_softif_validate(struct nlattr *tb[], struct nlattr *data[], +static int batadv_meshif_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct batadv_algo_ops *algo_ops; @@ -1062,20 +1062,19 @@ static int batadv_softif_validate(struct nlattr *tb[], struct nlattr *data[], } /** - * batadv_softif_newlink() - pre-initialize and register new batadv link - * @src_net: the applicable net namespace + * batadv_meshif_newlink() - pre-initialize and register new batadv link * @dev: network device to register - * @tb: IFLA_INFO_DATA netlink attributes - * @data: enum batadv_ifla_attrs attributes + * @params: rtnl newlink parameters * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ -static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int batadv_meshif_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct batadv_priv *bat_priv = netdev_priv(dev); + struct nlattr **data = params->data; const char *algo_name; int err; @@ -1090,40 +1089,40 @@ static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, } /** - * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via + * batadv_meshif_destroy_netlink() - deletion of batadv_mesh_interface via * netlink - * @soft_iface: the to-be-removed batman-adv interface + * @mesh_iface: the to-be-removed batman-adv interface * @head: list pointer */ -static void batadv_softif_destroy_netlink(struct net_device *soft_iface, +static void batadv_meshif_destroy_netlink(struct net_device *mesh_iface, struct list_head *head) { - struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_priv *bat_priv = netdev_priv(mesh_iface); struct batadv_hard_iface *hard_iface; - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; list_for_each_entry(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface == soft_iface) + if (hard_iface->mesh_iface == mesh_iface) batadv_hardif_disable_interface(hard_iface); } /* destroy the "untagged" VLAN */ - vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); + vlan = batadv_meshif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (vlan) { - batadv_softif_destroy_vlan(bat_priv, vlan); - batadv_softif_vlan_put(vlan); + batadv_meshif_destroy_vlan(bat_priv, vlan); + batadv_meshif_vlan_put(vlan); } - unregister_netdevice_queue(soft_iface, head); + unregister_netdevice_queue(mesh_iface, head); } /** - * batadv_softif_is_valid() - Check whether device is a batadv soft interface + * batadv_meshif_is_valid() - Check whether device is a batadv mesh interface * @net_dev: device which should be checked * * Return: true when net_dev is a batman-adv interface, false otherwise */ -bool batadv_softif_is_valid(const struct net_device *net_dev) +bool batadv_meshif_is_valid(const struct net_device *net_dev) { if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx) return true; @@ -1138,10 +1137,10 @@ static const struct nla_policy batadv_ifla_policy[IFLA_BATADV_MAX + 1] = { struct rtnl_link_ops batadv_link_ops __read_mostly = { .kind = "batadv", .priv_size = sizeof(struct batadv_priv), - .setup = batadv_softif_init_early, + .setup = batadv_meshif_init_early, .maxtype = IFLA_BATADV_MAX, .policy = batadv_ifla_policy, - .validate = batadv_softif_validate, - .newlink = batadv_softif_newlink, - .dellink = batadv_softif_destroy_netlink, + .validate = batadv_meshif_validate, + .newlink = batadv_meshif_newlink, + .dellink = batadv_meshif_destroy_netlink, }; diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/mesh-interface.h index 9f2003f1a497..7ba055b2bc26 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/mesh-interface.h @@ -4,8 +4,8 @@ * Marek Lindner */ -#ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ -#define _NET_BATMAN_ADV_SOFT_INTERFACE_H_ +#ifndef _NET_BATMAN_ADV_MESH_INTERFACE_H_ +#define _NET_BATMAN_ADV_MESH_INTERFACE_H_ #include "main.h" @@ -16,27 +16,27 @@ #include <net/rtnetlink.h> int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); -void batadv_interface_rx(struct net_device *soft_iface, +void batadv_interface_rx(struct net_device *mesh_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node); -bool batadv_softif_is_valid(const struct net_device *net_dev); +bool batadv_meshif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; -int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); -void batadv_softif_vlan_release(struct kref *ref); -struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, +int batadv_meshif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); +void batadv_meshif_vlan_release(struct kref *ref); +struct batadv_meshif_vlan *batadv_meshif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid); /** - * batadv_softif_vlan_put() - decrease the vlan object refcounter and + * batadv_meshif_vlan_put() - decrease the vlan object refcounter and * possibly release it * @vlan: the vlan object to release */ -static inline void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) +static inline void batadv_meshif_vlan_put(struct batadv_meshif_vlan *vlan) { if (!vlan) return; - kref_put(&vlan->refcount, batadv_softif_vlan_release); + kref_put(&vlan->refcount, batadv_meshif_vlan_release); } -#endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */ +#endif /* _NET_BATMAN_ADV_MESH_INTERFACE_H_ */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 14088c4ff2f6..5786680aff30 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -12,6 +12,7 @@ #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> +#include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> @@ -46,7 +47,6 @@ #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> -#include <net/sock.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> @@ -56,7 +56,6 @@ #include "log.h" #include "netlink.h" #include "send.h" -#include "soft-interface.h" #include "translation-table.h" #include "tvlv.h" @@ -64,7 +63,7 @@ static void batadv_mcast_mla_update(struct work_struct *work); /** * batadv_mcast_start_timer() - schedule the multicast periodic worker - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv) { @@ -73,18 +72,18 @@ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv) } /** - * batadv_mcast_get_bridge() - get the bridge on top of the softif if it exists - * @soft_iface: netdev struct of the mesh interface + * batadv_mcast_get_bridge() - get the bridge on top of the meshif if it exists + * @mesh_iface: netdev struct of the mesh interface * - * If the given soft interface has a bridge on top then the refcount + * If the given mesh interface has a bridge on top then the refcount * of the according net device is increased. * * Return: NULL if no such bridge exists. Otherwise the net device of the * bridge. */ -static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) +static struct net_device *batadv_mcast_get_bridge(struct net_device *mesh_iface) { - struct net_device *upper = soft_iface; + struct net_device *upper = mesh_iface; rcu_read_lock(); do { @@ -98,7 +97,7 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) } /** - * batadv_mcast_mla_rtr_flags_softif_get_ipv4() - get mcast router flags from + * batadv_mcast_mla_rtr_flags_meshif_get_ipv4() - get mcast router flags from * node for IPv4 * @dev: the interface to check * @@ -108,7 +107,7 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) * * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise. */ -static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv4(struct net_device *dev) +static u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv4(struct net_device *dev) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -119,7 +118,7 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv4(struct net_device *dev) } /** - * batadv_mcast_mla_rtr_flags_softif_get_ipv6() - get mcast router flags from + * batadv_mcast_mla_rtr_flags_meshif_get_ipv6() - get mcast router flags from * node for IPv6 * @dev: the interface to check * @@ -130,7 +129,7 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv4(struct net_device *dev) * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise. */ #if IS_ENABLED(CONFIG_IPV6_MROUTE) -static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) +static u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv6(struct net_device *dev) { struct inet6_dev *in6_dev = __in6_dev_get(dev); @@ -141,16 +140,16 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) } #else static inline u8 -batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) +batadv_mcast_mla_rtr_flags_meshif_get_ipv6(struct net_device *dev) { return BATADV_MCAST_WANT_NO_RTR6; } #endif /** - * batadv_mcast_mla_rtr_flags_softif_get() - get mcast router flags from node - * @bat_priv: the bat priv with all the soft interface information - * @bridge: bridge interface on top of the soft_iface if present, + * batadv_mcast_mla_rtr_flags_meshif_get() - get mcast router flags from node + * @bat_priv: the bat priv with all the mesh interface information + * @bridge: bridge interface on top of the mesh_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers on this @@ -162,16 +161,16 @@ batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ -static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv, +static u8 batadv_mcast_mla_rtr_flags_meshif_get(struct batadv_priv *bat_priv, struct net_device *bridge) { - struct net_device *dev = bridge ? bridge : bat_priv->soft_iface; + struct net_device *dev = bridge ? bridge : bat_priv->mesh_iface; u8 flags = BATADV_NO_FLAGS; rcu_read_lock(); - flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv4(dev); - flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv6(dev); + flags |= batadv_mcast_mla_rtr_flags_meshif_get_ipv4(dev); + flags |= batadv_mcast_mla_rtr_flags_meshif_get_ipv6(dev); rcu_read_unlock(); @@ -180,8 +179,8 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv, /** * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge - * @bat_priv: the bat priv with all the soft interface information - * @bridge: bridge interface on top of the soft_iface if present, + * @bat_priv: the bat priv with all the mesh interface information + * @bridge: bridge interface on top of the mesh_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge. @@ -195,7 +194,7 @@ static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv, static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, struct net_device *bridge) { - struct net_device *dev = bat_priv->soft_iface; + struct net_device *dev = bat_priv->mesh_iface; u8 flags = BATADV_NO_FLAGS; if (!bridge) @@ -211,8 +210,8 @@ static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, /** * batadv_mcast_mla_rtr_flags_get() - get multicast router flags - * @bat_priv: the bat priv with all the soft interface information - * @bridge: bridge interface on top of the soft_iface if present, + * @bat_priv: the bat priv with all the mesh interface information + * @bridge: bridge interface on top of the mesh_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers on this @@ -229,7 +228,7 @@ static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv, { u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; - flags &= batadv_mcast_mla_rtr_flags_softif_get(bat_priv, bridge); + flags &= batadv_mcast_mla_rtr_flags_meshif_get(bat_priv, bridge); flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge); return flags; @@ -237,7 +236,7 @@ static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv, /** * batadv_mcast_mla_forw_flags_get() - get multicast forwarding flags - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Checks if all active hard interfaces have an MTU larger or equal to 1280 * bytes (IPv6 minimum MTU). @@ -253,7 +252,7 @@ static u8 batadv_mcast_mla_forw_flags_get(struct batadv_priv *bat_priv) if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; - if (hard_iface->soft_iface != bat_priv->soft_iface) + if (hard_iface->mesh_iface != bat_priv->mesh_iface) continue; if (hard_iface->net_dev->mtu < IPV6_MIN_MTU) { @@ -268,7 +267,7 @@ static u8 batadv_mcast_mla_forw_flags_get(struct batadv_priv *bat_priv) /** * batadv_mcast_mla_flags_get() - get the new multicast flags - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: A set of flags for the current/next TVLV, querier and * bridge state. @@ -276,7 +275,7 @@ static u8 batadv_mcast_mla_forw_flags_get(struct batadv_priv *bat_priv) static struct batadv_mcast_mla_flags batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv) { - struct net_device *dev = bat_priv->soft_iface; + struct net_device *dev = bat_priv->mesh_iface; struct batadv_mcast_querier_state *qr4, *qr6; struct batadv_mcast_mla_flags mla_flags; struct net_device *bridge; @@ -352,13 +351,13 @@ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, } /** - * batadv_mcast_mla_softif_get_ipv4() - get softif IPv4 multicast listeners + * batadv_mcast_mla_meshif_get_ipv4() - get meshif IPv4 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of IPv4 multicast listeners residing - * on this kernel on the given soft interface, dev, in + * on this kernel on the given mesh interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * @@ -366,7 +365,7 @@ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, * items added to the mcast_list otherwise. */ static int -batadv_mcast_mla_softif_get_ipv4(struct net_device *dev, +batadv_mcast_mla_meshif_get_ipv4(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { @@ -418,13 +417,13 @@ batadv_mcast_mla_softif_get_ipv4(struct net_device *dev, } /** - * batadv_mcast_mla_softif_get_ipv6() - get softif IPv6 multicast listeners + * batadv_mcast_mla_meshif_get_ipv6() - get meshif IPv6 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of IPv6 multicast listeners residing - * on this kernel on the given soft interface, dev, in + * on this kernel on the given mesh interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * @@ -433,7 +432,7 @@ batadv_mcast_mla_softif_get_ipv4(struct net_device *dev, */ #if IS_ENABLED(CONFIG_IPV6) static int -batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, +batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { @@ -491,7 +490,7 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, } #else static inline int -batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, +batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { @@ -500,13 +499,13 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, #endif /** - * batadv_mcast_mla_softif_get() - get softif multicast listeners + * batadv_mcast_mla_meshif_get() - get meshif multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing - * on this kernel on the given soft interface, dev, in + * on this kernel on the given mesh interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * @@ -519,7 +518,7 @@ batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, * items added to the mcast_list otherwise. */ static int -batadv_mcast_mla_softif_get(struct net_device *dev, +batadv_mcast_mla_meshif_get(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { @@ -529,11 +528,11 @@ batadv_mcast_mla_softif_get(struct net_device *dev, if (bridge) dev = bridge; - ret4 = batadv_mcast_mla_softif_get_ipv4(dev, mcast_list, flags); + ret4 = batadv_mcast_mla_meshif_get_ipv4(dev, mcast_list, flags); if (ret4 < 0) goto out; - ret6 = batadv_mcast_mla_softif_get_ipv6(dev, mcast_list, flags); + ret6 = batadv_mcast_mla_meshif_get_ipv6(dev, mcast_list, flags); if (ret6 < 0) { ret4 = 0; goto out; @@ -577,7 +576,7 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) * * Collects multicast addresses of multicast listeners residing * on foreign, non-mesh devices which we gave access to our mesh via - * a bridge on top of the given soft interface, dev, in the given + * a bridge on top of the given mesh interface, dev, in the given * mcast_list. * * Return: -ENOMEM on memory allocation error or the number of @@ -673,7 +672,7 @@ static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list) /** * batadv_mcast_mla_tt_retract() - clean up multicast listener announcements - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @mcast_list: a list of addresses which should _not_ be removed * * Retracts the announcement of any multicast listener from the @@ -705,7 +704,7 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, /** * batadv_mcast_mla_tt_add() - add multicast listener announcements - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @mcast_list: a list of addresses which are going to get added * * Adds multicast listener announcements from the given mcast_list to the @@ -725,7 +724,7 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, &bat_priv->mcast.mla_list)) continue; - if (!batadv_tt_local_add(bat_priv->soft_iface, + if (!batadv_tt_local_add(bat_priv->mesh_iface, mcast_entry->addr, BATADV_NO_FLAGS, BATADV_NULL_IFINDEX, BATADV_NO_MARK)) continue; @@ -738,7 +737,7 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, /** * batadv_mcast_querier_log() - debug output regarding the querier status on * link - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD") * @old_state: the previous querier state on our link * @new_state: the new querier state on our link @@ -755,7 +754,7 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, * potentially shadowing listeners from us then. * * This is only interesting for nodes with a bridge on top of their - * soft interface. + * mesh interface. */ static void batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, @@ -763,14 +762,14 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, struct batadv_mcast_querier_state *new_state) { if (!old_state->exists && new_state->exists) - batadv_info(bat_priv->soft_iface, "%s Querier appeared\n", + batadv_info(bat_priv->mesh_iface, "%s Querier appeared\n", str_proto); else if (old_state->exists && !new_state->exists) - batadv_info(bat_priv->soft_iface, + batadv_info(bat_priv->mesh_iface, "%s Querier disappeared - multicast optimizations disabled\n", str_proto); else if (!bat_priv->mcast.mla_flags.bridged && !new_state->exists) - batadv_info(bat_priv->soft_iface, + batadv_info(bat_priv->mesh_iface, "No %s Querier present - multicast optimizations disabled\n", str_proto); @@ -790,7 +789,7 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, /** * batadv_mcast_bridge_log() - debug output for topology changes in bridged * setups - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @new_flags: flags indicating the new multicast state * * If no bridges are ever used on this node, then this function does nothing. @@ -799,7 +798,7 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, * which might be relevant to our multicast optimizations. * * More precisely, it outputs information when a bridge interface is added or - * removed from a soft interface. And when a bridge is present, it further + * removed from a mesh interface. And when a bridge is present, it further * outputs information about the querier state which is relevant for the * multicast flags this node is going to set. */ @@ -828,7 +827,7 @@ batadv_mcast_bridge_log(struct batadv_priv *bat_priv, /** * batadv_mcast_flags_log() - output debug information about mcast flag changes - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @flags: TVLV flags indicating the new multicast state * * Whenever the multicast TVLV flags this node announces change, this function @@ -861,7 +860,7 @@ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) /** * batadv_mcast_mla_flags_update() - update multicast flags - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @flags: flags indicating the new multicast state * * Updates the own multicast tvlv with our current multicast related settings, @@ -890,7 +889,7 @@ batadv_mcast_mla_flags_update(struct batadv_priv *bat_priv, /** * __batadv_mcast_mla_update() - update the own MLAs - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. @@ -902,18 +901,18 @@ batadv_mcast_mla_flags_update(struct batadv_priv *bat_priv, */ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv) { - struct net_device *soft_iface = bat_priv->soft_iface; + struct net_device *mesh_iface = bat_priv->mesh_iface; struct hlist_head mcast_list = HLIST_HEAD_INIT; struct batadv_mcast_mla_flags flags; int ret; flags = batadv_mcast_mla_flags_get(bat_priv); - ret = batadv_mcast_mla_softif_get(soft_iface, &mcast_list, &flags); + ret = batadv_mcast_mla_meshif_get(mesh_iface, &mcast_list, &flags); if (ret < 0) goto out; - ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list, &flags); + ret = batadv_mcast_mla_bridge_get(mesh_iface, &mcast_list, &flags); if (ret < 0) goto out; @@ -978,7 +977,7 @@ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) /** * batadv_mcast_forw_mode_check_ipv4() - check for optimized forwarding * potential - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the IPv4 packet to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable @@ -1043,7 +1042,7 @@ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) /** * batadv_mcast_forw_mode_check_ipv6() - check for optimized forwarding * potential - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the IPv6 packet to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable @@ -1085,7 +1084,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_mode_check() - check for optimized forwarding potential - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast frame to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable @@ -1125,7 +1124,7 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_want_all_ip_count() - count nodes with unspecific mcast * interest - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: ethernet header of a packet * * Return: the number of nodes which want all IPv4 multicast traffic if the @@ -1148,7 +1147,7 @@ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_rtr_count() - count nodes with a multicast router - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @protocol: the ethernet protocol type to count multicast routers for * * Return: the number of nodes which want all routable IPv4 multicast traffic @@ -1171,7 +1170,7 @@ static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_mode_by_count() - get forwarding mode by count - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to check * @vid: the vlan identifier * @is_routable: stores whether the destination is routable @@ -1215,7 +1214,7 @@ batadv_mcast_forw_mode_by_count(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_mode() - check on how to forward a multicast packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to check * @vid: the vlan identifier * @is_routable: stores whether the destination is routable @@ -1260,7 +1259,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, /** * batadv_mcast_forw_send_orig() - send a multicast packet to an originator - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to send * @vid: the vlan identifier * @orig_node: the originator to send the packet to @@ -1289,7 +1288,7 @@ static int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_tt() - forwards a packet to multicast listeners - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * @@ -1337,7 +1336,7 @@ out: /** * batadv_mcast_forw_want_all_ipv4() - forward to nodes with want-all-ipv4 - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * @@ -1374,7 +1373,7 @@ batadv_mcast_forw_want_all_ipv4(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_want_all_ipv6() - forward to nodes with want-all-ipv6 - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: The multicast packet to transmit * @vid: the vlan identifier * @@ -1411,7 +1410,7 @@ batadv_mcast_forw_want_all_ipv6(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_want_all() - forward packet to nodes in a want-all list - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * @@ -1440,7 +1439,7 @@ batadv_mcast_forw_want_all(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_want_all_rtr4() - forward to nodes with want-all-rtr4 - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * @@ -1477,7 +1476,7 @@ batadv_mcast_forw_want_all_rtr4(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_want_all_rtr6() - forward to nodes with want-all-rtr6 - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: The multicast packet to transmit * @vid: the vlan identifier * @@ -1514,7 +1513,7 @@ batadv_mcast_forw_want_all_rtr6(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_want_rtr() - forward packet to nodes in a want-all-rtr list - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * @@ -1543,7 +1542,7 @@ batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_send() - send packet to any detected multicast recipient - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * @is_routable: stores whether the destination is routable @@ -1591,7 +1590,7 @@ skip_mc_router: /** * batadv_mcast_want_unsnoop_update() - update unsnoop counter and list - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * @@ -1637,7 +1636,7 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, /** * batadv_mcast_want_ipv4_update() - update want-all-ipv4 counter and list - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * @@ -1682,7 +1681,7 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, /** * batadv_mcast_want_ipv6_update() - update want-all-ipv6 counter and list - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * @@ -1727,7 +1726,7 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, /** * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * @@ -1772,7 +1771,7 @@ static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, /** * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * @@ -1817,7 +1816,7 @@ static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv, /** * batadv_mcast_have_mc_ptype_update() - update multicast packet type counter - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * @@ -1873,7 +1872,7 @@ batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len) /** * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the multicast data @@ -1916,7 +1915,7 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, /** * batadv_mcast_init() - initialize the multicast optimizations structures - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_mcast_init(struct batadv_priv *bat_priv) { @@ -1935,7 +1934,7 @@ void batadv_mcast_init(struct batadv_priv *bat_priv) /** * batadv_mcast_mesh_info_put() - put multicast info into a netlink message * @msg: buffer for the message - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 or error code. */ @@ -2061,7 +2060,7 @@ skip: * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @bucket: current bucket to dump * @idx: index in current bucket to the next entry to dump * @@ -2104,23 +2103,15 @@ batadv_mcast_netlink_get_primary(struct netlink_callback *cb, struct batadv_hard_iface **primary_if) { struct batadv_hard_iface *hard_iface = NULL; - struct net *net = sock_net(cb->skb->sk); - struct net_device *soft_iface; + struct net_device *mesh_iface; struct batadv_priv *bat_priv; - int ifindex; int ret = 0; - ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); - if (!ifindex) - return -EINVAL; - - soft_iface = dev_get_by_index(net, ifindex); - if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { - ret = -ENODEV; - goto out; - } + mesh_iface = batadv_netlink_get_meshif(cb); + if (IS_ERR(mesh_iface)) + return PTR_ERR(mesh_iface); - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(mesh_iface); hard_iface = batadv_primary_if_get_selected(bat_priv); if (!hard_iface || hard_iface->if_status != BATADV_IF_ACTIVE) { @@ -2129,7 +2120,7 @@ batadv_mcast_netlink_get_primary(struct netlink_callback *cb, } out: - dev_put(soft_iface); + dev_put(mesh_iface); if (!ret && primary_if) *primary_if = hard_iface; @@ -2159,7 +2150,7 @@ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) if (ret) return ret; - bat_priv = netdev_priv(primary_if->soft_iface); + bat_priv = netdev_priv(primary_if->mesh_iface); ret = __batadv_mcast_flags_dump(msg, portid, cb, bat_priv, bucket, idx); batadv_hardif_put(primary_if); @@ -2168,7 +2159,7 @@ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) /** * batadv_mcast_free() - free the multicast optimizations structures - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_mcast_free(struct batadv_priv *bat_priv) { diff --git a/net/batman-adv/multicast_forw.c b/net/batman-adv/multicast_forw.c index fafd6ba8c056..b8668a80b94a 100644 --- a/net/batman-adv/multicast_forw.c +++ b/net/batman-adv/multicast_forw.c @@ -131,7 +131,7 @@ batadv_mcast_forw_orig_entry(struct hlist_node *node, /** * batadv_mcast_forw_push_dest() - push an originator MAC address onto an skb - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the skb to push the destination address onto * @vid: the vlan identifier * @orig_node: the originator node to get the MAC address from @@ -174,7 +174,7 @@ static bool batadv_mcast_forw_push_dest(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_push_dests_list() - push originators from list onto an skb - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the skb to push the destination addresses onto * @vid: the vlan identifier * @head: the list to gather originators from @@ -215,7 +215,7 @@ static int batadv_mcast_forw_push_dests_list(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_push_tt() - push originators with interest through TT - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the skb to push the destination addresses onto * @vid: the vlan identifier * @num_dests: a pointer to store the number of pushed addresses in @@ -262,7 +262,7 @@ out: /** * batadv_mcast_forw_push_want_all() - push originators with want-all flag - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the skb to push the destination addresses onto * @vid: the vlan identifier * @num_dests: a pointer to store the number of pushed addresses in @@ -308,7 +308,7 @@ static bool batadv_mcast_forw_push_want_all(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_push_want_rtr() - push originators with want-router flag - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the skb to push the destination addresses onto * @vid: the vlan identifier * @num_dests: a pointer to store the number of pushed addresses in @@ -475,7 +475,7 @@ out: /** * batadv_mcast_forw_push_dests() - push originator addresses onto an skb - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the skb to push the destination addresses onto * @vid: the vlan identifier * @is_routable: indicates whether the destination is routable @@ -567,7 +567,7 @@ static int batadv_mcast_forw_push_tracker(struct sk_buff *skb, int num_dests, /** * batadv_mcast_forw_push_tvlvs() - push a multicast tracker TVLV onto an skb - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the skb to push the tracker TVLV onto * @vid: the vlan identifier * @is_routable: indicates whether the destination is routable @@ -634,7 +634,7 @@ batadv_mcast_forw_push_hdr(struct sk_buff *skb, unsigned short tvlv_len) /** * batadv_mcast_forw_scrub_dests() - scrub destinations in a tracker TVLV - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @comp_neigh: next hop neighbor to scrub+collect destinations for * @dest: start MAC entry in original skb's tracker TVLV * @next_dest: start MAC entry in to be sent skb's tracker TVLV @@ -905,7 +905,7 @@ static void batadv_mcast_forw_shrink_tracker(struct sk_buff *skb) /** * batadv_mcast_forw_packet() - forward a batman-adv multicast packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the received or locally generated batman-adv multicast packet * @local_xmit: indicates that the packet was locally generated and not received * @@ -920,7 +920,7 @@ static void batadv_mcast_forw_shrink_tracker(struct sk_buff *skb) * * Return: NET_RX_SUCCESS or NET_RX_DROP on success or a negative error * code on failure. NET_RX_SUCCESS if the received packet is supposed to be - * decapsulated and forwarded to the own soft interface, NET_RX_DROP otherwise. + * decapsulated and forwarded to the own mesh interface, NET_RX_DROP otherwise. */ static int batadv_mcast_forw_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, bool local_xmit) @@ -1028,7 +1028,7 @@ static int batadv_mcast_forw_packet(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_tracker_tvlv_handler() - handle an mcast tracker tvlv - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the received batman-adv multicast packet * * Parses the tracker TVLV of an incoming batman-adv multicast packet and @@ -1042,7 +1042,7 @@ static int batadv_mcast_forw_packet(struct batadv_priv *bat_priv, * * Return: NET_RX_SUCCESS or NET_RX_DROP on success or a negative error * code on failure. NET_RX_SUCCESS if the received packet is supposed to be - * decapsulated and forwarded to the own soft interface, NET_RX_DROP otherwise. + * decapsulated and forwarded to the own mesh interface, NET_RX_DROP otherwise. */ int batadv_mcast_forw_tracker_tvlv_handler(struct batadv_priv *bat_priv, struct sk_buff *skb) @@ -1075,7 +1075,7 @@ unsigned int batadv_mcast_forw_packet_hdrlen(unsigned int num_dests) /** * batadv_mcast_forw_expand_head() - expand headroom for an mcast packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to send * * Tries to expand an skb's headroom so that its head to tail is 1298 @@ -1110,7 +1110,7 @@ static int batadv_mcast_forw_expand_head(struct batadv_priv *bat_priv, /** * batadv_mcast_forw_push() - encapsulate skb in a batman-adv multicast packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to encapsulate and send * @vid: the vlan identifier * @is_routable: indicates whether the destination is routable @@ -1154,7 +1154,7 @@ err: /** * batadv_mcast_forw_mcsend() - send a self prepared batman-adv multicast packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the multicast packet to encapsulate and send * * Transmits a batman-adv multicast packet that was locally prepared and diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 9362cd9d6f3d..e7c8f9f2bb1f 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -43,10 +43,10 @@ #include "gateway_common.h" #include "hard-interface.h" #include "log.h" +#include "mesh-interface.h" #include "multicast.h" #include "network-coding.h" #include "originator.h" -#include "soft-interface.h" #include "tp_meter.h" #include "translation-table.h" @@ -63,7 +63,7 @@ enum batadv_netlink_multicast_groups { */ enum batadv_genl_ops_flags { /** - * @BATADV_FLAG_NEED_MESH: request requires valid soft interface in + * @BATADV_FLAG_NEED_MESH: request requires valid mesh interface in * attribute BATADV_ATTR_MESH_IFINDEX and expects a pointer to it to be * saved in info->user_ptr[0] */ @@ -158,8 +158,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { * * Return: interface index, or 0. */ -int -batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) +static int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) { struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype); @@ -167,24 +166,24 @@ batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) } /** - * batadv_netlink_mesh_fill_ap_isolation() - Add ap_isolation softif attribute + * batadv_netlink_mesh_fill_ap_isolation() - Add ap_isolation meshif attribute * @msg: Netlink message to dump into - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_mesh_fill_ap_isolation(struct sk_buff *msg, struct batadv_priv *bat_priv) { - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; u8 ap_isolation; - vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); + vlan = batadv_meshif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (!vlan) return 0; ap_isolation = atomic_read(&vlan->ap_isolation); - batadv_softif_vlan_put(vlan); + batadv_meshif_vlan_put(vlan); return nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED, !!ap_isolation); @@ -193,21 +192,21 @@ static int batadv_netlink_mesh_fill_ap_isolation(struct sk_buff *msg, /** * batadv_netlink_set_mesh_ap_isolation() - Set ap_isolation from genl msg * @attr: parsed BATADV_ATTR_AP_ISOLATION_ENABLED attribute - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or negative error number in case of failure */ static int batadv_netlink_set_mesh_ap_isolation(struct nlattr *attr, struct batadv_priv *bat_priv) { - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; - vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); + vlan = batadv_meshif_vlan_get(bat_priv, BATADV_NO_FLAGS); if (!vlan) return -ENOENT; atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr)); - batadv_softif_vlan_put(vlan); + batadv_meshif_vlan_put(vlan); return 0; } @@ -215,7 +214,7 @@ static int batadv_netlink_set_mesh_ap_isolation(struct nlattr *attr, /** * batadv_netlink_mesh_fill() - Fill message with mesh attributes * @msg: Netlink message to dump into - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @cmd: type of message to generate * @portid: Port making netlink request * @seq: sequence number for message @@ -228,7 +227,7 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, enum batadv_nl_commands cmd, u32 portid, u32 seq, int flags) { - struct net_device *soft_iface = bat_priv->soft_iface; + struct net_device *mesh_iface = bat_priv->mesh_iface; struct batadv_hard_iface *primary_if = NULL; struct net_device *hard_iface; void *hdr; @@ -240,10 +239,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, if (nla_put_string(msg, BATADV_ATTR_VERSION, BATADV_SOURCE_VERSION) || nla_put_string(msg, BATADV_ATTR_ALGO_NAME, bat_priv->algo_ops->name) || - nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) || - nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) || + nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, mesh_iface->ifindex) || + nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, mesh_iface->name) || nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN, - soft_iface->dev_addr) || + mesh_iface->dev_addr) || nla_put_u8(msg, BATADV_ATTR_TT_TTVN, (u8)atomic_read(&bat_priv->tt.vn))) goto nla_put_failure; @@ -370,8 +369,8 @@ nla_put_failure: } /** - * batadv_netlink_notify_mesh() - send softif attributes to listener - * @bat_priv: the bat priv with all the soft interface information + * batadv_netlink_notify_mesh() - send meshif attributes to listener + * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success, < 0 on error */ @@ -392,14 +391,14 @@ static int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv) } genlmsg_multicast_netns(&batadv_netlink_family, - dev_net(bat_priv->soft_iface), msg, 0, + dev_net(bat_priv->mesh_iface), msg, 0, BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); return 0; } /** - * batadv_netlink_get_mesh() - Get softif attributes + * batadv_netlink_get_mesh() - Get meshif attributes * @skb: Netlink message with request data * @info: receiver information * @@ -428,7 +427,7 @@ static int batadv_netlink_get_mesh(struct sk_buff *skb, struct genl_info *info) } /** - * batadv_netlink_set_mesh() - Set softif attributes + * batadv_netlink_set_mesh() - Set meshif attributes * @skb: Netlink message with request data * @info: receiver information * @@ -475,7 +474,7 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) atomic_set(&bat_priv->bridge_loop_avoidance, !!nla_get_u8(attr)); - batadv_bla_status_update(bat_priv->soft_iface); + batadv_bla_status_update(bat_priv->mesh_iface); } #endif /* CONFIG_BATMAN_ADV_BLA */ @@ -485,7 +484,7 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) atomic_set(&bat_priv->distributed_arp_table, !!nla_get_u8(attr)); - batadv_dat_status_update(bat_priv->soft_iface); + batadv_dat_status_update(bat_priv->mesh_iface); } #endif /* CONFIG_BATMAN_ADV_DAT */ @@ -495,7 +494,7 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) atomic_set(&bat_priv->fragmentation, !!nla_get_u8(attr)); rtnl_lock(); - batadv_update_min_mtu(bat_priv->soft_iface); + batadv_update_min_mtu(bat_priv->mesh_iface); rtnl_unlock(); } @@ -595,7 +594,7 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) attr = info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]; atomic_set(&bat_priv->network_coding, !!nla_get_u8(attr)); - batadv_nc_status_update(bat_priv->soft_iface); + batadv_nc_status_update(bat_priv->mesh_iface); } #endif /* CONFIG_BATMAN_ADV_NC */ @@ -634,7 +633,7 @@ batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie) /** * batadv_netlink_tpmeter_notify() - send tp_meter result via netlink to client - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @dst: destination of tp_meter session * @result: reason for tp meter session stop * @test_time: total time of the tp_meter session @@ -681,7 +680,7 @@ int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, genlmsg_end(msg, hdr); genlmsg_multicast_netns(&batadv_netlink_family, - dev_net(bat_priv->soft_iface), msg, 0, + dev_net(bat_priv->mesh_iface), msg, 0, BATADV_NL_MCGRP_TPMETER, GFP_KERNEL); return 0; @@ -779,7 +778,7 @@ batadv_netlink_tp_meter_cancel(struct sk_buff *skb, struct genl_info *info) /** * batadv_netlink_hardif_fill() - Fill message with hardif attributes * @msg: Netlink message to dump into - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @hard_iface: hard interface which was modified * @cmd: type of message to generate * @portid: Port making netlink request @@ -807,11 +806,11 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg, genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, - bat_priv->soft_iface->ifindex)) + bat_priv->mesh_iface->ifindex)) goto nla_put_failure; if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, - bat_priv->soft_iface->name)) + bat_priv->mesh_iface->name)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, @@ -851,7 +850,7 @@ nla_put_failure: /** * batadv_netlink_notify_hardif() - send hardif attributes to listener - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @hard_iface: hard interface which was modified * * Return: 0 on success, < 0 on error @@ -874,21 +873,21 @@ static int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv, } genlmsg_multicast_netns(&batadv_netlink_family, - dev_net(bat_priv->soft_iface), msg, 0, + dev_net(bat_priv->mesh_iface), msg, 0, BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); return 0; } /** - * batadv_netlink_get_hardif() - Get hardif attributes + * batadv_netlink_cmd_get_hardif() - Get hardif attributes * @skb: Netlink message with request data * @info: receiver information * * Return: 0 on success or negative error number in case of failure */ -static int batadv_netlink_get_hardif(struct sk_buff *skb, - struct genl_info *info) +static int batadv_netlink_cmd_get_hardif(struct sk_buff *skb, + struct genl_info *info) { struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; @@ -964,36 +963,24 @@ static int batadv_netlink_set_hardif(struct sk_buff *skb, static int batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb) { - struct net *net = sock_net(cb->skb->sk); - struct net_device *soft_iface; + struct net_device *mesh_iface; struct batadv_hard_iface *hard_iface; struct batadv_priv *bat_priv; - int ifindex; int portid = NETLINK_CB(cb->skb).portid; int skip = cb->args[0]; int i = 0; - ifindex = batadv_netlink_get_ifindex(cb->nlh, - BATADV_ATTR_MESH_IFINDEX); - if (!ifindex) - return -EINVAL; + mesh_iface = batadv_netlink_get_meshif(cb); + if (IS_ERR(mesh_iface)) + return PTR_ERR(mesh_iface); - soft_iface = dev_get_by_index(net, ifindex); - if (!soft_iface) - return -ENODEV; - - if (!batadv_softif_is_valid(soft_iface)) { - dev_put(soft_iface); - return -ENODEV; - } - - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(mesh_iface); rtnl_lock(); cb->seq = batadv_hardif_generation << 1 | 1; list_for_each_entry(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != soft_iface) + if (hard_iface->mesh_iface != mesh_iface) continue; if (i++ < skip) @@ -1010,7 +997,7 @@ batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb) rtnl_unlock(); - dev_put(soft_iface); + dev_put(mesh_iface); cb->args[0] = i; @@ -1020,7 +1007,7 @@ batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb) /** * batadv_netlink_vlan_fill() - Fill message with vlan attributes * @msg: Netlink message to dump into - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @vlan: vlan which was modified * @cmd: type of message to generate * @portid: Port making netlink request @@ -1031,7 +1018,7 @@ batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb) */ static int batadv_netlink_vlan_fill(struct sk_buff *msg, struct batadv_priv *bat_priv, - struct batadv_softif_vlan *vlan, + struct batadv_meshif_vlan *vlan, enum batadv_nl_commands cmd, u32 portid, u32 seq, int flags) { @@ -1042,11 +1029,11 @@ static int batadv_netlink_vlan_fill(struct sk_buff *msg, return -ENOBUFS; if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, - bat_priv->soft_iface->ifindex)) + bat_priv->mesh_iface->ifindex)) goto nla_put_failure; if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, - bat_priv->soft_iface->name)) + bat_priv->mesh_iface->name)) goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK)) @@ -1066,13 +1053,13 @@ nla_put_failure: /** * batadv_netlink_notify_vlan() - send vlan attributes to listener - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @vlan: vlan which was modified * * Return: 0 on success, < 0 on error */ static int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, - struct batadv_softif_vlan *vlan) + struct batadv_meshif_vlan *vlan) { struct sk_buff *msg; int ret; @@ -1089,7 +1076,7 @@ static int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, } genlmsg_multicast_netns(&batadv_netlink_family, - dev_net(bat_priv->soft_iface), msg, 0, + dev_net(bat_priv->mesh_iface), msg, 0, BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); return 0; @@ -1104,7 +1091,7 @@ static int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, */ static int batadv_netlink_get_vlan(struct sk_buff *skb, struct genl_info *info) { - struct batadv_softif_vlan *vlan = info->user_ptr[1]; + struct batadv_meshif_vlan *vlan = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct sk_buff *msg; int ret; @@ -1134,7 +1121,7 @@ static int batadv_netlink_get_vlan(struct sk_buff *skb, struct genl_info *info) */ static int batadv_netlink_set_vlan(struct sk_buff *skb, struct genl_info *info) { - struct batadv_softif_vlan *vlan = info->user_ptr[1]; + struct batadv_meshif_vlan *vlan = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; struct nlattr *attr; @@ -1150,17 +1137,44 @@ static int batadv_netlink_set_vlan(struct sk_buff *skb, struct genl_info *info) } /** - * batadv_get_softif_from_info() - Retrieve soft interface from genl attributes + * batadv_netlink_get_meshif_from_ifindex() - Get mesh-iface from ifindex + * @net: the applicable net namespace + * @ifindex: index of the mesh interface + * + * Return: Pointer to mesh interface (with increased refcnt) on success, error + * pointer on error + */ +static struct net_device * +batadv_netlink_get_meshif_from_ifindex(struct net *net, int ifindex) +{ + struct net_device *mesh_iface; + + mesh_iface = dev_get_by_index(net, ifindex); + if (!mesh_iface) + return ERR_PTR(-ENODEV); + + if (!batadv_meshif_is_valid(mesh_iface)) + goto err_put_meshif; + + return mesh_iface; + +err_put_meshif: + dev_put(mesh_iface); + + return ERR_PTR(-EINVAL); +} + +/** + * batadv_netlink_get_meshif_from_info() - Get mesh-iface from genl attributes * @net: the applicable net namespace * @info: receiver information * - * Return: Pointer to soft interface (with increased refcnt) on success, error + * Return: Pointer to mesh interface (with increased refcnt) on success, error * pointer on error */ static struct net_device * -batadv_get_softif_from_info(struct net *net, struct genl_info *info) +batadv_netlink_get_meshif_from_info(struct net *net, struct genl_info *info) { - struct net_device *soft_iface; int ifindex; if (!info->attrs[BATADV_ATTR_MESH_IFINDEX]) @@ -1168,44 +1182,44 @@ batadv_get_softif_from_info(struct net *net, struct genl_info *info) ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]); - soft_iface = dev_get_by_index(net, ifindex); - if (!soft_iface) - return ERR_PTR(-ENODEV); - - if (!batadv_softif_is_valid(soft_iface)) - goto err_put_softif; - - return soft_iface; + return batadv_netlink_get_meshif_from_ifindex(net, ifindex); +} -err_put_softif: - dev_put(soft_iface); +/** + * batadv_netlink_get_meshif() - Retrieve mesh interface from netlink callback + * @cb: callback structure containing arguments + * + * Return: Pointer to mesh interface (with increased refcnt) on success, error + * pointer on error + */ +struct net_device *batadv_netlink_get_meshif(struct netlink_callback *cb) +{ + int ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_MESH_IFINDEX); + if (!ifindex) + return ERR_PTR(-ENONET); - return ERR_PTR(-EINVAL); + return batadv_netlink_get_meshif_from_ifindex(sock_net(cb->skb->sk), + ifindex); } /** - * batadv_get_hardif_from_info() - Retrieve hardif from genl attributes - * @bat_priv: the bat priv with all the soft interface information + * batadv_netlink_get_hardif_from_ifindex() - Get hard-iface from ifindex + * @bat_priv: the bat priv with all the mesh interface information * @net: the applicable net namespace - * @info: receiver information + * @ifindex: index of the hard interface * * Return: Pointer to hard interface (with increased refcnt) on success, error * pointer on error */ static struct batadv_hard_iface * -batadv_get_hardif_from_info(struct batadv_priv *bat_priv, struct net *net, - struct genl_info *info) +batadv_netlink_get_hardif_from_ifindex(struct batadv_priv *bat_priv, + struct net *net, int ifindex) { struct batadv_hard_iface *hard_iface; struct net_device *hard_dev; - unsigned int hardif_index; - - if (!info->attrs[BATADV_ATTR_HARD_IFINDEX]) - return ERR_PTR(-EINVAL); - hardif_index = nla_get_u32(info->attrs[BATADV_ATTR_HARD_IFINDEX]); - - hard_dev = dev_get_by_index(net, hardif_index); + hard_dev = dev_get_by_index(net, ifindex); if (!hard_dev) return ERR_PTR(-ENODEV); @@ -1213,7 +1227,7 @@ batadv_get_hardif_from_info(struct batadv_priv *bat_priv, struct net *net, if (!hard_iface) goto err_put_harddev; - if (hard_iface->soft_iface != bat_priv->soft_iface) + if (hard_iface->mesh_iface != bat_priv->mesh_iface) goto err_put_hardif; /* hard_dev is referenced by hard_iface and not needed here */ @@ -1230,19 +1244,64 @@ err_put_harddev: } /** + * batadv_netlink_get_hardif_from_info() - Get hard-iface from genl attributes + * @bat_priv: the bat priv with all the mesh interface information + * @net: the applicable net namespace + * @info: receiver information + * + * Return: Pointer to hard interface (with increased refcnt) on success, error + * pointer on error + */ +static struct batadv_hard_iface * +batadv_netlink_get_hardif_from_info(struct batadv_priv *bat_priv, + struct net *net, struct genl_info *info) +{ + int ifindex; + + if (!info->attrs[BATADV_ATTR_HARD_IFINDEX]) + return ERR_PTR(-EINVAL); + + ifindex = nla_get_u32(info->attrs[BATADV_ATTR_HARD_IFINDEX]); + + return batadv_netlink_get_hardif_from_ifindex(bat_priv, net, ifindex); +} + +/** + * batadv_netlink_get_hardif() - Retrieve hard interface from netlink callback + * @bat_priv: the bat priv with all the mesh interface information + * @cb: callback structure containing arguments + * + * Return: Pointer to hard interface (with increased refcnt) on success, error + * pointer on error + */ +struct batadv_hard_iface * +batadv_netlink_get_hardif(struct batadv_priv *bat_priv, + struct netlink_callback *cb) +{ + int ifindex = batadv_netlink_get_ifindex(cb->nlh, + BATADV_ATTR_HARD_IFINDEX); + if (!ifindex) + return ERR_PTR(-ENONET); + + return batadv_netlink_get_hardif_from_ifindex(bat_priv, + sock_net(cb->skb->sk), + ifindex); +} + +/** * batadv_get_vlan_from_info() - Retrieve vlan from genl attributes - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @net: the applicable net namespace * @info: receiver information * * Return: Pointer to vlan on success (with increased refcnt), error pointer * on error */ -static struct batadv_softif_vlan * +static struct batadv_meshif_vlan * batadv_get_vlan_from_info(struct batadv_priv *bat_priv, struct net *net, struct genl_info *info) { - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; u16 vid; if (!info->attrs[BATADV_ATTR_VLANID]) @@ -1250,7 +1309,7 @@ batadv_get_vlan_from_info(struct batadv_priv *bat_priv, struct net *net, vid = nla_get_u16(info->attrs[BATADV_ATTR_VLANID]); - vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); + vlan = batadv_meshif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); if (!vlan) return ERR_PTR(-ENOENT); @@ -1272,8 +1331,8 @@ static int batadv_pre_doit(const struct genl_split_ops *ops, struct net *net = genl_info_net(info); struct batadv_hard_iface *hard_iface; struct batadv_priv *bat_priv = NULL; - struct batadv_softif_vlan *vlan; - struct net_device *soft_iface; + struct batadv_meshif_vlan *vlan; + struct net_device *mesh_iface; u8 user_ptr1_flags; u8 mesh_dep_flags; int ret; @@ -1288,19 +1347,20 @@ static int batadv_pre_doit(const struct genl_split_ops *ops, return -EINVAL; if (ops->internal_flags & BATADV_FLAG_NEED_MESH) { - soft_iface = batadv_get_softif_from_info(net, info); - if (IS_ERR(soft_iface)) - return PTR_ERR(soft_iface); + mesh_iface = batadv_netlink_get_meshif_from_info(net, info); + if (IS_ERR(mesh_iface)) + return PTR_ERR(mesh_iface); - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(mesh_iface); info->user_ptr[0] = bat_priv; } if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF) { - hard_iface = batadv_get_hardif_from_info(bat_priv, net, info); + hard_iface = batadv_netlink_get_hardif_from_info(bat_priv, net, + info); if (IS_ERR(hard_iface)) { ret = PTR_ERR(hard_iface); - goto err_put_softif; + goto err_put_meshif; } info->user_ptr[1] = hard_iface; @@ -1310,7 +1370,7 @@ static int batadv_pre_doit(const struct genl_split_ops *ops, vlan = batadv_get_vlan_from_info(bat_priv, net, info); if (IS_ERR(vlan)) { ret = PTR_ERR(vlan); - goto err_put_softif; + goto err_put_meshif; } info->user_ptr[1] = vlan; @@ -1318,9 +1378,9 @@ static int batadv_pre_doit(const struct genl_split_ops *ops, return 0; -err_put_softif: +err_put_meshif: if (bat_priv) - dev_put(bat_priv->soft_iface); + dev_put(bat_priv->mesh_iface); return ret; } @@ -1336,7 +1396,7 @@ static void batadv_post_doit(const struct genl_split_ops *ops, struct genl_info *info) { struct batadv_hard_iface *hard_iface; - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; struct batadv_priv *bat_priv; if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF && @@ -1348,12 +1408,12 @@ static void batadv_post_doit(const struct genl_split_ops *ops, if (ops->internal_flags & BATADV_FLAG_NEED_VLAN && info->user_ptr[1]) { vlan = info->user_ptr[1]; - batadv_softif_vlan_put(vlan); + batadv_meshif_vlan_put(vlan); } if (ops->internal_flags & BATADV_FLAG_NEED_MESH && info->user_ptr[0]) { bat_priv = info->user_ptr[0]; - dev_put(bat_priv->soft_iface); + dev_put(bat_priv->mesh_iface); } } @@ -1390,7 +1450,7 @@ static const struct genl_small_ops batadv_netlink_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* can be retrieved by unprivileged users */ .dumpit = batadv_netlink_dump_hardif, - .doit = batadv_netlink_get_hardif, + .doit = batadv_netlink_cmd_get_hardif, .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_HARDIF, }, @@ -1507,7 +1567,7 @@ void __init batadv_netlink_register(void) ret = genl_register_family(&batadv_netlink_family); if (ret) - pr_warn("unable to register netlink family"); + pr_warn("unable to register netlink family\n"); } /** diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 876d2806a67d..fe4548b974bb 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -15,7 +15,10 @@ void batadv_netlink_register(void); void batadv_netlink_unregister(void); -int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype); +struct net_device *batadv_netlink_get_meshif(struct netlink_callback *cb); +struct batadv_hard_iface * +batadv_netlink_get_hardif(struct batadv_priv *bat_priv, + struct netlink_callback *cb); int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst, u8 result, u32 test_time, u64 total_bytes, diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 71ebd0284f95..9f56308779cc 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -65,7 +65,7 @@ int __init batadv_nc_init(void) /** * batadv_nc_start_timer() - initialise the nc periodic worker - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_nc_start_timer(struct batadv_priv *bat_priv) { @@ -76,7 +76,7 @@ static void batadv_nc_start_timer(struct batadv_priv *bat_priv) /** * batadv_nc_tvlv_container_update() - update the network coding tvlv container * after network coding setting change - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv) { @@ -98,7 +98,7 @@ static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv) /** * batadv_nc_status_update() - update the network coding tvlv container after * network coding setting change - * @net_dev: the soft interface net device + * @net_dev: the mesh interface net device */ void batadv_nc_status_update(struct net_device *net_dev) { @@ -109,7 +109,7 @@ void batadv_nc_status_update(struct net_device *net_dev) /** * batadv_nc_tvlv_ogm_handler_v1() - process incoming nc tvlv container - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the gateway data @@ -128,7 +128,7 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, /** * batadv_nc_mesh_init() - initialise coding hash table and start housekeeping - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or negative error number in case of failure */ @@ -171,7 +171,7 @@ err: /** * batadv_nc_init_bat_priv() - initialise the nc specific bat_priv variables - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) { @@ -267,7 +267,7 @@ static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet, /** * batadv_nc_to_purge_nc_node() - checks whether an nc node has to be purged - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @nc_node: the nc node to check * * Return: true if the entry has to be purged now, false otherwise @@ -283,7 +283,7 @@ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, /** * batadv_nc_to_purge_nc_path_coding() - checks whether an nc path has timed out - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @nc_path: the nc path to check * * Return: true if the entry has to be purged now, false otherwise @@ -304,7 +304,7 @@ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, /** * batadv_nc_to_purge_nc_path_decoding() - checks whether an nc path has timed * out - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @nc_path: the nc path to check * * Return: true if the entry has to be purged now, false otherwise @@ -325,7 +325,7 @@ static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, /** * batadv_nc_purge_orig_nc_nodes() - go through list of nc nodes and purge stale * entries - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @list: list of nc nodes * @lock: nc node list lock * @to_purge: function in charge to decide whether an entry has to be purged or @@ -363,7 +363,7 @@ batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv, /** * batadv_nc_purge_orig() - purges all nc node data attached of the given * originator - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig_node with the nc node entries to be purged * @to_purge: function in charge to decide whether an entry has to be purged or * not. This function takes the nc node as argument and has to return @@ -389,7 +389,7 @@ void batadv_nc_purge_orig(struct batadv_priv *bat_priv, /** * batadv_nc_purge_orig_hash() - traverse entire originator hash to check if * they have timed out nc nodes - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv) { @@ -416,7 +416,7 @@ static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv) /** * batadv_nc_purge_paths() - traverse all nc paths part of the hash and remove * unused ones - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @hash: hash table containing the nc paths to check * @to_purge: function in charge to decide whether an entry has to be purged or * not. This function takes the nc node as argument and has to return @@ -579,7 +579,7 @@ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) /** * batadv_nc_sniffed_purge() - Checks timestamp of given sniffed nc_packet. - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @nc_path: the nc path the packet belongs to * @nc_packet: the nc packet to be checked * @@ -618,7 +618,7 @@ out: /** * batadv_nc_fwd_flush() - Checks the timestamp of the given nc packet. - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @nc_path: the nc path the packet belongs to * @nc_packet: the nc packet to be checked * @@ -657,7 +657,7 @@ static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, /** * batadv_nc_process_nc_paths() - traverse given nc packet pool and free timed * out nc packets - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @hash: to be processed hash table * @process_fn: Function called to process given nc packet. Should return true * to encourage this function to proceed with the next packet. @@ -744,7 +744,7 @@ static void batadv_nc_worker(struct work_struct *work) /** * batadv_can_nc_with_orig() - checks whether the given orig node is suitable * for coding or not - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: neighboring orig node which may be used as nc candidate * @ogm_packet: incoming ogm packet also used for the checks * @@ -825,7 +825,7 @@ batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, /** * batadv_nc_get_nc_node() - retrieves an nc node or creates the entry if it was * not found - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node originating the ogm packet * @orig_neigh_node: neighboring orig node from which we received the ogm packet * (can be equal to orig_node) @@ -888,7 +888,7 @@ unlock: /** * batadv_nc_update_nc_node() - updates stored incoming and outgoing nc node * structs (best called on incoming OGMs) - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node originating the ogm packet * @orig_neigh_node: neighboring orig node from which we received the ogm packet * (can be equal to orig_node) @@ -940,7 +940,7 @@ out: /** * batadv_nc_get_path() - get existing nc_path or allocate a new one - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @hash: hash table containing the nc path * @src: ethernet source address - first half of the nc path search key * @dst: ethernet destination address - second half of the nc path search key @@ -1032,7 +1032,7 @@ static void batadv_nc_memxor(char *dst, const char *src, unsigned int len) /** * batadv_nc_code_packets() - code a received unicast_packet with an nc packet * into a coded_packet and send it - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: data skb to forward * @ethhdr: pointer to the ethernet header inside the skb * @nc_packet: structure containing the packet to the skb can be coded with @@ -1245,7 +1245,7 @@ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) /** * batadv_nc_path_search() - Find the coding path matching in_nc_node and * out_nc_node to retrieve a buffered packet that can be used for coding. - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @in_nc_node: pointer to skb next hop's neighbor nc node * @out_nc_node: pointer to skb source's neighbor nc node * @skb: data skb to forward @@ -1313,7 +1313,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, /** * batadv_nc_skb_src_search() - Loops through the list of neighboring nodes of * the skb's sender (may be equal to the originator). - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: data skb to forward * @eth_dst: next hop mac address of skb * @eth_src: source mac address of skb @@ -1359,7 +1359,7 @@ batadv_nc_skb_src_search(struct batadv_priv *bat_priv, /** * batadv_nc_skb_store_before_coding() - set the ethernet src and dst of the * unicast skb before it is stored for use in later decoding - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: data skb to store * @eth_dst_new: new destination mac address of skb */ @@ -1408,7 +1408,7 @@ static bool batadv_nc_skb_dst_search(struct sk_buff *skb, struct batadv_neigh_node *neigh_node, struct ethhdr *ethhdr) { - struct net_device *netdev = neigh_node->if_incoming->soft_iface; + struct net_device *netdev = neigh_node->if_incoming->mesh_iface; struct batadv_priv *bat_priv = netdev_priv(netdev); struct batadv_orig_node *orig_node = neigh_node->orig_node; struct batadv_nc_node *nc_node; @@ -1495,7 +1495,7 @@ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, bool batadv_nc_skb_forward(struct sk_buff *skb, struct batadv_neigh_node *neigh_node) { - const struct net_device *netdev = neigh_node->if_incoming->soft_iface; + const struct net_device *netdev = neigh_node->if_incoming->mesh_iface; struct batadv_priv *bat_priv = netdev_priv(netdev); struct batadv_unicast_packet *packet; struct batadv_nc_path *nc_path; @@ -1544,7 +1544,7 @@ out: /** * batadv_nc_skb_store_for_decoding() - save a clone of the skb which can be * used when decoding coded packets - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: data skb to store */ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, @@ -1605,7 +1605,7 @@ out: /** * batadv_nc_skb_store_sniffed_unicast() - check if a received unicast packet * should be saved in the decoding buffer and, if so, store it there - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: unicast skb to store */ void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, @@ -1625,7 +1625,7 @@ void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, /** * batadv_nc_skb_decode_packet() - decode given skb using the decode data stored * in nc_packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: unicast skb to decode * @nc_packet: decode data needed to decode the skb * @@ -1719,7 +1719,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, /** * batadv_nc_find_decoding_packet() - search through buffered decoding data to * find the data needed to decode the coded packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @ethhdr: pointer to the ethernet header inside the coded packet * @coded: coded packet we try to find decode data for * @@ -1793,7 +1793,7 @@ batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, static int batadv_nc_recv_coded_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { - struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); struct batadv_unicast_packet *unicast_packet; struct batadv_coded_packet *coded_packet; struct batadv_nc_packet *nc_packet; @@ -1858,7 +1858,7 @@ free_skb: /** * batadv_nc_mesh_free() - clean up network coding memory - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_nc_mesh_free(struct batadv_priv *bat_priv) { diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 8f6dd2c6ee41..d9cfc5c6b208 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -9,6 +9,7 @@ #include <linux/atomic.h> #include <linux/container_of.h> +#include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> @@ -26,9 +27,7 @@ #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/workqueue.h> -#include <net/sock.h> #include <uapi/linux/batadv_packet.h> -#include <uapi/linux/batman_adv.h> #include "bat_algo.h" #include "distributed-arp-table.h" @@ -41,7 +40,6 @@ #include "netlink.h" #include "network-coding.h" #include "routing.h" -#include "soft-interface.h" #include "translation-table.h" /* hash class keys */ @@ -49,7 +47,7 @@ static struct lock_class_key batadv_orig_hash_lock_class_key; /** * batadv_orig_hash_find() - Find and return originator from orig_hash - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @data: mac address of the originator * * Return: orig_node (with increased refcnt), NULL on errors @@ -215,7 +213,7 @@ void batadv_orig_node_vlan_release(struct kref *ref) /** * batadv_originator_init() - Initialize all originator structures - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or negative error number in case of failure */ @@ -340,7 +338,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node, /** * batadv_orig_to_router() - get next hop neighbor to an orig address - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_addr: the originator MAC address to search the best next hop router for * @if_outgoing: the interface where the payload packet has been received or * the OGM should be sent to @@ -569,7 +567,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface, const u8 *neigh_addr, struct batadv_orig_node *orig_node) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface); struct batadv_hardif_neigh_node *hardif_neigh; spin_lock_bh(&hard_iface->neigh_list_lock); @@ -755,65 +753,49 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, */ int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) { - struct net *net = sock_net(cb->skb->sk); - struct net_device *soft_iface; - struct net_device *hard_iface = NULL; - struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT; + struct batadv_hard_iface *primary_if, *hard_iface; + struct net_device *mesh_iface; struct batadv_priv *bat_priv; - struct batadv_hard_iface *primary_if = NULL; int ret; - int ifindex, hard_ifindex; - ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); - if (!ifindex) - return -EINVAL; + mesh_iface = batadv_netlink_get_meshif(cb); + if (IS_ERR(mesh_iface)) + return PTR_ERR(mesh_iface); - soft_iface = dev_get_by_index(net, ifindex); - if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { - ret = -ENODEV; - goto out; - } - - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; - goto out; + goto out_put_mesh_iface; } - hard_ifindex = batadv_netlink_get_ifindex(cb->nlh, - BATADV_ATTR_HARD_IFINDEX); - if (hard_ifindex) { - hard_iface = dev_get_by_index(net, hard_ifindex); - if (hard_iface) - hardif = batadv_hardif_get_by_netdev(hard_iface); - - if (!hardif) { - ret = -ENODEV; - goto out; - } - - if (hardif->soft_iface != soft_iface) { - ret = -ENOENT; - goto out; - } + hard_iface = batadv_netlink_get_hardif(bat_priv, cb); + if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) { + ret = PTR_ERR(hard_iface); + goto out_put_primary_if; + } else if (IS_ERR(hard_iface)) { + /* => PTR_ERR(hard_iface) == -ENONET + * => no hard-iface given, ok + */ + hard_iface = BATADV_IF_DEFAULT; } if (!bat_priv->algo_ops->neigh.dump) { ret = -EOPNOTSUPP; - goto out; + goto out_put_hard_iface; } - bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hardif); + bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hard_iface); ret = msg->len; - out: - batadv_hardif_put(hardif); - dev_put(hard_iface); +out_put_hard_iface: + batadv_hardif_put(hard_iface); +out_put_primary_if: batadv_hardif_put(primary_if); - dev_put(soft_iface); +out_put_mesh_iface: + dev_put(mesh_iface); return ret; } @@ -910,7 +892,7 @@ void batadv_orig_node_release(struct kref *ref) /** * batadv_originator_free() - Free all originator structures - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_originator_free(struct batadv_priv *bat_priv) { @@ -946,7 +928,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) /** * batadv_orig_node_new() - creates a new orig_node - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the originator * * Creates a new originator object and initialises all the generic fields. @@ -1027,7 +1009,7 @@ free_orig_node: /** * batadv_purge_neigh_ifinfo() - purge obsolete ifinfo entries from neighbor - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @neigh: orig node which is to be checked */ static void @@ -1068,7 +1050,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, /** * batadv_purge_orig_ifinfo() - purge obsolete ifinfo entries from originator - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked * * Return: true if any ifinfo entry was purged, false otherwise. @@ -1120,7 +1102,7 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, /** * batadv_purge_orig_neighbors() - purges neighbors from originator - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked * * Return: true if any neighbor was purged, false otherwise @@ -1178,7 +1160,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, /** * batadv_find_best_neighbor() - finds the best neighbor after purging - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked * @if_outgoing: the interface for which the metric should be compared * @@ -1212,7 +1194,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, /** * batadv_purge_orig_node() - purges obsolete information from an orig_node - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be checked * * This function checks if the orig_node or substructures of it have become @@ -1254,7 +1236,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, if (hard_iface->if_status != BATADV_IF_ACTIVE) continue; - if (hard_iface->soft_iface != bat_priv->soft_iface) + if (hard_iface->mesh_iface != bat_priv->mesh_iface) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) @@ -1276,7 +1258,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, /** * batadv_purge_orig_ref() - Purge all outdated originators - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_purge_orig_ref(struct batadv_priv *bat_priv) { @@ -1342,65 +1324,49 @@ static void batadv_purge_orig(struct work_struct *work) */ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) { - struct net *net = sock_net(cb->skb->sk); - struct net_device *soft_iface; - struct net_device *hard_iface = NULL; - struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT; + struct batadv_hard_iface *primary_if, *hard_iface; + struct net_device *mesh_iface; struct batadv_priv *bat_priv; - struct batadv_hard_iface *primary_if = NULL; int ret; - int ifindex, hard_ifindex; - ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); - if (!ifindex) - return -EINVAL; + mesh_iface = batadv_netlink_get_meshif(cb); + if (IS_ERR(mesh_iface)) + return PTR_ERR(mesh_iface); - soft_iface = dev_get_by_index(net, ifindex); - if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { - ret = -ENODEV; - goto out; - } - - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; - goto out; + goto out_put_mesh_iface; } - hard_ifindex = batadv_netlink_get_ifindex(cb->nlh, - BATADV_ATTR_HARD_IFINDEX); - if (hard_ifindex) { - hard_iface = dev_get_by_index(net, hard_ifindex); - if (hard_iface) - hardif = batadv_hardif_get_by_netdev(hard_iface); - - if (!hardif) { - ret = -ENODEV; - goto out; - } - - if (hardif->soft_iface != soft_iface) { - ret = -ENOENT; - goto out; - } + hard_iface = batadv_netlink_get_hardif(bat_priv, cb); + if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) { + ret = PTR_ERR(hard_iface); + goto out_put_primary_if; + } else if (IS_ERR(hard_iface)) { + /* => PTR_ERR(hard_iface) == -ENONET + * => no hard-iface given, ok + */ + hard_iface = BATADV_IF_DEFAULT; } if (!bat_priv->algo_ops->orig.dump) { ret = -EOPNOTSUPP; - goto out; + goto out_put_hard_iface; } - bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hardif); + bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hard_iface); ret = msg->len; - out: - batadv_hardif_put(hardif); - dev_put(hard_iface); +out_put_hard_iface: + batadv_hardif_put(hard_iface); +out_put_primary_if: batadv_hardif_put(primary_if); - dev_put(soft_iface); +out_put_mesh_iface: + dev_put(mesh_iface); return ret; } diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index f1061985149f..35d8c5783999 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -30,10 +30,10 @@ #include "fragmentation.h" #include "hard-interface.h" #include "log.h" +#include "mesh-interface.h" #include "network-coding.h" #include "originator.h" #include "send.h" -#include "soft-interface.h" #include "tp_meter.h" #include "translation-table.h" #include "tvlv.h" @@ -43,7 +43,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb, /** * _batadv_update_route() - set the router for this originator - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be configured * @recv_if: the receive interface for which this route is set * @neigh_node: neighbor which should be the next router @@ -106,7 +106,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, /** * batadv_update_route() - set the router for this originator - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which is to be configured * @recv_if: the receive interface for which this route is set * @neigh_node: neighbor which should be the next router @@ -133,7 +133,7 @@ out: /** * batadv_window_protected() - checks whether the host restarted and is in the * protection time. - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @seq_num_diff: difference between the current/received sequence number and * the last sequence number * @seq_old_max_diff: maximum age of sequence number not considered as restart @@ -207,7 +207,7 @@ bool batadv_check_management_packet(struct sk_buff *skb, /** * batadv_recv_my_icmp_packet() - receive an icmp packet locally - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: icmp packet to process * * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP @@ -338,7 +338,7 @@ out: int batadv_recv_icmp_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { - struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); struct batadv_icmp_header *icmph; struct batadv_icmp_packet_rr *icmp_packet_rr; struct ethhdr *ethhdr; @@ -428,7 +428,7 @@ free_skb: /** * batadv_check_unicast_packet() - Check for malformed unicast packets - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: packet to check * @hdr_size: size of header to pull * @@ -511,7 +511,7 @@ batadv_last_bonding_replace(struct batadv_orig_node *orig_node, /** * batadv_find_router() - find a suitable router for this originator - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: the destination node * @recv_if: pointer to interface this packet was received on * @@ -656,7 +656,7 @@ next: static int batadv_route_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { - struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); struct batadv_orig_node *orig_node = NULL; struct batadv_unicast_packet *unicast_packet; struct ethhdr *ethhdr = eth_hdr(skb); @@ -727,7 +727,7 @@ free_skb: /** * batadv_reroute_unicast_packet() - update the unicast header for re-routing - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: unicast packet to process * @unicast_packet: the unicast header to be updated * @dst_addr: the payload destination @@ -879,7 +879,7 @@ static bool batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, return false; /* update the header in order to let the packet be delivered to this - * node's soft interface + * node's mesh interface */ primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) @@ -909,7 +909,7 @@ int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { struct batadv_unicast_packet *unicast_packet; - struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); int check, hdr_size = sizeof(*unicast_packet); check = batadv_check_unicast_packet(bat_priv, skb, hdr_size); @@ -938,7 +938,7 @@ free_skb: int batadv_recv_unicast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { - struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); struct batadv_unicast_packet *unicast_packet; struct batadv_unicast_4addr_packet *unicast_4addr_packet; u8 *orig_addr, *orig_addr_gw; @@ -1017,7 +1017,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, batadv_dat_snoop_incoming_dhcp_ack(bat_priv, skb, hdr_size); - batadv_interface_rx(recv_if->soft_iface, skb, hdr_size, + batadv_interface_rx(recv_if->mesh_iface, skb, hdr_size, orig_node); rx_success: @@ -1047,7 +1047,7 @@ free_skb: int batadv_recv_unicast_tvlv(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { - struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; unsigned char *tvlv_buff; u16 tvlv_buff_len; @@ -1103,7 +1103,7 @@ free_skb: int batadv_recv_frag_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { - struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); struct batadv_orig_node *orig_node_src = NULL; struct batadv_frag_packet *frag_packet; int ret = NET_RX_DROP; @@ -1165,7 +1165,7 @@ free_skb: int batadv_recv_bcast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { - struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); struct batadv_orig_node *orig_node = NULL; struct batadv_bcast_packet *bcast_packet; struct ethhdr *ethhdr; @@ -1255,7 +1255,7 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, batadv_dat_snoop_incoming_dhcp_ack(bat_priv, skb, hdr_size); /* broadcast for me */ - batadv_interface_rx(recv_if->soft_iface, skb, hdr_size, orig_node); + batadv_interface_rx(recv_if->mesh_iface, skb, hdr_size, orig_node); rx_success: ret = NET_RX_SUCCESS; @@ -1279,14 +1279,14 @@ out: * * Parses the given, received batman-adv multicast packet. Depending on the * contents of its TVLV forwards it and/or decapsulates it to hand it to the - * soft interface. + * mesh interface. * * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise. */ int batadv_recv_mcast_packet(struct sk_buff *skb, struct batadv_hard_iface *recv_if) { - struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); + struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); struct batadv_mcast_packet *mcast_packet; int hdr_size = sizeof(*mcast_packet); unsigned char *tvlv_buff; @@ -1329,7 +1329,7 @@ int batadv_recv_mcast_packet(struct sk_buff *skb, batadv_add_counter(bat_priv, BATADV_CNT_MCAST_RX_LOCAL_BYTES, skb->len - hdr_size); - batadv_interface_rx(bat_priv->soft_iface, skb, hdr_size, NULL); + batadv_interface_rx(bat_priv->mesh_iface, skb, hdr_size, NULL); /* skb was consumed */ skb = NULL; } diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 0379b126865d..9d72f4f15b3d 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -34,10 +34,10 @@ #include "gateway_client.h" #include "hard-interface.h" #include "log.h" +#include "mesh-interface.h" #include "network-coding.h" #include "originator.h" #include "routing.h" -#include "soft-interface.h" #include "translation-table.h" static void batadv_send_outstanding_bcast_packet(struct work_struct *work); @@ -68,7 +68,7 @@ int batadv_send_skb_packet(struct sk_buff *skb, struct ethhdr *ethhdr; int ret; - bat_priv = netdev_priv(hard_iface->soft_iface); + bat_priv = netdev_priv(hard_iface->mesh_iface); if (hard_iface->if_status != BATADV_IF_ACTIVE) goto send_skb_err; @@ -124,7 +124,9 @@ send_skb_err: int batadv_send_broadcast_skb(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { - return batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr); + static const u8 broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; + + return batadv_send_skb_packet(skb, hard_iface, broadcast_addr); } /** @@ -272,7 +274,7 @@ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb, /** * batadv_send_skb_prepare_unicast_4addr() - encapsulate an skb with a * unicast 4addr header - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the skb containing the payload to encapsulate * @orig: the destination node * @packet_subtype: the unicast 4addr packet subtype to use @@ -314,7 +316,7 @@ out: /** * batadv_send_skb_unicast() - encapsulate and send an skb via unicast - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: payload to send * @packet_type: the batman unicast packet type to use * @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast @@ -384,7 +386,7 @@ out: /** * batadv_send_skb_via_tt_generic() - send an skb via TT lookup - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: payload to send * @packet_type: the batman unicast packet type to use * @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast @@ -430,7 +432,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, /** * batadv_send_skb_via_gw() - send an skb via gateway lookup - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: payload to send * @vid: the vid to be used to search the translation table * @@ -532,7 +534,7 @@ batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming, forw_packet->queue_left = queue_left; forw_packet->if_incoming = if_incoming; forw_packet->if_outgoing = if_outgoing; - forw_packet->num_packets = 0; + forw_packet->num_packets = 1; return forw_packet; @@ -695,7 +697,7 @@ static void batadv_forw_packet_queue(struct batadv_forw_packet *forw_packet, /** * batadv_forw_packet_bcast_queue() - try to queue a broadcast packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @forw_packet: the forwarding packet to queue * @send_time: timestamp (jiffies) when the packet is to be sent * @@ -714,7 +716,7 @@ batadv_forw_packet_bcast_queue(struct batadv_priv *bat_priv, /** * batadv_forw_packet_ogmv1_queue() - try to queue an OGMv1 packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @forw_packet: the forwarding packet to queue * @send_time: timestamp (jiffies) when the packet is to be sent * @@ -732,7 +734,7 @@ void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv, /** * batadv_forw_bcast_packet_to_list() - queue broadcast packet for transmissions - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet @@ -787,7 +789,7 @@ err: /** * batadv_forw_bcast_packet_if() - forward and queue a broadcast packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet @@ -838,7 +840,7 @@ static int batadv_forw_bcast_packet_if(struct batadv_priv *bat_priv, /** * batadv_send_no_broadcast() - check whether (re)broadcast is necessary - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: broadcast packet to check * @own_packet: true if it is a self-generated broadcast packet * @if_out: the outgoing interface checked and considered for (re)broadcast @@ -900,7 +902,7 @@ static bool batadv_send_no_broadcast(struct batadv_priv *bat_priv, /** * __batadv_forw_bcast_packet() - forward and queue a broadcast packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet @@ -930,7 +932,7 @@ static int __batadv_forw_bcast_packet(struct batadv_priv *bat_priv, rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if (hard_iface->soft_iface != bat_priv->soft_iface) + if (hard_iface->mesh_iface != bat_priv->mesh_iface) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) @@ -958,7 +960,7 @@ static int __batadv_forw_bcast_packet(struct batadv_priv *bat_priv, /** * batadv_forw_bcast_packet() - forward and queue a broadcast packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet @@ -979,7 +981,7 @@ int batadv_forw_bcast_packet(struct batadv_priv *bat_priv, /** * batadv_send_bcast_packet() - send and queue a broadcast packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet @@ -1060,7 +1062,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); - bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); + bat_priv = netdev_priv(forw_packet->if_incoming->mesh_iface); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) { dropped = true; @@ -1095,7 +1097,7 @@ out: /** * batadv_purge_outstanding_packets() - stop/purge scheduled bcast/OGMv1 packets - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @hard_iface: the hard interface to cancel and purge bcast/ogm packets on * * This method cancels and purges any broadcast and OGMv1 packet on the given diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 08af251b765c..3415afec4a0c 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -68,7 +68,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, /** * batadv_send_skb_via_tt() - send an skb via TT lookup - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the payload to send * @dst_hint: can be used to override the destination contained in the skb * @vid: the vid to be used to search the translation table @@ -89,7 +89,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, /** * batadv_send_skb_via_tt_4addr() - send an skb via TT lookup - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the payload to send * @packet_subtype: the unicast 4addr packet subtype to use * @dst_hint: can be used to override the destination contained in the skb diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 7f3dd3c393e0..adbadb436033 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -206,7 +206,7 @@ static void batadv_tp_update_rto(struct batadv_tp_vars *tp_vars, * batadv_tp_batctl_notify() - send client status result to client * @reason: reason for tp meter session stop * @dst: destination of tp_meter session - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @start_time: start of transmission in jiffies * @total_sent: bytes acked to the receiver * @cookie: cookie of tp_meter session @@ -238,7 +238,7 @@ static void batadv_tp_batctl_notify(enum batadv_tp_meter_reason reason, * batadv_tp_batctl_error_notify() - send client error result to client * @reason: reason for tp meter session stop * @dst: destination of tp_meter session - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @cookie: cookie of tp_meter session */ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, @@ -251,7 +251,7 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason, /** * batadv_tp_list_find() - find a tp_vars object in the global list - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @dst: the other endpoint MAC address to look for * * Look for a tp_vars object matching dst as end_point and return it after @@ -287,7 +287,7 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv, /** * batadv_tp_list_find_session() - find tp_vars session object in the global * list - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @dst: the other endpoint MAC address to look for * @session: session identifier * @@ -366,7 +366,7 @@ static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars) /** * batadv_tp_sender_cleanup() - cleanup sender data and drop and timer - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @tp_vars: the private data of the current TP meter session to cleanup */ static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv, @@ -384,19 +384,19 @@ static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv, atomic_dec(&tp_vars->bat_priv->tp_num); /* kill the timer and remove its reference */ - del_timer_sync(&tp_vars->timer); + timer_delete_sync(&tp_vars->timer); /* the worker might have rearmed itself therefore we kill it again. Note * that if the worker should run again before invoking the following - * del_timer(), it would not re-arm itself once again because the status + * timer_delete(), it would not re-arm itself once again because the status * is OFF now */ - del_timer(&tp_vars->timer); + timer_delete(&tp_vars->timer); batadv_tp_vars_put(tp_vars); } /** * batadv_tp_sender_end() - print info about ended session and inform client - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @tp_vars: the private data of the current TP meter session */ static void batadv_tp_sender_end(struct batadv_priv *bat_priv, @@ -619,7 +619,7 @@ static int batadv_tp_send_msg(struct batadv_tp_vars *tp_vars, const u8 *src, /** * batadv_tp_recv_ack() - ACK receiving function - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the buffer containing the received packet * * Process a received TP ACK packet @@ -832,7 +832,7 @@ static int batadv_tp_send(void *arg) } /* assume that all the hard_interfaces have a correctly - * configured MTU, so use the soft_iface MTU as MSS. + * configured MTU, so use the mesh_iface MTU as MSS. * This might not be true and in that case the fragmentation * should be used. * Now, try to send the packet as it is @@ -927,7 +927,7 @@ static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars) /** * batadv_tp_start() - start a new tp meter session - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @dst: the receiver MAC address * @test_length: test length in milliseconds * @cookie: session cookie @@ -993,7 +993,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, /* initialise the CWND to 3*MSS (Section 3.1 in RFC5681). * For batman-adv the MSS is the size of the payload received by the - * soft_interface, hence its MTU + * mesh_interface, hence its MTU */ tp_vars->cwnd = BATADV_TP_PLEN * 3; /* at the beginning initialise the SS threshold to the biggest possible @@ -1052,7 +1052,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, /** * batadv_tp_stop() - stop currently running tp meter session - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @dst: the receiver MAC address * @return_value: reason for tp meter session stop */ @@ -1141,7 +1141,7 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t) /** * batadv_tp_send_ack() - send an ACK packet - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @dst: the mac address of the destination originator * @seq: the sequence number to ACK * @timestamp: the timestamp to echo back in the ACK @@ -1320,7 +1320,7 @@ static void batadv_tp_ack_unordered(struct batadv_tp_vars *tp_vars) /** * batadv_tp_init_recv() - return matching or create new receiver tp_vars - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @icmp: received icmp tp msg * * Return: corresponding tp_vars or NULL on errors @@ -1373,7 +1373,7 @@ out_unlock: /** * batadv_tp_recv_msg() - process a single data message - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the buffer containing the received packet * * Process a received TP MSG packet @@ -1457,7 +1457,7 @@ out: /** * batadv_tp_meter_recv() - main TP Meter receiving function - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @skb: the buffer containing the received packet */ void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb) diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index 6b816cf1a953..7da692ec38e9 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -34,7 +34,7 @@ TRACE_EVENT(batadv_dbg, TP_ARGS(bat_priv, vaf), TP_STRUCT__entry( - __string(device, bat_priv->soft_iface->name) + __string(device, bat_priv->mesh_iface->name) __string(driver, KBUILD_MODNAME) __vstring(msg, vaf->fmt, vaf->va) ), diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 760d51fdbdf6..8d0e04e770cb 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -14,7 +14,8 @@ #include <linux/cache.h> #include <linux/compiler.h> #include <linux/container_of.h> -#include <linux/crc32c.h> +#include <linux/crc32.h> +#include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> @@ -39,7 +40,6 @@ #include <linux/workqueue.h> #include <net/genetlink.h> #include <net/netlink.h> -#include <net/sock.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> @@ -47,9 +47,9 @@ #include "hard-interface.h" #include "hash.h" #include "log.h" +#include "mesh-interface.h" #include "netlink.h" #include "originator.h" -#include "soft-interface.h" #include "tvlv.h" static struct kmem_cache *batadv_tl_cache __read_mostly; @@ -161,7 +161,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, /** * batadv_tt_local_hash_find() - search the local table for a given client - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the client to look for * @vid: VLAN identifier * @@ -186,7 +186,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr, /** * batadv_tt_global_hash_find() - search the global table for a given client - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the client to look for * @vid: VLAN identifier * @@ -221,7 +221,7 @@ static void batadv_tt_local_entry_release(struct kref *ref) tt_local_entry = container_of(ref, struct batadv_tt_local_entry, common.refcount); - batadv_softif_vlan_put(tt_local_entry->vlan); + batadv_meshif_vlan_put(tt_local_entry->vlan); kfree_rcu(tt_local_entry, common.rcu); } @@ -260,7 +260,7 @@ void batadv_tt_global_entry_release(struct kref *ref) /** * batadv_tt_global_hash_count() - count the number of orig entries - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the client to count entries for * @vid: VLAN identifier * @@ -286,28 +286,28 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, /** * batadv_tt_local_size_mod() - change the size by v of the local table * identified by vid - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @vid: the VLAN identifier of the sub-table to change * @v: the amount to sum to the local table size */ static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv, unsigned short vid, int v) { - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; - vlan = batadv_softif_vlan_get(bat_priv, vid); + vlan = batadv_meshif_vlan_get(bat_priv, vid); if (!vlan) return; atomic_add(v, &vlan->tt.num_entries); - batadv_softif_vlan_put(vlan); + batadv_meshif_vlan_put(vlan); } /** * batadv_tt_local_size_inc() - increase by one the local table size for the * given vid - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @vid: the VLAN identifier */ static void batadv_tt_local_size_inc(struct batadv_priv *bat_priv, @@ -319,7 +319,7 @@ static void batadv_tt_local_size_inc(struct batadv_priv *bat_priv, /** * batadv_tt_local_size_dec() - decrease by one the local table size for the * given vid - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @vid: the VLAN identifier */ static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv, @@ -412,7 +412,7 @@ batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry) /** * batadv_tt_local_event() - store a local TT event (ADD/DEL) - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @tt_local_entry: the TT entry involved in the event * @event_flags: flags to store in the event structure */ @@ -423,8 +423,8 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv, struct batadv_tt_change_node *tt_change_node, *entry, *safe; struct batadv_tt_common_entry *common = &tt_local_entry->common; u8 flags = common->flags | event_flags; - bool event_removed = false; bool del_op_requested, del_op_entry; + size_t changes; tt_change_node = kmem_cache_alloc(batadv_tt_change_cache, GFP_ATOMIC); if (!tt_change_node) @@ -438,51 +438,45 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv, del_op_requested = flags & BATADV_TT_CLIENT_DEL; - /* check for ADD+DEL or DEL+ADD events */ + /* check for ADD+DEL, DEL+ADD, ADD+ADD or DEL+DEL events */ spin_lock_bh(&bat_priv->tt.changes_list_lock); + changes = READ_ONCE(bat_priv->tt.local_changes); list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { if (!batadv_compare_eth(entry->change.addr, common->addr)) continue; - /* DEL+ADD in the same orig interval have no effect and can be - * removed to avoid silly behaviour on the receiver side. The - * other way around (ADD+DEL) can happen in case of roaming of - * a client still in the NEW state. Roaming of NEW clients is - * now possible due to automatically recognition of "temporary" - * clients - */ del_op_entry = entry->change.flags & BATADV_TT_CLIENT_DEL; - if (!del_op_requested && del_op_entry) - goto del; - if (del_op_requested && !del_op_entry) - goto del; - - /* this is a second add in the same originator interval. It - * means that flags have been changed: update them! - */ - if (!del_op_requested && !del_op_entry) + if (del_op_requested != del_op_entry) { + /* DEL+ADD in the same orig interval have no effect and + * can be removed to avoid silly behaviour on the + * receiver side. The other way around (ADD+DEL) can + * happen in case of roaming of a client still in the + * NEW state. Roaming of NEW clients is now possible due + * to automatically recognition of "temporary" clients + */ + list_del(&entry->list); + kmem_cache_free(batadv_tt_change_cache, entry); + changes--; + } else { + /* this is a second add or del in the same originator + * interval. It could mean that flags have been changed + * (e.g. double add): update them + */ entry->change.flags = flags; + } - continue; -del: - list_del(&entry->list); - kmem_cache_free(batadv_tt_change_cache, entry); kmem_cache_free(batadv_tt_change_cache, tt_change_node); - event_removed = true; - goto unlock; + goto update_changes; } /* track the change in the OGMinterval list */ list_add_tail(&tt_change_node->list, &bat_priv->tt.changes_list); + changes++; -unlock: +update_changes: + WRITE_ONCE(bat_priv->tt.local_changes, changes); spin_unlock_bh(&bat_priv->tt.changes_list_lock); - - if (event_removed) - atomic_dec(&bat_priv->tt.local_changes); - else - atomic_inc(&bat_priv->tt.local_changes); } /** @@ -510,7 +504,7 @@ static u16 batadv_tt_entries(u16 tt_len) /** * batadv_tt_local_table_transmit_size() - calculates the local translation * table size when transmitted over the air - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: local translation table size in bytes. */ @@ -518,11 +512,11 @@ static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv) { u16 num_vlan = 0; u16 tt_local_entries = 0; - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; int hdr_size; rcu_read_lock(); - hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { + hlist_for_each_entry_rcu(vlan, &bat_priv->meshif_vlan_list, list) { num_vlan++; tt_local_entries += atomic_read(&vlan->tt.num_entries); } @@ -582,7 +576,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, /** * batadv_tt_local_add() - add a new client to the local table or update an * existing client - * @soft_iface: netdev struct of the mesh interface + * @mesh_iface: netdev struct of the mesh interface * @addr: the mac address of the client to add * @vid: VLAN identifier * @ifindex: index of the interface where the client is connected to (useful to @@ -592,14 +586,14 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, * * Return: true if the client was successfully added, false otherwise. */ -bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, +bool batadv_tt_local_add(struct net_device *mesh_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark) { - struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_priv *bat_priv = netdev_priv(mesh_iface); struct batadv_tt_local_entry *tt_local; struct batadv_tt_global_entry *tt_global = NULL; - struct net *net = dev_net(soft_iface); - struct batadv_softif_vlan *vlan; + struct net *net = dev_net(mesh_iface); + struct batadv_meshif_vlan *vlan; struct net_device *in_dev = NULL; struct batadv_hard_iface *in_hardif = NULL; struct hlist_head *head; @@ -656,7 +650,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, table_size += batadv_tt_len(1); packet_size_max = atomic_read(&bat_priv->packet_size_max); if (table_size > packet_size_max) { - net_ratelimited_function(batadv_info, soft_iface, + net_ratelimited_function(batadv_info, mesh_iface, "Local translation table size (%i) exceeds maximum packet size (%i); Ignoring new local tt entry: %pM\n", table_size, packet_size_max, addr); goto out; @@ -667,9 +661,9 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, goto out; /* increase the refcounter of the related vlan */ - vlan = batadv_softif_vlan_get(bat_priv, vid); + vlan = batadv_meshif_vlan_get(bat_priv, vid); if (!vlan) { - net_ratelimited_function(batadv_info, soft_iface, + net_ratelimited_function(batadv_info, mesh_iface, "adding TT local entry %pM to non-existent VLAN %d\n", addr, batadv_print_vid(vid)); kmem_cache_free(batadv_tl_cache, tt_local); @@ -699,7 +693,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, /* the batman interface mac and multicast addresses should never be * purged */ - if (batadv_compare_eth(addr, soft_iface->dev_addr) || + if (batadv_compare_eth(addr, mesh_iface->dev_addr) || is_multicast_ether_addr(addr)) tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE; @@ -855,7 +849,7 @@ out: /** * batadv_tt_prepare_tvlv_local_data() - allocate and prepare the TT TVLV for * this node - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @tt_data: uninitialised pointer to the address of the TVLV buffer * @tt_change: uninitialised pointer to the address of the area where the TT * changes can be stored @@ -877,7 +871,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, s32 *tt_len) { struct batadv_tvlv_tt_vlan_data *tt_vlan; - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; u16 num_vlan = 0; u16 vlan_entries = 0; u16 total_entries = 0; @@ -885,8 +879,8 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, u8 *tt_change_ptr; int change_offset; - spin_lock_bh(&bat_priv->softif_vlan_list_lock); - hlist_for_each_entry(vlan, &bat_priv->softif_vlan_list, list) { + spin_lock_bh(&bat_priv->meshif_vlan_list_lock); + hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); if (vlan_entries < 1) continue; @@ -915,7 +909,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (*tt_data)->vlan_data; - hlist_for_each_entry(vlan, &bat_priv->softif_vlan_list, list) { + hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) { vlan_entries = atomic_read(&vlan->tt.num_entries); if (vlan_entries < 1) continue; @@ -931,14 +925,14 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: - spin_unlock_bh(&bat_priv->softif_vlan_list_lock); + spin_unlock_bh(&bat_priv->meshif_vlan_list_lock); return tvlv_len; } /** * batadv_tt_tvlv_container_update() - update the translation table tvlv * container after local tt changes have been committed - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) { @@ -952,7 +946,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) size_t tt_extra_len = 0; u16 tvlv_len; - tt_diff_entries_num = atomic_read(&bat_priv->tt.local_changes); + tt_diff_entries_num = READ_ONCE(bat_priv->tt.local_changes); tt_diff_len = batadv_tt_len(tt_diff_entries_num); /* if we have too many changes for one packet don't send any @@ -962,7 +956,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) * The local change history should still be cleaned up so the next * TT round can start again with a clean state. */ - if (tt_diff_len > bat_priv->soft_iface->mtu) { + if (tt_diff_len > bat_priv->mesh_iface->mtu) { tt_diff_len = 0; tt_diff_entries_num = 0; drop_changes = true; @@ -979,7 +973,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) goto container_register; spin_lock_bh(&bat_priv->tt.changes_list_lock); - atomic_set(&bat_priv->tt.local_changes, 0); + WRITE_ONCE(bat_priv->tt.local_changes, 0); list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list, list) { @@ -1031,7 +1025,7 @@ container_register: * @msg :Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @common: tt local & tt global common data * * Return: Error code, or 0 on success @@ -1043,7 +1037,7 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, struct batadv_tt_common_entry *common) { void *hdr; - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; struct batadv_tt_local_entry *local; unsigned int last_seen_msecs; u32 crc; @@ -1051,13 +1045,13 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, local = container_of(common, struct batadv_tt_local_entry, common); last_seen_msecs = jiffies_to_msecs(jiffies - local->last_seen); - vlan = batadv_softif_vlan_get(bat_priv, common->vid); + vlan = batadv_meshif_vlan_get(bat_priv, common->vid); if (!vlan) return 0; crc = vlan->tt.crc; - batadv_softif_vlan_put(vlan); + batadv_meshif_vlan_put(vlan); hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, @@ -1090,7 +1084,7 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, * @msg: Netlink message to dump into * @portid: Port making netlink request * @cb: Control block containing additional options - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @hash: hash to dump * @bucket: bucket index to dump * @idx_s: Number of entries to skip @@ -1136,28 +1130,20 @@ batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, */ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) { - struct net *net = sock_net(cb->skb->sk); - struct net_device *soft_iface; + struct net_device *mesh_iface; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_hashtable *hash; int ret; - int ifindex; int bucket = cb->args[0]; int idx = cb->args[1]; int portid = NETLINK_CB(cb->skb).portid; - ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); - if (!ifindex) - return -EINVAL; - - soft_iface = dev_get_by_index(net, ifindex); - if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { - ret = -ENODEV; - goto out; - } + mesh_iface = batadv_netlink_get_meshif(cb); + if (IS_ERR(mesh_iface)) + return PTR_ERR(mesh_iface); - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { @@ -1179,7 +1165,7 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) out: batadv_hardif_put(primary_if); - dev_put(soft_iface); + dev_put(mesh_iface); cb->args[0] = bucket; cb->args[1] = idx; @@ -1208,7 +1194,7 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv, /** * batadv_tt_local_remove() - logically remove an entry from the local table - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: the MAC address of the client to remove * @vid: VLAN identifier * @message: message to append to the log on deletion @@ -1273,7 +1259,7 @@ out: /** * batadv_tt_local_purge_list() - purge inactive tt local entries - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @head: pointer to the list containing the local tt entries * @timeout: parameter deciding whether a given tt local entry is considered * inactive or not @@ -1308,7 +1294,7 @@ static void batadv_tt_local_purge_list(struct batadv_priv *bat_priv, /** * batadv_tt_local_purge() - purge inactive tt local entries - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @timeout: parameter deciding whether a given tt local entry is considered * inactive or not */ @@ -1395,7 +1381,7 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv) kmem_cache_free(batadv_tt_change_cache, entry); } - atomic_set(&bat_priv->tt.local_changes, 0); + WRITE_ONCE(bat_priv->tt.local_changes, 0); spin_unlock_bh(&bat_priv->tt.changes_list_lock); } @@ -1543,7 +1529,7 @@ out: /** * batadv_tt_global_add() - add a new TT global entry or update an existing one - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: the originator announcing the client * @tt_addr: the mac address of the non-mesh client * @vid: VLAN identifier @@ -1716,7 +1702,7 @@ out: /** * batadv_transtable_best_orig() - Get best originator list entry from tt entry - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @tt_global_entry: global translation table entry to be analyzed * * This function assumes the caller holds rcu_read_lock(). @@ -1823,7 +1809,7 @@ batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @common: tt local & tt global common data * @sub_s: Number of entries to skip * @@ -1868,7 +1854,7 @@ batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, * @msg: Netlink message to dump into * @portid: Port making netlink request * @seq: Sequence number of netlink message - * @bat_priv: The bat priv with all the soft interface information + * @bat_priv: The bat priv with all the mesh interface information * @head: Pointer to the list containing the global tt entries * @idx_s: Number of entries to skip * @sub: Number of entries to skip @@ -1911,30 +1897,22 @@ batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq, */ int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb) { - struct net *net = sock_net(cb->skb->sk); - struct net_device *soft_iface; + struct net_device *mesh_iface; struct batadv_priv *bat_priv; struct batadv_hard_iface *primary_if = NULL; struct batadv_hashtable *hash; struct hlist_head *head; int ret; - int ifindex; int bucket = cb->args[0]; int idx = cb->args[1]; int sub = cb->args[2]; int portid = NETLINK_CB(cb->skb).portid; - ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); - if (!ifindex) - return -EINVAL; + mesh_iface = batadv_netlink_get_meshif(cb); + if (IS_ERR(mesh_iface)) + return PTR_ERR(mesh_iface); - soft_iface = dev_get_by_index(net, ifindex); - if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { - ret = -ENODEV; - goto out; - } - - bat_priv = netdev_priv(soft_iface); + bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { @@ -1959,7 +1937,7 @@ int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb) out: batadv_hardif_put(primary_if); - dev_put(soft_iface); + dev_put(mesh_iface); cb->args[0] = bucket; cb->args[1] = idx; @@ -2012,7 +1990,7 @@ batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry) /** * batadv_tt_global_del_orig_node() - remove orig_node from a global tt entry - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @tt_global_entry: the global entry to remove the orig_node from * @orig_node: the originator announcing the client * @message: message to append to the log on deletion @@ -2091,7 +2069,7 @@ batadv_tt_global_del_roaming(struct batadv_priv *bat_priv, /** * batadv_tt_global_del() - remove a client from the global table - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: an originator serving this client * @addr: the mac address of the client * @vid: VLAN identifier @@ -2156,7 +2134,7 @@ out: /** * batadv_tt_global_del_orig() - remove all the TT global entries belonging to * the given originator matching the provided vid - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: the originator owning the entries to remove * @match_vid: the VLAN identifier to match. If negative all the entries will be * removed @@ -2327,7 +2305,7 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry, /** * batadv_transtable_search() - get the mesh destination for a given client - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @src: mac address of the source client * @addr: mac address of the destination client * @vid: VLAN identifier @@ -2386,7 +2364,7 @@ out: /** * batadv_tt_global_crc() - calculates the checksum of the local table belonging * to the given orig_node - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: originator for which the CRC should be computed * @vid: VLAN identifier for which the CRC32 has to be computed * @@ -2480,7 +2458,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, /** * batadv_tt_local_crc() - calculates the checksum of the local table - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @vid: VLAN identifier for which the CRC32 has to be computed * * For details about the computation, please refer to the documentation for @@ -2615,7 +2593,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv) /** * batadv_tt_req_node_new() - search and possibly create a tt_req_node object - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node this request is being issued for * * Return: the pointer to the new tt_req_node struct if no request @@ -2711,7 +2689,7 @@ static bool batadv_tt_global_valid(const void *entry_ptr, /** * batadv_tt_tvlv_generate() - fill the tvlv buff with the tt entries from the * specified tt hash - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @hash: hash table containing the tt entries * @tt_len: expected tvlv tt data buffer length in number of bytes * @tvlv_buff: pointer to the buffer to fill with the TT data @@ -2832,15 +2810,15 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, /** * batadv_tt_local_update_crc() - update all the local CRCs - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv) { - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; /* recompute the global CRC for each VLAN */ rcu_read_lock(); - hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) { + hlist_for_each_entry_rcu(vlan, &bat_priv->meshif_vlan_list, list) { vlan->tt.crc = batadv_tt_local_crc(bat_priv, vlan->vid); } rcu_read_unlock(); @@ -2848,7 +2826,7 @@ static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv) /** * batadv_tt_global_update_crc() - update all the global CRCs for this orig_node - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: the orig_node for which the CRCs have to be updated */ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, @@ -2875,7 +2853,7 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, /** * batadv_send_tt_request() - send a TT Request message to a given node - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @dst_orig_node: the destination of the message * @ttvn: the version number that the source of the message is looking for * @tt_vlan: pointer to the first tvlv VLAN object to request @@ -2960,7 +2938,7 @@ out: /** * batadv_send_other_tt_response() - send reply to tt request concerning another * node's translation table - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient @@ -3051,7 +3029,7 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, /* Don't send the response, if larger than fragmented packet. */ tt_len = sizeof(struct batadv_unicast_tvlv_packet) + tvlv_len; if (tt_len > atomic_read(&bat_priv->packet_size_max)) { - net_ratelimited_function(batadv_info, bat_priv->soft_iface, + net_ratelimited_function(batadv_info, bat_priv->mesh_iface, "Ignoring TT_REQUEST from %pM; Response size exceeds max packet size.\n", res_dst_orig_node->orig); goto out; @@ -3090,7 +3068,7 @@ out: /** * batadv_send_my_tt_response() - send reply to tt request concerning this * node's translation table - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * @@ -3207,7 +3185,7 @@ out: /** * batadv_send_tt_response() - send reply to tt request - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @tt_data: tt data containing the tt request information * @req_src: mac address of tt request sender * @req_dst: mac address of tt request recipient @@ -3302,7 +3280,7 @@ static void batadv_tt_update_changes(struct batadv_priv *bat_priv, /** * batadv_is_my_client() - check if a client is served by the local node - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the client to check * @vid: VLAN identifier * @@ -3331,7 +3309,7 @@ out: /** * batadv_handle_tt_response() - process incoming tt reply - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @tt_data: tt data containing the tt request information * @resp_src: mac address of tt reply sender * @num_entries: number of tt change entries appended to the tt data @@ -3419,7 +3397,7 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv) /** * batadv_tt_check_roam_count() - check if a client has roamed too frequently - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @client: mac address of the roaming client * * This function checks whether the client already reached the @@ -3474,7 +3452,7 @@ unlock: /** * batadv_send_roam_adv() - send a roaming advertisement message - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @client: mac address of the roaming client * @vid: VLAN identifier * @orig_node: message destination @@ -3538,8 +3516,8 @@ static void batadv_tt_purge(struct work_struct *work) } /** - * batadv_tt_free() - Free translation table of soft interface - * @bat_priv: the bat priv with all the soft interface information + * batadv_tt_free() - Free translation table of mesh interface + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_tt_free(struct batadv_priv *bat_priv) { @@ -3562,7 +3540,7 @@ void batadv_tt_free(struct batadv_priv *bat_priv) /** * batadv_tt_local_set_flags() - set or unset the specified flags on the local * table and possibly count them in the TT size - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @flags: the flag to switch * @enable: whether to set or unset the flag * @count: whether to increase the TT size by the number of changed entries @@ -3648,7 +3626,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) /** * batadv_tt_local_commit_changes_nolock() - commit all pending local tt changes * which have been queued in the time since the last commit - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Caller must hold tt->commit_lock. */ @@ -3656,7 +3634,7 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv) { lockdep_assert_held(&bat_priv->tt.commit_lock); - if (atomic_read(&bat_priv->tt.local_changes) < 1) { + if (READ_ONCE(bat_priv->tt.local_changes) == 0) { if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt)) batadv_tt_tvlv_container_update(bat_priv); return; @@ -3681,7 +3659,7 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv) /** * batadv_tt_local_commit_changes() - commit all pending local tt changes which * have been queued in the time since the last commit - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information */ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv) { @@ -3692,7 +3670,7 @@ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv) /** * batadv_is_ap_isolated() - Check if packet from upper layer should be dropped - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @src: source mac address of packet * @dst: destination mac address of packet * @vid: vlan id of packet @@ -3704,10 +3682,10 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, { struct batadv_tt_local_entry *tt_local_entry; struct batadv_tt_global_entry *tt_global_entry; - struct batadv_softif_vlan *vlan; + struct batadv_meshif_vlan *vlan; bool ret = false; - vlan = batadv_softif_vlan_get(bat_priv, vid); + vlan = batadv_meshif_vlan_get(bat_priv, vid); if (!vlan) return false; @@ -3729,14 +3707,14 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, local_entry_put: batadv_tt_local_entry_put(tt_local_entry); vlan_put: - batadv_softif_vlan_put(vlan); + batadv_meshif_vlan_put(vlan); return ret; } /** * batadv_tt_update_orig() - update global translation table with new tt * information received via ogms - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: the orig_node of the ogm * @tt_buff: pointer to the first tvlv VLAN entry * @tt_num_vlan: number of tvlv VLAN entries @@ -3820,7 +3798,7 @@ request_table: /** * batadv_tt_global_client_is_roaming() - check if a client is marked as roaming - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the client to check * @vid: VLAN identifier * @@ -3846,7 +3824,7 @@ out: /** * batadv_tt_local_client_is_roaming() - tells whether the client is roaming - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the local client to query * @vid: VLAN identifier * @@ -3872,7 +3850,7 @@ out: /** * batadv_tt_add_temporary_global_entry() - Add temporary entry to global TT - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig_node: orig node which the temporary entry should be associated with * @addr: mac address of the client * @vid: VLAN id of the new temporary global translation table @@ -3905,14 +3883,14 @@ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, /** * batadv_tt_local_resize_to_mtu() - resize the local translation table fit the * maximum packet size that can be transported through the mesh - * @soft_iface: netdev struct of the mesh interface + * @mesh_iface: netdev struct of the mesh interface * * Remove entries older than 'timeout' and half timeout if more entries need * to be removed. */ -void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface) +void batadv_tt_local_resize_to_mtu(struct net_device *mesh_iface) { - struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct batadv_priv *bat_priv = netdev_priv(mesh_iface); int packet_size_max = atomic_read(&bat_priv->packet_size_max); int table_size, timeout = BATADV_TT_LOCAL_TIMEOUT / 2; bool reduced = false; @@ -3929,7 +3907,7 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface) timeout /= 2; reduced = true; - net_ratelimited_function(batadv_info, soft_iface, + net_ratelimited_function(batadv_info, mesh_iface, "Forced to purge local tt entries to fit new maximum fragment MTU (%i)\n", packet_size_max); } @@ -3945,7 +3923,7 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface) /** * batadv_tt_tvlv_ogm_handler_v1() - process incoming tt tvlv container - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the gateway data @@ -3959,23 +3937,21 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tt_data; u16 num_entries, num_vlan; - size_t flex_size; + size_t tt_data_sz; if (tvlv_value_len < sizeof(*tt_data)) return; tt_data = tvlv_value; - tvlv_value_len -= sizeof(*tt_data); - num_vlan = ntohs(tt_data->num_vlan); - flex_size = flex_array_size(tt_data, vlan_data, num_vlan); - if (tvlv_value_len < flex_size) + tt_data_sz = struct_size(tt_data, vlan_data, num_vlan); + if (tvlv_value_len < tt_data_sz) return; tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data - + flex_size); - tvlv_value_len -= flex_size; + + tt_data_sz); + tvlv_value_len -= tt_data_sz; num_entries = batadv_tt_entries(tvlv_value_len); @@ -3986,7 +3962,7 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, /** * batadv_tt_tvlv_unicast_handler_v1() - process incoming (unicast) tt tvlv * container - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @src: mac address of tt tvlv sender * @dst: mac address of tt tvlv recipient * @tvlv_value: tvlv buffer containing the tt data @@ -4068,7 +4044,7 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, /** * batadv_roam_tvlv_unicast_handler_v1() - process incoming tt roam tvlv * container - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @src: mac address of tt tvlv sender * @dst: mac address of tt tvlv recipient * @tvlv_value: tvlv buffer containing the tt data @@ -4117,7 +4093,7 @@ out: /** * batadv_tt_init() - initialise the translation table internals - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success or negative error number in case of failure. */ @@ -4155,7 +4131,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv) /** * batadv_tt_global_is_isolated() - check if a client is marked as isolated - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @addr: the mac address of the client * @vid: the identifier of the VLAN where this client is connected * diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index d18740d9a22b..618d9dbca5ea 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -16,7 +16,7 @@ #include <linux/types.h> int batadv_tt_init(struct batadv_priv *bat_priv); -bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, +bool batadv_tt_local_add(struct net_device *mesh_iface, const u8 *addr, unsigned short vid, int ifindex, u32 mark); u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid, @@ -45,7 +45,7 @@ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid); -void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface); +void batadv_tt_local_resize_to_mtu(struct net_device *mesh_iface); bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 2a583215d439..76dff1f9c559 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -59,7 +59,7 @@ static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) /** * batadv_tvlv_handler_get() - retrieve tvlv handler from the tvlv handler list * based on the provided type and version (both need to match) - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv handler type to look for * @version: tvlv handler version to look for * @@ -118,7 +118,7 @@ static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) /** * batadv_tvlv_container_get() - retrieve tvlv container from the tvlv container * list based on the provided type and version (both need to match) - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv container type to look for * @version: tvlv container version to look for * @@ -152,7 +152,7 @@ batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) /** * batadv_tvlv_container_list_size() - calculate the size of the tvlv container * list entries - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). @@ -177,7 +177,7 @@ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) /** * batadv_tvlv_container_remove() - remove tvlv container from the tvlv * container list - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @tvlv: the to be removed tvlv container * * Has to be called with the appropriate locks being acquired @@ -201,7 +201,7 @@ static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, /** * batadv_tvlv_container_unregister() - unregister tvlv container based on the * provided type and version (both need to match) - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv container type to unregister * @version: tvlv container type to unregister */ @@ -219,7 +219,7 @@ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, /** * batadv_tvlv_container_register() - register tvlv type, version and content * to be propagated with each (primary interface) OGM - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv container type * @version: tvlv container version * @tvlv_value: tvlv container content @@ -297,7 +297,7 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, /** * batadv_tvlv_container_ogm_append() - append tvlv container content to given * OGM packet buffer - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @packet_buff: ogm packet buffer * @packet_buff_len: ogm packet buffer size including ogm header and tvlv * content @@ -350,7 +350,7 @@ end: /** * batadv_tvlv_call_handler() - parse the given tvlv buffer to call the * appropriate handlers - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @tvlv_handler: tvlv callback function handling the tvlv content * @packet_type: indicates for which packet type the TVLV handler is called * @orig_node: orig node emitting the ogm packet @@ -421,7 +421,7 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, /** * batadv_tvlv_containers_process() - parse the given tvlv buffer to call the * appropriate handlers - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @packet_type: indicates for which packet type the TVLV handler is called * @orig_node: orig node emitting the ogm packet * @skb: the skb the TVLV handler is called for @@ -490,7 +490,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, /** * batadv_tvlv_ogm_receive() - process an incoming ogm and call the appropriate * handlers - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @batadv_ogm_packet: ogm packet containing the tvlv containers * @orig_node: orig node emitting the ogm packet */ @@ -518,7 +518,7 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, * batadv_tvlv_handler_register() - register tvlv handler based on the provided * type and version (both need to match) for ogm tvlv payload and/or unicast * payload - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @optr: ogm tvlv handler callback function. This function receives the orig * node, flags and the tvlv content as argument to process. * @uptr: unicast tvlv handler callback function. This function receives the @@ -583,7 +583,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, /** * batadv_tvlv_handler_unregister() - unregister tvlv handler based on the * provided type and version (both need to match) - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @type: tvlv handler type to be unregistered * @version: tvlv handler version to be unregistered */ @@ -606,7 +606,7 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, /** * batadv_tvlv_unicast_send() - send a unicast packet with tvlv payload to the * specified host - * @bat_priv: the bat priv with all the soft interface information + * @bat_priv: the bat priv with all the mesh interface information * @src: source mac address of the unicast packet * @dst: destination mac address of the unicast packet * @type: tvlv type diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 04f6398b3a40..0ca0fc072fc9 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -186,6 +186,9 @@ struct batadv_hard_iface { /** @net_dev: pointer to the net_device */ struct net_device *net_dev; + /** @dev_tracker: device tracker for @net_dev */ + netdevice_tracker dev_tracker; + /** @refcount: number of contexts the object is used */ struct kref refcount; @@ -196,10 +199,13 @@ struct batadv_hard_iface { struct packet_type batman_adv_ptype; /** - * @soft_iface: the batman-adv interface which uses this network + * @mesh_iface: the batman-adv interface which uses this network * interface */ - struct net_device *soft_iface; + struct net_device *mesh_iface; + + /** @meshif_dev_tracker: device tracker for @mesh_iface */ + netdevice_tracker meshif_dev_tracker; /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; @@ -487,7 +493,7 @@ struct batadv_orig_node { /** @hash_entry: hlist node for &batadv_priv.orig_hash */ struct hlist_node hash_entry; - /** @bat_priv: pointer to soft_iface this orig node belongs to */ + /** @bat_priv: pointer to mesh_iface this orig node belongs to */ struct batadv_priv *bat_priv; /** @bcast_seqno_lock: lock protecting bcast_bits & last_bcast_seqno */ @@ -596,9 +602,6 @@ struct batadv_hardif_neigh_node_bat_v { * neighbor */ unsigned long last_unicast_tx; - - /** @metric_work: work queue callback item for metric update */ - struct work_struct metric_work; }; /** @@ -902,13 +905,13 @@ enum batadv_counters { /** * @BATADV_CNT_MCAST_RX_LOCAL: counter for received batman-adv multicast - * packets which were forwarded to the local soft interface + * packets which were forwarded to the local mesh interface */ BATADV_CNT_MCAST_RX_LOCAL, /** * @BATADV_CNT_MCAST_RX_LOCAL_BYTES: bytes counter for received - * batman-adv multicast packets which were forwarded to the local soft + * batman-adv multicast packets which were forwarded to the local mesh * interface */ BATADV_CNT_MCAST_RX_LOCAL_BYTES, @@ -1022,7 +1025,7 @@ struct batadv_priv_tt { atomic_t ogm_append_cnt; /** @local_changes: changes registered in an originator interval */ - atomic_t local_changes; + size_t local_changes; /** * @changes_list: tracks tt local changes within an originator interval @@ -1044,7 +1047,7 @@ struct batadv_priv_tt { */ struct list_head roam_list; - /** @changes_list_lock: lock protecting changes_list */ + /** @changes_list_lock: lock protecting changes_list & local_changes */ spinlock_t changes_list_lock; /** @req_list_lock: lock protecting req_list */ @@ -1140,29 +1143,6 @@ struct batadv_priv_bla { }; #endif -#ifdef CONFIG_BATMAN_ADV_DEBUG - -/** - * struct batadv_priv_debug_log - debug logging data - */ -struct batadv_priv_debug_log { - /** @log_buff: buffer holding the logs (ring buffer) */ - char log_buff[BATADV_LOG_BUF_LEN]; - - /** @log_start: index of next character to read */ - unsigned long log_start; - - /** @log_end: index of next character to write */ - unsigned long log_end; - - /** @lock: lock protecting log_buff, log_start & log_end */ - spinlock_t lock; - - /** @queue_wait: log reader's wait queue */ - wait_queue_head_t queue_wait; -}; -#endif - /** * struct batadv_priv_gw - per mesh interface gateway data */ @@ -1267,7 +1247,7 @@ struct batadv_mcast_mla_flags { /** @enabled: whether the multicast tvlv is currently enabled */ unsigned char enabled:1; - /** @bridged: whether the soft interface has a bridge on top */ + /** @bridged: whether the mesh interface has a bridge on top */ unsigned char bridged:1; /** @tvlv_flags: the flags we have last sent in our mcast tvlv */ @@ -1403,7 +1383,7 @@ struct batadv_priv_nc { /** * @decoding_hash: Hash table used to buffer skbs that might be needed * to decode a received coded skb. The buffer is used for 1) skbs - * arriving on the soft-interface; 2) skbs overheard on the + * arriving on the mesh-interface; 2) skbs overheard on the * hard-interface; and 3) skbs forwarded by batman-adv. */ struct batadv_hashtable *decoding_hash; @@ -1556,9 +1536,9 @@ struct batadv_tp_vars { }; /** - * struct batadv_softif_vlan - per VLAN attributes set + * struct batadv_meshif_vlan - per VLAN attributes set */ -struct batadv_softif_vlan { +struct batadv_meshif_vlan { /** @bat_priv: pointer to the mesh object */ struct batadv_priv *bat_priv; @@ -1571,7 +1551,7 @@ struct batadv_softif_vlan { /** @tt: TT private attributes (VLAN specific) */ struct batadv_vlan_tt tt; - /** @list: list node for &bat_priv.softif_vlan_list */ + /** @list: list node for &bat_priv.meshif_vlan_list */ struct hlist_node list; /** @@ -1584,7 +1564,7 @@ struct batadv_softif_vlan { }; /** - * struct batadv_priv_bat_v - B.A.T.M.A.N. V per soft-interface private data + * struct batadv_priv_bat_v - B.A.T.M.A.N. V per mesh-interface private data */ struct batadv_priv_bat_v { /** @ogm_buff: buffer holding the OGM packet */ @@ -1613,8 +1593,8 @@ struct batadv_priv { */ atomic_t mesh_state; - /** @soft_iface: net device which holds this struct as private data */ - struct net_device *soft_iface; + /** @mesh_iface: net device which holds this struct as private data */ + struct net_device *mesh_iface; /** * @mtu_set_by_user: MTU was set once by user @@ -1763,24 +1743,19 @@ struct batadv_priv { struct batadv_algo_ops *algo_ops; /** - * @softif_vlan_list: a list of softif_vlan structs, one per VLAN + * @meshif_vlan_list: a list of meshif_vlan structs, one per VLAN * created on top of the mesh interface represented by this object */ - struct hlist_head softif_vlan_list; + struct hlist_head meshif_vlan_list; - /** @softif_vlan_list_lock: lock protecting softif_vlan_list */ - spinlock_t softif_vlan_list_lock; + /** @meshif_vlan_list_lock: lock protecting meshif_vlan_list */ + spinlock_t meshif_vlan_list_lock; #ifdef CONFIG_BATMAN_ADV_BLA /** @bla: bridge loop avoidance data */ struct batadv_priv_bla bla; #endif -#ifdef CONFIG_BATMAN_ADV_DEBUG - /** @debug_log: holding debug logging relevant data */ - struct batadv_priv_debug_log *debug_log; -#endif - /** @gw: gateway data */ struct batadv_priv_gw gw; @@ -1811,7 +1786,7 @@ struct batadv_priv { #endif /* CONFIG_BATMAN_ADV_NC */ #ifdef CONFIG_BATMAN_ADV_BATMAN_V - /** @bat_v: B.A.T.M.A.N. V per soft-interface private data */ + /** @bat_v: B.A.T.M.A.N. V per mesh-interface private data */ struct batadv_priv_bat_v bat_v; #endif }; @@ -1834,7 +1809,7 @@ struct batadv_bla_backbone_gw { /** @hash_entry: hlist node for &batadv_priv_bla.backbone_hash */ struct hlist_node hash_entry; - /** @bat_priv: pointer to soft_iface this backbone gateway belongs to */ + /** @bat_priv: pointer to mesh_iface this backbone gateway belongs to */ struct batadv_priv *bat_priv; /** @lasttime: last time we heard of this backbone gw */ @@ -1939,8 +1914,8 @@ struct batadv_tt_local_entry { /** @last_seen: timestamp used for purging stale tt local entries */ unsigned long last_seen; - /** @vlan: soft-interface vlan of the entry */ - struct batadv_softif_vlan *vlan; + /** @vlan: mesh-interface vlan of the entry */ + struct batadv_meshif_vlan *vlan; }; /** @@ -2164,7 +2139,7 @@ struct batadv_forw_packet { u16 packet_len; /** @direct_link_flags: direct link flags for aggregated OGM packets */ - u32 direct_link_flags; + DECLARE_BITMAP(direct_link_flags, BATADV_MAX_AGGREGATION_PACKETS); /** @num_packets: counter for aggregated OGMv1 packets */ u8 num_packets; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 50cfec8ccac4..f0c862091bff 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -13,6 +13,7 @@ #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> +#include <net/netdev_lock.h> #include <net/pkt_sched.h> #include <net/bluetooth/bluetooth.h> @@ -443,7 +444,7 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, memset(&msg, 0, sizeof(msg)); iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, skb->len); - err = l2cap_chan_send(chan, &msg, skb->len); + err = l2cap_chan_send(chan, &msg, skb->len, NULL); if (err > 0) { netdev->stats.tx_bytes += err; netdev->stats.tx_packets++; @@ -825,11 +826,16 @@ static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan, unsigned long hdr_len, unsigned long len, int nb) { + struct sk_buff *skb; + /* Note that we must allocate using GFP_ATOMIC here as * this function is called originally from netdev hard xmit * function in atomic context. */ - return bt_skb_alloc(hdr_len + len, GFP_ATOMIC); + skb = bt_skb_alloc(hdr_len + len, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + return skb; } static void chan_suspend_cb(struct l2cap_chan *chan) diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 5a3835b7dfcd..a7eede7616d8 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -14,7 +14,8 @@ bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \ - ecdh_helper.o mgmt_util.o mgmt_config.o hci_codec.o eir.o hci_sync.o + ecdh_helper.o mgmt_util.o mgmt_config.o hci_codec.o eir.o hci_sync.o \ + hci_drv.o bluetooth-$(CONFIG_DEV_COREDUMP) += coredump.o diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 0b4d0a8bd361..6ad2f72f53f4 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -34,6 +34,9 @@ #include <net/bluetooth/bluetooth.h> #include <linux/proc_fs.h> +#include <linux/ethtool.h> +#include <linux/sockios.h> + #include "leds.h" #include "selftest.h" @@ -563,6 +566,86 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock, } EXPORT_SYMBOL(bt_sock_poll); +static int bt_ethtool_get_ts_info(struct sock *sk, unsigned int index, + void __user *useraddr) +{ + struct ethtool_ts_info info; + struct kernel_ethtool_ts_info ts_info = {}; + int ret; + + ret = hci_ethtool_ts_info(index, sk->sk_protocol, &ts_info); + if (ret == -ENODEV) + return ret; + else if (ret < 0) + return -EIO; + + memset(&info, 0, sizeof(info)); + + info.cmd = ETHTOOL_GET_TS_INFO; + info.so_timestamping = ts_info.so_timestamping; + info.phc_index = ts_info.phc_index; + info.tx_types = ts_info.tx_types; + info.rx_filters = ts_info.rx_filters; + + if (copy_to_user(useraddr, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +static int bt_ethtool(struct sock *sk, const struct ifreq *ifr, + void __user *useraddr) +{ + unsigned int index; + u32 ethcmd; + int n; + + if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) + return -EFAULT; + + if (sscanf(ifr->ifr_name, "hci%u%n", &index, &n) != 1 || + n != strlen(ifr->ifr_name)) + return -ENODEV; + + switch (ethcmd) { + case ETHTOOL_GET_TS_INFO: + return bt_ethtool_get_ts_info(sk, index, useraddr); + } + + return -EOPNOTSUPP; +} + +static int bt_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg) +{ + struct sock *sk = sock->sk; + struct ifreq ifr = {}; + void __user *data; + char *colon; + int ret = -ENOIOCTLCMD; + + if (get_user_ifreq(&ifr, &data, arg)) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ - 1] = 0; + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + switch (cmd) { + case SIOCETHTOOL: + ret = bt_ethtool(sk, &ifr, data); + break; + } + + if (colon) + *colon = ':'; + + if (put_user_ifreq(&ifr, arg)) + return -EFAULT; + + return ret; +} + int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; @@ -595,6 +678,10 @@ int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = put_user(amount, (int __user *)arg); break; + case SIOCETHTOOL: + err = bt_dev_ioctl(sock, cmd, (void __user *)arg); + break; + default: err = -ENOIOCTLCMD; break; diff --git a/net/bluetooth/coredump.c b/net/bluetooth/coredump.c index c18df3a08607..819eacb38762 100644 --- a/net/bluetooth/coredump.c +++ b/net/bluetooth/coredump.c @@ -240,6 +240,26 @@ static void hci_devcd_handle_pkt_pattern(struct hci_dev *hdev, bt_dev_dbg(hdev, "Failed to set pattern"); } +static void hci_devcd_dump(struct hci_dev *hdev) +{ + struct sk_buff *skb; + u32 size; + + bt_dev_dbg(hdev, "state %d", hdev->dump.state); + + size = hdev->dump.tail - hdev->dump.head; + + /* Emit a devcoredump with the available data */ + dev_coredumpv(&hdev->dev, hdev->dump.head, size, GFP_KERNEL); + + /* Send a copy to monitor as a diagnostic packet */ + skb = bt_skb_alloc(size, GFP_ATOMIC); + if (skb) { + skb_put_data(skb, hdev->dump.head, size); + hci_recv_diag(hdev, skb); + } +} + static void hci_devcd_handle_pkt_complete(struct hci_dev *hdev, struct sk_buff *skb) { @@ -256,7 +276,7 @@ static void hci_devcd_handle_pkt_complete(struct hci_dev *hdev, bt_dev_dbg(hdev, "complete with size %u (expect %zu)", dump_size, hdev->dump.alloc_size); - dev_coredumpv(&hdev->dev, hdev->dump.head, dump_size, GFP_KERNEL); + hci_devcd_dump(hdev); } static void hci_devcd_handle_pkt_abort(struct hci_dev *hdev, @@ -275,8 +295,7 @@ static void hci_devcd_handle_pkt_abort(struct hci_dev *hdev, bt_dev_dbg(hdev, "aborted with size %u (expect %zu)", dump_size, hdev->dump.alloc_size); - /* Emit a devcoredump with the available data */ - dev_coredumpv(&hdev->dev, hdev->dump.head, dump_size, GFP_KERNEL); + hci_devcd_dump(hdev); } /* Bluetooth devcoredump state machine. @@ -391,8 +410,7 @@ void hci_devcd_timeout(struct work_struct *work) bt_dev_dbg(hdev, "timeout with size %u (expect %zu)", dump_size, hdev->dump.alloc_size); - /* Emit a devcoredump with the available data */ - dev_coredumpv(&hdev->dev, hdev->dump.head, dump_size, GFP_KERNEL); + hci_devcd_dump(hdev); hci_devcd_reset(hdev); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index d097e308a755..99efeed6a766 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -27,6 +27,7 @@ #include <linux/export.h> #include <linux/debugfs.h> +#include <linux/errqueue.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> @@ -784,7 +785,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c d->sync_handle = conn->sync_handle; if (test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) { - hci_conn_hash_list_flag(hdev, find_bis, ISO_LINK, + hci_conn_hash_list_flag(hdev, find_bis, BIS_LINK, HCI_CONN_PA_SYNC, d); if (!d->count) @@ -794,7 +795,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c } if (test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags)) { - hci_conn_hash_list_flag(hdev, find_bis, ISO_LINK, + hci_conn_hash_list_flag(hdev, find_bis, BIS_LINK, HCI_CONN_BIG_SYNC, d); if (!d->count) @@ -884,9 +885,11 @@ static void cis_cleanup(struct hci_conn *conn) /* Check if ISO connection is a CIS and remove CIG if there are * no other connections using it. */ - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_BOUND, &d); - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECT, &d); - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECTED, &d); + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_BOUND, &d); + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECT, + &d); + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECTED, + &d); if (d.count) return; @@ -909,7 +912,8 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t if (!hdev->acl_mtu) return ERR_PTR(-ECONNREFUSED); break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: if (hdev->iso_mtu) /* Dedicated ISO Buffer exists */ break; @@ -973,7 +977,8 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t hci_copy_identity_address(hdev, &conn->src, &conn->src_type); conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); @@ -1002,6 +1007,7 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t } skb_queue_head_init(&conn->data_q); + skb_queue_head_init(&conn->tx_q.queue); INIT_LIST_HEAD(&conn->chan_list); INIT_LIST_HEAD(&conn->link_list); @@ -1069,7 +1075,8 @@ static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason) if (HCI_CONN_HANDLE_UNSET(conn->handle)) hci_conn_failed(conn, reason); break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: if ((conn->state != BT_CONNECTED && !test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) || test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) @@ -1144,7 +1151,8 @@ void hci_conn_del(struct hci_conn *conn) hdev->acl_cnt += conn->sent; } else { /* Unacked ISO frames */ - if (conn->type == ISO_LINK) { + if (conn->type == CIS_LINK || + conn->type == BIS_LINK) { if (hdev->iso_pkts) hdev->iso_cnt += conn->sent; else if (hdev->le_pkts) @@ -1155,6 +1163,7 @@ void hci_conn_del(struct hci_conn *conn) } skb_queue_purge(&conn->data_q); + skb_queue_purge(&conn->tx_q.queue); /* Remove the connection from the list and cleanup its remaining * state. This is a separate function since for some cases like @@ -1529,7 +1538,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, memcmp(conn->le_per_adv_data, base, base_len))) return ERR_PTR(-EADDRINUSE); - conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER); + conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_MASTER); if (IS_ERR(conn)) return conn; @@ -1737,7 +1746,7 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) data.count = 0; /* Create a BIS for each bound connection */ - hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, + hci_conn_hash_list_state(hdev, bis_list, BIS_LINK, BT_BOUND, &data); cp.handle = qos->bcast.big; @@ -1826,12 +1835,12 @@ static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) for (data.cig = 0x00; data.cig < 0xf0; data.cig++) { data.count = 0; - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECT, &data); if (data.count) continue; - hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, + hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECTED, &data); if (!data.count) break; @@ -1881,7 +1890,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig, qos->ucast.cis); if (!cis) { - cis = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER); + cis = hci_conn_add_unset(hdev, CIS_LINK, dst, + HCI_ROLE_MASTER); if (IS_ERR(cis)) return cis; cis->cleanup = cis_cleanup; @@ -1973,7 +1983,7 @@ bool hci_iso_setup_path(struct hci_conn *conn) int hci_conn_check_create_cis(struct hci_conn *conn) { - if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY)) + if (conn->type != CIS_LINK) return -EINVAL; if (!conn->parent || conn->parent->state != BT_CONNECTED || @@ -2061,102 +2071,15 @@ static int create_big_sync(struct hci_dev *hdev, void *data) return hci_le_create_big(conn, &conn->iso_qos); } -static void create_pa_complete(struct hci_dev *hdev, void *data, int err) -{ - bt_dev_dbg(hdev, ""); - - if (err) - bt_dev_err(hdev, "Unable to create PA: %d", err); -} - -static bool hci_conn_check_create_pa_sync(struct hci_conn *conn) -{ - if (conn->type != ISO_LINK || conn->sid == HCI_SID_INVALID) - return false; - - return true; -} - -static int create_pa_sync(struct hci_dev *hdev, void *data) -{ - struct hci_cp_le_pa_create_sync cp = {0}; - struct hci_conn *conn; - int err = 0; - - hci_dev_lock(hdev); - - rcu_read_lock(); - - /* The spec allows only one pending LE Periodic Advertising Create - * Sync command at a time. If the command is pending now, don't do - * anything. We check for pending connections after each PA Sync - * Established event. - * - * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E - * page 2493: - * - * If the Host issues this command when another HCI_LE_Periodic_ - * Advertising_Create_Sync command is pending, the Controller shall - * return the error code Command Disallowed (0x0C). - */ - list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { - if (test_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags)) - goto unlock; - } - - list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { - if (hci_conn_check_create_pa_sync(conn)) { - struct bt_iso_qos *qos = &conn->iso_qos; - - cp.options = qos->bcast.options; - cp.sid = conn->sid; - cp.addr_type = conn->dst_type; - bacpy(&cp.addr, &conn->dst); - cp.skip = cpu_to_le16(qos->bcast.skip); - cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); - cp.sync_cte_type = qos->bcast.sync_cte_type; - - break; - } - } - -unlock: - rcu_read_unlock(); - - hci_dev_unlock(hdev); - - if (bacmp(&cp.addr, BDADDR_ANY)) { - hci_dev_set_flag(hdev, HCI_PA_SYNC); - set_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); - - err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC, - sizeof(cp), &cp, HCI_CMD_TIMEOUT); - if (!err) - err = hci_update_passive_scan_sync(hdev); - - if (err) { - hci_dev_clear_flag(hdev, HCI_PA_SYNC); - clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); - } - } - - return err; -} - -int hci_pa_create_sync_pending(struct hci_dev *hdev) -{ - /* Queue start pa_create_sync and scan */ - return hci_cmd_sync_queue(hdev, create_pa_sync, - NULL, create_pa_complete); -} - struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos) { struct hci_conn *conn; - conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_SLAVE); + bt_dev_dbg(hdev, "dst %pMR type %d sid %d", dst, dst_type, sid); + + conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_SLAVE); if (IS_ERR(conn)) return conn; @@ -2164,97 +2087,18 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, conn->dst_type = dst_type; conn->sid = sid; conn->state = BT_LISTEN; + conn->conn_timeout = msecs_to_jiffies(qos->bcast.sync_timeout * 10); hci_conn_hold(conn); - hci_pa_create_sync_pending(hdev); + hci_connect_pa_sync(hdev, conn); return conn; } -static bool hci_conn_check_create_big_sync(struct hci_conn *conn) -{ - if (!conn->num_bis) - return false; - - return true; -} - -static void big_create_sync_complete(struct hci_dev *hdev, void *data, int err) -{ - bt_dev_dbg(hdev, ""); - - if (err) - bt_dev_err(hdev, "Unable to create BIG sync: %d", err); -} - -static int big_create_sync(struct hci_dev *hdev, void *data) -{ - DEFINE_FLEX(struct hci_cp_le_big_create_sync, pdu, bis, num_bis, 0x11); - struct hci_conn *conn; - - rcu_read_lock(); - - pdu->num_bis = 0; - - /* The spec allows only one pending LE BIG Create Sync command at - * a time. If the command is pending now, don't do anything. We - * check for pending connections after each BIG Sync Established - * event. - * - * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E - * page 2586: - * - * If the Host sends this command when the Controller is in the - * process of synchronizing to any BIG, i.e. the HCI_LE_BIG_Sync_ - * Established event has not been generated, the Controller shall - * return the error code Command Disallowed (0x0C). - */ - list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { - if (test_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags)) - goto unlock; - } - - list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { - if (hci_conn_check_create_big_sync(conn)) { - struct bt_iso_qos *qos = &conn->iso_qos; - - set_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); - - pdu->handle = qos->bcast.big; - pdu->sync_handle = cpu_to_le16(conn->sync_handle); - pdu->encryption = qos->bcast.encryption; - memcpy(pdu->bcode, qos->bcast.bcode, - sizeof(pdu->bcode)); - pdu->mse = qos->bcast.mse; - pdu->timeout = cpu_to_le16(qos->bcast.timeout); - pdu->num_bis = conn->num_bis; - memcpy(pdu->bis, conn->bis, conn->num_bis); - - break; - } - } - -unlock: - rcu_read_unlock(); - - if (!pdu->num_bis) - return 0; - - return hci_send_cmd(hdev, HCI_OP_LE_BIG_CREATE_SYNC, - struct_size(pdu, bis, pdu->num_bis), pdu); -} - -int hci_le_big_create_sync_pending(struct hci_dev *hdev) -{ - /* Queue big_create_sync */ - return hci_cmd_sync_queue_once(hdev, big_create_sync, - NULL, big_create_sync_complete); -} - -int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, - struct bt_iso_qos *qos, - __u16 sync_handle, __u8 num_bis, __u8 bis[]) +int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, + struct bt_iso_qos *qos, __u16 sync_handle, + __u8 num_bis, __u8 bis[]) { int err; @@ -2271,9 +2115,10 @@ int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, hcon->num_bis = num_bis; memcpy(hcon->bis, bis, num_bis); + hcon->conn_timeout = msecs_to_jiffies(qos->bcast.timeout * 10); } - return hci_le_big_create_sync_pending(hdev); + return hci_connect_big_sync(hdev, hcon); } static void create_big_complete(struct hci_dev *hdev, void *data, int err) @@ -2383,7 +2228,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, * the start periodic advertising and create BIG commands have * been queued */ - hci_conn_hash_list_state(hdev, bis_mark_per_adv, ISO_LINK, + hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ @@ -3064,3 +2909,184 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason) */ return hci_cmd_sync_run_once(hdev, abort_conn_sync, conn, NULL); } + +void hci_setup_tx_timestamp(struct sk_buff *skb, size_t key_offset, + const struct sockcm_cookie *sockc) +{ + struct sock *sk = skb ? skb->sk : NULL; + int key; + + /* This shall be called on a single skb of those generated by user + * sendmsg(), and only when the sendmsg() does not return error to + * user. This is required for keeping the tskey that increments here in + * sync with possible sendmsg() counting by user. + * + * Stream sockets shall set key_offset to sendmsg() length in bytes + * and call with the last fragment, others to 1 and first fragment. + */ + + if (!skb || !sockc || !sk || !key_offset) + return; + + sock_tx_timestamp(sk, sockc, &skb_shinfo(skb)->tx_flags); + + if (sk->sk_type == SOCK_STREAM) + key = atomic_add_return(key_offset, &sk->sk_tskey); + + if (sockc->tsflags & SOF_TIMESTAMPING_OPT_ID && + sockc->tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) { + if (sockc->tsflags & SOCKCM_FLAG_TS_OPT_ID) { + skb_shinfo(skb)->tskey = sockc->ts_opt_id; + } else { + if (sk->sk_type != SOCK_STREAM) + key = atomic_inc_return(&sk->sk_tskey); + skb_shinfo(skb)->tskey = key - 1; + } + } +} + +void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb) +{ + struct tx_queue *comp = &conn->tx_q; + bool track = false; + + /* Emit SND now, ie. just before sending to driver */ + if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP) + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SND); + + /* COMPLETION tstamp is emitted for tracked skb later in Number of + * Completed Packets event. Available only for flow controlled cases. + * + * TODO: SCO support without flowctl (needs to be done in drivers) + */ + switch (conn->type) { + case CIS_LINK: + case BIS_LINK: + case ACL_LINK: + case LE_LINK: + break; + case SCO_LINK: + case ESCO_LINK: + if (!hci_dev_test_flag(conn->hdev, HCI_SCO_FLOWCTL)) + return; + break; + default: + return; + } + + if (skb->sk && (skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP)) + track = true; + + /* If nothing is tracked, just count extra skbs at the queue head */ + if (!track && !comp->tracked) { + comp->extra++; + return; + } + + if (track) { + skb = skb_clone_sk(skb); + if (!skb) + goto count_only; + + comp->tracked++; + } else { + skb = skb_clone(skb, GFP_KERNEL); + if (!skb) + goto count_only; + } + + skb_queue_tail(&comp->queue, skb); + return; + +count_only: + /* Stop tracking skbs, and only count. This will not emit timestamps for + * the packets, but if we get here something is more seriously wrong. + */ + comp->tracked = 0; + comp->extra += skb_queue_len(&comp->queue) + 1; + skb_queue_purge(&comp->queue); +} + +void hci_conn_tx_dequeue(struct hci_conn *conn) +{ + struct tx_queue *comp = &conn->tx_q; + struct sk_buff *skb; + + /* If there are tracked skbs, the counted extra go before dequeuing real + * skbs, to keep ordering. When nothing is tracked, the ordering doesn't + * matter so dequeue real skbs first to get rid of them ASAP. + */ + if (comp->extra && (comp->tracked || skb_queue_empty(&comp->queue))) { + comp->extra--; + return; + } + + skb = skb_dequeue(&comp->queue); + if (!skb) + return; + + if (skb->sk) { + comp->tracked--; + __skb_tstamp_tx(skb, NULL, NULL, skb->sk, + SCM_TSTAMP_COMPLETION); + } + + kfree_skb(skb); +} + +u8 *hci_conn_key_enc_size(struct hci_conn *conn) +{ + if (conn->type == ACL_LINK) { + struct link_key *key; + + key = hci_find_link_key(conn->hdev, &conn->dst); + if (!key) + return NULL; + + return &key->pin_len; + } else if (conn->type == LE_LINK) { + struct smp_ltk *ltk; + + ltk = hci_find_ltk(conn->hdev, &conn->dst, conn->dst_type, + conn->role); + if (!ltk) + return NULL; + + return <k->enc_size; + } + + return NULL; +} + +int hci_ethtool_ts_info(unsigned int index, int sk_proto, + struct kernel_ethtool_ts_info *info) +{ + struct hci_dev *hdev; + + hdev = hci_dev_get(index); + if (!hdev) + return -ENODEV; + + info->so_timestamping = + SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + info->phc_index = -1; + info->tx_types = BIT(HWTSTAMP_TX_OFF); + info->rx_filters = BIT(HWTSTAMP_FILTER_NONE); + + switch (sk_proto) { + case BTPROTO_ISO: + case BTPROTO_L2CAP: + info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE; + info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION; + break; + case BTPROTO_SCO: + info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE; + if (hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) + info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION; + break; + } + + hci_dev_put(hdev); + return 0; +} diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 18ab5628f85a..3b49828160b7 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -57,6 +57,7 @@ DEFINE_RWLOCK(hci_dev_list_lock); /* HCI callback list */ LIST_HEAD(hci_cb_list); +DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); @@ -1456,8 +1457,8 @@ static void hci_cmd_timeout(struct work_struct *work) bt_dev_err(hdev, "command tx timeout"); } - if (hdev->cmd_timeout) - hdev->cmd_timeout(hdev); + if (hdev->reset) + hdev->reset(hdev); atomic_set(&hdev->cmd_cnt, 1); queue_work(hdev->workqueue, &hdev->cmd_work); @@ -2181,26 +2182,6 @@ int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, return 0; } -int hci_bdaddr_list_del_with_flags(struct list_head *list, bdaddr_t *bdaddr, - u8 type) -{ - struct bdaddr_list_with_flags *entry; - - if (!bacmp(bdaddr, BDADDR_ANY)) { - hci_bdaddr_list_clear(list); - return 0; - } - - entry = hci_bdaddr_list_lookup_with_flags(list, bdaddr, type); - if (!entry) - return -ENOENT; - - list_del(&entry->list); - kfree(entry); - - return 0; -} - /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) @@ -2917,12 +2898,13 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) break; case HCI_ACLDATA_PKT: /* Detect if ISO packet has been sent as ACL */ - if (hci_conn_num(hdev, ISO_LINK)) { + if (hci_conn_num(hdev, CIS_LINK) || + hci_conn_num(hdev, BIS_LINK)) { __u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle); __u8 type; type = hci_conn_lookup_type(hdev, hci_handle(handle)); - if (type == ISO_LINK) + if (type == CIS_LINK || type == BIS_LINK) hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; } break; @@ -2930,6 +2912,8 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) break; case HCI_ISODATA_PKT: break; + case HCI_DRV_PKT: + break; default: kfree_skb(skb); return -EINVAL; @@ -2992,7 +2976,9 @@ int hci_register_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); - list_add_tail_rcu(&cb->list, &hci_cb_list); + mutex_lock(&hci_cb_list_lock); + list_add_tail(&cb->list, &hci_cb_list); + mutex_unlock(&hci_cb_list_lock); return 0; } @@ -3002,8 +2988,9 @@ int hci_unregister_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); - list_del_rcu(&cb->list); - synchronize_rcu(); + mutex_lock(&hci_cb_list_lock); + list_del(&cb->list); + mutex_unlock(&hci_cb_list_lock); return 0; } @@ -3035,6 +3022,15 @@ static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) return -EINVAL; } + if (hci_skb_pkt_type(skb) == HCI_DRV_PKT) { + /* Intercept HCI Drv packet here and don't go with hdev->send + * callback. + */ + err = hci_drv_process_cmd(hdev, skb); + kfree_skb(skb); + return err; + } + err = hdev->send(hdev, skb); if (err < 0) { bt_dev_err(hdev, "sending frame failed (%d)", err); @@ -3045,6 +3041,13 @@ static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) return 0; } +static int hci_send_conn_frame(struct hci_dev *hdev, struct hci_conn *conn, + struct sk_buff *skb) +{ + hci_conn_tx_queue(conn, skb); + return hci_send_frame(hdev, skb); +} + /* Send HCI command */ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, const void *param) @@ -3354,7 +3357,8 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) case LE_LINK: cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: cnt = hdev->iso_mtu ? hdev->iso_cnt : hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; @@ -3368,7 +3372,7 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) } static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, - int *quote) + __u8 type2, int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn = NULL, *c; @@ -3380,7 +3384,8 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != type || skb_queue_empty(&c->data_q)) + if ((c->type != type && c->type != type2) || + skb_queue_empty(&c->data_q)) continue; if (c->state != BT_CONNECTED && c->state != BT_CONFIG) @@ -3568,51 +3573,45 @@ static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) } /* Schedule SCO */ -static void hci_sched_sco(struct hci_dev *hdev) +static void hci_sched_sco(struct hci_dev *hdev, __u8 type) { struct hci_conn *conn; struct sk_buff *skb; - int quote; + int quote, *cnt; + unsigned int pkts = hdev->sco_pkts; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "type %u", type); - if (!hci_conn_num(hdev, SCO_LINK)) + if (!hci_conn_num(hdev, type) || !pkts) return; - while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, "e))) { - while (quote-- && (skb = skb_dequeue(&conn->data_q))) { - BT_DBG("skb %p len %d", skb, skb->len); - hci_send_frame(hdev, skb); - - conn->sent++; - if (conn->sent == ~0) - conn->sent = 0; - } - } -} - -static void hci_sched_esco(struct hci_dev *hdev) -{ - struct hci_conn *conn; - struct sk_buff *skb; - int quote; - - BT_DBG("%s", hdev->name); - - if (!hci_conn_num(hdev, ESCO_LINK)) - return; + /* Use sco_pkts if flow control has not been enabled which will limit + * the amount of buffer sent in a row. + */ + if (!hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) + cnt = &pkts; + else + cnt = &hdev->sco_cnt; - while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK, - "e))) { + while (*cnt && (conn = hci_low_sent(hdev, type, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); - hci_send_frame(hdev, skb); + hci_send_conn_frame(hdev, conn, skb); conn->sent++; if (conn->sent == ~0) conn->sent = 0; + (*cnt)--; } } + + /* Rescheduled if all packets were sent and flow control is not enabled + * as there could be more packets queued that could not be sent and + * since no HCI_EV_NUM_COMP_PKTS event will be generated the reschedule + * needs to be forced. + */ + if (!pkts && !hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL)) + queue_work(hdev->workqueue, &hdev->tx_work); } static void hci_sched_acl_pkt(struct hci_dev *hdev) @@ -3640,7 +3639,7 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev) hci_conn_enter_active_mode(chan->conn, bt_cb(skb)->force_active); - hci_send_frame(hdev, skb); + hci_send_conn_frame(hdev, chan->conn, skb); hdev->acl_last_tx = jiffies; hdev->acl_cnt--; @@ -3648,8 +3647,8 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev) chan->conn->sent++; /* Send pending SCO packets right away */ - hci_sched_sco(hdev); - hci_sched_esco(hdev); + hci_sched_sco(hdev, SCO_LINK); + hci_sched_sco(hdev, ESCO_LINK); } } @@ -3696,7 +3695,7 @@ static void hci_sched_le(struct hci_dev *hdev) skb = skb_dequeue(&chan->data_q); - hci_send_frame(hdev, skb); + hci_send_conn_frame(hdev, chan->conn, skb); hdev->le_last_tx = jiffies; (*cnt)--; @@ -3704,8 +3703,8 @@ static void hci_sched_le(struct hci_dev *hdev) chan->conn->sent++; /* Send pending SCO packets right away */ - hci_sched_sco(hdev); - hci_sched_esco(hdev); + hci_sched_sco(hdev, SCO_LINK); + hci_sched_sco(hdev, ESCO_LINK); } } @@ -3722,15 +3721,17 @@ static void hci_sched_iso(struct hci_dev *hdev) BT_DBG("%s", hdev->name); - if (!hci_conn_num(hdev, ISO_LINK)) + if (!hci_conn_num(hdev, CIS_LINK) && + !hci_conn_num(hdev, BIS_LINK)) return; cnt = hdev->iso_pkts ? &hdev->iso_cnt : hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; - while (*cnt && (conn = hci_low_sent(hdev, ISO_LINK, "e))) { + while (*cnt && (conn = hci_low_sent(hdev, CIS_LINK, BIS_LINK, + "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); - hci_send_frame(hdev, skb); + hci_send_conn_frame(hdev, conn, skb); conn->sent++; if (conn->sent == ~0) @@ -3750,8 +3751,8 @@ static void hci_tx_work(struct work_struct *work) if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { /* Schedule queues and send stuff to HCI driver */ - hci_sched_sco(hdev); - hci_sched_esco(hdev); + hci_sched_sco(hdev, SCO_LINK); + hci_sched_sco(hdev, ESCO_LINK); hci_sched_iso(hdev); hci_sched_acl(hdev); hci_sched_le(hdev); @@ -4072,10 +4073,13 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) return; } - err = hci_send_frame(hdev, skb); - if (err < 0) { - hci_cmd_sync_cancel_sync(hdev, -err); - return; + if (hci_skb_opcode(skb) != HCI_OP_NOP) { + err = hci_send_frame(hdev, skb); + if (err < 0) { + hci_cmd_sync_cancel_sync(hdev, -err); + return; + } + atomic_dec(&hdev->cmd_cnt); } if (hdev->req_status == HCI_REQ_PEND && @@ -4083,8 +4087,6 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(hdev->req_skb); hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); } - - atomic_dec(&hdev->cmd_cnt); } static void hci_cmd_work(struct work_struct *work) diff --git a/net/bluetooth/hci_drv.c b/net/bluetooth/hci_drv.c new file mode 100644 index 000000000000..3dd2d8a006b9 --- /dev/null +++ b/net/bluetooth/hci_drv.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025 Google Corporation + */ + +#include <linux/skbuff.h> +#include <linux/types.h> + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci.h> +#include <net/bluetooth/hci_core.h> +#include <net/bluetooth/hci_drv.h> + +int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status) +{ + struct hci_drv_ev_hdr *hdr; + struct hci_drv_ev_cmd_status *ev; + struct sk_buff *skb; + + skb = bt_skb_alloc(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->opcode = __cpu_to_le16(HCI_DRV_EV_CMD_STATUS); + hdr->len = __cpu_to_le16(sizeof(*ev)); + + ev = skb_put(skb, sizeof(*ev)); + ev->opcode = __cpu_to_le16(cmd); + ev->status = status; + + hci_skb_pkt_type(skb) = HCI_DRV_PKT; + + return hci_recv_frame(hdev, skb); +} +EXPORT_SYMBOL(hci_drv_cmd_status); + +int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp, + size_t rp_len) +{ + struct hci_drv_ev_hdr *hdr; + struct hci_drv_ev_cmd_complete *ev; + struct sk_buff *skb; + + skb = bt_skb_alloc(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->opcode = __cpu_to_le16(HCI_DRV_EV_CMD_COMPLETE); + hdr->len = __cpu_to_le16(sizeof(*ev) + rp_len); + + ev = skb_put(skb, sizeof(*ev)); + ev->opcode = __cpu_to_le16(cmd); + ev->status = status; + + skb_put_data(skb, rp, rp_len); + + hci_skb_pkt_type(skb) = HCI_DRV_PKT; + + return hci_recv_frame(hdev, skb); +} +EXPORT_SYMBOL(hci_drv_cmd_complete); + +int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_drv_cmd_hdr *hdr; + const struct hci_drv_handler *handler = NULL; + u16 opcode, len, ogf, ocf; + + hdr = skb_pull_data(skb, sizeof(*hdr)); + if (!hdr) + return -EILSEQ; + + opcode = __le16_to_cpu(hdr->opcode); + len = __le16_to_cpu(hdr->len); + if (len != skb->len) + return -EILSEQ; + + ogf = hci_opcode_ogf(opcode); + ocf = hci_opcode_ocf(opcode); + + if (!hdev->hci_drv) + return hci_drv_cmd_status(hdev, opcode, + HCI_DRV_STATUS_UNKNOWN_COMMAND); + + if (ogf != HCI_DRV_OGF_DRIVER_SPECIFIC) { + if (opcode < hdev->hci_drv->common_handler_count) + handler = &hdev->hci_drv->common_handlers[opcode]; + } else { + if (ocf < hdev->hci_drv->specific_handler_count) + handler = &hdev->hci_drv->specific_handlers[ocf]; + } + + if (!handler || !handler->func) + return hci_drv_cmd_status(hdev, opcode, + HCI_DRV_STATUS_UNKNOWN_COMMAND); + + if (len != handler->data_len) + return hci_drv_cmd_status(hdev, opcode, + HCI_DRV_STATUS_INVALID_PARAMETERS); + + return handler->func(hdev, skb->data, len); +} +EXPORT_SYMBOL(hci_drv_process_cmd); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 2cc7a9306350..66052d6aaa1d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -151,7 +151,7 @@ static u8 hci_cc_exit_periodic_inq(struct hci_dev *hdev, void *data, static u8 hci_cc_remote_name_req_cancel(struct hci_dev *hdev, void *data, struct sk_buff *skb) { - struct hci_ev_status *rp = data; + struct hci_rp_remote_name_req_cancel *rp = data; bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); @@ -739,10 +739,17 @@ static u8 hci_cc_read_enc_key_size(struct hci_dev *hdev, void *data, handle); conn->enc_key_size = 0; } else { + u8 *key_enc_size = hci_conn_key_enc_size(conn); + conn->enc_key_size = rp->key_size; status = 0; - if (conn->enc_key_size < hdev->min_enc_key_size) { + /* Attempt to check if the key size is too small or if it has + * been downgraded from the last time it was stored as part of + * the link_key. + */ + if (conn->enc_key_size < hdev->min_enc_key_size || + (key_enc_size && conn->enc_key_size < *key_enc_size)) { /* As slave role, the conn->state has been set to * BT_CONNECTED and l2cap conn req might not be received * yet, at this moment the l2cap layer almost does @@ -755,6 +762,10 @@ static u8 hci_cc_read_enc_key_size(struct hci_dev *hdev, void *data, clear_bit(HCI_CONN_ENCRYPT, &conn->flags); clear_bit(HCI_CONN_AES_CCM, &conn->flags); } + + /* Update the key encryption size with the connection one */ + if (key_enc_size && *key_enc_size != conn->enc_key_size) + *key_enc_size = conn->enc_key_size; } hci_encrypt_cfm(conn, status); @@ -930,6 +941,9 @@ static u8 hci_cc_read_buffer_size(struct hci_dev *hdev, void *data, hdev->sco_pkts = 8; } + if (!read_voice_setting_capable(hdev)) + hdev->sco_pkts = 0; + hdev->acl_cnt = hdev->acl_pkts; hdev->sco_cnt = hdev->sco_pkts; @@ -3062,6 +3076,34 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, void *edata, hci_dev_unlock(hdev); } +static int hci_read_enc_key_size(struct hci_dev *hdev, struct hci_conn *conn) +{ + struct hci_cp_read_enc_key_size cp; + u8 *key_enc_size = hci_conn_key_enc_size(conn); + + if (!read_key_size_capable(hdev)) { + conn->enc_key_size = HCI_LINK_KEY_SIZE; + return -EOPNOTSUPP; + } + + bt_dev_dbg(hdev, "hcon %p", conn); + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(conn->handle); + + /* If the key enc_size is already known, use it as conn->enc_key_size, + * otherwise use hdev->min_enc_key_size so the likes of + * l2cap_check_enc_key_size don't fail while waiting for + * HCI_OP_READ_ENC_KEY_SIZE response. + */ + if (key_enc_size && *key_enc_size) + conn->enc_key_size = *key_enc_size; + else + conn->enc_key_size = hdev->min_enc_key_size; + + return hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp); +} + static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -3154,23 +3196,11 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, if (ev->encr_mode == 1 && !test_bit(HCI_CONN_ENCRYPT, &conn->flags) && ev->link_type == ACL_LINK) { struct link_key *key; - struct hci_cp_read_enc_key_size cp; key = hci_find_link_key(hdev, &ev->bdaddr); if (key) { set_bit(HCI_CONN_ENCRYPT, &conn->flags); - - if (!read_key_size_capable(hdev)) { - conn->enc_key_size = HCI_LINK_KEY_SIZE; - } else { - cp.handle = cpu_to_le16(conn->handle); - if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, - sizeof(cp), &cp)) { - bt_dev_err(hdev, "sending read key size failed"); - conn->enc_key_size = HCI_LINK_KEY_SIZE; - } - } - + hci_read_enc_key_size(hdev, conn); hci_encrypt_cfm(conn, ev->status); } } @@ -3391,23 +3421,30 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, void *data, hci_update_scan(hdev); } - params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); - if (params) { - switch (params->auto_connect) { - case HCI_AUTO_CONN_LINK_LOSS: - if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT) + /* Re-enable passive scanning if disconnected device is marked + * as auto-connectable. + */ + if (conn->type == LE_LINK) { + params = hci_conn_params_lookup(hdev, &conn->dst, + conn->dst_type); + if (params) { + switch (params->auto_connect) { + case HCI_AUTO_CONN_LINK_LOSS: + if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT) + break; + fallthrough; + + case HCI_AUTO_CONN_DIRECT: + case HCI_AUTO_CONN_ALWAYS: + hci_pend_le_list_del_init(params); + hci_pend_le_list_add(params, + &hdev->pend_le_conns); + hci_update_passive_scan(hdev); break; - fallthrough; - case HCI_AUTO_CONN_DIRECT: - case HCI_AUTO_CONN_ALWAYS: - hci_pend_le_list_del_init(params); - hci_pend_le_list_add(params, &hdev->pend_le_conns); - hci_update_passive_scan(hdev); - break; - - default: - break; + default: + break; + } } } @@ -3602,24 +3639,8 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data, /* Try reading the encryption key size for encrypted ACL links */ if (!ev->status && ev->encrypt && conn->type == ACL_LINK) { - struct hci_cp_read_enc_key_size cp; - - /* Only send HCI_Read_Encryption_Key_Size if the - * controller really supports it. If it doesn't, assume - * the default size (16). - */ - if (!read_key_size_capable(hdev)) { - conn->enc_key_size = HCI_LINK_KEY_SIZE; + if (hci_read_enc_key_size(hdev, conn)) goto notify; - } - - cp.handle = cpu_to_le16(conn->handle); - if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, - sizeof(cp), &cp)) { - bt_dev_err(hdev, "sending read key size failed"); - conn->enc_key_size = HCI_LINK_KEY_SIZE; - goto notify; - } goto unlock; } @@ -3783,7 +3804,7 @@ static void hci_unbound_cis_failed(struct hci_dev *hdev, u8 cig, u8 status) lockdep_assert_held(&hdev->lock); list_for_each_entry_safe(conn, tmp, &hdev->conn_hash.list, list) { - if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY) || + if (conn->type != CIS_LINK || conn->state == BT_OPEN || conn->iso_qos.ucast.cig != cig) continue; @@ -4005,8 +4026,8 @@ static const struct hci_cc { HCI_CC_STATUS(HCI_OP_INQUIRY_CANCEL, hci_cc_inquiry_cancel), HCI_CC_STATUS(HCI_OP_PERIODIC_INQ, hci_cc_periodic_inq), HCI_CC_STATUS(HCI_OP_EXIT_PERIODIC_INQ, hci_cc_exit_periodic_inq), - HCI_CC_STATUS(HCI_OP_REMOTE_NAME_REQ_CANCEL, - hci_cc_remote_name_req_cancel), + HCI_CC(HCI_OP_REMOTE_NAME_REQ_CANCEL, hci_cc_remote_name_req_cancel, + sizeof(struct hci_rp_remote_name_req_cancel)), HCI_CC(HCI_OP_ROLE_DISCOVERY, hci_cc_role_discovery, sizeof(struct hci_rp_role_discovery)), HCI_CC(HCI_OP_READ_LINK_POLICY, hci_cc_read_link_policy, @@ -4405,6 +4426,7 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, struct hci_comp_pkts_info *info = &ev->handles[i]; struct hci_conn *conn; __u16 handle, count; + unsigned int i; handle = __le16_to_cpu(info->handle); count = __le16_to_cpu(info->count); @@ -4415,6 +4437,9 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, conn->sent -= count; + for (i = 0; i < count; ++i) + hci_conn_tx_dequeue(conn); + switch (conn->type) { case ACL_LINK: hdev->acl_cnt += count; @@ -4435,12 +4460,15 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, break; case SCO_LINK: + case ESCO_LINK: hdev->sco_cnt += count; if (hdev->sco_cnt > hdev->sco_pkts) hdev->sco_cnt = hdev->sco_pkts; + break; - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: if (hdev->iso_pkts) { hdev->iso_cnt += count; if (hdev->iso_cnt > hdev->iso_pkts) @@ -6044,8 +6072,17 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, * a LE Direct Advertising Report event. In that case it is * important to see if the address is matching the local * controller address. + * + * If local privacy is not enable the controller shall not be + * generating such event since according to its documentation it is only + * valid for filter_policy 0x02 and 0x03, but the fact that it did + * generate LE Direct Advertising Report means it is probably broken and + * won't generate any other event which can potentially break + * auto-connect logic so in case local privacy is not enable this + * ignores the direct_addr so it works as a regular report. */ - if (!hci_dev_test_flag(hdev, HCI_MESH) && direct_addr) { + if (!hci_dev_test_flag(hdev, HCI_MESH) && direct_addr && + hci_dev_test_flag(hdev, HCI_PRIVACY)) { direct_addr_type = ev_bdaddr_type(hdev, direct_addr_type, &bdaddr_resolved); @@ -6055,12 +6092,6 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, if (!hci_bdaddr_is_rpa(direct_addr, direct_addr_type)) return; - /* If the controller is not using resolvable random - * addresses, then this report can be ignored. - */ - if (!hci_dev_test_flag(hdev, HCI_PRIVACY)) - return; - /* If the local IRK of the controller does not match * with the resolvable random address provided, then * this report can be ignored. @@ -6141,11 +6172,12 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, * event or send an immediate device found event if the data * should not be stored for later. */ - if (!ext_adv && !has_pending_adv_report(hdev)) { + if (!has_pending_adv_report(hdev)) { /* If the report will trigger a SCAN_REQ store it for * later merging. */ - if (type == LE_ADV_IND || type == LE_ADV_SCAN_IND) { + if (!ext_adv && (type == LE_ADV_IND || + type == LE_ADV_SCAN_IND)) { store_pending_adv_report(hdev, bdaddr, bdaddr_type, rssi, flags, data, len); return; @@ -6320,6 +6352,17 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, info->secondary_phy &= 0x1f; } + /* Check if PA Sync is pending and if the hci_conn SID has not + * been set update it. + */ + if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) { + struct hci_conn *conn; + + conn = hci_conn_hash_lookup_create_pa_sync(hdev); + if (conn && conn->sid == HCI_SID_INVALID) + conn->sid = info->sid; + } + if (legacy_evt_type != LE_ADV_INVALID) { process_adv_report(hdev, legacy_evt_type, &info->bdaddr, info->bdaddr_type, NULL, 0, @@ -6358,8 +6401,7 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, hci_dev_clear_flag(hdev, HCI_PA_SYNC); - conn = hci_conn_hash_lookup_sid(hdev, ev->sid, &ev->bdaddr, - ev->bdaddr_type); + conn = hci_conn_hash_lookup_create_pa_sync(hdev); if (!conn) { bt_dev_err(hdev, "Unable to find connection for dst %pMR sid 0x%2.2x", @@ -6372,7 +6414,8 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, conn->sync_handle = le16_to_cpu(ev->handle); conn->sid = HCI_SID_INVALID; - mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ISO_LINK, &flags); + mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, BIS_LINK, + &flags); if (!(mask & HCI_LM_ACCEPT)) { hci_le_pa_term_sync(hdev, ev->handle); goto unlock; @@ -6382,7 +6425,7 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, goto unlock; /* Add connection to indicate PA sync event */ - pa_sync = hci_conn_add_unset(hdev, ISO_LINK, BDADDR_ANY, + pa_sync = hci_conn_add_unset(hdev, BIS_LINK, BDADDR_ANY, HCI_ROLE_SLAVE); if (IS_ERR(pa_sync)) @@ -6398,9 +6441,6 @@ static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, } unlock: - /* Handle any other pending PA sync command */ - hci_pa_create_sync_pending(hdev); - hci_dev_unlock(hdev); } @@ -6416,7 +6456,7 @@ static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags); + mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, BIS_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) goto unlock; @@ -6700,7 +6740,7 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, goto unlock; } - if (conn->type != ISO_LINK) { + if (conn->type != CIS_LINK) { bt_dev_err(hdev, "Invalid connection link type handle 0x%4.4x", handle); @@ -6818,7 +6858,7 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data, if (!acl) goto unlock; - mask = hci_proto_connect_ind(hdev, &acl->dst, ISO_LINK, &flags); + mask = hci_proto_connect_ind(hdev, &acl->dst, CIS_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) { hci_le_reject_cis(hdev, ev->cis_handle); goto unlock; @@ -6826,8 +6866,8 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data, cis = hci_conn_hash_lookup_handle(hdev, cis_handle); if (!cis) { - cis = hci_conn_add(hdev, ISO_LINK, &acl->dst, HCI_ROLE_SLAVE, - cis_handle); + cis = hci_conn_add(hdev, CIS_LINK, &acl->dst, + HCI_ROLE_SLAVE, cis_handle); if (IS_ERR(cis)) { hci_le_reject_cis(hdev, ev->cis_handle); goto unlock; @@ -6912,7 +6952,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); - if (!hci_le_ev_skb_pull(hdev, skb, HCI_EVT_LE_BIG_SYNC_ESTABILISHED, + if (!hci_le_ev_skb_pull(hdev, skb, HCI_EVT_LE_BIG_SYNC_ESTABLISHED, flex_array_size(ev, bis, ev->num_bis))) return; @@ -6942,7 +6982,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "ignore too large handle %u", handle); continue; } - bis = hci_conn_add(hdev, ISO_LINK, BDADDR_ANY, + bis = hci_conn_add(hdev, BIS_LINK, BDADDR_ANY, HCI_ROLE_SLAVE, handle); if (IS_ERR(bis)) continue; @@ -6983,9 +7023,6 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, } unlock: - /* Handle any other pending BIG sync command */ - hci_le_big_create_sync_pending(hdev); - hci_dev_unlock(hdev); } @@ -7001,7 +7038,7 @@ static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, ISO_LINK, &flags); + mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, BIS_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) goto unlock; @@ -7107,8 +7144,8 @@ static const struct hci_le_ev { hci_le_create_big_complete_evt, sizeof(struct hci_evt_le_create_big_complete), HCI_MAX_EVENT_SIZE), - /* [0x1d = HCI_EV_LE_BIG_SYNC_ESTABILISHED] */ - HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_ESTABILISHED, + /* [0x1d = HCI_EV_LE_BIG_SYNC_ESTABLISHED] */ + HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_ESTABLISHED, hci_le_big_sync_established_evt, sizeof(struct hci_evt_le_big_sync_estabilished), HCI_MAX_EVENT_SIZE), @@ -7131,7 +7168,8 @@ static void hci_le_meta_evt(struct hci_dev *hdev, void *data, /* Only match event if command OGF is for LE */ if (hdev->req_skb && - hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) == 0x08 && + (hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) == 0x08 || + hci_skb_opcode(hdev->req_skb) == HCI_OP_NOP) && hci_skb_event(hdev->req_skb) == ev->subevent) { *opcode = hci_skb_opcode(hdev->req_skb); hci_req_cmd_complete(hdev, *opcode, 0x00, req_complete, @@ -7487,8 +7525,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) goto done; } + hci_dev_lock(hdev); kfree_skb(hdev->recv_event); hdev->recv_event = skb_clone(skb, GFP_KERNEL); + hci_dev_unlock(hdev); event = hdr->evt; if (!event) { diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 022b86797acd..428ee5c7de7e 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -234,7 +234,8 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && - hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT && + hci_skb_pkt_type(skb) != HCI_DRV_PKT) continue; } else { /* Don't send frame to other channel types */ @@ -391,6 +392,12 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb) else opcode = cpu_to_le16(HCI_MON_ISO_TX_PKT); break; + case HCI_DRV_PKT: + if (bt_cb(skb)->incoming) + opcode = cpu_to_le16(HCI_MON_DRV_RX_PKT); + else + opcode = cpu_to_le16(HCI_MON_DRV_TX_PKT); + break; case HCI_DIAG_PKT: opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG); break; @@ -1860,7 +1867,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT && hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT && hci_skb_pkt_type(skb) != HCI_SCODATA_PKT && - hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) { + hci_skb_pkt_type(skb) != HCI_ISODATA_PKT && + hci_skb_pkt_type(skb) != HCI_DRV_PKT) { err = -EINVAL; goto drop; } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 7b2b04d6b856..62d1ff951ebe 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1066,7 +1066,7 @@ int hci_update_random_address_sync(struct hci_dev *hdev, bool require_privacy, /* If Controller supports LL Privacy use own address type is * 0x03 */ - if (use_ll_privacy(hdev)) + if (ll_privacy_capable(hdev)) *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; else *own_addr_type = ADDR_LE_DEV_RANDOM; @@ -1786,30 +1786,6 @@ int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance, HCI_CMD_TIMEOUT, sk); } -static int remove_ext_adv_sync(struct hci_dev *hdev, void *data) -{ - struct adv_info *adv = data; - u8 instance = 0; - - if (adv) - instance = adv->instance; - - return hci_remove_ext_adv_instance_sync(hdev, instance, NULL); -} - -int hci_remove_ext_adv_instance(struct hci_dev *hdev, u8 instance) -{ - struct adv_info *adv = NULL; - - if (instance) { - adv = hci_find_adv_instance(hdev, instance); - if (!adv) - return -EINVAL; - } - - return hci_cmd_sync_queue(hdev, remove_ext_adv_sync, adv, NULL); -} - int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason) { struct hci_cp_le_term_big cp; @@ -1934,7 +1910,7 @@ int hci_schedule_adv_instance_sync(struct hci_dev *hdev, u8 instance, hdev->adv_instance_timeout = timeout; queue_delayed_work(hdev->req_workqueue, &hdev->adv_instance_expire, - msecs_to_jiffies(timeout * 1000)); + secs_to_jiffies(timeout)); } /* If we're just re-scheduling the same instance again then do not @@ -2162,7 +2138,7 @@ static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val, static int hci_le_set_addr_resolution_enable_sync(struct hci_dev *hdev, u8 val) { - if (!use_ll_privacy(hdev)) + if (!ll_privacy_capable(hdev)) return 0; /* If controller is not/already resolving we are done. */ @@ -2254,7 +2230,7 @@ static int hci_le_del_resolve_list_sync(struct hci_dev *hdev, struct hci_cp_le_del_from_resolv_list cp; struct bdaddr_list_with_irk *entry; - if (!use_ll_privacy(hdev)) + if (!ll_privacy_capable(hdev)) return 0; /* Check if the IRK has been programmed */ @@ -2319,7 +2295,7 @@ static int hci_le_add_resolve_list_sync(struct hci_dev *hdev, struct bdaddr_list_with_irk *entry; struct hci_conn_params *p; - if (!use_ll_privacy(hdev)) + if (!ll_privacy_capable(hdev)) return 0; /* Attempt to program local identity address, type and irk if params is @@ -2332,7 +2308,8 @@ static int hci_le_add_resolve_list_sync(struct hci_dev *hdev, hci_copy_identity_address(hdev, &cp.bdaddr, &cp.bdaddr_type); memcpy(cp.peer_irk, hdev->irk, 16); goto done; - } + } else if (!(params->flags & HCI_CONN_FLAG_ADDRESS_RESOLUTION)) + return 0; irk = hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type); if (!irk) @@ -2379,6 +2356,10 @@ static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev, struct hci_cp_le_set_privacy_mode cp; struct smp_irk *irk; + if (!ll_privacy_capable(hdev) || + !(params->flags & HCI_CONN_FLAG_ADDRESS_RESOLUTION)) + return 0; + /* If device privacy mode has already been set there is nothing to do */ if (params->privacy_mode == HCI_DEVICE_PRIVACY) return 0; @@ -2428,11 +2409,6 @@ static int hci_le_add_accept_list_sync(struct hci_dev *hdev, if (*num_entries >= hdev->le_accept_list_size) return -ENOSPC; - /* Accept list can not be used with RPAs */ - if (!use_ll_privacy(hdev) && - hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) - return -EINVAL; - /* Attempt to program the device in the resolving list first to avoid * having to rollback in case it fails since the resolving list is * dynamic it can probably be smaller than the accept list. @@ -2567,7 +2543,7 @@ static int hci_pause_addr_resolution(struct hci_dev *hdev) { int err; - if (!use_ll_privacy(hdev)) + if (!ll_privacy_capable(hdev)) return 0; if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) @@ -2671,12 +2647,12 @@ static int hci_le_clear_accept_list_sync(struct hci_dev *hdev) * * Update is done using the following sequence: * - * use_ll_privacy((Disable Advertising) -> Disable Resolving List) -> + * ll_privacy_capable((Disable Advertising) -> Disable Resolving List) -> * Remove Devices From Accept List -> - * (has IRK && use_ll_privacy(Remove Devices From Resolving List))-> + * (has IRK && ll_privacy_capable(Remove Devices From Resolving List))-> * Add Devices to Accept List -> - * (has IRK && use_ll_privacy(Remove Devices From Resolving List)) -> - * use_ll_privacy(Enable Resolving List -> (Enable Advertising)) -> + * (has IRK && ll_privacy_capable(Remove Devices From Resolving List)) -> + * ll_privacy_capable(Enable Resolving List -> (Enable Advertising)) -> * Enable Scanning * * In case of failure advertising shall be restored to its original state and @@ -2697,7 +2673,7 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) /* Pause advertising if resolving list can be used as controllers * cannot accept resolving list modifications while advertising. */ - if (use_ll_privacy(hdev)) { + if (ll_privacy_capable(hdev)) { err = hci_pause_advertising_sync(hdev); if (err) { bt_dev_err(hdev, "pause advertising failed: %d", err); @@ -2717,16 +2693,16 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) /* Force address filtering if PA Sync is in progress */ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) { - struct hci_cp_le_pa_create_sync *sent; + struct hci_conn *conn; - sent = hci_sent_cmd_data(hdev, HCI_OP_LE_PA_CREATE_SYNC); - if (sent) { + conn = hci_conn_hash_lookup_create_pa_sync(hdev); + if (conn) { struct conn_params pa; memset(&pa, 0, sizeof(pa)); - bacpy(&pa.addr, &sent->addr); - pa.addr_type = sent->addr_type; + bacpy(&pa.addr, &conn->dst); + pa.addr_type = conn->dst_type; /* Clear first since there could be addresses left * behind. @@ -2842,7 +2818,7 @@ done: bt_dev_err(hdev, "Unable to enable LL privacy: %d", err); /* Resume advertising if it was paused */ - if (use_ll_privacy(hdev)) + if (ll_privacy_capable(hdev)) hci_resume_advertising_sync(hdev); /* Select filter policy to use accept list */ @@ -2884,7 +2860,7 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, if (sent) { struct hci_conn *conn; - conn = hci_conn_hash_lookup_ba(hdev, ISO_LINK, + conn = hci_conn_hash_lookup_ba(hdev, BIS_LINK, &sent->bdaddr); if (conn) { struct bt_iso_qos *qos = &conn->iso_qos; @@ -3100,7 +3076,7 @@ static int hci_passive_scan_sync(struct hci_dev *hdev) * If there are devices to scan: * * Disable Scanning -> Update Accept List -> - * use_ll_privacy((Disable Advertising) -> Disable Resolving List -> + * ll_privacy_capable((Disable Advertising) -> Disable Resolving List -> * Update Resolving List -> Enable Resolving List -> (Enable Advertising)) -> * Enable Scanning * @@ -3454,7 +3430,7 @@ int hci_update_name_sync(struct hci_dev *hdev) * * HCI_SSP_ENABLED(Enable SSP) * HCI_LE_ENABLED(Enable LE) - * HCI_LE_ENABLED(use_ll_privacy(Add local IRK to Resolving List) -> + * HCI_LE_ENABLED(ll_privacy_capable(Add local IRK to Resolving List) -> * Update adv data) * Enable Authentication * lmp_bredr_capable(Set Fast Connectable -> Set Scan Type -> Set Class -> @@ -3720,6 +3696,9 @@ static int hci_read_local_name_sync(struct hci_dev *hdev) /* Read Voice Setting */ static int hci_read_voice_setting_sync(struct hci_dev *hdev) { + if (!read_voice_setting_capable(hdev)) + return 0; + return __hci_cmd_sync_status(hdev, HCI_OP_READ_VOICE_SETTING, 0, NULL, HCI_CMD_TIMEOUT); } @@ -3790,6 +3769,28 @@ static int hci_write_ca_timeout_sync(struct hci_dev *hdev) sizeof(param), ¶m, HCI_CMD_TIMEOUT); } +/* Enable SCO flow control if supported */ +static int hci_write_sync_flowctl_sync(struct hci_dev *hdev) +{ + struct hci_cp_write_sync_flowctl cp; + int err; + + /* Check if the controller supports SCO and HCI_OP_WRITE_SYNC_FLOWCTL */ + if (!lmp_sco_capable(hdev) || !(hdev->commands[10] & BIT(4)) || + !test_bit(HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, &hdev->quirks)) + return 0; + + memset(&cp, 0, sizeof(cp)); + cp.enable = 0x01; + + err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SYNC_FLOWCTL, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); + if (!err) + hci_dev_set_flag(hdev, HCI_SCO_FLOWCTL); + + return err; +} + /* BR Controller init stage 2 command sequence */ static const struct hci_init_stage br_init2[] = { /* HCI_OP_READ_BUFFER_SIZE */ @@ -3808,6 +3809,8 @@ static const struct hci_init_stage br_init2[] = { HCI_INIT(hci_clear_event_filter_sync), /* HCI_OP_WRITE_CA_TIMEOUT */ HCI_INIT(hci_write_ca_timeout_sync), + /* HCI_OP_WRITE_SYNC_FLOWCTL */ + HCI_INIT(hci_write_sync_flowctl_sync), {} }; @@ -4153,7 +4156,8 @@ static int hci_read_page_scan_type_sync(struct hci_dev *hdev) * support the Read Page Scan Type command. Check support for * this command in the bit mask of supported commands. */ - if (!(hdev->commands[13] & 0x01)) + if (!(hdev->commands[13] & 0x01) || + test_bit(HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, &hdev->quirks)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_TYPE, @@ -4229,6 +4233,14 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) if (use_enhanced_conn_complete(hdev)) events[1] |= 0x02; /* LE Enhanced Connection Complete */ + /* Mark Device Privacy if Privacy Mode is supported */ + if (privacy_mode_capable(hdev)) + hdev->conn_flags |= HCI_CONN_FLAG_DEVICE_PRIVACY; + + /* Mark Address Resolution if LL Privacy is supported */ + if (ll_privacy_capable(hdev)) + hdev->conn_flags |= HCI_CONN_FLAG_ADDRESS_RESOLUTION; + /* If the controller supports Extended Scanner Filter * Policies, enable the corresponding event. */ @@ -5385,7 +5397,7 @@ int hci_stop_discovery_sync(struct hci_dev *hdev) } /* Resume advertising if it was paused */ - if (use_ll_privacy(hdev)) + if (ll_privacy_capable(hdev)) hci_resume_advertising_sync(hdev); /* No further actions needed for LE-only discovery */ @@ -5465,7 +5477,7 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, if (conn->type == LE_LINK) return hci_le_connect_cancel_sync(hdev, conn, reason); - if (conn->type == ISO_LINK) { + if (conn->type == CIS_LINK) { /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 1857: * @@ -5478,9 +5490,10 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, return hci_disconnect_sync(hdev, conn, reason); /* CIS with no Create CIS sent have nothing to cancel */ - if (bacmp(&conn->dst, BDADDR_ANY)) - return HCI_ERROR_LOCAL_HOST_TERM; + return HCI_ERROR_LOCAL_HOST_TERM; + } + if (conn->type == BIS_LINK) { /* There is no way to cancel a BIS without terminating the BIG * which is done later on connection cleanup. */ @@ -5542,9 +5555,12 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, { struct hci_cp_reject_conn_req cp; - if (conn->type == ISO_LINK) + if (conn->type == CIS_LINK) return hci_le_reject_cis_sync(hdev, conn, reason); + if (conn->type == BIS_LINK) + return -EINVAL; + if (conn->type == SCO_LINK || conn->type == ESCO_LINK) return hci_reject_sco_sync(hdev, conn, reason); @@ -5897,7 +5913,7 @@ static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval) failed: /* Resume advertising if it was paused */ - if (use_ll_privacy(hdev)) + if (ll_privacy_capable(hdev)) hci_resume_advertising_sync(hdev); /* Resume passive scanning */ @@ -6673,7 +6689,7 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, /* If Controller supports LL Privacy use own address type is * 0x03 */ - if (use_ll_privacy(hdev)) + if (ll_privacy_capable(hdev)) *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; else *own_addr_type = ADDR_LE_DEV_RANDOM; @@ -6883,3 +6899,182 @@ int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn, return __hci_cmd_sync_status(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } + +static void create_pa_complete(struct hci_dev *hdev, void *data, int err) +{ + struct hci_conn *conn = data; + struct hci_conn *pa_sync; + + bt_dev_dbg(hdev, "err %d", err); + + if (err == -ECANCELED) + return; + + hci_dev_lock(hdev); + + hci_dev_clear_flag(hdev, HCI_PA_SYNC); + + if (!hci_conn_valid(hdev, conn)) + clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); + + if (!err) + goto unlock; + + /* Add connection to indicate PA sync error */ + pa_sync = hci_conn_add_unset(hdev, BIS_LINK, BDADDR_ANY, + HCI_ROLE_SLAVE); + + if (IS_ERR(pa_sync)) + goto unlock; + + set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags); + + /* Notify iso layer */ + hci_connect_cfm(pa_sync, bt_status(err)); + +unlock: + hci_dev_unlock(hdev); +} + +static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) +{ + struct hci_cp_le_pa_create_sync cp; + struct hci_conn *conn = data; + struct bt_iso_qos *qos = &conn->iso_qos; + int err; + + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; + + if (conn->sync_handle != HCI_SYNC_HANDLE_INVALID) + return -EINVAL; + + if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC)) + return -EBUSY; + + /* Stop scanning if SID has not been set and active scanning is enabled + * so we use passive scanning which will be scanning using the allow + * list programmed to contain only the connection address. + */ + if (conn->sid == HCI_SID_INVALID && + hci_dev_test_flag(hdev, HCI_LE_SCAN)) { + hci_scan_disable_sync(hdev); + hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED); + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + } + + /* Mark HCI_CONN_CREATE_PA_SYNC so hci_update_passive_scan_sync can + * program the address in the allow list so PA advertisements can be + * received. + */ + set_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); + + hci_update_passive_scan_sync(hdev); + + /* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update + * it. + */ + if (conn->sid == HCI_SID_INVALID) + __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, + HCI_EV_LE_EXT_ADV_REPORT, + conn->conn_timeout, NULL); + + memset(&cp, 0, sizeof(cp)); + cp.options = qos->bcast.options; + cp.sid = conn->sid; + cp.addr_type = conn->dst_type; + bacpy(&cp.addr, &conn->dst); + cp.skip = cpu_to_le16(qos->bcast.skip); + cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); + cp.sync_cte_type = qos->bcast.sync_cte_type; + + /* The spec allows only one pending LE Periodic Advertising Create + * Sync command at a time so we forcefully wait for PA Sync Established + * event since cmd_work can only schedule one command at a time. + * + * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 2493: + * + * If the Host issues this command when another HCI_LE_Periodic_ + * Advertising_Create_Sync command is pending, the Controller shall + * return the error code Command Disallowed (0x0C). + */ + err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_PA_CREATE_SYNC, + sizeof(cp), &cp, + HCI_EV_LE_PA_SYNC_ESTABLISHED, + conn->conn_timeout, NULL); + if (err == -ETIMEDOUT) + __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL, + 0, NULL, HCI_CMD_TIMEOUT); + + return err; +} + +int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn) +{ + return hci_cmd_sync_queue_once(hdev, hci_le_pa_create_sync, conn, + create_pa_complete); +} + +static void create_big_complete(struct hci_dev *hdev, void *data, int err) +{ + struct hci_conn *conn = data; + + bt_dev_dbg(hdev, "err %d", err); + + if (err == -ECANCELED) + return; + + if (hci_conn_valid(hdev, conn)) + clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); +} + +static int hci_le_big_create_sync(struct hci_dev *hdev, void *data) +{ + DEFINE_FLEX(struct hci_cp_le_big_create_sync, cp, bis, num_bis, 0x11); + struct hci_conn *conn = data; + struct bt_iso_qos *qos = &conn->iso_qos; + int err; + + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; + + set_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); + + memset(cp, 0, sizeof(*cp)); + cp->handle = qos->bcast.big; + cp->sync_handle = cpu_to_le16(conn->sync_handle); + cp->encryption = qos->bcast.encryption; + memcpy(cp->bcode, qos->bcast.bcode, sizeof(cp->bcode)); + cp->mse = qos->bcast.mse; + cp->timeout = cpu_to_le16(qos->bcast.timeout); + cp->num_bis = conn->num_bis; + memcpy(cp->bis, conn->bis, conn->num_bis); + + /* The spec allows only one pending LE BIG Create Sync command at + * a time, so we forcefully wait for BIG Sync Established event since + * cmd_work can only schedule one command at a time. + * + * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E + * page 2586: + * + * If the Host sends this command when the Controller is in the + * process of synchronizing to any BIG, i.e. the HCI_LE_BIG_Sync_ + * Established event has not been generated, the Controller shall + * return the error code Command Disallowed (0x0C). + */ + err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_BIG_CREATE_SYNC, + struct_size(cp, bis, cp->num_bis), cp, + HCI_EVT_LE_BIG_SYNC_ESTABLISHED, + conn->conn_timeout, NULL); + if (err == -ETIMEDOUT) + hci_le_big_terminate_sync(hdev, cp->handle); + + return err; +} + +int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn) +{ + return hci_cmd_sync_queue_once(hdev, hci_le_big_create_sync, conn, + create_big_complete); +} diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 4b54dbbf0729..041ce9adc378 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -90,9 +90,28 @@ static void bt_host_release(struct device *dev) module_put(THIS_MODULE); } +static ssize_t reset_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hci_dev *hdev = to_hci_dev(dev); + + if (hdev->reset) + hdev->reset(hdev); + + return count; +} +static DEVICE_ATTR_WO(reset); + +static struct attribute *bt_host_attrs[] = { + &dev_attr_reset.attr, + NULL, +}; +ATTRIBUTE_GROUPS(bt_host); + static const struct device_type bt_host = { .name = "host", .release = bt_host_release, + .groups = bt_host_groups, }; void hci_init_sysfs(struct hci_dev *hdev) diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig index 6746be07e222..e08aae35351a 100644 --- a/net/bluetooth/hidp/Kconfig +++ b/net/bluetooth/hidp/Kconfig @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config BT_HIDP tristate "HIDP protocol support" - depends on BT_BREDR && INPUT && HID_SUPPORT - select HID + depends on BT_BREDR && HID help HIDP (Human Interface Device Protocol) is a transport layer for HID reports. HIDP is required for the Bluetooth Human diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 707f229f896a..fc5af8639b1e 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -433,7 +433,7 @@ static void hidp_set_timer(struct hidp_session *session) static void hidp_del_timer(struct hidp_session *session) { if (session->idle_to > 0) - del_timer_sync(&session->timer); + timer_delete_sync(&session->timer); } static void hidp_process_report(struct hidp_session *session, int type, diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 43d0ebe11100..6e2c752aaa8f 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -518,7 +518,8 @@ static struct bt_iso_qos *iso_sock_get_qos(struct sock *sk) return &iso_pi(sk)->qos; } -static int iso_send_frame(struct sock *sk, struct sk_buff *skb) +static int iso_send_frame(struct sock *sk, struct sk_buff *skb, + const struct sockcm_cookie *sockc) { struct iso_conn *conn = iso_pi(sk)->conn; struct bt_iso_qos *qos = iso_sock_get_qos(sk); @@ -538,10 +539,12 @@ static int iso_send_frame(struct sock *sk, struct sk_buff *skb) hdr->slen = cpu_to_le16(hci_iso_data_len_pack(len, HCI_ISO_STATUS_VALID)); - if (sk->sk_state == BT_CONNECTED) + if (sk->sk_state == BT_CONNECTED) { + hci_setup_tx_timestamp(skb, 1, sockc); hci_send_iso(conn->hcon, skb); - else + } else { len = -ENOTCONN; + } return len; } @@ -938,7 +941,7 @@ static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr, iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type; - if (sa->iso_bc->bc_sid > 0x0f) + if (sa->iso_bc->bc_sid > 0x0f && sa->iso_bc->bc_sid != HCI_SID_INVALID) return -EINVAL; iso_pi(sk)->bc_sid = sa->iso_bc->bc_sid; @@ -1281,6 +1284,42 @@ static int iso_sock_accept(struct socket *sock, struct socket *newsock, BT_DBG("new socket %p", ch); + /* A Broadcast Sink might require BIG sync to be terminated + * and re-established multiple times, while keeping the same + * PA sync handle active. To allow this, once all BIS + * connections have been accepted on a PA sync parent socket, + * "reset" socket state, to allow future BIG re-sync procedures. + */ + if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { + /* Iterate through the list of bound BIS indices + * and clear each BIS as they are accepted by the + * user space, one by one. + */ + for (int i = 0; i < iso_pi(sk)->bc_num_bis; i++) { + if (iso_pi(sk)->bc_bis[i] > 0) { + iso_pi(sk)->bc_bis[i] = 0; + iso_pi(sk)->bc_num_bis--; + break; + } + } + + if (iso_pi(sk)->bc_num_bis == 0) { + /* Once the last BIS was accepted, reset parent + * socket parameters to mark that the listening + * process for BIS connections has been completed: + * + * 1. Reset the DEFER setup flag on the parent sk. + * 2. Clear the flag marking that the BIG create + * sync command is pending. + * 3. Transition socket state from BT_LISTEN to + * BT_CONNECTED. + */ + set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); + clear_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags); + sk->sk_state = BT_CONNECTED; + } + } + done: release_sock(sk); return err; @@ -1291,6 +1330,7 @@ static int iso_sock_getname(struct socket *sock, struct sockaddr *addr, { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; + int len = sizeof(struct sockaddr_iso); BT_DBG("sock %p, sk %p", sock, sk); @@ -1299,12 +1339,20 @@ static int iso_sock_getname(struct socket *sock, struct sockaddr *addr, if (peer) { bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst); sa->iso_bdaddr_type = iso_pi(sk)->dst_type; + + if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { + sa->iso_bc->bc_sid = iso_pi(sk)->bc_sid; + sa->iso_bc->bc_num_bis = iso_pi(sk)->bc_num_bis; + memcpy(sa->iso_bc->bc_bis, iso_pi(sk)->bc_bis, + ISO_MAX_NUM_BIS); + len += sizeof(struct sockaddr_iso_bc); + } } else { bacpy(&sa->iso_bdaddr, &iso_pi(sk)->src); sa->iso_bdaddr_type = iso_pi(sk)->src_type; } - return sizeof(struct sockaddr_iso); + return len; } static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, @@ -1312,6 +1360,7 @@ static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct sk_buff *skb, **frag; + struct sockcm_cookie sockc; size_t mtu; int err; @@ -1324,6 +1373,14 @@ static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; + hci_sockcm_init(&sockc, sk); + + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (err) + return err; + } + lock_sock(sk); if (sk->sk_state != BT_CONNECTED) { @@ -1369,7 +1426,7 @@ static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, lock_sock(sk); if (sk->sk_state == BT_CONNECTED) - err = iso_send_frame(sk, skb); + err = iso_send_frame(sk, skb, &sockc); else err = -ENOTCONN; @@ -1414,14 +1471,13 @@ static void iso_conn_big_sync(struct sock *sk) lock_sock(sk); if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { - err = hci_le_big_create_sync(hdev, iso_pi(sk)->conn->hcon, - &iso_pi(sk)->qos, - iso_pi(sk)->sync_handle, - iso_pi(sk)->bc_num_bis, - iso_pi(sk)->bc_bis); + err = hci_conn_big_create_sync(hdev, iso_pi(sk)->conn->hcon, + &iso_pi(sk)->qos, + iso_pi(sk)->sync_handle, + iso_pi(sk)->bc_num_bis, + iso_pi(sk)->bc_bis); if (err) - bt_dev_err(hdev, "hci_le_big_create_sync: %d", - err); + bt_dev_err(hdev, "hci_big_create_sync: %d", err); } release_sock(sk); @@ -1438,6 +1494,10 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, BT_DBG("sk %p", sk); + if (unlikely(flags & MSG_ERRQUEUE)) + return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH, + BT_SCM_ERROR); + if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { sock_hold(sk); lock_sock(sk); @@ -1870,7 +1930,7 @@ static void iso_conn_ready(struct iso_conn *conn) hcon); } else if (test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { ev = hci_recv_event_data(hcon->hdev, - HCI_EVT_LE_BIG_SYNC_ESTABILISHED); + HCI_EVT_LE_BIG_SYNC_ESTABLISHED); /* Get reference to PA sync parent socket, if it exists */ parent = iso_get_sock(&hcon->src, &hcon->dst, @@ -1937,11 +1997,13 @@ static void iso_conn_ready(struct iso_conn *conn) hcon->dst_type = iso_pi(parent)->dst_type; } - if (ev3) { + if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) { iso_pi(sk)->qos = iso_pi(parent)->qos; hcon->iso_qos = iso_pi(sk)->qos; + iso_pi(sk)->bc_sid = iso_pi(parent)->bc_sid; iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis; - memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, ISO_MAX_NUM_BIS); + memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, + ISO_MAX_NUM_BIS); set_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); } @@ -1978,6 +2040,9 @@ static bool iso_match_sid(struct sock *sk, void *data) { struct hci_ev_le_pa_sync_established *ev = data; + if (iso_pi(sk)->bc_sid == HCI_SID_INVALID) + return true; + return ev->sid == iso_pi(sk)->bc_sid; } @@ -2024,8 +2089,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (ev1) { sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid, ev1); - if (sk && !ev1->status) + if (sk && !ev1->status) { iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); + iso_pi(sk)->bc_sid = ev1->sid; + } goto done; } @@ -2061,12 +2128,11 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { - err = hci_le_big_create_sync(hdev, - hcon, - &iso_pi(sk)->qos, - iso_pi(sk)->sync_handle, - iso_pi(sk)->bc_num_bis, - iso_pi(sk)->bc_bis); + err = hci_conn_big_create_sync(hdev, hcon, + &iso_pi(sk)->qos, + iso_pi(sk)->sync_handle, + iso_pi(sk)->bc_num_bis, + iso_pi(sk)->bc_bis); if (err) { bt_dev_err(hdev, "hci_le_big_create_sync: %d", err); @@ -2151,14 +2217,9 @@ done: return HCI_LM_ACCEPT; } -static bool iso_match(struct hci_conn *hcon) -{ - return hcon->type == ISO_LINK || hcon->type == LE_LINK; -} - static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) { - if (hcon->type != ISO_LINK) { + if (hcon->type != CIS_LINK && hcon->type != BIS_LINK) { if (hcon->type != LE_LINK) return; @@ -2199,7 +2260,7 @@ static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) { - if (hcon->type != ISO_LINK) + if (hcon->type != CIS_LINK && hcon->type != BIS_LINK) return; BT_DBG("hcon %p reason %d", hcon, reason); @@ -2337,7 +2398,6 @@ drop: static struct hci_cb iso_cb = { .name = "ISO", - .match = iso_match, .connect_cfm = iso_connect_cfm, .disconn_cfm = iso_disconn_cfm, }; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 27b4c4a2ba1f..a5bde5db58ef 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -119,7 +119,6 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, { struct l2cap_chan *c; - mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_scid(conn, cid); if (c) { /* Only lock if chan reference is not 0 */ @@ -127,7 +126,6 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, if (c) l2cap_chan_lock(c); } - mutex_unlock(&conn->chan_lock); return c; } @@ -140,7 +138,6 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, { struct l2cap_chan *c; - mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_dcid(conn, cid); if (c) { /* Only lock if chan reference is not 0 */ @@ -148,7 +145,6 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, if (c) l2cap_chan_lock(c); } - mutex_unlock(&conn->chan_lock); return c; } @@ -286,7 +282,7 @@ static void __set_retrans_timer(struct l2cap_chan *chan) if (!delayed_work_pending(&chan->monitor_timer) && chan->retrans_timeout) { l2cap_set_timer(chan, &chan->retrans_timer, - msecs_to_jiffies(chan->retrans_timeout)); + secs_to_jiffies(chan->retrans_timeout)); } } @@ -295,7 +291,7 @@ static void __set_monitor_timer(struct l2cap_chan *chan) __clear_retrans_timer(chan); if (chan->monitor_timeout) { l2cap_set_timer(chan, &chan->monitor_timer, - msecs_to_jiffies(chan->monitor_timeout)); + secs_to_jiffies(chan->monitor_timeout)); } } @@ -418,7 +414,7 @@ static void l2cap_chan_timeout(struct work_struct *work) if (!conn) return; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling * this work. No need to call l2cap_chan_hold(chan) here again. */ @@ -439,7 +435,7 @@ static void l2cap_chan_timeout(struct work_struct *work) l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } struct l2cap_chan *l2cap_chan_create(void) @@ -636,14 +632,15 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) test_bit(FLAG_HOLD_HCI_CONN, &chan->flags)) hci_conn_hold(conn->hcon); - list_add(&chan->list, &conn->chan_l); + /* Append to the list since the order matters for ECRED */ + list_add_tail(&chan->list, &conn->chan_l); } void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) { - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); __l2cap_chan_add(conn, chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } void l2cap_chan_del(struct l2cap_chan *chan, int err) @@ -731,9 +728,9 @@ void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func, if (!conn) return; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); __l2cap_chan_list(conn, func, data); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } EXPORT_SYMBOL_GPL(l2cap_chan_list); @@ -745,7 +742,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work) struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -754,7 +751,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan) @@ -948,6 +945,16 @@ static u8 l2cap_get_ident(struct l2cap_conn *conn) return id; } +static void l2cap_send_acl(struct l2cap_conn *conn, struct sk_buff *skb, + u8 flags) +{ + /* Check if the hcon still valid before attempting to send */ + if (hci_conn_valid(conn->hcon->hdev, conn->hcon)) + hci_send_acl(conn->hchan, skb, flags); + else + kfree_skb(skb); +} + static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data) { @@ -970,7 +977,7 @@ static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, bt_cb(skb)->force_active = BT_POWER_FORCE_ACTIVE_ON; skb->priority = HCI_PRIO_MAX; - hci_send_acl(conn->hchan, skb, flags); + l2cap_send_acl(conn, skb, flags); } static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb) @@ -1404,7 +1411,8 @@ static void l2cap_request_info(struct l2cap_conn *conn) sizeof(req), &req); } -static bool l2cap_check_enc_key_size(struct hci_conn *hcon) +static bool l2cap_check_enc_key_size(struct hci_conn *hcon, + struct l2cap_chan *chan) { /* The minimum encryption key size needs to be enforced by the * host stack before establishing any L2CAP connections. The @@ -1418,7 +1426,7 @@ static bool l2cap_check_enc_key_size(struct hci_conn *hcon) int min_key_size = hcon->hdev->min_enc_key_size; /* On FIPS security level, key size must be 16 bytes */ - if (hcon->sec_level == BT_SECURITY_FIPS) + if (chan->sec_level == BT_SECURITY_FIPS) min_key_size = 16; return (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags) || @@ -1446,7 +1454,7 @@ static void l2cap_do_start(struct l2cap_chan *chan) !__l2cap_no_conn_pending(chan)) return; - if (l2cap_check_enc_key_size(conn->hcon)) + if (l2cap_check_enc_key_size(conn->hcon, chan)) l2cap_start_connection(chan); else __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); @@ -1497,8 +1505,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -1523,7 +1529,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn) continue; } - if (l2cap_check_enc_key_size(conn->hcon)) + if (l2cap_check_enc_key_size(conn->hcon, chan)) l2cap_start_connection(chan); else l2cap_chan_close(chan, ECONNREFUSED); @@ -1567,8 +1573,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn) l2cap_chan_unlock(chan); } - - mutex_unlock(&conn->chan_lock); } static void l2cap_le_conn_ready(struct l2cap_conn *conn) @@ -1614,7 +1618,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) if (hcon->type == ACL_LINK) l2cap_request_info(conn); - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { @@ -1632,7 +1636,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); if (hcon->type == LE_LINK) l2cap_le_conn_ready(conn); @@ -1647,14 +1651,10 @@ static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry(chan, &conn->chan_l, list) { if (test_bit(FLAG_FORCE_RELIABLE, &chan->flags)) l2cap_chan_set_err(chan, err); } - - mutex_unlock(&conn->chan_lock); } static void l2cap_info_timeout(struct work_struct *work) @@ -1665,7 +1665,9 @@ static void l2cap_info_timeout(struct work_struct *work) conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; conn->info_ident = 0; + mutex_lock(&conn->lock); l2cap_conn_start(conn); + mutex_unlock(&conn->lock); } /* @@ -1757,6 +1759,8 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); + mutex_lock(&conn->lock); + kfree_skb(conn->rx_skb); skb_queue_purge(&conn->pending_rx); @@ -1775,8 +1779,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) /* Force the connection to be immediately dropped */ hcon->disc_timeout = 0; - mutex_lock(&conn->chan_lock); - /* Kill channels */ list_for_each_entry_safe(chan, l, &conn->chan_l, list) { l2cap_chan_hold(chan); @@ -1790,15 +1792,14 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_chan_put(chan); } - mutex_unlock(&conn->chan_lock); - - hci_chan_del(conn->hchan); - if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) cancel_delayed_work_sync(&conn->info_timer); - hcon->l2cap_data = NULL; + hci_chan_del(conn->hchan); conn->hchan = NULL; + + hcon->l2cap_data = NULL; + mutex_unlock(&conn->lock); l2cap_conn_put(conn); } @@ -2515,7 +2516,33 @@ static void l2cap_le_flowctl_send(struct l2cap_chan *chan) skb_queue_len(&chan->tx_q)); } -int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) +static void l2cap_tx_timestamp(struct sk_buff *skb, + const struct sockcm_cookie *sockc, + size_t len) +{ + struct sock *sk = skb ? skb->sk : NULL; + + if (sk && sk->sk_type == SOCK_STREAM) + hci_setup_tx_timestamp(skb, len, sockc); + else + hci_setup_tx_timestamp(skb, 1, sockc); +} + +static void l2cap_tx_timestamp_seg(struct sk_buff_head *queue, + const struct sockcm_cookie *sockc, + size_t len) +{ + struct sk_buff *skb = skb_peek(queue); + struct sock *sk = skb ? skb->sk : NULL; + + if (sk && sk->sk_type == SOCK_STREAM) + l2cap_tx_timestamp(skb_peek_tail(queue), sockc, len); + else + l2cap_tx_timestamp(skb, sockc, len); +} + +int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len, + const struct sockcm_cookie *sockc) { struct sk_buff *skb; int err; @@ -2530,6 +2557,8 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (IS_ERR(skb)) return PTR_ERR(skb); + l2cap_tx_timestamp(skb, sockc, len); + l2cap_do_send(chan, skb); return len; } @@ -2553,6 +2582,8 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (err) return err; + l2cap_tx_timestamp_seg(&seg_queue, sockc, len); + skb_queue_splice_tail_init(&seg_queue, &chan->tx_q); l2cap_le_flowctl_send(chan); @@ -2574,6 +2605,8 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (IS_ERR(skb)) return PTR_ERR(skb); + l2cap_tx_timestamp(skb, sockc, len); + l2cap_do_send(chan, skb); err = len; break; @@ -2597,10 +2630,13 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (err) break; - if (chan->mode == L2CAP_MODE_ERTM) + if (chan->mode == L2CAP_MODE_ERTM) { + /* TODO: ERTM mode timestamping */ l2cap_tx(chan, NULL, &seg_queue, L2CAP_EV_DATA_REQUEST); - else + } else { + l2cap_tx_timestamp_seg(&seg_queue, sockc, len); l2cap_streaming_send(chan, &seg_queue); + } err = len; @@ -2916,8 +2952,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry(chan, &conn->chan_l, list) { if (chan->chan_type != L2CAP_CHAN_RAW) continue; @@ -2932,8 +2966,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) if (chan->ops->recv(chan, nskb)) kfree_skb(nskb); } - - mutex_unlock(&conn->chan_lock); } /* ---- L2CAP signalling commands ---- */ @@ -3776,7 +3808,11 @@ static void l2cap_ecred_rsp_defer(struct l2cap_chan *chan, void *data) struct l2cap_ecred_conn_rsp *rsp_flex = container_of(&rsp->pdu.rsp, struct l2cap_ecred_conn_rsp, hdr); - if (test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags)) + /* Check if channel for outgoing connection or if it wasn't deferred + * since in those cases it must be skipped. + */ + if (test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags) || + !test_and_clear_bit(FLAG_DEFER_SETUP, &chan->flags)) return; /* Reset ident so only one response is sent */ @@ -3952,12 +3988,12 @@ static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); /* Check if the ACL is secure enough (if not SDP) */ if (psm != cpu_to_le16(L2CAP_PSM_SDP) && - !hci_conn_check_link_mode(conn->hcon)) { + (!hci_conn_check_link_mode(conn->hcon) || + !l2cap_check_enc_key_size(conn->hcon, pchan))) { conn->disc_reason = HCI_ERROR_AUTH_FAILURE; result = L2CAP_CR_SEC_BLOCK; goto response; @@ -4059,7 +4095,6 @@ response: } l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); } @@ -4098,27 +4133,19 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x", dcid, scid, result, status); - mutex_lock(&conn->chan_lock); - if (scid) { chan = __l2cap_get_chan_by_scid(conn, scid); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; } else { chan = __l2cap_get_chan_by_ident(conn, cmd->ident); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; } chan = l2cap_chan_hold_unless_zero(chan); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; err = 0; @@ -4156,9 +4183,6 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); l2cap_chan_put(chan); -unlock: - mutex_unlock(&conn->chan_lock); - return err; } @@ -4446,11 +4470,7 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, chan->ops->set_shutdown(chan); - l2cap_chan_unlock(chan); - mutex_lock(&conn->chan_lock); - l2cap_chan_lock(chan); l2cap_chan_del(chan, ECONNRESET); - mutex_unlock(&conn->chan_lock); chan->ops->close(chan); @@ -4487,11 +4507,7 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, return 0; } - l2cap_chan_unlock(chan); - mutex_lock(&conn->chan_lock); - l2cap_chan_lock(chan); l2cap_chan_del(chan, 0); - mutex_unlock(&conn->chan_lock); chan->ops->close(chan); @@ -4689,13 +4705,9 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x", dcid, mtu, mps, credits, result); - mutex_lock(&conn->chan_lock); - chan = __l2cap_get_chan_by_ident(conn, cmd->ident); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; err = 0; @@ -4743,9 +4755,6 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); -unlock: - mutex_unlock(&conn->chan_lock); - return err; } @@ -4857,12 +4866,12 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); if (!smp_sufficient_security(conn->hcon, pchan->sec_level, SMP_ALLOW_STK)) { - result = L2CAP_CR_LE_AUTHENTICATION; + result = pchan->sec_level == BT_SECURITY_MEDIUM ? + L2CAP_CR_LE_ENCRYPTION : L2CAP_CR_LE_AUTHENTICATION; chan = NULL; goto response_unlock; } @@ -4923,7 +4932,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, response_unlock: l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); if (result == L2CAP_CR_PEND) @@ -5057,7 +5065,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); if (!smp_sufficient_security(conn->hcon, pchan->sec_level, @@ -5132,7 +5139,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, unlock: l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); response: @@ -5169,8 +5175,6 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, BT_DBG("mtu %u mps %u credits %u result 0x%4.4x", mtu, mps, credits, result); - mutex_lock(&conn->chan_lock); - cmd_len -= sizeof(*rsp); list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { @@ -5256,8 +5260,6 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); - return err; } @@ -5370,8 +5372,6 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn, if (cmd_len < sizeof(*rej)) return -EPROTO; - mutex_lock(&conn->chan_lock); - chan = __l2cap_get_chan_by_ident(conn, cmd->ident); if (!chan) goto done; @@ -5386,7 +5386,6 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn, l2cap_chan_put(chan); done: - mutex_unlock(&conn->chan_lock); return 0; } @@ -6841,8 +6840,12 @@ static void process_pending_rx(struct work_struct *work) BT_DBG(""); + mutex_lock(&conn->lock); + while ((skb = skb_dequeue(&conn->pending_rx))) l2cap_recv_frame(conn, skb); + + mutex_unlock(&conn->lock); } static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) @@ -6881,7 +6884,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR; mutex_init(&conn->ident_lock); - mutex_init(&conn->chan_lock); + mutex_init(&conn->lock); INIT_LIST_HEAD(&conn->chan_l); INIT_LIST_HEAD(&conn->users); @@ -7072,7 +7075,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, } } - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); l2cap_chan_lock(chan); if (cid && __l2cap_get_chan_by_dcid(conn, cid)) { @@ -7113,7 +7116,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, chan_unlock: l2cap_chan_unlock(chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); done: hci_dev_unlock(hdev); hci_dev_put(hdev); @@ -7217,11 +7220,6 @@ static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c, return NULL; } -static bool l2cap_match(struct hci_conn *hcon) -{ - return hcon->type == ACL_LINK || hcon->type == LE_LINK; -} - static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) { struct hci_dev *hdev = hcon->hdev; @@ -7229,6 +7227,9 @@ static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status) struct l2cap_chan *pchan; u8 dst_type; + if (hcon->type != ACL_LINK && hcon->type != LE_LINK) + return; + BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); if (status) { @@ -7293,6 +7294,9 @@ int l2cap_disconn_ind(struct hci_conn *hcon) static void l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason) { + if (hcon->type != ACL_LINK && hcon->type != LE_LINK) + return; + BT_DBG("hcon %p reason %d", hcon, reason); l2cap_conn_del(hcon, bt_to_errno(reason)); @@ -7325,7 +7329,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) BT_DBG("conn %p status 0x%2.2x encrypt %u", conn, status, encrypt); - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -7350,7 +7354,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) } if (chan->state == BT_CONNECT) { - if (!status && l2cap_check_enc_key_size(hcon)) + if (!status && l2cap_check_enc_key_size(hcon, chan)) l2cap_start_connection(chan); else __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); @@ -7360,7 +7364,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) struct l2cap_conn_rsp rsp; __u16 res, stat; - if (!status && l2cap_check_enc_key_size(hcon)) { + if (!status && l2cap_check_enc_key_size(hcon, chan)) { if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) { res = L2CAP_CR_PEND; stat = L2CAP_CS_AUTHOR_PEND; @@ -7399,7 +7403,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } /* Append fragment into frame respecting the maximum len of rx_skb */ @@ -7413,6 +7417,9 @@ static int l2cap_recv_frag(struct l2cap_conn *conn, struct sk_buff *skb, return -ENOMEM; /* Init rx_len */ conn->rx_len = len; + + skb_set_delivery_time(conn->rx_skb, skb->tstamp, + skb->tstamp_type); } /* Copy as much as the rx_skb can hold */ @@ -7466,19 +7473,45 @@ static void l2cap_recv_reset(struct l2cap_conn *conn) conn->rx_len = 0; } +struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c) +{ + if (!c) + return NULL; + + BT_DBG("conn %p orig refcnt %u", c, kref_read(&c->ref)); + + if (!kref_get_unless_zero(&c->ref)) + return NULL; + + return c; +} + void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { - struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_conn *conn; int len; + /* Lock hdev to access l2cap_data to avoid race with l2cap_conn_del */ + hci_dev_lock(hcon->hdev); + + conn = hcon->l2cap_data; + if (!conn) conn = l2cap_conn_add(hcon); - if (!conn) - goto drop; + conn = l2cap_conn_hold_unless_zero(conn); + + hci_dev_unlock(hcon->hdev); + + if (!conn) { + kfree_skb(skb); + return; + } BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags); + mutex_lock(&conn->lock); + switch (flags) { case ACL_START: case ACL_START_NO_FLUSH: @@ -7503,7 +7536,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (len == skb->len) { /* Complete frame received */ l2cap_recv_frame(conn, skb); - return; + goto unlock; } BT_DBG("Start: total len %d, frag len %u", len, skb->len); @@ -7511,8 +7544,24 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (skb->len > len) { BT_ERR("Frame is too long (len %u, expected len %d)", skb->len, len); + /* PTS test cases L2CAP/COS/CED/BI-14-C and BI-15-C + * (Multiple Signaling Command in one PDU, Data + * Truncated, BR/EDR) send a C-frame to the IUT with + * PDU Length set to 8 and Channel ID set to the + * correct signaling channel for the logical link. + * The Information payload contains one L2CAP_ECHO_REQ + * packet with Data Length set to 0 with 0 octets of + * echo data and one invalid command packet due to + * data truncated in PDU but present in HCI packet. + * + * Shorter the socket buffer to the PDU length to + * allow to process valid commands from the PDU before + * setting the socket unreliable. + */ + skb->len = len; + l2cap_recv_frame(conn, skb); l2cap_conn_unreliable(conn, ECOMM); - goto drop; + goto unlock; } /* Append fragment into frame (with header) */ @@ -7567,11 +7616,13 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) drop: kfree_skb(skb); +unlock: + mutex_unlock(&conn->lock); + l2cap_conn_put(conn); } static struct hci_cb l2cap_cb = { .name = "L2CAP", - .match = l2cap_match, .connect_cfm = l2cap_connect_cfm, .disconn_cfm = l2cap_disconn_cfm, .security_cfm = l2cap_security_cfm, diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 3d2553dcdb1b..5aa55fa69594 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -710,12 +710,12 @@ static bool l2cap_valid_mtu(struct l2cap_chan *chan, u16 mtu) { switch (chan->scid) { case L2CAP_CID_ATT: - if (mtu < L2CAP_LE_MIN_MTU) + if (mtu && mtu < L2CAP_LE_MIN_MTU) return false; break; default: - if (mtu < L2CAP_DEFAULT_MIN_MTU) + if (mtu && mtu < L2CAP_DEFAULT_MIN_MTU) return false; } @@ -1106,6 +1106,7 @@ static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; + struct sockcm_cookie sockc; int err; BT_DBG("sock %p, sk %p", sock, sk); @@ -1120,6 +1121,14 @@ static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (sk->sk_state != BT_CONNECTED) return -ENOTCONN; + hci_sockcm_init(&sockc, sk); + + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (err) + return err; + } + lock_sock(sk); err = bt_sock_wait_ready(sk, msg->msg_flags); release_sock(sk); @@ -1127,7 +1136,7 @@ static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg, return err; l2cap_chan_lock(chan); - err = l2cap_chan_send(chan, msg, len); + err = l2cap_chan_send(chan, msg, len, &sockc); l2cap_chan_unlock(chan); return err; @@ -1168,6 +1177,10 @@ static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg, struct l2cap_pinfo *pi = l2cap_pi(sk); int err; + if (unlikely(flags & MSG_ERRQUEUE)) + return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH, + BT_SCM_ERROR); + lock_sock(sk); if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP, @@ -1326,9 +1339,10 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) /* prevent sk structure from being freed whilst unlocked */ sock_hold(sk); - chan = l2cap_pi(sk)->chan; /* prevent chan structure from being freed whilst unlocked */ - l2cap_chan_hold(chan); + chan = l2cap_chan_hold_unless_zero(l2cap_pi(sk)->chan); + if (!chan) + goto shutdown_already; BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); @@ -1358,22 +1372,20 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) release_sock(sk); l2cap_chan_lock(chan); - conn = chan->conn; - if (conn) - /* prevent conn structure from being freed */ - l2cap_conn_get(conn); + /* prevent conn structure from being freed */ + conn = l2cap_conn_hold_unless_zero(chan->conn); l2cap_chan_unlock(chan); if (conn) /* mutex lock must be taken before l2cap_chan_lock() */ - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); l2cap_chan_lock(chan); l2cap_chan_close(chan, 0); l2cap_chan_unlock(chan); if (conn) { - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); l2cap_conn_put(conn); } @@ -1888,7 +1900,8 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, chan = l2cap_chan_create(); if (!chan) { sk_free(sk); - sock->sk = NULL; + if (sock) + sock->sk = NULL; return NULL; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index de47ad999d7b..14a9462fced5 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -210,7 +210,7 @@ static const u16 mgmt_untrusted_events[] = { MGMT_EV_EXP_FEATURE_CHANGED, }; -#define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) +#define CACHE_TIMEOUT secs_to_jiffies(2) #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" @@ -851,6 +851,9 @@ static u32 get_supported_settings(struct hci_dev *hdev) if (cis_peripheral_capable(hdev)) settings |= MGMT_SETTING_CIS_PERIPHERAL; + if (ll_privacy_capable(hdev)) + settings |= MGMT_SETTING_LL_PRIVACY; + settings |= MGMT_SETTING_PHY_CONFIGURATION; return settings; @@ -933,6 +936,9 @@ static u32 get_current_settings(struct hci_dev *hdev) if (sync_recv_capable(hdev)) settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; + if (ll_privacy_capable(hdev)) + settings |= MGMT_SETTING_LL_PRIVACY; + return settings; } @@ -1533,7 +1539,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE) && hdev->discov_timeout > 0) { - int to = msecs_to_jiffies(hdev->discov_timeout * 1000); + int to = secs_to_jiffies(hdev->discov_timeout); queue_delayed_work(hdev->req_workqueue, &hdev->discov_off, to); } @@ -1641,7 +1647,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, hdev->discov_timeout = timeout; if (cp->val && hdev->discov_timeout > 0) { - int to = msecs_to_jiffies(hdev->discov_timeout * 1000); + int to = secs_to_jiffies(hdev->discov_timeout); queue_delayed_work(hdev->req_workqueue, &hdev->discov_off, to); } @@ -2534,7 +2540,7 @@ static int send_hci_cmd_sync(struct hci_dev *hdev, void *data) skb = __hci_cmd_sync_ev(hdev, le16_to_cpu(cp->opcode), le16_to_cpu(cp->params_len), cp->params, cp->event, cp->timeout ? - msecs_to_jiffies(cp->timeout * 1000) : + secs_to_jiffies(cp->timeout) : HCI_CMD_TIMEOUT); if (IS_ERR(skb)) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, @@ -2560,7 +2566,8 @@ static int mgmt_hci_cmd_sync(struct sock *sk, struct hci_dev *hdev, struct mgmt_pending_cmd *cmd; int err; - if (len < sizeof(*cp)) + if (len != (offsetof(struct mgmt_cp_hci_cmd_sync, params) + + le16_to_cpu(cp->params_len))) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, MGMT_STATUS_INVALID_PARAMS); @@ -3215,7 +3222,8 @@ failed: static u8 link_to_bdaddr(u8 link_type, u8 addr_type) { switch (link_type) { - case ISO_LINK: + case CIS_LINK: + case BIS_LINK: case LE_LINK: switch (addr_type) { case ADDR_LE_DEV_PUBLIC: @@ -4417,12 +4425,6 @@ static const u8 le_simultaneous_roles_uuid[16] = { 0x96, 0x46, 0xc0, 0x42, 0xb5, 0x10, 0x1b, 0x67, }; -/* 15c0a148-c273-11ea-b3de-0242ac130004 */ -static const u8 rpa_resolution_uuid[16] = { - 0x04, 0x00, 0x13, 0xac, 0x42, 0x02, 0xde, 0xb3, - 0xea, 0x11, 0x73, 0xc2, 0x48, 0xa1, 0xc0, 0x15, -}; - /* 6fbaf188-05e0-496a-9885-d6ddfdb4e03e */ static const u8 iso_socket_uuid[16] = { 0x3e, 0xe0, 0xb4, 0xfd, 0xdd, 0xd6, 0x85, 0x98, @@ -4473,17 +4475,6 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, idx++; } - if (hdev && ll_privacy_capable(hdev)) { - if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) - flags = BIT(0) | BIT(1); - else - flags = BIT(1); - - memcpy(rp->features[idx].uuid, rpa_resolution_uuid, 16); - rp->features[idx].flags = cpu_to_le32(flags); - idx++; - } - if (hdev && (aosp_has_quality_report(hdev) || hdev->set_quality_report)) { if (hci_dev_test_flag(hdev, HCI_QUALITY_REPORT)) @@ -4540,27 +4531,6 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, return status; } -static int exp_ll_privacy_feature_changed(bool enabled, struct hci_dev *hdev, - struct sock *skip) -{ - struct mgmt_ev_exp_feature_changed ev; - - memset(&ev, 0, sizeof(ev)); - memcpy(ev.uuid, rpa_resolution_uuid, 16); - ev.flags = cpu_to_le32((enabled ? BIT(0) : 0) | BIT(1)); - - // Do we need to be atomic with the conn_flags? - if (enabled && privacy_mode_capable(hdev)) - hdev->conn_flags |= HCI_CONN_FLAG_DEVICE_PRIVACY; - else - hdev->conn_flags &= ~HCI_CONN_FLAG_DEVICE_PRIVACY; - - return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, hdev, - &ev, sizeof(ev), - HCI_MGMT_EXP_FEATURE_EVENTS, skip); - -} - static int exp_feature_changed(struct hci_dev *hdev, const u8 *uuid, bool enabled, struct sock *skip) { @@ -4601,16 +4571,6 @@ static int set_zero_key_func(struct sock *sk, struct hci_dev *hdev, } #endif - if (hdev && use_ll_privacy(hdev) && !hdev_is_powered(hdev)) { - bool changed; - - changed = hci_dev_test_and_clear_flag(hdev, - HCI_ENABLE_LL_PRIVACY); - if (changed) - exp_feature_changed(hdev, rpa_resolution_uuid, false, - sk); - } - hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, @@ -4716,71 +4676,6 @@ static int set_mgmt_mesh_func(struct sock *sk, struct hci_dev *hdev, return err; } -static int set_rpa_resolution_func(struct sock *sk, struct hci_dev *hdev, - struct mgmt_cp_set_exp_feature *cp, - u16 data_len) -{ - struct mgmt_rp_set_exp_feature rp; - bool val, changed; - int err; - u32 flags; - - /* Command requires to use the controller index */ - if (!hdev) - return mgmt_cmd_status(sk, MGMT_INDEX_NONE, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_INDEX); - - /* Changes can only be made when controller is powered down */ - if (hdev_is_powered(hdev)) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_REJECTED); - - /* Parameters are limited to a single octet */ - if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_PARAMS); - - /* Only boolean on/off is supported */ - if (cp->param[0] != 0x00 && cp->param[0] != 0x01) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_PARAMS); - - val = !!cp->param[0]; - - if (val) { - changed = !hci_dev_test_and_set_flag(hdev, - HCI_ENABLE_LL_PRIVACY); - hci_dev_clear_flag(hdev, HCI_ADVERTISING); - - /* Enable LL privacy + supported settings changed */ - flags = BIT(0) | BIT(1); - } else { - changed = hci_dev_test_and_clear_flag(hdev, - HCI_ENABLE_LL_PRIVACY); - - /* Disable LL privacy + supported settings changed */ - flags = BIT(1); - } - - memcpy(rp.uuid, rpa_resolution_uuid, 16); - rp.flags = cpu_to_le32(flags); - - hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); - - err = mgmt_cmd_complete(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, 0, - &rp, sizeof(rp)); - - if (changed) - exp_ll_privacy_feature_changed(val, hdev, sk); - - return err; -} - static int set_quality_report_func(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_set_exp_feature *cp, u16 data_len) @@ -5032,7 +4927,6 @@ static const struct mgmt_exp_feature { EXP_FEAT(debug_uuid, set_debug_func), #endif EXP_FEAT(mgmt_mesh_uuid, set_mgmt_mesh_func), - EXP_FEAT(rpa_resolution_uuid, set_rpa_resolution_func), EXP_FEAT(quality_report_uuid, set_quality_report_func), EXP_FEAT(offload_codecs_uuid, set_offload_codec_func), EXP_FEAT(le_simultaneous_roles_uuid, set_le_simultaneous_roles_func), @@ -5062,22 +4956,6 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, MGMT_STATUS_NOT_SUPPORTED); } -static u32 get_params_flags(struct hci_dev *hdev, - struct hci_conn_params *params) -{ - u32 flags = hdev->conn_flags; - - /* Devices using RPAs can only be programmed in the acceptlist if - * LL Privacy has been enable otherwise they cannot mark - * HCI_CONN_FLAG_REMOTE_WAKEUP. - */ - if ((flags & HCI_CONN_FLAG_REMOTE_WAKEUP) && !use_ll_privacy(hdev) && - hci_find_irk_by_addr(hdev, ¶ms->addr, params->addr_type)) - flags &= ~HCI_CONN_FLAG_REMOTE_WAKEUP; - - return flags; -} - static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { @@ -5112,7 +4990,6 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, if (!params) goto done; - supported_flags = get_params_flags(hdev, params); current_flags = params->flags; } @@ -5192,7 +5069,7 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } - supported_flags = get_params_flags(hdev, params); + supported_flags = hdev->conn_flags; if ((supported_flags | current_flags) != supported_flags) { bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)", @@ -5519,10 +5396,16 @@ static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, { struct mgmt_rp_remove_adv_monitor rp; struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_remove_adv_monitor *cp = cmd->param; + struct mgmt_cp_remove_adv_monitor *cp; + + if (status == -ECANCELED || + cmd != pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) + return; hci_dev_lock(hdev); + cp = cmd->param; + rp.monitor_handle = cp->monitor_handle; if (!status) @@ -5540,6 +5423,10 @@ static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, static int mgmt_remove_adv_monitor_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; + + if (cmd != pending_find(MGMT_OP_REMOVE_ADV_MONITOR, hdev)) + return -ECANCELED; + struct mgmt_cp_remove_adv_monitor *cp = cmd->param; u16 handle = __le16_to_cpu(cp->monitor_handle); @@ -5864,29 +5751,6 @@ done: return err; } -void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status) -{ - struct mgmt_pending_cmd *cmd; - - bt_dev_dbg(hdev, "status %u", status); - - hci_dev_lock(hdev); - - cmd = pending_find(MGMT_OP_START_DISCOVERY, hdev); - if (!cmd) - cmd = pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev); - - if (!cmd) - cmd = pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev); - - if (cmd) { - cmd->cmd_complete(cmd, mgmt_status(status)); - mgmt_pending_remove(cmd); - } - - hci_dev_unlock(hdev); -} - static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type, uint8_t *mgmt_status) { @@ -6139,23 +6003,6 @@ failed: return err; } -void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status) -{ - struct mgmt_pending_cmd *cmd; - - bt_dev_dbg(hdev, "status %u", status); - - hci_dev_lock(hdev); - - cmd = pending_find(MGMT_OP_STOP_DISCOVERY, hdev); - if (cmd) { - cmd->cmd_complete(cmd, mgmt_status(status)); - mgmt_pending_remove(cmd); - } - - hci_dev_unlock(hdev); -} - static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; @@ -7661,11 +7508,16 @@ static void add_device_complete(struct hci_dev *hdev, void *data, int err) struct mgmt_cp_add_device *cp = cmd->param; if (!err) { + struct hci_conn_params *params; + + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); device_flags_changed(NULL, hdev, &cp->addr.bdaddr, cp->addr.type, hdev->conn_flags, - PTR_UINT(cmd->user_data)); + params ? params->flags : 0); } mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE, @@ -7768,8 +7620,6 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - cmd->user_data = UINT_PTR(current_flags); - err = hci_cmd_sync_queue(hdev, add_device_sync, cmd, add_device_complete); if (err < 0) { @@ -9781,6 +9631,9 @@ void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn, sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0) + eir_precalc_len(sizeof(conn->dev_class))); + if (!skb) + return; + ev = skb_put(skb, sizeof(*ev)); bacpy(&ev->addr.bdaddr, &conn->dst); ev->addr.type = link_to_bdaddr(conn->type, conn->dst_type); @@ -10534,6 +10387,8 @@ void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND, sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0)); + if (!skb) + return; ev = skb_put(skb, sizeof(*ev)); bacpy(&ev->addr.bdaddr, bdaddr); diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index 17ab909a7c07..3713ff490c65 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -229,23 +229,6 @@ struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, return NULL; } -struct mgmt_pending_cmd *mgmt_pending_find_data(unsigned short channel, - u16 opcode, - struct hci_dev *hdev, - const void *data) -{ - struct mgmt_pending_cmd *cmd; - - list_for_each_entry(cmd, &hdev->mgmt_pending, list) { - if (cmd->user_data != data) - continue; - if (cmd->opcode == opcode) - return cmd; - } - - return NULL; -} - void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, void (*cb)(struct mgmt_pending_cmd *cmd, void *data), void *data) @@ -321,7 +304,7 @@ void mgmt_mesh_foreach(struct hci_dev *hdev, { struct mgmt_mesh_tx *mesh_tx, *tmp; - list_for_each_entry_safe(mesh_tx, tmp, &hdev->mgmt_pending, list) { + list_for_each_entry_safe(mesh_tx, tmp, &hdev->mesh_pending, list) { if (!sk || mesh_tx->sk == sk) cb(mesh_tx, data); } diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h index bdf978605d5a..f2ba994ab1d8 100644 --- a/net/bluetooth/mgmt_util.h +++ b/net/bluetooth/mgmt_util.h @@ -54,10 +54,6 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, struct hci_dev *hdev); -struct mgmt_pending_cmd *mgmt_pending_find_data(unsigned short channel, - u16 opcode, - struct hci_dev *hdev, - const void *data); void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, void (*cb)(struct mgmt_pending_cmd *cmd, void *data), void *data); diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 4c56ca5a216c..20ea7dba0a9a 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -254,7 +254,7 @@ static void rfcomm_session_clear_timer(struct rfcomm_session *s) { BT_DBG("session %p state %ld", s, s->state); - del_timer_sync(&s->timer); + timer_delete_sync(&s->timer); } /* ---- RFCOMM DLCs ---- */ @@ -281,7 +281,7 @@ static void rfcomm_dlc_clear_timer(struct rfcomm_dlc *d) { BT_DBG("dlc %p state %ld", d, d->state); - if (del_timer(&d->timer)) + if (timer_delete(&d->timer)) rfcomm_dlc_put(d); } @@ -2134,11 +2134,6 @@ static int rfcomm_run(void *unused) return 0; } -static bool rfcomm_match(struct hci_conn *hcon) -{ - return hcon->type == ACL_LINK; -} - static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) { struct rfcomm_session *s; @@ -2185,7 +2180,6 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) static struct hci_cb rfcomm_cb = { .name = "RFCOMM", - .match = rfcomm_match, .security_cfm = rfcomm_security_cfm }; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index aa7bfe26cb40..2945d27e75dc 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -107,6 +107,14 @@ static void sco_conn_put(struct sco_conn *conn) kref_put(&conn->ref, sco_conn_free); } +static struct sco_conn *sco_conn_hold(struct sco_conn *conn) +{ + BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref)); + + kref_get(&conn->ref); + return conn; +} + static struct sco_conn *sco_conn_hold_unless_zero(struct sco_conn *conn) { if (!conn) @@ -370,7 +378,8 @@ unlock: return err; } -static int sco_send_frame(struct sock *sk, struct sk_buff *skb) +static int sco_send_frame(struct sock *sk, struct sk_buff *skb, + const struct sockcm_cookie *sockc) { struct sco_conn *conn = sco_pi(sk)->conn; int len = skb->len; @@ -381,6 +390,7 @@ static int sco_send_frame(struct sock *sk, struct sk_buff *skb) BT_DBG("sk %p len %d", sk, len); + hci_setup_tx_timestamp(skb, 1, sockc); hci_send_sco(conn->hcon, skb); return len; @@ -776,6 +786,7 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, { struct sock *sk = sock->sk; struct sk_buff *skb; + struct sockcm_cookie sockc; int err; BT_DBG("sock %p, sk %p", sock, sk); @@ -787,6 +798,14 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; + hci_sockcm_init(&sockc, sk); + + if (msg->msg_controllen) { + err = sock_cmsg_send(sk, msg, &sockc); + if (err) + return err; + } + skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0); if (IS_ERR(skb)) return PTR_ERR(skb); @@ -794,7 +813,7 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, lock_sock(sk); if (sk->sk_state == BT_CONNECTED) - err = sco_send_frame(sk, skb); + err = sco_send_frame(sk, skb, &sockc); else err = -ENOTCONN; @@ -860,6 +879,10 @@ static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg, struct sock *sk = sock->sk; struct sco_pinfo *pi = sco_pi(sk); + if (unlikely(flags & MSG_ERRQUEUE)) + return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH, + BT_SCM_ERROR); + lock_sock(sk); if (sk->sk_state == BT_CONNECT2 && @@ -1353,6 +1376,7 @@ static void sco_conn_ready(struct sco_conn *conn) bacpy(&sco_pi(sk)->src, &conn->hcon->src); bacpy(&sco_pi(sk)->dst, &conn->hcon->dst); + sco_conn_hold(conn); hci_conn_hold(conn->hcon); __sco_chan_add(conn, sk, parent); @@ -1398,27 +1422,30 @@ int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) return lm; } -static bool sco_match(struct hci_conn *hcon) -{ - return hcon->type == SCO_LINK || hcon->type == ESCO_LINK; -} - static void sco_connect_cfm(struct hci_conn *hcon, __u8 status) { + if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) + return; + BT_DBG("hcon %p bdaddr %pMR status %u", hcon, &hcon->dst, status); if (!status) { struct sco_conn *conn; conn = sco_conn_add(hcon); - if (conn) + if (conn) { sco_conn_ready(conn); + sco_conn_put(conn); + } } else sco_conn_del(hcon, bt_to_errno(status)); } static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) { + if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK) + return; + BT_DBG("hcon %p reason %d", hcon, reason); sco_conn_del(hcon, bt_to_errno(reason)); @@ -1444,7 +1471,6 @@ drop: static struct hci_cb sco_cb = { .name = "SCO", - .match = sco_match, .connect_cfm = sco_connect_cfm, .disconn_cfm = sco_disconn_cfm, }; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 8b9724fd752a..47f359f24d1f 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -55,7 +55,7 @@ /* Keys which are not distributed with Secure Connections */ #define SMP_SC_NO_DIST (SMP_DIST_ENC_KEY | SMP_DIST_LINK_KEY) -#define SMP_TIMEOUT msecs_to_jiffies(30000) +#define SMP_TIMEOUT secs_to_jiffies(30) #define ID_ADDR_TIMEOUT msecs_to_jiffies(200) @@ -608,7 +608,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iv, 2, 1 + len); - l2cap_chan_send(chan, &msg, 1 + len); + l2cap_chan_send(chan, &msg, 1 + len, NULL); if (!chan->data) return; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 501ec4249fed..aaf13a7d58ed 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -153,7 +153,7 @@ static void xdp_test_run_init_page(netmem_ref netmem, void *arg) new_ctx->data = new_ctx->data_meta + meta_len; xdp_update_frame_from_buff(new_ctx, frm); - frm->mem = new_ctx->rxq->mem; + frm->mem_type = new_ctx->rxq->mem.type; memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx)); } @@ -246,7 +246,7 @@ static void reset_ctx(struct xdp_page_head *head) head->ctx.data_meta = head->orig_ctx.data_meta; head->ctx.data_end = head->orig_ctx.data_end; xdp_update_frame_from_buff(&head->ctx, head->frame); - head->frame->mem = head->orig_ctx.rxq->mem; + head->frame->mem_type = head->orig_ctx.rxq->mem.type; } static int xdp_recv_frames(struct xdp_frame **frames, int nframes, @@ -569,6 +569,11 @@ __bpf_kfunc u32 bpf_fentry_test9(u32 *a) return *a; } +int noinline bpf_fentry_test10(const void *a) +{ + return (long)a; +} + void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) { } @@ -660,12 +665,9 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, void __user *data_in = u64_to_user_ptr(kattr->test.data_in); void *data; - if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) + if (user_size < ETH_HLEN || user_size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); - if (user_size > size) - return ERR_PTR(-EMSGSIZE); - size = SKB_DATA_ALIGN(size); data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) @@ -702,7 +704,8 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 || bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 || bpf_fentry_test8(&arg) != 0 || - bpf_fentry_test9(&retval) != 0) + bpf_fentry_test9(&retval) != 0 || + bpf_fentry_test10((void *)0) != 0) goto out; break; case BPF_MODIFY_RETURN: @@ -1018,6 +1021,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_CGROUP_SKB: is_direct_pkt_access = true; break; default: diff --git a/net/bridge/br.c b/net/bridge/br.c index 2cab878e0a39..0adeafe11a36 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -51,6 +51,13 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v } } + if (is_vlan_dev(dev)) { + struct net_device *real_dev = vlan_dev_real_dev(dev); + + if (netif_is_bridge_master(real_dev)) + br_vlan_vlan_upper_event(real_dev, dev, event); + } + /* not a port of a bridge */ p = br_port_get_rtnl(dev); if (!p) @@ -277,6 +284,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, case BR_BOOLOPT_MST_ENABLE: err = br_mst_set_enabled(br, on, extack); break; + case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: + br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -295,6 +305,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); case BR_BOOLOPT_MST_ENABLE: return br_opt_get(br, BROPT_MST_ENABLED); + case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: + return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION); default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -356,21 +368,20 @@ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) clear_bit(opt, &br->options); } -static void __net_exit br_net_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit br_net_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { struct net_device *dev; - struct net *net; - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - for_each_netdev(net, dev) - if (netif_is_bridge_master(dev)) - br_dev_delete(dev, dev_to_kill); + ASSERT_RTNL_NET(net); + + for_each_netdev(net, dev) + if (netif_is_bridge_master(dev)) + br_dev_delete(dev, dev_to_kill); } static struct pernet_operations br_net_ops = { - .exit_batch_rtnl = br_net_exit_batch_rtnl, + .exit_rtnl = br_net_exit_rtnl, }; static const struct stp_proto br_stp_proto = { diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index c7869a286df4..1e2b51769eec 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -160,6 +160,9 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) { if (br_is_neigh_suppress_enabled(p, vid)) return; + if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + parp->ar_op == htons(ARPOP_REQUEST)) + return; if (parp->ar_op != htons(ARPOP_RREQUEST) && parp->ar_op != htons(ARPOP_RREPLY) && (ipv4_is_zeronet(sip) || sip == tip)) { @@ -229,7 +232,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, #endif #if IS_ENABLED(CONFIG_IPV6) -struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *msg) +struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *msg) { struct nd_msg *m; @@ -410,6 +413,10 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, if (br_is_neigh_suppress_enabled(p, vid)) return; + if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) && + msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) + return; + if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT && !msg->icmph.icmp6_solicited) { /* prevent flooding to neigh suppress ports */ diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0ab4613aa07a..a818fdc22da9 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -16,6 +16,8 @@ #include <linux/netfilter_bridge.h> #include <linux/uaccess.h> +#include <net/netdev_lock.h> + #include "br_private.h" #define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \ @@ -488,7 +490,7 @@ void br_dev_setup(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &br_type); dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE; dev->lltx = true; - dev->netns_local = true; + dev->netns_immutable = true; dev->features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 82bac2426631..902694c0ce64 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -955,6 +955,7 @@ int br_fdb_dump(struct sk_buff *skb, struct net_device *filter_dev, int *idx) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct net_bridge *br = netdev_priv(dev); struct net_bridge_fdb_entry *f; int err = 0; @@ -970,7 +971,7 @@ int br_fdb_dump(struct sk_buff *skb, rcu_read_lock(); hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip; if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) { if (filter_dev != dev) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index e19b583ff2c6..29097e984b4f 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -201,6 +201,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, enum br_pkt_type pkt_type, bool local_rcv, bool local_orig, u16 vid) { + enum skb_drop_reason reason = SKB_DROP_REASON_NO_TX_TARGET; struct net_bridge_port *prev = NULL; struct net_bridge_port *p; @@ -234,8 +235,11 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, continue; prev = maybe_deliver(prev, p, skb, local_orig); - if (IS_ERR(prev)) + if (IS_ERR(prev)) { + reason = PTR_ERR(prev) == -ENOMEM ? SKB_DROP_REASON_NOMEM : + SKB_DROP_REASON_NOT_SPECIFIED; goto out; + } } if (!prev) @@ -249,7 +253,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, out: if (!local_rcv) - kfree_skb(skb); + kfree_skb_reason(skb, reason); } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING @@ -289,6 +293,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig) { + enum skb_drop_reason reason = SKB_DROP_REASON_NO_TX_TARGET; struct net_bridge_port *prev = NULL; struct net_bridge_port_group *p; bool allow_mode_include = true; @@ -329,8 +334,11 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, } prev = maybe_deliver(prev, port, skb, local_orig); - if (IS_ERR(prev)) + if (IS_ERR(prev)) { + reason = PTR_ERR(prev) == -ENOMEM ? SKB_DROP_REASON_NOMEM : + SKB_DROP_REASON_NOT_SPECIFIED; goto out; + } delivered: if ((unsigned long)lport >= (unsigned long)port) p = rcu_dereference(p->next); @@ -349,6 +357,6 @@ delivered: out: if (!local_rcv) - kfree_skb(skb); + kfree_skb_reason(skb, reason); } #endif diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index ceaa5a89b947..5f6ac9bf1527 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -75,6 +75,7 @@ static int br_pass_frame_up(struct sk_buff *skb, bool promisc) /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net_bridge_port *p = br_port_get_rcu(skb->dev); enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; @@ -96,8 +97,10 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (br_mst_is_enabled(br)) { state = BR_STATE_FORWARDING; } else { - if (p->state == BR_STATE_DISABLED) + if (p->state == BR_STATE_DISABLED) { + reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; goto drop; + } state = p->state; } @@ -155,8 +158,10 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb } } - if (state == BR_STATE_LEARNING) + if (state == BR_STATE_LEARNING) { + reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; goto drop; + } BR_INPUT_SKB_CB(skb)->brdev = br->dev; BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED); @@ -184,7 +189,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || - br_multicast_is_router(brmctx, skb)) { + br_multicast_is_router(brmctx, skb) || + br->dev->flags & IFF_ALLMULTI) { local_rcv = true; DEV_STATS_INC(br->dev, multicast); } @@ -223,7 +229,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb out: return 0; drop: - kfree_skb(skb); + kfree_skb_reason(skb, reason); goto out; } EXPORT_SYMBOL_GPL(br_handle_frame_finish); @@ -324,6 +330,7 @@ static int br_process_frame_type(struct net_bridge_port *p, */ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net_bridge_port *p; struct sk_buff *skb = *pskb; const unsigned char *dest = eth_hdr(skb)->h_dest; @@ -331,8 +338,10 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; - if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) + if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) { + reason = SKB_DROP_REASON_MAC_INVALID_SOURCE; goto drop; + } skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) @@ -374,6 +383,7 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; case 0x01: /* IEEE MAC (Pause) */ + reason = SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL; goto drop; case 0x0E: /* 802.1AB LLDP */ @@ -423,8 +433,9 @@ defer_stp_filtering: return nf_hook_bridge_pre(skb, pskb); default: + reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE; drop: - kfree_skb(skb); + kfree_skb_reason(skb, reason); } return RX_HANDLER_CONSUMED; } diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index f213ed108361..6bc0a11f2ed3 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -394,10 +394,26 @@ static int old_deviceless(struct net *net, void __user *data) return -EOPNOTSUPP; } -int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, - struct ifreq *ifr, void __user *uarg) +int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg) { int ret = -EOPNOTSUPP; + struct ifreq ifr; + + if (cmd == SIOCBRADDIF || cmd == SIOCBRDELIF) { + void __user *data; + char *colon; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (get_user_ifreq(&ifr, &data, uarg)) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ - 1] = 0; + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + } rtnl_lock(); @@ -430,7 +446,21 @@ int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, break; case SIOCBRADDIF: case SIOCBRDELIF: - ret = add_del_if(br, ifr->ifr_ifindex, cmd == SIOCBRADDIF); + { + struct net_device *dev; + + dev = __dev_get_by_name(net, ifr.ifr_name); + if (!dev || !netif_device_present(dev)) { + ret = -ENODEV; + break; + } + if (!netif_is_bridge_master(dev)) { + ret = -EOPNOTSUPP; + break; + } + + ret = add_del_if(netdev_priv(dev), ifr.ifr_ifindex, cmd == SIOCBRADDIF); + } break; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 1a52a0bca086..400eb872b403 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -144,6 +144,8 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags) e->flags |= MDB_FLAGS_STAR_EXCL; if (flags & MDB_PG_FLAGS_BLOCKED) e->flags |= MDB_FLAGS_BLOCKED; + if (flags & MDB_PG_FLAGS_OFFLOAD_FAILED) + e->flags |= MDB_FLAGS_OFFLOAD_FAILED; } static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip, @@ -517,16 +519,17 @@ static size_t rtnl_mdb_nlmsg_size(const struct net_bridge_port_group *pg) rtnl_mdb_nlmsg_pg_size(pg); } -void br_mdb_notify(struct net_device *dev, - struct net_bridge_mdb_entry *mp, - struct net_bridge_port_group *pg, - int type) +static void __br_mdb_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type, bool notify_switchdev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; - br_switchdev_mdb_notify(dev, mp, pg, type); + if (notify_switchdev) + br_switchdev_mdb_notify(dev, mp, pg, type); skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC); if (!skb) @@ -544,6 +547,21 @@ errout: rtnl_set_sk_err(net, RTNLGRP_MDB, err); } +void br_mdb_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type) +{ + __br_mdb_notify(dev, mp, pg, type, true); +} + +void br_mdb_flag_change_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg) +{ + __br_mdb_notify(dev, mp, pg, RTM_NEWMDB, false); +} + static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, int ifindex, u16 vid, u32 pid, @@ -732,7 +750,7 @@ static int br_mdb_replace_group_sg(const struct br_mdb_config *cfg, mod_timer(&pg->timer, now + brmctx->multicast_membership_interval); else - del_timer(&pg->timer); + timer_delete(&pg->timer); br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); @@ -853,7 +871,7 @@ static int br_mdb_add_group_src(const struct br_mdb_config *cfg, cfg->entry->state == MDB_TEMPORARY) mod_timer(&ent->timer, now + br_multicast_gmi(brmctx)); else - del_timer(&ent->timer); + timer_delete(&ent->timer); /* Install a (S, G) forwarding entry for the source. */ err = br_mdb_add_group_src_fwd(cfg, &src->addr, brmctx, extack); @@ -953,7 +971,7 @@ static int br_mdb_replace_group_star_g(const struct br_mdb_config *cfg, mod_timer(&pg->timer, now + brmctx->multicast_membership_interval); else - del_timer(&pg->timer); + timer_delete(&pg->timer); br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB); @@ -1040,7 +1058,7 @@ static int br_mdb_add_group(const struct br_mdb_config *cfg, /* host join */ if (!port) { - if (mp->host_joined) { + if (mp->host_joined && !(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; } diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 1820f09ff59c..3f24b4ee49c2 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -80,10 +80,10 @@ static void br_mst_vlan_set_state(struct net_bridge_vlan_group *vg, if (br_vlan_get_state(v) == state) return; - br_vlan_set_state(v, state); - if (v->vid == vg->pvid) br_vlan_set_pvid_state(vg, state); + + br_vlan_set_state(v, state); } int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index b2ae0d2434d2..fb6f7f2001c9 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -546,7 +546,7 @@ static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) return; /* the kernel is now responsible for removing this S,G */ - del_timer(&sg->timer); + timer_delete(&sg->timer); star_mp = br_mdb_ip_get(src->br, &src->pg->key.addr); if (!star_mp) return; @@ -2015,9 +2015,9 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port, void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) - del_timer_sync(&pmctx->ip6_mc_router_timer); + timer_delete_sync(&pmctx->ip6_mc_router_timer); #endif - del_timer_sync(&pmctx->ip4_mc_router_timer); + timer_delete_sync(&pmctx->ip4_mc_router_timer); } int br_multicast_add_port(struct net_bridge_port *port) @@ -2061,8 +2061,8 @@ static void br_multicast_enable(struct bridge_mcast_own_query *query) { query->startup_sent = 0; - if (try_to_del_timer_sync(&query->timer) >= 0 || - del_timer(&query->timer)) + if (timer_delete_sync_try(&query->timer) >= 0 || + timer_delete(&query->timer)) mod_timer(&query->timer, jiffies); } @@ -2105,12 +2105,17 @@ static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) } } -void br_multicast_enable_port(struct net_bridge_port *port) +static void br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; spin_lock_bh(&br->multicast_lock); - __br_multicast_enable_port_ctx(&port->multicast_ctx); + if (br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) { + spin_unlock_bh(&br->multicast_lock); + return; + } + __br_multicast_enable_port_ctx(pmctx); spin_unlock_bh(&br->multicast_lock); } @@ -2127,21 +2132,77 @@ static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) br_multicast_find_del_pg(pmctx->port->br, pg); del |= br_ip4_multicast_rport_del(pmctx); - del_timer(&pmctx->ip4_mc_router_timer); - del_timer(&pmctx->ip4_own_query.timer); + timer_delete(&pmctx->ip4_mc_router_timer); + timer_delete(&pmctx->ip4_own_query.timer); del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&pmctx->ip6_mc_router_timer); - del_timer(&pmctx->ip6_own_query.timer); + timer_delete(&pmctx->ip6_mc_router_timer); + timer_delete(&pmctx->ip6_own_query.timer); #endif br_multicast_rport_del_notify(pmctx, del); } +static void br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) +{ + struct net_bridge *br = pmctx->port->br; + + spin_lock_bh(&br->multicast_lock); + if (br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) { + spin_unlock_bh(&br->multicast_lock); + return; + } + + __br_multicast_disable_port_ctx(pmctx); + spin_unlock_bh(&br->multicast_lock); +} + +static void br_multicast_toggle_port(struct net_bridge_port *port, bool on) +{ +#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) + if (br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + + rcu_read_lock(); + vg = nbp_vlan_group_rcu(port); + if (!vg) { + rcu_read_unlock(); + return; + } + + /* iterate each vlan, toggle vlan multicast context */ + list_for_each_entry_rcu(vlan, &vg->vlan_list, vlist) { + struct net_bridge_mcast_port *pmctx = + &vlan->port_mcast_ctx; + u8 state = br_vlan_get_state(vlan); + /* enable vlan multicast context when state is + * LEARNING or FORWARDING + */ + if (on && br_vlan_state_allowed(state, true)) + br_multicast_enable_port_ctx(pmctx); + else + br_multicast_disable_port_ctx(pmctx); + } + rcu_read_unlock(); + return; + } +#endif + /* toggle port multicast context when vlan snooping is disabled */ + if (on) + br_multicast_enable_port_ctx(&port->multicast_ctx); + else + br_multicast_disable_port_ctx(&port->multicast_ctx); +} + +void br_multicast_enable_port(struct net_bridge_port *port) +{ + br_multicast_toggle_port(port, true); +} + void br_multicast_disable_port(struct net_bridge_port *port) { - spin_lock_bh(&port->br->multicast_lock); - __br_multicast_disable_port_ctx(&port->multicast_ctx); - spin_unlock_bh(&port->br->multicast_lock); + br_multicast_toggle_port(port, false); } static int __grp_src_delete_marked(struct net_bridge_port_group *pg) @@ -3480,7 +3541,7 @@ static void br_ip4_multicast_query(struct net_bridge_mcast *brmctx, if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : - try_to_del_timer_sync(&mp->timer) >= 0)) + timer_delete_sync_try(&mp->timer) >= 0)) mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; @@ -3488,7 +3549,7 @@ static void br_ip4_multicast_query(struct net_bridge_mcast *brmctx, pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : - try_to_del_timer_sync(&p->timer) >= 0 && + timer_delete_sync_try(&p->timer) >= 0 && (brmctx->multicast_igmp_version == 2 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); @@ -3569,7 +3630,7 @@ static int br_ip6_multicast_query(struct net_bridge_mcast *brmctx, if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : - try_to_del_timer_sync(&mp->timer) >= 0)) + timer_delete_sync_try(&mp->timer) >= 0)) mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; @@ -3577,7 +3638,7 @@ static int br_ip6_multicast_query(struct net_bridge_mcast *brmctx, pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : - try_to_del_timer_sync(&p->timer) >= 0 && + timer_delete_sync_try(&p->timer) >= 0 && (brmctx->multicast_mld_version == 1 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); @@ -3649,7 +3710,7 @@ br_multicast_leave_group(struct net_bridge_mcast *brmctx, if (!hlist_unhashed(&p->mglist) && (timer_pending(&p->timer) ? time_after(p->timer.expires, time) : - try_to_del_timer_sync(&p->timer) >= 0)) { + timer_delete_sync_try(&p->timer) >= 0)) { mod_timer(&p->timer, time); } @@ -3665,7 +3726,7 @@ br_multicast_leave_group(struct net_bridge_mcast *brmctx, if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, time) : - try_to_del_timer_sync(&mp->timer) >= 0)) { + timer_delete_sync_try(&mp->timer) >= 0)) { mod_timer(&mp->timer, time); } @@ -3681,7 +3742,7 @@ br_multicast_leave_group(struct net_bridge_mcast *brmctx, if (!hlist_unhashed(&p->mglist) && (timer_pending(&p->timer) ? time_after(p->timer.expires, time) : - try_to_del_timer_sync(&p->timer) >= 0)) { + timer_delete_sync_try(&p->timer) >= 0)) { mod_timer(&p->timer, time); } @@ -4199,15 +4260,41 @@ void br_multicast_open(struct net_bridge *br) static void __br_multicast_stop(struct net_bridge_mcast *brmctx) { - del_timer_sync(&brmctx->ip4_mc_router_timer); - del_timer_sync(&brmctx->ip4_other_query.timer); - del_timer_sync(&brmctx->ip4_other_query.delay_timer); - del_timer_sync(&brmctx->ip4_own_query.timer); + timer_delete_sync(&brmctx->ip4_mc_router_timer); + timer_delete_sync(&brmctx->ip4_other_query.timer); + timer_delete_sync(&brmctx->ip4_other_query.delay_timer); + timer_delete_sync(&brmctx->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) - del_timer_sync(&brmctx->ip6_mc_router_timer); - del_timer_sync(&brmctx->ip6_other_query.timer); - del_timer_sync(&brmctx->ip6_other_query.delay_timer); - del_timer_sync(&brmctx->ip6_own_query.timer); + timer_delete_sync(&brmctx->ip6_mc_router_timer); + timer_delete_sync(&brmctx->ip6_other_query.timer); + timer_delete_sync(&brmctx->ip6_other_query.delay_timer); + timer_delete_sync(&brmctx->ip6_own_query.timer); +#endif +} + +void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state) +{ +#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) + struct net_bridge *br; + + if (!br_vlan_should_use(v)) + return; + + if (br_vlan_is_master(v)) + return; + + br = v->port->br; + + if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + return; + + if (br_vlan_state_allowed(state, true)) + br_multicast_enable_port_ctx(&v->port_mcast_ctx); + + /* Multicast is not disabled for the vlan when it goes in + * blocking state because the timers will expire and stop by + * themselves without sending more queries. + */ #endif } @@ -4304,9 +4391,9 @@ int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, __br_multicast_open(&br->multicast_ctx); list_for_each_entry(p, &br->port_list, list) { if (on) - br_multicast_disable_port(p); + br_multicast_disable_port_ctx(&p->multicast_ctx); else - br_multicast_enable_port(p); + br_multicast_enable_port_ctx(&p->multicast_ctx); } list_for_each_entry(vlan, &vg->vlan_list, vlist) @@ -4384,9 +4471,9 @@ int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val) case MDB_RTR_TYPE_DISABLED: case MDB_RTR_TYPE_PERM: br_mc_router_state_change(brmctx->br, val == MDB_RTR_TYPE_PERM); - del_timer(&brmctx->ip4_mc_router_timer); + timer_delete(&brmctx->ip4_mc_router_timer); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&brmctx->ip6_mc_router_timer); + timer_delete(&brmctx->ip6_mc_router_timer); #endif brmctx->multicast_router = val; err = 0; @@ -4455,10 +4542,10 @@ int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, case MDB_RTR_TYPE_DISABLED: pmctx->multicast_router = MDB_RTR_TYPE_DISABLED; del |= br_ip4_multicast_rport_del(pmctx); - del_timer(&pmctx->ip4_mc_router_timer); + timer_delete(&pmctx->ip4_mc_router_timer); del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&pmctx->ip6_mc_router_timer); + timer_delete(&pmctx->ip6_mc_router_timer); #endif br_multicast_rport_del_notify(pmctx, del); break; @@ -4470,10 +4557,10 @@ int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, break; case MDB_RTR_TYPE_PERM: pmctx->multicast_router = MDB_RTR_TYPE_PERM; - del_timer(&pmctx->ip4_mc_router_timer); + timer_delete(&pmctx->ip4_mc_router_timer); br_ip4_multicast_add_router(brmctx, pmctx); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&pmctx->ip6_mc_router_timer); + timer_delete(&pmctx->ip6_mc_router_timer); #endif br_ip6_multicast_add_router(brmctx, pmctx); break; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 451e45b9a6a5..94cbe967d1c1 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -393,38 +393,10 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ reason = ip_route_input(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); if (reason) { - struct in_device *in_dev = __in_dev_get_rcu(dev); - - /* If err equals -EHOSTUNREACH the error is due to a - * martian destination or due to the fact that - * forwarding is disabled. For most martian packets, - * ip_route_output_key() will fail. It won't fail for 2 types of - * martian destinations: loopback destinations and destination - * 0.0.0.0. In both cases the packet will be dropped because the - * destination is the loopback device and not the bridge. */ - if (reason != SKB_DROP_REASON_IP_INADDRERRORS || !in_dev || - IN_DEV_FORWARD(in_dev)) - goto free_skb; - - rt = ip_route_output(net, iph->daddr, 0, - ip4h_dscp(iph), 0, - RT_SCOPE_UNIVERSE); - if (!IS_ERR(rt)) { - /* - Bridged-and-DNAT'ed traffic doesn't - * require ip_forwarding. */ - if (rt->dst.dev == dev) { - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - goto bridged_dnat; - } - ip_rt_put(rt); - } -free_skb: - kfree_skb(skb); + kfree_skb_reason(skb, reason); return 0; } else { if (skb_dst(skb)->dev == dev) { -bridged_dnat: skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3e0f47203f2a..6e337937d0d7 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1553,11 +1553,13 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return 0; } -static int br_dev_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int br_dev_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; int err; err = register_netdevice(dev); diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c index 98aea5485aae..a8c67035e23c 100644 --- a/net/bridge/br_nf_core.c +++ b/net/bridge/br_nf_core.c @@ -65,17 +65,14 @@ static struct dst_ops fake_dst_ops = { * ipt_REJECT needs it. Future netfilter modules might * require us to fill additional fields. */ -static const u32 br_dst_default_metrics[RTAX_MAX] = { - [RTAX_MTU - 1] = 1500, -}; - void br_netfilter_rtable_init(struct net_bridge *br) { struct rtable *rt = &br->fake_rtable; rcuref_init(&rt->dst.__rcuref, 1); rt->dst.dev = br->dev; - dst_init_metrics(&rt->dst, br_dst_default_metrics, true); + dst_init_metrics(&rt->dst, br->metrics, false); + dst_metric_set(&rt->dst, RTAX_MTU, br->dev->mtu); rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; rt->dst.ops = &fake_dst_ops; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 9853cfbb9d14..b159aae594c0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -306,11 +306,12 @@ struct net_bridge_fdb_flush_desc { u16 vlan_id; }; -#define MDB_PG_FLAGS_PERMANENT BIT(0) -#define MDB_PG_FLAGS_OFFLOAD BIT(1) -#define MDB_PG_FLAGS_FAST_LEAVE BIT(2) -#define MDB_PG_FLAGS_STAR_EXCL BIT(3) -#define MDB_PG_FLAGS_BLOCKED BIT(4) +#define MDB_PG_FLAGS_PERMANENT BIT(0) +#define MDB_PG_FLAGS_OFFLOAD BIT(1) +#define MDB_PG_FLAGS_FAST_LEAVE BIT(2) +#define MDB_PG_FLAGS_STAR_EXCL BIT(3) +#define MDB_PG_FLAGS_BLOCKED BIT(4) +#define MDB_PG_FLAGS_OFFLOAD_FAILED BIT(5) #define PG_SRC_ENT_LIMIT 32 @@ -483,6 +484,7 @@ enum net_bridge_opts { BROPT_VLAN_BRIDGE_BINDING, BROPT_MCAST_VLAN_SNOOPING_ENABLED, BROPT_MST_ENABLED, + BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, }; struct net_bridge { @@ -505,6 +507,7 @@ struct net_bridge { struct rtable fake_rtable; struct rt6_info fake_rt6_info; }; + u32 metrics[RTAX_MAX]; #endif u16 group_fwd_mask; u16 group_fwd_mask_required; @@ -949,8 +952,7 @@ br_port_get_check_rtnl(const struct net_device *dev) /* br_ioctl.c */ int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd); -int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, - struct ifreq *ifr, void __user *uarg); +int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg); /* br_multicast.c */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING @@ -1003,6 +1005,8 @@ int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type); +void br_mdb_flag_change_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg); void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type); void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, @@ -1052,6 +1056,7 @@ void br_multicast_port_ctx_init(struct net_bridge_port *port, struct net_bridge_vlan *vlan, struct net_bridge_mcast_port *pmctx); void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx); +void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state); void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on); int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, struct netlink_ext_ack *extack); @@ -1343,6 +1348,22 @@ br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx) return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx)); } + +static inline void +br_multicast_set_pg_offload_flags(struct net_bridge_port_group *p, + bool offloaded) +{ + p->flags &= ~(MDB_PG_FLAGS_OFFLOAD | MDB_PG_FLAGS_OFFLOAD_FAILED); + p->flags |= (offloaded ? MDB_PG_FLAGS_OFFLOAD : + MDB_PG_FLAGS_OFFLOAD_FAILED); +} + +static inline bool +br_mdb_should_notify(const struct net_bridge *br, u8 changed_flags) +{ + return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION) && + (changed_flags & MDB_PG_FLAGS_OFFLOAD_FAILED); +} #else static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx, struct net_bridge_mcast_port **pmctx, @@ -1502,6 +1523,11 @@ static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pm { } +static inline void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, + u8 state) +{ +} + static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) { @@ -1571,6 +1597,9 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, void br_vlan_port_event(struct net_bridge_port *p, unsigned long event); int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr); +void br_vlan_vlan_upper_event(struct net_device *br_dev, + struct net_device *vlan_dev, + unsigned long event); int br_vlan_rtnl_init(void); void br_vlan_rtnl_uninit(void); void br_vlan_notify(const struct net_bridge *br, @@ -1802,6 +1831,12 @@ static inline int br_vlan_bridge_event(struct net_device *dev, return 0; } +static inline void br_vlan_vlan_upper_event(struct net_device *br_dev, + struct net_device *vlan_dev, + unsigned long event) +{ +} + static inline int br_vlan_rtnl_init(void) { return 0; @@ -1853,7 +1888,9 @@ bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts); -/* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ +/* vlan state manipulation helpers using *_ONCE to annotate lock-free access, + * while br_vlan_set_state() may access data protected by multicast_lock. + */ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) { return READ_ONCE(v->state); @@ -1862,6 +1899,7 @@ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state) { WRITE_ONCE(v->state, state); + br_multicast_update_vlan_mcast_ctx(v, state); } static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg) @@ -2290,6 +2328,6 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, u16 vid, struct net_bridge_port *p); void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, u16 vid, struct net_bridge_port *p, struct nd_msg *msg); -struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *m); +struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *m); bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid); #endif diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 7d27b2e6038f..024210f95468 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -198,7 +198,7 @@ void br_become_root_bridge(struct net_bridge *br) br->hello_time = br->bridge_hello_time; br->forward_delay = br->bridge_forward_delay; br_topology_change_detection(br); - del_timer(&br->tcn_timer); + timer_delete(&br->tcn_timer); if (br->dev->flags & IFF_UP) { br_config_bpdu_generation(br); @@ -363,7 +363,7 @@ static int br_supersedes_port_info(const struct net_bridge_port *p, static void br_topology_change_acknowledged(struct net_bridge *br) { br->topology_change_detected = 0; - del_timer(&br->tcn_timer); + timer_delete(&br->tcn_timer); } /* called under bridge lock */ @@ -439,7 +439,7 @@ static void br_make_blocking(struct net_bridge_port *p) br_set_state(p, BR_STATE_BLOCKING); br_ifinfo_notify(RTM_NEWLINK, NULL, p); - del_timer(&p->forward_delay_timer); + timer_delete(&p->forward_delay_timer); } } @@ -454,7 +454,7 @@ static void br_make_forwarding(struct net_bridge_port *p) if (br->stp_enabled == BR_NO_STP || br->forward_delay == 0) { br_set_state(p, BR_STATE_FORWARDING); br_topology_change_detection(br); - del_timer(&p->forward_delay_timer); + timer_delete(&p->forward_delay_timer); } else if (br->stp_enabled == BR_KERNEL_STP) br_set_state(p, BR_STATE_LISTENING); else @@ -483,7 +483,7 @@ void br_port_state_selection(struct net_bridge *br) p->topology_change_ack = 0; br_make_forwarding(p); } else if (br_is_designated_port(p)) { - del_timer(&p->message_age_timer); + timer_delete(&p->message_age_timer); br_make_forwarding(p); } else { p->config_pending = 0; @@ -533,9 +533,9 @@ void br_received_config_bpdu(struct net_bridge_port *p, br_port_state_selection(br); if (!br_is_root_bridge(br) && was_root) { - del_timer(&br->hello_timer); + timer_delete(&br->hello_timer); if (br->topology_change_detected) { - del_timer(&br->topology_change_timer); + timer_delete(&br->topology_change_timer); br_transmit_tcn(br); mod_timer(&br->tcn_timer, diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 75204d36d7f9..c20a41bf253b 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -81,9 +81,9 @@ void br_stp_disable_bridge(struct net_bridge *br) br->topology_change_detected = 0; spin_unlock_bh(&br->lock); - del_timer_sync(&br->hello_timer); - del_timer_sync(&br->topology_change_timer); - del_timer_sync(&br->tcn_timer); + timer_delete_sync(&br->hello_timer); + timer_delete_sync(&br->topology_change_timer); + timer_delete_sync(&br->tcn_timer); cancel_delayed_work_sync(&br->gc_work); } @@ -109,9 +109,9 @@ void br_stp_disable_port(struct net_bridge_port *p) br_ifinfo_notify(RTM_NEWLINK, NULL, p); - del_timer(&p->message_age_timer); - del_timer(&p->forward_delay_timer); - del_timer(&p->hold_timer); + timer_delete(&p->message_age_timer); + timer_delete(&p->forward_delay_timer); + timer_delete(&p->hold_timer); if (!rcu_access_pointer(p->backup_port)) br_fdb_delete_by_port(br, p, 0, 0); diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index 7b41ee8740cb..95d7355a0407 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -504,9 +504,10 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri struct net_bridge_mdb_entry *mp; struct net_bridge_port *port = data->port; struct net_bridge *br = port->br; + u8 old_flags; - if (err) - goto err; + if (err == -EOPNOTSUPP) + goto out_free; spin_lock_bh(&br->multicast_lock); mp = br_mdb_ip_get(br, &data->ip); @@ -516,11 +517,15 @@ static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *pri pp = &p->next) { if (p->key.port != port) continue; - p->flags |= MDB_PG_FLAGS_OFFLOAD; + + old_flags = p->flags; + br_multicast_set_pg_offload_flags(p, !err); + if (br_mdb_should_notify(br, old_flags ^ p->flags)) + br_mdb_flag_change_notify(br->dev, mp, p); } out: spin_unlock_bh(&br->multicast_lock); -err: +out_free: kfree(priv); } diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index ea733542244c..c1176a5e02c4 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -1002,7 +1002,7 @@ static const struct attribute_group bridge_group = { * Returns the number of bytes read. */ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, - struct bin_attribute *bin_attr, + const struct bin_attribute *bin_attr, char *buf, loff_t off, size_t count) { struct device *dev = kobj_to_dev(kobj); @@ -1023,10 +1023,10 @@ static ssize_t brforward_read(struct file *filp, struct kobject *kobj, return n; } -static struct bin_attribute bridge_forward = { +static const struct bin_attribute bridge_forward = { .attr = { .name = SYSFS_BRIDGE_FDB, .mode = 0444, }, - .read = brforward_read, + .read_new = brforward_read, }; /* diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 89f51ea4cabe..939a3aa78d5c 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -715,8 +715,8 @@ static int br_vlan_add_existing(struct net_bridge *br, u16 flags, bool *changed, struct netlink_ext_ack *extack) { - bool would_change = __vlan_flags_would_change(vlan, flags); bool becomes_brentry = false; + bool would_change = false; int err; if (!br_vlan_is_brentry(vlan)) { @@ -725,6 +725,8 @@ static int br_vlan_add_existing(struct net_bridge *br, return -EINVAL; becomes_brentry = true; + } else { + would_change = __vlan_flags_would_change(vlan, flags); } /* Master VLANs that aren't brentries weren't notified before, @@ -1664,6 +1666,18 @@ static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p) } } +static void br_vlan_toggle_bridge_binding(struct net_device *br_dev, + bool enable) +{ + struct net_bridge *br = netdev_priv(br_dev); + + if (enable) + br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true); + else + br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, + br_vlan_has_upper_bind_vlan_dev(br_dev)); +} + static void br_vlan_upper_change(struct net_device *dev, struct net_device *upper_dev, bool linking) @@ -1673,13 +1687,9 @@ static void br_vlan_upper_change(struct net_device *dev, if (!br_vlan_is_bind_vlan_dev(upper_dev)) return; - if (linking) { + br_vlan_toggle_bridge_binding(dev, linking); + if (linking) br_vlan_set_vlan_dev_state(br, upper_dev); - br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true); - } else { - br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, - br_vlan_has_upper_bind_vlan_dev(dev)); - } } struct br_vlan_link_state_walk_data { @@ -1764,6 +1774,30 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr) return ret; } +void br_vlan_vlan_upper_event(struct net_device *br_dev, + struct net_device *vlan_dev, + unsigned long event) +{ + struct vlan_dev_priv *vlan = vlan_dev_priv(vlan_dev); + struct net_bridge *br = netdev_priv(br_dev); + bool bridge_binding; + + switch (event) { + case NETDEV_CHANGE: + case NETDEV_UP: + break; + default: + return; + } + + bridge_binding = vlan->flags & VLAN_FLAG_BRIDGE_BINDING; + br_vlan_toggle_bridge_binding(br_dev, bridge_binding); + if (bridge_binding) + br_vlan_set_vlan_dev_state(br, vlan_dev); + else if (!bridge_binding && netif_carrier_ok(br_dev)) + netif_carrier_on(vlan_dev); +} + /* Must be protected by RTNL. */ void br_vlan_port_event(struct net_bridge_port *p, unsigned long event) { diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 816bb0fde718..6482de4d8750 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -60,19 +60,19 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, struct ip_fraglist_iter iter; struct sk_buff *frag; - if (first_len - hlen > mtu || - skb_headroom(skb) < ll_rs) + if (first_len - hlen > mtu) goto blackhole; - if (skb_cloned(skb)) + if (skb_cloned(skb) || + skb_headroom(skb) < ll_rs) goto slow_path; skb_walk_frags(skb, frag) { - if (frag->len > mtu || - skb_headroom(frag) < hlen + ll_rs) + if (frag->len > mtu) goto blackhole; - if (skb_shared(frag)) + if (skb_shared(frag) || + skb_headroom(frag) < hlen + ll_rs) goto slow_path; } diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 94ad09e36df2..fa6a3c2634a8 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -438,10 +438,11 @@ static void caif_netlink_parms(struct nlattr *data[], } } -static int ipcaif_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipcaif_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; int ret; struct chnl_net *caifdev; ASSERT_RTNL(); diff --git a/net/can/af_can.c b/net/can/af_can.c index 01f3fbb3b67d..4aab7033c933 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -172,6 +172,8 @@ static int can_create(struct net *net, struct socket *sock, int protocol, sock_orphan(sk); sock_put(sk); sock->sk = NULL; + } else { + sock_prot_inuse_add(net, sk->sk_prot, 1); } errout: @@ -287,8 +289,8 @@ int can_send(struct sk_buff *skb, int loop) netif_rx(newskb); /* update statistics */ - pkg_stats->tx_frames++; - pkg_stats->tx_frames_delta++; + atomic_long_inc(&pkg_stats->tx_frames); + atomic_long_inc(&pkg_stats->tx_frames_delta); return 0; @@ -647,8 +649,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) int matches; /* update statistics */ - pkg_stats->rx_frames++; - pkg_stats->rx_frames_delta++; + atomic_long_inc(&pkg_stats->rx_frames); + atomic_long_inc(&pkg_stats->rx_frames_delta); /* create non-zero unique skb identifier together with *skb */ while (!(can_skb_prv(skb)->skbcnt)) @@ -669,8 +671,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); if (matches > 0) { - pkg_stats->matches++; - pkg_stats->matches_delta++; + atomic_long_inc(&pkg_stats->matches); + atomic_long_inc(&pkg_stats->matches_delta); } } @@ -823,7 +825,7 @@ static void can_pernet_exit(struct net *net) if (IS_ENABLED(CONFIG_PROC_FS)) { can_remove_proc(net); if (stats_timer) - del_timer_sync(&net->can.stattimer); + timer_delete_sync(&net->can.stattimer); } kfree(net->can.rx_alldev_list); diff --git a/net/can/af_can.h b/net/can/af_can.h index 7c2d9161e224..22f3352c77fe 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -66,9 +66,9 @@ struct receiver { struct can_pkg_stats { unsigned long jiffies_init; - unsigned long rx_frames; - unsigned long tx_frames; - unsigned long matches; + atomic_long_t rx_frames; + atomic_long_t tx_frames; + atomic_long_t matches; unsigned long total_rx_rate; unsigned long total_tx_rate; @@ -82,9 +82,9 @@ struct can_pkg_stats { unsigned long max_tx_rate; unsigned long max_rx_match_ratio; - unsigned long rx_frames_delta; - unsigned long tx_frames_delta; - unsigned long matches_delta; + atomic_long_t rx_frames_delta; + atomic_long_t tx_frames_delta; + atomic_long_t matches_delta; }; /* persistent statistics */ diff --git a/net/can/bcm.c b/net/can/bcm.c index 217049fa496e..6bc1cc4c94c5 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -58,6 +58,7 @@ #include <linux/can/skb.h> #include <linux/can/bcm.h> #include <linux/slab.h> +#include <linux/spinlock.h> #include <net/sock.h> #include <net/net_namespace.h> @@ -122,6 +123,7 @@ struct bcm_op { struct canfd_frame last_sframe; struct sock *sk; struct net_device *rx_reg_dev; + spinlock_t bcm_tx_lock; /* protect currframe/count in runtime updates */ }; struct bcm_sock { @@ -217,7 +219,9 @@ static int bcm_proc_show(struct seq_file *m, void *v) seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex)); seq_printf(m, " <<<\n"); - list_for_each_entry(op, &bo->rx_ops, list) { + rcu_read_lock(); + + list_for_each_entry_rcu(op, &bo->rx_ops, list) { unsigned long reduction; @@ -273,6 +277,9 @@ static int bcm_proc_show(struct seq_file *m, void *v) seq_printf(m, "# sent %ld\n", op->frames_abs); } seq_putc(m, '\n'); + + rcu_read_unlock(); + return 0; } #endif /* CONFIG_PROC_FS */ @@ -285,13 +292,18 @@ static void bcm_can_tx(struct bcm_op *op) { struct sk_buff *skb; struct net_device *dev; - struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe; + struct canfd_frame *cf; int err; /* no target device? => exit */ if (!op->ifindex) return; + /* read currframe under lock protection */ + spin_lock_bh(&op->bcm_tx_lock); + cf = op->frames + op->cfsiz * op->currframe; + spin_unlock_bh(&op->bcm_tx_lock); + dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (!dev) { /* RFC: should this bcm_op remove itself here? */ @@ -312,6 +324,10 @@ static void bcm_can_tx(struct bcm_op *op) skb->dev = dev; can_skb_set_owner(skb, op->sk); err = can_send(skb, 1); + + /* update currframe and count under lock protection */ + spin_lock_bh(&op->bcm_tx_lock); + if (!err) op->frames_abs++; @@ -320,6 +336,11 @@ static void bcm_can_tx(struct bcm_op *op) /* reached last frame? */ if (op->currframe >= op->nframes) op->currframe = 0; + + if (op->count > 0) + op->count--; + + spin_unlock_bh(&op->bcm_tx_lock); out: dev_put(dev); } @@ -430,7 +451,7 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) struct bcm_msg_head msg_head; if (op->kt_ival1 && (op->count > 0)) { - op->count--; + bcm_can_tx(op); if (!op->count && (op->flags & TX_COUNTEVT)) { /* create notification to user */ @@ -445,7 +466,6 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) bcm_send_to_user(op, &msg_head, NULL, 0); } - bcm_can_tx(op); } else if (op->kt_ival2) { bcm_can_tx(op); @@ -843,7 +863,7 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, REGMASK(op->can_id), bcm_rx_handler, op); - list_del(&op->list); + list_del_rcu(&op->list); bcm_remove_op(op); return 1; /* done */ } @@ -863,7 +883,7 @@ static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh, list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { - list_del(&op->list); + list_del_rcu(&op->list); bcm_remove_op(op); return 1; /* done */ } @@ -956,6 +976,27 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } op->flags = msg_head->flags; + /* only lock for unlikely count/nframes/currframe changes */ + if (op->nframes != msg_head->nframes || + op->flags & TX_RESET_MULTI_IDX || + op->flags & SETTIMER) { + + spin_lock_bh(&op->bcm_tx_lock); + + if (op->nframes != msg_head->nframes || + op->flags & TX_RESET_MULTI_IDX) { + /* potentially update changed nframes */ + op->nframes = msg_head->nframes; + /* restart multiple frame transmission */ + op->currframe = 0; + } + + if (op->flags & SETTIMER) + op->count = msg_head->count; + + spin_unlock_bh(&op->bcm_tx_lock); + } + } else { /* insert new BCM operation for the given can_id */ @@ -963,9 +1004,14 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (!op) return -ENOMEM; + spin_lock_init(&op->bcm_tx_lock); op->can_id = msg_head->can_id; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; + op->nframes = msg_head->nframes; + + if (op->flags & SETTIMER) + op->count = msg_head->count; /* create array for CAN frames and copy the data */ if (msg_head->nframes > 1) { @@ -1011,35 +1057,20 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ - hrtimer_init(&op->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - op->timer.function = bcm_tx_timeout_handler; + hrtimer_setup(&op->timer, bcm_tx_timeout_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); /* currently unused in tx_ops */ - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&op->thrtimer, hrtimer_dummy_timeout, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the tx_ops */ list_add(&op->list, &bo->tx_ops); } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */ - if (op->nframes != msg_head->nframes) { - op->nframes = msg_head->nframes; - /* start multiple frame transmission with index 0 */ - op->currframe = 0; - } - - /* check flags */ - - if (op->flags & TX_RESET_MULTI_IDX) { - /* start multiple frame transmission with index 0 */ - op->currframe = 0; - } - if (op->flags & SETTIMER) { /* set timer values */ - op->count = msg_head->count; op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); @@ -1056,11 +1087,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->flags |= TX_ANNOUNCE; } - if (op->flags & TX_ANNOUNCE) { + if (op->flags & TX_ANNOUNCE) bcm_can_tx(op); - if (op->count) - op->count--; - } if (op->flags & STARTTIMER) bcm_tx_start_timer(op); @@ -1192,13 +1220,10 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->rx_ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ - hrtimer_init(&op->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - op->timer.function = bcm_rx_timeout_handler; - - hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - op->thrtimer.function = bcm_rx_thr_handler; + hrtimer_setup(&op->timer, bcm_rx_timeout_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&op->thrtimer, bcm_rx_thr_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); /* add this bcm_op to the list of the rx_ops */ list_add(&op->list, &bo->rx_ops); @@ -1276,7 +1301,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, bcm_rx_handler, op, "bcm", sk); if (err) { /* this bcm rx op is broken -> remove it */ - list_del(&op->list); + list_del_rcu(&op->list); bcm_remove_op(op); return err; } @@ -1625,6 +1650,7 @@ static int bcm_release(struct socket *sock) sock->sk = NULL; release_sock(sk); + sock_prot_inuse_add(net, sk->sk_prot, -1); sock_put(sk); return 0; diff --git a/net/can/gw.c b/net/can/gw.c index ef93293c1fae..55eccb1c7620 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -130,7 +130,7 @@ struct cgw_job { u32 handled_frames; u32 dropped_frames; u32 deleted_frames; - struct cf_mod mod; + struct cf_mod __rcu *cf_mod; union { /* CAN frame data source */ struct net_device *dev; @@ -459,6 +459,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) struct cgw_job *gwj = (struct cgw_job *)data; struct canfd_frame *cf; struct sk_buff *nskb; + struct cf_mod *mod; int modidx = 0; /* process strictly Classic CAN or CAN FD frames */ @@ -506,7 +507,8 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) * When there is at least one modification function activated, * we need to copy the skb as we want to modify skb->data. */ - if (gwj->mod.modfunc[0]) + mod = rcu_dereference(gwj->cf_mod); + if (mod->modfunc[0]) nskb = skb_copy(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); @@ -529,8 +531,8 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) cf = (struct canfd_frame *)nskb->data; /* perform preprocessed modification functions if there are any */ - while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx]) - (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod); + while (modidx < MAX_MODFUNCTIONS && mod->modfunc[modidx]) + (*mod->modfunc[modidx++])(cf, mod); /* Has the CAN frame been modified? */ if (modidx) { @@ -546,11 +548,11 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) } /* check for checksum updates */ - if (gwj->mod.csumfunc.crc8) - (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); + if (mod->csumfunc.crc8) + (*mod->csumfunc.crc8)(cf, &mod->csum.crc8); - if (gwj->mod.csumfunc.xor) - (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor); + if (mod->csumfunc.xor) + (*mod->csumfunc.xor)(cf, &mod->csum.xor); } /* clear the skb timestamp if not configured the other way */ @@ -581,9 +583,20 @@ static void cgw_job_free_rcu(struct rcu_head *rcu_head) { struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu); + /* cgw_job::cf_mod is always accessed from the same cgw_job object within + * the same RCU read section. Once cgw_job is scheduled for removal, + * cf_mod can also be removed without mandating an additional grace period. + */ + kfree(rcu_access_pointer(gwj->cf_mod)); kmem_cache_free(cgw_cache, gwj); } +/* Return cgw_job::cf_mod with RTNL protected section */ +static struct cf_mod *cgw_job_cf_mod(struct cgw_job *gwj) +{ + return rcu_dereference_protected(gwj->cf_mod, rtnl_is_locked()); +} + static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { @@ -616,6 +629,7 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, { struct rtcanmsg *rtcan; struct nlmsghdr *nlh; + struct cf_mod *mod; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags); if (!nlh) @@ -650,82 +664,83 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, goto cancel; } + mod = cgw_job_cf_mod(gwj); if (gwj->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; - if (gwj->mod.modtype.and) { - memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.and; + if (mod->modtype.and) { + memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf)); + mb.modtype = mod->modtype.and; if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.or) { - memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.or; + if (mod->modtype.or) { + memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf)); + mb.modtype = mod->modtype.or; if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.xor) { - memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.xor; + if (mod->modtype.xor) { + memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf)); + mb.modtype = mod->modtype.xor; if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.set) { - memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.set; + if (mod->modtype.set) { + memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf)); + mb.modtype = mod->modtype.set; if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } else { struct cgw_frame_mod mb; - if (gwj->mod.modtype.and) { - memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.and; + if (mod->modtype.and) { + memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf)); + mb.modtype = mod->modtype.and; if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.or) { - memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.or; + if (mod->modtype.or) { + memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf)); + mb.modtype = mod->modtype.or; if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.xor) { - memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.xor; + if (mod->modtype.xor) { + memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf)); + mb.modtype = mod->modtype.xor; if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.set) { - memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.set; + if (mod->modtype.set) { + memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf)); + mb.modtype = mod->modtype.set; if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } - if (gwj->mod.uid) { - if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0) + if (mod->uid) { + if (nla_put_u32(skb, CGW_MOD_UID, mod->uid) < 0) goto cancel; } - if (gwj->mod.csumfunc.crc8) { + if (mod->csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, - &gwj->mod.csum.crc8) < 0) + &mod->csum.crc8) < 0) goto cancel; } - if (gwj->mod.csumfunc.xor) { + if (mod->csumfunc.xor) { if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN, - &gwj->mod.csum.xor) < 0) + &mod->csum.xor) < 0) goto cancel; } @@ -1059,7 +1074,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct rtcanmsg *r; struct cgw_job *gwj; - struct cf_mod mod; + struct cf_mod *mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; @@ -1078,37 +1093,48 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; - err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); + mod = kmalloc(sizeof(*mod), GFP_KERNEL); + if (!mod) + return -ENOMEM; + + err = cgw_parse_attr(nlh, mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); if (err < 0) - return err; + goto out_free_cf; - if (mod.uid) { + if (mod->uid) { ASSERT_RTNL(); /* check for updating an existing job with identical uid */ hlist_for_each_entry(gwj, &net->can.cgw_list, list) { - if (gwj->mod.uid != mod.uid) + struct cf_mod *old_cf; + + old_cf = cgw_job_cf_mod(gwj); + if (old_cf->uid != mod->uid) continue; /* interfaces & filters must be identical */ - if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) - return -EINVAL; + if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) { + err = -EINVAL; + goto out_free_cf; + } - /* update modifications with disabled softirq & quit */ - local_bh_disable(); - memcpy(&gwj->mod, &mod, sizeof(mod)); - local_bh_enable(); + rcu_assign_pointer(gwj->cf_mod, mod); + kfree_rcu_mightsleep(old_cf); return 0; } } /* ifindex == 0 is not allowed for job creation */ - if (!ccgw.src_idx || !ccgw.dst_idx) - return -ENODEV; + if (!ccgw.src_idx || !ccgw.dst_idx) { + err = -ENODEV; + goto out_free_cf; + } gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); - if (!gwj) - return -ENOMEM; + if (!gwj) { + err = -ENOMEM; + goto out_free_cf; + } gwj->handled_frames = 0; gwj->dropped_frames = 0; @@ -1118,7 +1144,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, gwj->limit_hops = limhops; /* insert already parsed information */ - memcpy(&gwj->mod, &mod, sizeof(mod)); + RCU_INIT_POINTER(gwj->cf_mod, mod); memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; @@ -1152,9 +1178,11 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, if (!err) hlist_add_head_rcu(&gwj->list, &net->can.cgw_list); out: - if (err) + if (err) { kmem_cache_free(cgw_cache, gwj); - +out_free_cf: + kfree(mod); + } return err; } @@ -1214,19 +1242,22 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, /* remove only the first matching entry */ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { + struct cf_mod *cf_mod; + if (gwj->flags != r->flags) continue; if (gwj->limit_hops != limhops) continue; + cf_mod = cgw_job_cf_mod(gwj); /* we have a match when uid is enabled and identical */ - if (gwj->mod.uid || mod.uid) { - if (gwj->mod.uid != mod.uid) + if (cf_mod->uid || mod.uid) { + if (cf_mod->uid != mod.uid) continue; } else { /* no uid => check for identical modifications */ - if (memcmp(&gwj->mod, &mod, sizeof(mod))) + if (memcmp(cf_mod, &mod, sizeof(mod))) continue; } diff --git a/net/can/isotp.c b/net/can/isotp.c index 16046931542a..1efa377f002e 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1239,6 +1239,7 @@ static int isotp_release(struct socket *sock) sock->sk = NULL; release_sock(sk); + sock_prot_inuse_add(net, sk->sk_prot, -1); sock_put(sk); return 0; @@ -1634,12 +1635,10 @@ static int isotp_init(struct sock *sk) so->rx.buflen = ARRAY_SIZE(so->rx.sbuf); so->tx.buflen = ARRAY_SIZE(so->tx.sbuf); - hrtimer_init(&so->rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - so->rxtimer.function = isotp_rx_timer_handler; - hrtimer_init(&so->txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - so->txtimer.function = isotp_tx_timer_handler; - hrtimer_init(&so->txfrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - so->txfrtimer.function = isotp_txfr_timer_handler; + hrtimer_setup(&so->rxtimer, isotp_rx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&so->txtimer, isotp_tx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&so->txfrtimer, isotp_txfr_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); init_waitqueue_head(&so->wait); spin_lock_init(&so->rx_lock); diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c index 486687901602..39844f14eed8 100644 --- a/net/can/j1939/bus.c +++ b/net/can/j1939/bus.c @@ -158,8 +158,8 @@ struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name) ecu->addr = J1939_IDLE_ADDR; ecu->name = name; - hrtimer_init(&ecu->ac_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); - ecu->ac_timer.function = j1939_ecu_timer_handler; + hrtimer_setup(&ecu->ac_timer, j1939_ecu_timer_handler, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_SOFT); INIT_LIST_HEAD(&ecu->list); j1939_priv_get(priv); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 305dd72c844c..6fefe7a68761 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -655,6 +655,7 @@ static int j1939_sk_release(struct socket *sock) sock->sk = NULL; release_sock(sk); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); sock_put(sk); return 0; @@ -1132,7 +1133,7 @@ static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, todo_size = size; - while (todo_size) { + do { struct j1939_sk_buff_cb *skcb; segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, @@ -1177,7 +1178,7 @@ static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, todo_size -= segment_size; session->total_queued_size += segment_size; - } + } while (todo_size); switch (ret) { case 0: /* OK */ diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 95f7a7e65a73..fbf5c8001c9d 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -382,8 +382,9 @@ sk_buff *j1939_session_skb_get_by_offset(struct j1939_session *session, skb_queue_walk(&session->skb_queue, do_skb) { do_skcb = j1939_skb_to_cb(do_skb); - if (offset_start >= do_skcb->offset && - offset_start < (do_skcb->offset + do_skb->len)) { + if ((offset_start >= do_skcb->offset && + offset_start < (do_skcb->offset + do_skb->len)) || + (offset_start == 0 && do_skcb->offset == 0 && do_skb->len == 0)) { skb = do_skb; } } @@ -1510,12 +1511,8 @@ static struct j1939_session *j1939_session_new(struct j1939_priv *priv, skcb = j1939_skb_to_cb(skb); memcpy(&session->skcb, skcb, sizeof(session->skcb)); - hrtimer_init(&session->txtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - session->txtimer.function = j1939_tp_txtimer; - hrtimer_init(&session->rxtimer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_SOFT); - session->rxtimer.function = j1939_tp_rxtimer; + hrtimer_setup(&session->txtimer, j1939_tp_txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_setup(&session->rxtimer, j1939_tp_rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); netdev_dbg(priv->ndev, "%s: 0x%p: sa: %02x, da: %02x\n", __func__, session, skcb->addr.sa, skcb->addr.da); diff --git a/net/can/proc.c b/net/can/proc.c index bbce97825f13..25fdf060e30d 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -118,6 +118,13 @@ void can_stat_update(struct timer_list *t) struct can_pkg_stats *pkg_stats = net->can.pkg_stats; unsigned long j = jiffies; /* snapshot */ + long rx_frames = atomic_long_read(&pkg_stats->rx_frames); + long tx_frames = atomic_long_read(&pkg_stats->tx_frames); + long matches = atomic_long_read(&pkg_stats->matches); + long rx_frames_delta = atomic_long_read(&pkg_stats->rx_frames_delta); + long tx_frames_delta = atomic_long_read(&pkg_stats->tx_frames_delta); + long matches_delta = atomic_long_read(&pkg_stats->matches_delta); + /* restart counting in timer context on user request */ if (user_reset) can_init_stats(net); @@ -127,35 +134,33 @@ void can_stat_update(struct timer_list *t) can_init_stats(net); /* prevent overflow in calc_rate() */ - if (pkg_stats->rx_frames > (ULONG_MAX / HZ)) + if (rx_frames > (LONG_MAX / HZ)) can_init_stats(net); /* prevent overflow in calc_rate() */ - if (pkg_stats->tx_frames > (ULONG_MAX / HZ)) + if (tx_frames > (LONG_MAX / HZ)) can_init_stats(net); /* matches overflow - very improbable */ - if (pkg_stats->matches > (ULONG_MAX / 100)) + if (matches > (LONG_MAX / 100)) can_init_stats(net); /* calc total values */ - if (pkg_stats->rx_frames) - pkg_stats->total_rx_match_ratio = (pkg_stats->matches * 100) / - pkg_stats->rx_frames; + if (rx_frames) + pkg_stats->total_rx_match_ratio = (matches * 100) / rx_frames; pkg_stats->total_tx_rate = calc_rate(pkg_stats->jiffies_init, j, - pkg_stats->tx_frames); + tx_frames); pkg_stats->total_rx_rate = calc_rate(pkg_stats->jiffies_init, j, - pkg_stats->rx_frames); + rx_frames); /* calc current values */ - if (pkg_stats->rx_frames_delta) + if (rx_frames_delta) pkg_stats->current_rx_match_ratio = - (pkg_stats->matches_delta * 100) / - pkg_stats->rx_frames_delta; + (matches_delta * 100) / rx_frames_delta; - pkg_stats->current_tx_rate = calc_rate(0, HZ, pkg_stats->tx_frames_delta); - pkg_stats->current_rx_rate = calc_rate(0, HZ, pkg_stats->rx_frames_delta); + pkg_stats->current_tx_rate = calc_rate(0, HZ, tx_frames_delta); + pkg_stats->current_rx_rate = calc_rate(0, HZ, rx_frames_delta); /* check / update maximum values */ if (pkg_stats->max_tx_rate < pkg_stats->current_tx_rate) @@ -168,9 +173,9 @@ void can_stat_update(struct timer_list *t) pkg_stats->max_rx_match_ratio = pkg_stats->current_rx_match_ratio; /* clear values for 'current rate' calculation */ - pkg_stats->tx_frames_delta = 0; - pkg_stats->rx_frames_delta = 0; - pkg_stats->matches_delta = 0; + atomic_long_set(&pkg_stats->tx_frames_delta, 0); + atomic_long_set(&pkg_stats->rx_frames_delta, 0); + atomic_long_set(&pkg_stats->matches_delta, 0); /* restart timer (one second) */ mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ)); @@ -214,9 +219,12 @@ static int can_stats_proc_show(struct seq_file *m, void *v) struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats; seq_putc(m, '\n'); - seq_printf(m, " %8ld transmitted frames (TXF)\n", pkg_stats->tx_frames); - seq_printf(m, " %8ld received frames (RXF)\n", pkg_stats->rx_frames); - seq_printf(m, " %8ld matched frames (RXMF)\n", pkg_stats->matches); + seq_printf(m, " %8ld transmitted frames (TXF)\n", + atomic_long_read(&pkg_stats->tx_frames)); + seq_printf(m, " %8ld received frames (RXF)\n", + atomic_long_read(&pkg_stats->rx_frames)); + seq_printf(m, " %8ld matched frames (RXMF)\n", + atomic_long_read(&pkg_stats->matches)); seq_putc(m, '\n'); diff --git a/net/can/raw.c b/net/can/raw.c index 255c0a8f39d6..020f21430b1d 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -397,11 +397,13 @@ static int raw_release(struct socket *sock) { struct sock *sk = sock->sk; struct raw_sock *ro; + struct net *net; if (!sk) return 0; ro = raw_sk(sk); + net = sock_net(sk); spin_lock(&raw_notifier_lock); while (raw_busy_notifier == ro) { @@ -421,7 +423,7 @@ static int raw_release(struct socket *sock) raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk); netdev_put(ro->dev, &ro->dev_tracker); } else { - raw_disable_allfilters(sock_net(sk), NULL, sk); + raw_disable_allfilters(net, NULL, sk); } } @@ -440,6 +442,7 @@ static int raw_release(struct socket *sock) release_sock(sk); rtnl_unlock(); + sock_prot_inuse_add(net, sk->sk_prot, -1); sock_put(sk); return 0; @@ -962,8 +965,8 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) } skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); - skb->mark = READ_ONCE(sk->sk_mark); + skb->priority = sockc.priority; + skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, &sockc); diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index c5c4eef3a9ff..0aa21fcbf6ec 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -2,7 +2,7 @@ config CEPH_LIB tristate "Ceph core library" depends on INET - select LIBCRC32C + select CRC32 select CRYPTO_AES select CRYPTO_CBC select CRYPTO_GCM diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b24afec24138..6664ea73ccf8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -220,16 +220,6 @@ void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); -void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, - unsigned int which, struct ceph_pagelist *pagelist) -{ - struct ceph_osd_data *osd_data; - - osd_data = osd_req_op_data(osd_req, which, extent, osd_data); - ceph_osd_data_pagelist_init(osd_data, pagelist); -} -EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); - #ifdef CONFIG_BLOCK void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, unsigned int which, @@ -297,19 +287,6 @@ static void osd_req_op_cls_request_info_pagelist( ceph_osd_data_pagelist_init(osd_data, pagelist); } -void osd_req_op_cls_request_data_pagelist( - struct ceph_osd_request *osd_req, - unsigned int which, struct ceph_pagelist *pagelist) -{ - struct ceph_osd_data *osd_data; - - osd_data = osd_req_op_data(osd_req, which, cls, request_data); - ceph_osd_data_pagelist_init(osd_data, pagelist); - osd_req->r_ops[which].cls.indata_len += pagelist->length; - osd_req->r_ops[which].indata_len += pagelist->length; -} -EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); - void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) diff --git a/net/core/Makefile b/net/core/Makefile index d9326600e289..b2a76ce33932 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -9,7 +9,7 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o -obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ +obj-y += dev.o dev_api.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ fib_notifier.o xdp.o flow_offload.o gro.o \ @@ -45,5 +45,5 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o obj-$(CONFIG_NET_TEST) += net_test.o obj-$(CONFIG_NET_DEVMEM) += devmem.o -obj-$(CONFIG_DEBUG_NET_SMALL_RTNL) += rtnl_net_debug.o +obj-$(CONFIG_DEBUG_NET) += lock_debug.o obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 2f4ed83a75ae..2e538399757f 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -50,15 +50,16 @@ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; + migrate_disable(); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); - if (!sk_storage) { - rcu_read_unlock(); - return; - } + if (!sk_storage) + goto out; bpf_local_storage_destroy(sk_storage); +out: rcu_read_unlock(); + migrate_enable(); } static void bpf_sk_storage_map_free(struct bpf_map *map) @@ -160,6 +161,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); + migrate_disable(); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); @@ -212,6 +214,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) out: rcu_read_unlock(); + migrate_enable(); /* In case of an error, don't free anything explicitly here, the * caller is responsible to call bpf_sk_storage_free. @@ -352,11 +355,6 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) { - const struct btf *btf_vmlinux; - const struct btf_type *t; - const char *tname; - u32 btf_id; - if (prog->aux->dst_prog) return false; @@ -371,13 +369,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: - btf_vmlinux = bpf_get_btf_vmlinux(); - if (IS_ERR_OR_NULL(btf_vmlinux)) - return false; - btf_id = prog->aux->attach_btf_id; - t = btf_type_by_id(btf_vmlinux, btf_id); - tname = btf_name_by_offset(btf_vmlinux, t->name_off); - return !!strncmp(tname, "bpf_sk_storage", + return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: return false; diff --git a/net/core/datagram.c b/net/core/datagram.c index f0693707aece..94cc4705e91d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -52,6 +52,7 @@ #include <linux/pagemap.h> #include <linux/iov_iter.h> #include <linux/indirect_call_wrapper.h> +#include <linux/crc32.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -61,7 +62,8 @@ #include <net/tcp_states.h> #include <trace/events/skb.h> #include <net/busy_poll.h> -#include <crypto/hash.h> + +#include "devmem.h" /* * Is a socket 'connection oriented' ? @@ -163,8 +165,7 @@ done: return skb; } -struct sk_buff *__skb_try_recv_from_queue(struct sock *sk, - struct sk_buff_head *queue, +struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue, unsigned int flags, int *off, int *err, struct sk_buff **last) @@ -261,7 +262,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, * However, this function was correct in any case. 8) */ spin_lock_irqsave(&queue->lock, cpu_flags); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error, + skb = __skb_try_recv_from_queue(queue, flags, off, &error, last); spin_unlock_irqrestore(&queue->lock, cpu_flags); if (error) @@ -482,41 +483,37 @@ short_copy: return 0; } -static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, - struct iov_iter *i) +#ifdef CONFIG_NET_CRC32C +static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes, + void *_crcp, struct iov_iter *i) { -#ifdef CONFIG_CRYPTO_HASH - struct ahash_request *hash = hashp; - struct scatterlist sg; + u32 *crcp = _crcp; size_t copied; copied = copy_to_iter(addr, bytes, i); - sg_init_one(&sg, addr, copied); - ahash_request_set_crypt(hash, &sg, NULL, copied); - crypto_ahash_update(hash); + *crcp = crc32c(*crcp, addr, copied); return copied; -#else - return 0; -#endif } /** - * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator - * and update a hash. + * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator + * and update a CRC32C value. * @skb: buffer to copy * @offset: offset in the buffer to start copying from * @to: iovec iterator to copy to * @len: amount of data to copy from buffer to iovec - * @hash: hash request to update + * @crcp: pointer to CRC32C value to update + * + * Return: 0 on success, -EFAULT if there was a fault during copy. */ -int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len, - struct ahash_request *hash) +int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, u32 *crcp) { return __skb_datagram_iter(skb, offset, to, len, true, - hash_and_copy_to_iter, hash); + crc32c_and_copy_to_iter, crcp); } -EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); +EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter); +#endif /* CONFIG_NET_CRC32C */ static size_t simple_copy_to_iter(const void *addr, size_t bytes, void *data __always_unused, struct iov_iter *i) @@ -692,9 +689,50 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb, return 0; } +static int +zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from, + int length, + struct net_devmem_dmabuf_binding *binding) +{ + int i = skb_shinfo(skb)->nr_frags; + size_t virt_addr, size, off; + struct net_iov *niov; + + /* Devmem filling works by taking an IOVEC from the user where the + * iov_addrs are interpreted as an offset in bytes into the dma-buf to + * send from. We do not support other iter types. + */ + if (iov_iter_type(from) != ITER_IOVEC && + iov_iter_type(from) != ITER_UBUF) + return -EFAULT; + + while (length && iov_iter_count(from)) { + if (i == MAX_SKB_FRAGS) + return -EMSGSIZE; + + virt_addr = (size_t)iter_iov_addr(from); + niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size); + if (!niov) + return -EFAULT; + + size = min_t(size_t, size, length); + size = min_t(size_t, size, iter_iov_len(from)); + + get_netmem(net_iov_to_netmem(niov)); + skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off, + size, PAGE_SIZE); + iov_iter_advance(from, size); + length -= size; + i++; + } + + return 0; +} + int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, - size_t length) + size_t length, + struct net_devmem_dmabuf_binding *binding) { unsigned long orig_size = skb->truesize; unsigned long truesize; @@ -702,6 +740,8 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, if (msg && msg->msg_ubuf && msg->sg_from_iter) ret = msg->sg_from_iter(skb, from, length); + else if (binding) + ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding); else ret = zerocopy_fill_skb_from_iter(skb, from, length); @@ -735,7 +775,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (skb_copy_datagram_from_iter(skb, 0, from, copy)) return -EFAULT; - return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U); + return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL); } EXPORT_SYMBOL(zerocopy_sg_from_iter); diff --git a/net/core/dev.c b/net/core/dev.c index a9f62f5aeb84..be97c440ecd5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -92,6 +92,7 @@ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/ethtool.h> +#include <linux/ethtool_netlink.h> #include <linux/skbuff.h> #include <linux/kthread.h> #include <linux/bpf.h> @@ -105,6 +106,7 @@ #include <net/dst.h> #include <net/dst_metadata.h> #include <net/gro.h> +#include <net/netdev_queues.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> @@ -154,9 +156,11 @@ #include <linux/pm_runtime.h> #include <linux/prandom.h> #include <linux/once_lite.h> +#include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <net/rps.h> #include <linux/phy_link_topology.h> @@ -180,8 +184,6 @@ static DEFINE_SPINLOCK(napi_hash_lock); static unsigned int napi_gen_id = NR_CPUS; static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); -static DECLARE_RWSEM(devnet_rename_sem); - static inline void dev_base_seq_inc(struct net *net) { unsigned int val = net->dev_base_seq + 1; @@ -460,7 +462,9 @@ EXPORT_PER_CPU_SYMBOL(softnet_data); * PP consumers must pay attention to run APIs in the appropriate context * (e.g. NAPI context). */ -static DEFINE_PER_CPU(struct page_pool *, system_page_pool); +DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #ifdef CONFIG_LOCKDEP /* @@ -570,10 +574,18 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) static inline struct list_head *ptype_head(const struct packet_type *pt) { - if (pt->type == htons(ETH_P_ALL)) - return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; - else - return pt->dev ? &pt->dev->ptype_specific : + if (pt->type == htons(ETH_P_ALL)) { + if (!pt->af_packet_net && !pt->dev) + return NULL; + + return pt->dev ? &pt->dev->ptype_all : + &pt->af_packet_net->ptype_all; + } + + if (pt->dev) + return &pt->dev->ptype_specific; + + return pt->af_packet_net ? &pt->af_packet_net->ptype_specific : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; } @@ -594,6 +606,9 @@ void dev_add_pack(struct packet_type *pt) { struct list_head *head = ptype_head(pt); + if (WARN_ON_ONCE(!head)) + return; + spin_lock(&ptype_lock); list_add_rcu(&pt->list, head); spin_unlock(&ptype_lock); @@ -618,6 +633,9 @@ void __dev_remove_pack(struct packet_type *pt) struct list_head *head = ptype_head(pt); struct packet_type *pt1; + if (!head) + return; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { @@ -767,7 +785,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) } /* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id) +static struct napi_struct * +netdev_napi_by_id(struct net *net, unsigned int napi_id) { struct napi_struct *napi; @@ -784,6 +803,49 @@ struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id) } /** + * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it + * @net: the applicable net namespace + * @napi_id: ID of a NAPI of a target device + * + * Find a NAPI instance with @napi_id. Lock its device. + * The device must be in %NETREG_REGISTERED state for lookup to succeed. + * netdev_unlock() must be called to release it. + * + * Return: pointer to NAPI, its device with lock held, NULL if not found. + */ +struct napi_struct * +netdev_napi_by_id_lock(struct net *net, unsigned int napi_id) +{ + struct napi_struct *napi; + struct net_device *dev; + + rcu_read_lock(); + napi = netdev_napi_by_id(net, napi_id); + if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) { + rcu_read_unlock(); + return NULL; + } + + dev = napi->dev; + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock(dev, net); + if (!dev) + return NULL; + + rcu_read_lock(); + napi = netdev_napi_by_id(net, napi_id); + if (napi && napi->dev != dev) + napi = NULL; + rcu_read_unlock(); + + if (!napi) + netdev_unlock(dev); + return napi; +} + +/** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace * @name: name to find @@ -957,21 +1019,138 @@ EXPORT_SYMBOL(netdev_get_by_index); * its reference counter increased so the caller must be careful * about locking. The caller must hold RCU lock. */ - struct net_device *dev_get_by_napi_id(unsigned int napi_id) { struct napi_struct *napi; WARN_ON_ONCE(!rcu_read_lock_held()); - if (napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi_id)) return NULL; napi = napi_by_id(napi_id); return napi ? napi->dev : NULL; } -EXPORT_SYMBOL(dev_get_by_napi_id); + +/* Release the held reference on the net_device, and if the net_device + * is still registered try to lock the instance lock. If device is being + * unregistered NULL will be returned (but the reference has been released, + * either way!) + * + * This helper is intended for locking net_device after it has been looked up + * using a lockless lookup helper. Lock prevents the instance from going away. + */ +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net) +{ + netdev_lock(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + +static struct net_device * +__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net) +{ + netdev_lock_ops_compat(dev); + if (dev->reg_state > NETREG_REGISTERED || + dev->moving_ns || !net_eq(dev_net(dev), net)) { + netdev_unlock_ops_compat(dev); + dev_put(dev); + return NULL; + } + dev_put(dev); + return dev; +} + +/** + * netdev_get_by_index_lock() - find a device by its ifindex + * @net: the applicable net namespace + * @ifindex: index of device + * + * Search for an interface by index. If a valid device + * with @ifindex is found it will be returned with netdev->lock held. + * netdev_unlock() must be called to release it. + * + * Return: pointer to a device with lock held, NULL if not found. + */ +struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock(dev, net); +} + +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex) +{ + struct net_device *dev; + + dev = dev_get_by_index(net, ifindex); + if (!dev) + return NULL; + + return __netdev_put_lock_ops_compat(dev, net); +} + +struct net_device * +netdev_xa_find_lock(struct net *net, struct net_device *dev, + unsigned long *index) +{ + if (dev) + netdev_unlock(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); +} + +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index) +{ + if (dev) + netdev_unlock_ops_compat(dev); + + do { + rcu_read_lock(); + dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT); + if (!dev) { + rcu_read_unlock(); + return NULL; + } + dev_hold(dev); + rcu_read_unlock(); + + dev = __netdev_put_lock_ops_compat(dev, net); + if (dev) + return dev; + + (*index)++; + } while (true); +} static DEFINE_SEQLOCK(netdev_rename_lock); @@ -1012,6 +1191,12 @@ out: return ret; } +static bool dev_addr_cmp(struct net_device *dev, unsigned short type, + const char *ha) +{ + return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len); +} + /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace @@ -1020,7 +1205,7 @@ out: * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. - * The caller must hold RCU or RTNL. + * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * @@ -1032,14 +1217,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, struct net_device *dev; for_each_netdev_rcu(net, dev) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); +/** + * dev_getbyhwaddr() - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold + * rtnl_lock. + * + * Context: rtnl_lock() must be held. + * Return: pointer to the net_device, or NULL if not found + */ +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *ha) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev_addr_cmp(dev, type, ha)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr); + struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; @@ -1230,33 +1440,18 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev, return ret < 0 ? ret : 0; } -/** - * dev_change_name - change name of a device - * @dev: device - * @newname: name (or format string) must be at least IFNAMSIZ - * - * Change name of a device, can pass format strings "eth%d". - * for wildcarding. - */ -int dev_change_name(struct net_device *dev, const char *newname) +int netif_change_name(struct net_device *dev, const char *newname) { + struct net *net = dev_net(dev); unsigned char old_assign_type; char oldname[IFNAMSIZ]; int err = 0; int ret; - struct net *net; - ASSERT_RTNL(); - BUG_ON(!dev_net(dev)); - - net = dev_net(dev); + ASSERT_RTNL_NET(net); - down_write(&devnet_rename_sem); - - if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { - up_write(&devnet_rename_sem); + if (!strncmp(newname, dev->name, IFNAMSIZ)) return 0; - } memcpy(oldname, dev->name, IFNAMSIZ); @@ -1264,10 +1459,8 @@ int dev_change_name(struct net_device *dev, const char *newname) err = dev_get_valid_name(net, dev, newname); write_sequnlock_bh(&netdev_rename_lock); - if (err < 0) { - up_write(&devnet_rename_sem); + if (err < 0) return err; - } if (oldname[0] && !strchr(oldname, '%')) netdev_info(dev, "renamed from %s%s\n", oldname, @@ -1279,14 +1472,13 @@ int dev_change_name(struct net_device *dev, const char *newname) rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { + write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); + write_sequnlock_bh(&netdev_rename_lock); WRITE_ONCE(dev->name_assign_type, old_assign_type); - up_write(&devnet_rename_sem); return ret; } - up_write(&devnet_rename_sem); - netdev_adjacent_rename_links(dev, oldname); netdev_name_node_del(dev->name_node); @@ -1302,7 +1494,6 @@ rollback: /* err >= 0 after dev_alloc_name() or stores the first errno */ if (err >= 0) { err = ret; - down_write(&devnet_rename_sem); write_seqlock_bh(&netdev_rename_lock); memcpy(dev->name, oldname, IFNAMSIZ); write_sequnlock_bh(&netdev_rename_lock); @@ -1319,15 +1510,7 @@ rollback: return err; } -/** - * dev_set_alias - change ifalias of a device - * @dev: device - * @alias: name up to IFALIASZ - * @len: limit of bytes to copy from info - * - * Set ifalias for a device, - */ -int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +int netif_set_alias(struct net_device *dev, const char *alias, size_t len) { struct dev_ifalias *new_alias = NULL; @@ -1353,7 +1536,6 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len) return len; } -EXPORT_SYMBOL(dev_set_alias); /** * dev_get_alias - get ifalias of a device @@ -1390,16 +1572,10 @@ void netdev_features_change(struct net_device *dev) } EXPORT_SYMBOL(netdev_features_change); -/** - * netdev_state_change - device changes state - * @dev: device to cause notification - * - * Called to indicate a device has changed state. This function calls - * the notifier chains for netdev_chain and sends a NEWLINK message - * to the routing socket. - */ -void netdev_state_change(struct net_device *dev) +void netif_state_change(struct net_device *dev) { + netdev_ops_assert_locked_or_invisible(dev); + if (dev->flags & IFF_UP) { struct netdev_notifier_change_info change_info = { .info.dev = dev, @@ -1410,7 +1586,6 @@ void netdev_state_change(struct net_device *dev) rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL); } } -EXPORT_SYMBOL(netdev_state_change); /** * __netdev_notify_peers - notify network peers about existence of @dev, @@ -1499,6 +1674,8 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) set_bit(__LINK_STATE_START, &dev->state); + netdev_ops_assert_locked(dev); + if (ops->ndo_validate_addr) ret = ops->ndo_validate_addr(dev); @@ -1510,7 +1687,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) if (ret) clear_bit(__LINK_STATE_START, &dev->state); else { - dev->flags |= IFF_UP; + netif_set_up(dev, true); dev_set_rx_mode(dev); dev_activate(dev); add_device_randomness(dev->dev_addr, dev->addr_len); @@ -1519,20 +1696,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) return ret; } -/** - * dev_open - prepare an interface for use. - * @dev: device to open - * @extack: netlink extended ack - * - * Takes a device from down to up state. The device's private open - * function is invoked and then the multicast lists are loaded. Finally - * the device is moved into the up state and a %NETDEV_UP message is - * sent to the netdev notifier chain. - * - * Calling this function on an active interface is a nop. On a failure - * a negative errno code is returned. - */ -int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) +int netif_open(struct net_device *dev, struct netlink_ext_ack *extack) { int ret; @@ -1548,7 +1712,6 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) return ret; } -EXPORT_SYMBOL(dev_open); static void __dev_close_many(struct list_head *head) { @@ -1586,10 +1749,13 @@ static void __dev_close_many(struct list_head *head) * We allow it to be called even after a DETACH hot-plug * event. */ + + netdev_ops_assert_locked(dev); + if (ops->ndo_stop) ops->ndo_stop(dev); - dev->flags &= ~IFF_UP; + netif_set_up(dev, false); netpoll_poll_enable(dev); } } @@ -1623,16 +1789,7 @@ void dev_close_many(struct list_head *head, bool unlink) } EXPORT_SYMBOL(dev_close_many); -/** - * dev_close - shutdown an interface. - * @dev: device to shutdown - * - * This function moves an active device into down state. A - * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device - * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier - * chain. - */ -void dev_close(struct net_device *dev) +void netif_close(struct net_device *dev) { if (dev->flags & IFF_UP) { LIST_HEAD(single); @@ -1642,18 +1799,9 @@ void dev_close(struct net_device *dev) list_del(&single); } } -EXPORT_SYMBOL(dev_close); +EXPORT_SYMBOL(netif_close); - -/** - * dev_disable_lro - disable Large Receive Offload on a device - * @dev: device - * - * Disable Large Receive Offload (LRO) on a net device. Must be - * called under RTNL. This is needed if received packets may be - * forwarded to another interface. - */ -void dev_disable_lro(struct net_device *dev) +void netif_disable_lro(struct net_device *dev) { struct net_device *lower_dev; struct list_head *iter; @@ -1664,10 +1812,13 @@ void dev_disable_lro(struct net_device *dev) if (unlikely(dev->features & NETIF_F_LRO)) netdev_WARN(dev, "failed to disable LRO!\n"); - netdev_for_each_lower_dev(dev, lower_dev, iter) - dev_disable_lro(lower_dev); + netdev_for_each_lower_dev(dev, lower_dev, iter) { + netdev_lock_ops(lower_dev); + netif_disable_lro(lower_dev); + netdev_unlock_ops(lower_dev); + } } -EXPORT_SYMBOL(dev_disable_lro); +EXPORT_IPV6_MOD(netif_disable_lro); /** * dev_disable_gro_hw - disable HW Generic Receive Offload on a device @@ -1755,7 +1906,9 @@ static int call_netdevice_register_net_notifiers(struct notifier_block *nb, int err; for_each_netdev(net, dev) { + netdev_lock_ops(dev); err = call_netdevice_register_notifiers(nb, dev); + netdev_unlock_ops(dev); if (err) goto rollback; } @@ -1799,14 +1952,19 @@ int register_netdevice_notifier(struct notifier_block *nb) /* Close race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); + + /* When RTNL is removed, we need protection for netdev_chain. */ rtnl_lock(); + err = raw_notifier_chain_register(&netdev_chain, nb); if (err) goto unlock; if (dev_boot_phase) goto unlock; for_each_net(net) { + __rtnl_net_lock(net); err = call_netdevice_register_net_notifiers(nb, net); + __rtnl_net_unlock(net); if (err) goto rollback; } @@ -1817,8 +1975,11 @@ unlock: return err; rollback: - for_each_net_continue_reverse(net) + for_each_net_continue_reverse(net) { + __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); + __rtnl_net_unlock(net); + } raw_notifier_chain_unregister(&netdev_chain, nb); goto unlock; @@ -1851,8 +2012,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb) if (err) goto unlock; - for_each_net(net) + for_each_net(net) { + __rtnl_net_lock(net); call_netdevice_unregister_net_notifiers(nb, net); + __rtnl_net_unlock(net); + } unlock: rtnl_unlock(); @@ -1916,9 +2080,10 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb) { int err; - rtnl_lock(); + rtnl_net_lock(net); err = __register_netdevice_notifier_net(net, nb, false); - rtnl_unlock(); + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(register_netdevice_notifier_net); @@ -1944,9 +2109,10 @@ int unregister_netdevice_notifier_net(struct net *net, { int err; - rtnl_lock(); + rtnl_net_lock(net); err = __unregister_netdevice_notifier_net(net, nb); - rtnl_unlock(); + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_net); @@ -1959,19 +2125,56 @@ static void __move_netdevice_notifier_net(struct net *src_net, __register_netdevice_notifier_net(dst_net, nb, true); } +static void rtnl_net_dev_lock(struct net_device *dev) +{ + bool again; + + do { + struct net *net; + + again = false; + + /* netns might be being dismantled. */ + rcu_read_lock(); + net = dev_net_rcu(dev); + net_passive_inc(net); + rcu_read_unlock(); + + rtnl_net_lock(net); + +#ifdef CONFIG_NET_NS + /* dev might have been moved to another netns. */ + if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) { + rtnl_net_unlock(net); + net_passive_dec(net); + again = true; + } +#endif + } while (again); +} + +static void rtnl_net_dev_unlock(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + rtnl_net_unlock(net); + net_passive_dec(net); +} + int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { int err; - rtnl_lock(); + rtnl_net_dev_lock(dev); err = __register_netdevice_notifier_net(dev_net(dev), nb, false); if (!err) { nn->nb = nb; list_add(&nn->list, &dev->net_notifier_list); } - rtnl_unlock(); + rtnl_net_dev_unlock(dev); + return err; } EXPORT_SYMBOL(register_netdevice_notifier_dev_net); @@ -1982,10 +2185,11 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev, { int err; - rtnl_lock(); + rtnl_net_dev_lock(dev); list_del(&nn->list); err = __unregister_netdevice_notifier_net(dev_net(dev), nb); - rtnl_unlock(); + rtnl_net_dev_unlock(dev); + return err; } EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net); @@ -2134,8 +2338,8 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif #ifdef CONFIG_NET_CLS_ACT -DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); -EXPORT_SYMBOL(tcf_bypass_check_needed_key); +DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key); +EXPORT_SYMBOL(tcf_sw_enabled_key); #endif DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); @@ -2301,16 +2505,21 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) } /** - * dev_nit_active - return true if any network interface taps are in use + * dev_nit_active_rcu - return true if any network interface taps are in use + * + * The caller must hold the RCU lock * * @dev: network device to check for the presence of taps */ -bool dev_nit_active(struct net_device *dev) +bool dev_nit_active_rcu(const struct net_device *dev) { - return !list_empty(&net_hotdata.ptype_all) || + /* Callers may hold either RCU or RCU BH lock */ + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + return !list_empty(&dev_net(dev)->ptype_all) || !list_empty(&dev->ptype_all); } -EXPORT_SYMBOL_GPL(dev_nit_active); +EXPORT_SYMBOL_GPL(dev_nit_active_rcu); /* * Support routine. Sends outgoing frames to any network @@ -2319,11 +2528,12 @@ EXPORT_SYMBOL_GPL(dev_nit_active); void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { - struct list_head *ptype_list = &net_hotdata.ptype_all; struct packet_type *ptype, *pt_prev = NULL; + struct list_head *ptype_list; struct sk_buff *skb2 = NULL; rcu_read_lock(); + ptype_list = &dev_net_rcu(dev)->ptype_all; again: list_for_each_entry_rcu(ptype, ptype_list, list) { if (READ_ONCE(ptype->ignore_outgoing)) @@ -2367,7 +2577,7 @@ again: pt_prev = ptype; } - if (ptype_list == &net_hotdata.ptype_all) { + if (ptype_list != &dev->ptype_all) { ptype_list = &dev->ptype_all; goto again; } @@ -2970,6 +3180,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) { ASSERT_RTNL(); + netdev_ops_assert_locked(dev); rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, txq); @@ -3000,7 +3211,6 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) } EXPORT_SYMBOL(netif_set_real_num_tx_queues); -#ifdef CONFIG_SYSFS /** * netif_set_real_num_rx_queues - set actual number of RX queues used * @dev: Network device @@ -3020,6 +3230,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); + netdev_ops_assert_locked(dev); rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); @@ -3031,7 +3242,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) return 0; } EXPORT_SYMBOL(netif_set_real_num_rx_queues); -#endif /** * netif_set_real_num_queues - set actual number of RX and TX queues used @@ -3263,7 +3473,7 @@ void netif_device_attach(struct net_device *dev) if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { netif_tx_wake_all_queues(dev); - __netdev_watchdog_up(dev); + netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_device_attach); @@ -3386,9 +3596,10 @@ out: } EXPORT_SYMBOL(skb_checksum_help); +#ifdef CONFIG_NET_CRC32C int skb_crc32c_csum_help(struct sk_buff *skb) { - __le32 crc32c_csum; + u32 crc; int ret = 0, offset, start; if (skb->ip_summed != CHECKSUM_PARTIAL) @@ -3416,15 +3627,14 @@ int skb_crc32c_csum_help(struct sk_buff *skb) if (ret) goto out; - crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start, - skb->len - start, ~(__u32)0, - crc32c_csum_stub)); - *(__le32 *)(skb->data + offset) = crc32c_csum; + crc = ~skb_crc32c(skb, start, skb->len - start, ~0); + *(__le32 *)(skb->data + offset) = cpu_to_le32(crc); skb_reset_csum_not_inet(skb); out: return ret; } EXPORT_SYMBOL(skb_crc32c_csum_help); +#endif /* CONFIG_NET_CRC32C */ __be16 skb_network_protocol(struct sk_buff *skb, int *depth) { @@ -3612,7 +3822,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, unsigned int len; int rc; - if (dev_nit_active(dev)) + if (dev_nit_active_rcu(dev)) dev_queue_xmit_nit(skb, dev); len = skb->len; @@ -3688,10 +3898,43 @@ sw_checksum: } EXPORT_SYMBOL(skb_csum_hwoffload_help); +static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, + struct net_device *dev) +{ + struct skb_shared_info *shinfo; + struct net_iov *niov; + + if (likely(skb_frags_readable(skb))) + goto out; + + if (!dev->netmem_tx) + goto out_free; + + shinfo = skb_shinfo(skb); + + if (shinfo->nr_frags > 0) { + niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0])); + if (net_is_devmem_iov(niov) && + net_devmem_iov_binding(niov)->dev != dev) + goto out_free; + } + +out: + return skb; + +out_free: + kfree_skb(skb); + return NULL; +} + static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; + skb = validate_xmit_unreadable_skb(skb, dev); + if (unlikely(!skb)) + goto out_null; + features = netif_skb_features(skb); skb = validate_xmit_vlan(skb, features); if (unlikely(!skb)) @@ -4030,10 +4273,13 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, if (!miniq) return ret; - if (static_branch_unlikely(&tcf_bypass_check_needed_key)) { - if (tcf_block_bypass_sw(miniq->block)) - return ret; - } + /* Global bypass */ + if (!static_branch_likely(&tcf_sw_enabled_key)) + return ret; + + /* Block-wise bypass */ + if (tcf_block_bypass_sw(miniq->block)) + return ret; tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; @@ -4382,7 +4628,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_reset_mac_header(skb); skb_assert_len(skb); - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + if (unlikely(skb_shinfo(skb)->tx_flags & + (SKBTX_SCHED_TSTAMP | SKBTX_BPF))) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also @@ -4568,13 +4815,14 @@ static inline void ____napi_schedule(struct softnet_data *sd, } use_local_napi: + DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list)); list_add_tail(&napi->poll_list, &sd->poll_list); WRITE_ONCE(napi->list_owner, smp_processor_id()); /* If not called from net_rx_action() * we have to raise NET_RX_SOFTIRQ. */ if (!sd->in_net_rx_action) - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS @@ -4584,6 +4832,11 @@ EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); +static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) +{ + return hash_32(hash, flow_table->log); +} + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow *rflow, u16 next_cpu) @@ -4610,7 +4863,7 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = skb_get_hash(skb) & flow_table->mask; + flow_id = rfs_slot(skb_get_hash(skb), flow_table); rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) @@ -4689,7 +4942,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ - rflow = &flow_table->flows[hash & flow_table->mask]; + rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; tcpu = rflow->cpu; /* @@ -4756,13 +5009,13 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); - if (flow_table && flow_id <= flow_table->mask) { + if (flow_table && flow_id < (1UL << flow_table->log)) { rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - READ_ONCE(rflow->last_qtail)) < - (int)(10 * flow_table->mask))) + (int)(10 << flow_table->log))) expire = false; } rcu_read_unlock(); @@ -4778,7 +5031,8 @@ static void rps_trigger_softirq(void *data) struct softnet_data *sd = data; ____napi_schedule(sd, &sd->backlog); - sd->received_rps++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->received_rps, sd->received_rps + 1); } #endif /* CONFIG_RPS */ @@ -4863,7 +5117,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); if (fl) { - new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); + new_flow = hash_32(skb_get_hash(skb), fl->log_buckets); old_flow = fl->history[fl->history_head]; fl->history[fl->history_head] = new_flow; @@ -4874,7 +5128,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) fl->buckets[old_flow]--; if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { - fl->count++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(fl->count, fl->count + 1); rcu_read_unlock(); return true; } @@ -4963,7 +5218,7 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; @@ -5065,12 +5320,15 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, } static int -netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog) { struct sk_buff *skb = *pskb; int err, hroom, troom; - if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) + local_lock_nested_bh(&system_page_pool.bh_lock); + err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog); + local_unlock_nested_bh(&system_page_pool.bh_lock); + if (!err) return 0; /* In case we have to go down the path and also linearize, @@ -5089,7 +5347,7 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct sk_buff *skb = *pskb; u32 mac_len, act = XDP_DROP; @@ -5142,7 +5400,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb, * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX * queues, so they do not have this starvation issue. */ -void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog) { struct net_device *dev = skb->dev; struct netdev_queue *txq; @@ -5167,7 +5425,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) +int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; @@ -5506,8 +5764,14 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, orig_dev = skb->dev; skb_reset_network_header(skb); +#if !defined(CONFIG_DEBUG_NET) + /* We plan to no longer reset the transport header here. + * Give some time to fuzzers and dev build to catch bugs + * in network stacks. + */ if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); +#endif skb_reset_mac_len(skb); pt_prev = NULL; @@ -5543,7 +5807,8 @@ another_round: if (pfmemalloc) goto skip_taps; - list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { + list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all, + list) { if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; @@ -5655,6 +5920,14 @@ check_vlan_id: deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, &ptype_base[ntohs(type) & PTYPE_HASH_MASK]); + + /* orig_dev and skb->dev could belong to different netns; + * Even in such case we need to traverse only the list + * coming from skb->dev, as the ptype owner (packet socket) + * will use dev_net(skb->dev) to do namespace filtering. + */ + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, + &dev_net_rcu(skb->dev)->ptype_specific); } deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, @@ -5865,7 +6138,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) static_branch_dec(&generic_xdp_needed_key); } else if (new && !old) { static_branch_inc(&generic_xdp_needed_key); - dev_disable_lro(dev); + netif_disable_lro(dev); dev_disable_gro_hw(dev); } break; @@ -5991,22 +6264,22 @@ void netif_receive_skb_list(struct list_head *head) } EXPORT_SYMBOL(netif_receive_skb_list); -static DEFINE_PER_CPU(struct work_struct, flush_works); - /* Network device is going away, flush any packets still pending */ static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; + struct sk_buff_head list; struct softnet_data *sd; + __skb_queue_head_init(&list); local_bh_disable(); sd = this_cpu_ptr(&softnet_data); backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); - dev_kfree_skb_irq(skb); + __skb_queue_tail(&list, skb); rps_input_queue_head_incr(sd); } } @@ -6014,14 +6287,16 @@ static void flush_backlog(struct work_struct *work) local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); - kfree_skb(skb); + __skb_queue_tail(&list, skb); rps_input_queue_head_incr(sd); } } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); + + __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY); } static bool flush_required(int cpu) @@ -6049,36 +6324,54 @@ static bool flush_required(int cpu) return true; } +struct flush_backlogs { + cpumask_t flush_cpus; + struct work_struct w[]; +}; + +static struct flush_backlogs *flush_backlogs_alloc(void) +{ + return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids), + GFP_KERNEL); +} + +static struct flush_backlogs *flush_backlogs_fallback; +static DEFINE_MUTEX(flush_backlogs_mutex); + static void flush_all_backlogs(void) { - static cpumask_t flush_cpus; + struct flush_backlogs *ptr = flush_backlogs_alloc(); unsigned int cpu; - /* since we are under rtnl lock protection we can use static data - * for the cpumask and avoid allocating on stack the possibly - * large mask - */ - ASSERT_RTNL(); + if (!ptr) { + mutex_lock(&flush_backlogs_mutex); + ptr = flush_backlogs_fallback; + } + cpumask_clear(&ptr->flush_cpus); cpus_read_lock(); - cpumask_clear(&flush_cpus); for_each_online_cpu(cpu) { if (flush_required(cpu)) { - queue_work_on(cpu, system_highpri_wq, - per_cpu_ptr(&flush_works, cpu)); - cpumask_set_cpu(cpu, &flush_cpus); + INIT_WORK(&ptr->w[cpu], flush_backlog); + queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]); + __cpumask_set_cpu(cpu, &ptr->flush_cpus); } } /* we can have in flight packet[s] on the cpus we are not flushing, * synchronize_net() in unregister_netdevice_many() will take care of - * them + * them. */ - for_each_cpu(cpu, &flush_cpus) - flush_work(per_cpu_ptr(&flush_works, cpu)); + for_each_cpu(cpu, &ptr->flush_cpus) + flush_work(&ptr->w[cpu]); cpus_read_unlock(); + + if (ptr != flush_backlogs_fallback) + kfree(ptr); + else + mutex_unlock(&flush_backlogs_mutex); } static void net_rps_send_ipi(struct softnet_data *remsd) @@ -6267,7 +6560,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; if (work_done) { - if (n->gro_bitmask) + if (n->gro.bitmask) timeout = napi_get_gro_flush_timeout(n); n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } @@ -6277,15 +6570,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (timeout) ret = false; } - if (n->gro_bitmask) { - /* When the NAPI instance uses a timeout and keeps postponing - * it, we need to bound somehow the time packets are kept in - * the GRO layer - */ - napi_gro_flush(n, !!timeout); - } - gro_normal_list(n); + /* + * When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer. + */ + gro_flush(&n->gro, !!timeout); + gro_normal_list(&n->gro); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ @@ -6349,19 +6641,15 @@ static void skb_defer_free_flush(struct softnet_data *sd) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { - gro_normal_list(napi); + gro_normal_list(&napi->gro); __napi_schedule(napi); return; } - if (napi->gro_bitmask) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(napi, HZ >= 1000); - } + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush(&napi->gro, HZ >= 1000); + gro_normal_list(&napi->gro); - gro_normal_list(napi); clear_bit(NAPI_STATE_SCHED, &napi->state); } @@ -6468,7 +6756,7 @@ restart: } work = napi_poll(napi, budget); trace_napi_poll(napi, work, budget); - gro_normal_list(napi); + gro_normal_list(&napi->gro); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), @@ -6568,7 +6856,9 @@ void napi_resume_irqs(unsigned int napi_id) static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { - napi->napi_id = napi_id; + napi->gro.cached_napi_id = napi_id; + + WRITE_ONCE(napi->napi_id, napi_id); hlist_add_head_rcu(&napi->napi_hash_node, &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); } @@ -6595,7 +6885,7 @@ static void napi_hash_add(struct napi_struct *napi) /* 0..NR_CPUS range is reserved for sender_cpu use */ do { - if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + if (unlikely(!napi_id_valid(++napi_gen_id))) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); @@ -6636,22 +6926,13 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void init_gro_hash(struct napi_struct *napi) -{ - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - INIT_LIST_HEAD(&napi->gro_hash[i].list); - napi->gro_hash[i].count = 0; - } - napi->gro_bitmask = 0; -} - int dev_set_threaded(struct net_device *dev, bool threaded) { struct napi_struct *napi; int err = 0; + netdev_assert_locked_or_invisible(dev); + if (dev->threaded == threaded) return 0; @@ -6706,8 +6987,7 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, if (WARN_ON_ONCE(napi && !napi->dev)) return; - if (dev->reg_state >= NETREG_REGISTERED) - ASSERT_RTNL(); + netdev_ops_assert_locked_or_invisible(dev); switch (type) { case NETDEV_QUEUE_TYPE_RX: @@ -6724,19 +7004,184 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, } EXPORT_SYMBOL(netif_queue_set_napi); +static void +netif_napi_irq_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) +{ + struct napi_struct *napi = + container_of(notify, struct napi_struct, notify); +#ifdef CONFIG_RFS_ACCEL + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + int err; +#endif + + if (napi->config && napi->dev->irq_affinity_auto) + cpumask_copy(&napi->config->affinity_mask, mask); + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); + if (err) + netdev_warn(napi->dev, "RMAP update failed (%d)\n", + err); + } +#endif +} + +#ifdef CONFIG_RFS_ACCEL +static void netif_napi_affinity_release(struct kref *ref) +{ + struct napi_struct *napi = + container_of(ref, struct napi_struct, notify.kref); + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + + netdev_assert_locked(napi->dev); + WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, + &napi->state)); + + if (!napi->dev->rx_cpu_rmap_auto) + return; + rmap->obj[napi->napi_rmap_idx] = NULL; + napi->napi_rmap_idx = -1; + cpu_rmap_put(rmap); +} + +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + if (dev->rx_cpu_rmap_auto) + return 0; + + dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs); + if (!dev->rx_cpu_rmap) + return -ENOMEM; + + dev->rx_cpu_rmap_auto = true; + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); + +static void netif_del_cpu_rmap(struct net_device *dev) +{ + struct cpu_rmap *rmap = dev->rx_cpu_rmap; + + if (!dev->rx_cpu_rmap_auto) + return; + + /* Free the rmap */ + cpu_rmap_put(rmap); + dev->rx_cpu_rmap = NULL; + dev->rx_cpu_rmap_auto = false; +} + +#else +static void netif_napi_affinity_release(struct kref *ref) +{ +} + +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); + +static void netif_del_cpu_rmap(struct net_device *dev) +{ +} +#endif + +void netif_set_affinity_auto(struct net_device *dev) +{ + unsigned int i, maxqs, numa; + + maxqs = max(dev->num_tx_queues, dev->num_rx_queues); + numa = dev_to_node(&dev->dev); + + for (i = 0; i < maxqs; i++) + cpumask_set_cpu(cpumask_local_spread(i, numa), + &dev->napi_config[i].affinity_mask); + + dev->irq_affinity_auto = true; +} +EXPORT_SYMBOL(netif_set_affinity_auto); + +void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) +{ + int rc; + + netdev_assert_locked_or_invisible(napi->dev); + + if (napi->irq == irq) + return; + + /* Remove existing resources */ + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + + napi->irq = irq; + if (irq < 0 || + (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto)) + return; + + /* Abort for buggy drivers */ + if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config)) + return; + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi); + if (rc < 0) + return; + + cpu_rmap_get(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = rc; + } +#endif + + /* Use core IRQ notifier */ + napi->notify.notify = netif_napi_irq_notify; + napi->notify.release = netif_napi_affinity_release; + rc = irq_set_affinity_notifier(irq, &napi->notify); + if (rc) { + netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n", + rc); + goto put_rmap; + } + + set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); + return; + +put_rmap: +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL; + cpu_rmap_put(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = -1; + } +#endif + napi->notify.notify = NULL; + napi->notify.release = NULL; +} +EXPORT_SYMBOL(netif_napi_set_irq_locked); + static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; n->irq_suspend_timeout = n->config->irq_suspend_timeout; + + if (n->dev->irq_affinity_auto && + test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state)) + irq_set_affinity(n->irq, &n->config->affinity_mask); + /* a NAPI ID might be stored in the config, if so use it. if not, use - * napi_hash_add to generate one for us. It will be saved to the config - * in napi_disable. + * napi_hash_add to generate one for us. */ - if (n->config->napi_id) + if (n->config->napi_id) { napi_hash_add_with_id(n, n->config->napi_id); - else + } else { napi_hash_add(n); + n->config->napi_id = n->napi_id; + } } static void napi_save_config(struct napi_struct *n) @@ -6744,24 +7189,70 @@ static void napi_save_config(struct napi_struct *n) n->config->defer_hard_irqs = n->defer_hard_irqs; n->config->gro_flush_timeout = n->gro_flush_timeout; n->config->irq_suspend_timeout = n->irq_suspend_timeout; - n->config->napi_id = n->napi_id; napi_hash_del(n); } -void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, - int (*poll)(struct napi_struct *, int), int weight) +/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will + * inherit an existing ID try to insert it at the right position. + */ +static void +netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) +{ + unsigned int new_id, pos_id; + struct list_head *higher; + struct napi_struct *pos; + + new_id = UINT_MAX; + if (napi->config && napi->config->napi_id) + new_id = napi->config->napi_id; + + higher = &dev->napi_list; + list_for_each_entry(pos, &dev->napi_list, dev_list) { + if (napi_id_valid(pos->napi_id)) + pos_id = pos->napi_id; + else if (pos->config) + pos_id = pos->config->napi_id; + else + pos_id = UINT_MAX; + + if (pos_id <= new_id) + break; + higher = &pos->dev_list; + } + list_add_rcu(&napi->dev_list, higher); /* adds after higher */ +} + +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +static void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + +void netif_napi_add_weight_locked(struct net_device *dev, + struct napi_struct *napi, + int (*poll)(struct napi_struct *, int), + int weight) { + netdev_assert_locked(dev); if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state))) return; INIT_LIST_HEAD(&napi->poll_list); INIT_HLIST_NODE(&napi->napi_hash_node); - hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - napi->timer.function = napi_watchdog; - init_gro_hash(napi); + hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); + gro_init(&napi->gro); napi->skb = NULL; - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, @@ -6774,7 +7265,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, napi->list_owner = -1; set_bit(NAPI_STATE_SCHED, &napi->state); set_bit(NAPI_STATE_NPSVC, &napi->state); - list_add_rcu(&napi->dev_list, &dev->napi_list); + netif_napi_dev_list_add(dev, napi); /* default settings from sysfs are applied to all NAPIs. any per-NAPI * configuration will be loaded in napi_enable @@ -6789,15 +7280,17 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, */ if (dev->threaded && napi_kthread_create(napi)) dev->threaded = false; - netif_napi_set_irq(napi, -1); + netif_napi_set_irq_locked(napi, -1); } -EXPORT_SYMBOL(netif_napi_add_weight); +EXPORT_SYMBOL(netif_napi_add_weight_locked); -void napi_disable(struct napi_struct *n) +void napi_disable_locked(struct napi_struct *n) { unsigned long val, new; might_sleep(); + netdev_assert_locked(n->dev); + set_bit(NAPI_STATE_DISABLE, &n->state); val = READ_ONCE(n->state); @@ -6820,16 +7313,25 @@ void napi_disable(struct napi_struct *n) clear_bit(NAPI_STATE_DISABLE, &n->state); } -EXPORT_SYMBOL(napi_disable); +EXPORT_SYMBOL(napi_disable_locked); /** - * napi_enable - enable NAPI scheduling - * @n: NAPI context + * napi_disable() - prevent NAPI from scheduling + * @n: NAPI context * - * Resume NAPI from being scheduled on this context. - * Must be paired with napi_disable. + * Stop NAPI from being scheduled on this context. + * Waits till any outstanding processing completes. + * Takes netdev_lock() for associated net_device. */ -void napi_enable(struct napi_struct *n) +void napi_disable(struct napi_struct *n) +{ + netdev_lock(n->dev); + napi_disable_locked(n); + netdev_unlock(n->dev); +} +EXPORT_SYMBOL(napi_disable); + +void napi_enable_locked(struct napi_struct *n) { unsigned long new, val = READ_ONCE(n->state); @@ -6846,27 +7348,38 @@ void napi_enable(struct napi_struct *n) new |= NAPIF_STATE_THREADED; } while (!try_cmpxchg(&n->state, &val, new)); } -EXPORT_SYMBOL(napi_enable); +EXPORT_SYMBOL(napi_enable_locked); -static void flush_gro_hash(struct napi_struct *napi) +/** + * napi_enable() - enable NAPI scheduling + * @n: NAPI context + * + * Enable scheduling of a NAPI instance. + * Must be paired with napi_disable(). + * Takes netdev_lock() for associated net_device. + */ +void napi_enable(struct napi_struct *n) { - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - struct sk_buff *skb, *n; - - list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) - kfree_skb(skb); - napi->gro_hash[i].count = 0; - } + netdev_lock(n->dev); + napi_enable_locked(n); + netdev_unlock(n->dev); } +EXPORT_SYMBOL(napi_enable); /* Must be called in process context */ -void __netif_napi_del(struct napi_struct *napi) +void __netif_napi_del_locked(struct napi_struct *napi) { + netdev_assert_locked(napi->dev); + if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; + /* Make sure NAPI is disabled (or was never enabled). */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + if (napi->config) { napi->index = -1; napi->config = NULL; @@ -6875,15 +7388,14 @@ void __netif_napi_del(struct napi_struct *napi) list_del_rcu(&napi->dev_list); napi_free_frags(napi); - flush_gro_hash(napi); - napi->gro_bitmask = 0; + gro_cleanup(&napi->gro); if (napi->thread) { kthread_stop(napi->thread); napi->thread = NULL; } } -EXPORT_SYMBOL(__netif_napi_del); +EXPORT_SYMBOL(__netif_napi_del_locked); static int __napi_poll(struct napi_struct *n, bool *repoll) { @@ -6935,14 +7447,9 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) return work; } - if (n->gro_bitmask) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(n, HZ >= 1000); - } - - gro_normal_list(n); + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush(&n->gro, HZ >= 1000); + gro_normal_list(&n->gro); /* Some drivers may have called napi_schedule * prior to exhausting their budget. @@ -6970,9 +7477,14 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) work = __napi_poll(n, &do_repoll); - if (do_repoll) + if (do_repoll) { +#if defined(CONFIG_DEBUG_NET) + if (unlikely(!napi_is_scheduled(n))) + pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n", + n->dev->name, n->poll); +#endif list_add_tail(&n->poll_list, repoll); - + } netpoll_poll_unlock(have); return work; @@ -7098,7 +7610,8 @@ start: */ if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit))) { - sd->time_squeeze++; + /* Pairs with READ_ONCE() in softnet_seq_show() */ + WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1); break; } } @@ -8771,23 +9284,20 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) dev_change_rx_flags(dev, IFF_PROMISC); } - if (notify) + if (notify) { + /* The ops lock is only required to ensure consistent locking + * for `NETDEV_CHANGE` notifiers. This function is sometimes + * called without the lock, even for devices that are ops + * locked, such as in `dev_uc_sync_multiple` when using + * bonding or teaming. + */ + netdev_ops_assert_locked(dev); __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL); + } return 0; } -/** - * dev_set_promiscuity - update promiscuity count on a device - * @dev: device - * @inc: modifier - * - * Add or remove promiscuity from a device. While the count in the device - * remains above zero the interface remains promiscuous. Once it hits zero - * the device reverts back to normal filtering operation. A negative inc - * value is used to drop promiscuity on the device. - * Return 0 if successful or a negative errno code on error. - */ -int dev_set_promiscuity(struct net_device *dev, int inc) +int netif_set_promiscuity(struct net_device *dev, int inc) { unsigned int old_flags = dev->flags; int err; @@ -8799,9 +9309,8 @@ int dev_set_promiscuity(struct net_device *dev, int inc) dev_set_rx_mode(dev); return err; } -EXPORT_SYMBOL(dev_set_promiscuity); -static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) +int netif_set_allmulti(struct net_device *dev, int inc, bool notify) { unsigned int old_flags = dev->flags, old_gflags = dev->gflags; unsigned int allmulti, flags; @@ -8836,25 +9345,6 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) return 0; } -/** - * dev_set_allmulti - update allmulti count on a device - * @dev: device - * @inc: modifier - * - * Add or remove reception of all multicast frames to a device. While the - * count in the device remains above zero the interface remains listening - * to all interfaces. Once it hits zero the device reverts back to normal - * filtering operation. A negative @inc value is used to drop the counter - * when releasing a resource needing all multicasts. - * Return 0 if successful or a negative errno code on error. - */ - -int dev_set_allmulti(struct net_device *dev, int inc) -{ - return __dev_set_allmulti(dev, inc, true); -} -EXPORT_SYMBOL(dev_set_allmulti); - /* * Upload unicast and multicast address lists to device and * configure RX filtering. When the device doesn't support unicast @@ -8970,7 +9460,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; - unsigned int old_flags = dev->flags; + old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; @@ -8987,7 +9477,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, int inc = (flags & IFF_ALLMULTI) ? 1 : -1; dev->gflags ^= IFF_ALLMULTI; - __dev_set_allmulti(dev, inc, false); + netif_set_allmulti(dev, inc, false); } return ret; @@ -9022,17 +9512,8 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, } } -/** - * dev_change_flags - change device settings - * @dev: device - * @flags: device state flags - * @extack: netlink extended ack - * - * Change settings on device based state flags. The flags are - * in the userspace exported format. - */ -int dev_change_flags(struct net_device *dev, unsigned int flags, - struct netlink_ext_ack *extack) +int netif_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) { int ret; unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; @@ -9045,7 +9526,6 @@ int dev_change_flags(struct net_device *dev, unsigned int flags, __dev_notify_flags(dev, old_flags, changes, 0, NULL); return ret; } -EXPORT_SYMBOL(dev_change_flags); int __dev_set_mtu(struct net_device *dev, int new_mtu) { @@ -9077,15 +9557,15 @@ int dev_validate_mtu(struct net_device *dev, int new_mtu, } /** - * dev_set_mtu_ext - Change maximum transfer unit + * netif_set_mtu_ext - Change maximum transfer unit * @dev: device * @new_mtu: new transfer unit * @extack: netlink extended ack * * Change the maximum transfer size of the network device. */ -int dev_set_mtu_ext(struct net_device *dev, int new_mtu, - struct netlink_ext_ack *extack) +int netif_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack) { int err, orig_mtu; @@ -9123,25 +9603,20 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu, return err; } -int dev_set_mtu(struct net_device *dev, int new_mtu) +int netif_set_mtu(struct net_device *dev, int new_mtu) { struct netlink_ext_ack extack; int err; memset(&extack, 0, sizeof(extack)); - err = dev_set_mtu_ext(dev, new_mtu, &extack); + err = netif_set_mtu_ext(dev, new_mtu, &extack); if (err && extack._msg) net_err_ratelimited("%s: %s\n", dev->name, extack._msg); return err; } -EXPORT_SYMBOL(dev_set_mtu); +EXPORT_SYMBOL(netif_set_mtu); -/** - * dev_change_tx_queue_len - Change TX queue length of a netdevice - * @dev: device - * @new_len: new tx queue length - */ -int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len) { unsigned int orig_len = dev->tx_queue_len; int res; @@ -9168,12 +9643,7 @@ err_rollback: return res; } -/** - * dev_set_group - Change group this device belongs to - * @dev: device - * @new_group: group this device should belong to - */ -void dev_set_group(struct net_device *dev, int new_group) +void netif_set_group(struct net_device *dev, int new_group) { dev->group = new_group; } @@ -9199,31 +9669,23 @@ int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr, } EXPORT_SYMBOL(dev_pre_changeaddr_notify); -/** - * dev_set_mac_address - Change Media Access Control Address - * @dev: device - * @sa: new address - * @extack: netlink extended ack - * - * Change the hardware (MAC) address of the device - */ -int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, - struct netlink_ext_ack *extack) +int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; int err; if (!ops->ndo_set_mac_address) return -EOPNOTSUPP; - if (sa->sa_family != dev->type) + if (ss->ss_family != dev->type) return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack); + err = dev_pre_changeaddr_notify(dev, ss->__data, extack); if (err) return err; - if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) { - err = ops->ndo_set_mac_address(dev, sa); + if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) { + err = ops->ndo_set_mac_address(dev, ss); if (err) return err; } @@ -9232,22 +9694,10 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, add_device_randomness(dev->dev_addr, dev->addr_len); return 0; } -EXPORT_SYMBOL(dev_set_mac_address); DECLARE_RWSEM(dev_addr_sem); -int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, - struct netlink_ext_ack *extack) -{ - int ret; - - down_write(&dev_addr_sem); - ret = dev_set_mac_address(dev, sa, extack); - up_write(&dev_addr_sem); - return ret; -} -EXPORT_SYMBOL(dev_set_mac_address_user); - +/* "sa" is a true struct sockaddr with limited "sa_data" member. */ int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { size_t size = sizeof(sa->sa_data_min); @@ -9276,14 +9726,7 @@ unlock: } EXPORT_SYMBOL(dev_get_mac_address); -/** - * dev_change_carrier - Change device carrier - * @dev: device - * @new_carrier: new value - * - * Change device carrier - */ -int dev_change_carrier(struct net_device *dev, bool new_carrier) +int netif_change_carrier(struct net_device *dev, bool new_carrier) { const struct net_device_ops *ops = dev->netdev_ops; @@ -9394,13 +9837,7 @@ bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b) } EXPORT_SYMBOL(netdev_port_same_parent_id); -/** - * dev_change_proto_down - set carrier according to proto_down. - * - * @dev: device - * @proto_down: new value - */ -int dev_change_proto_down(struct net_device *dev, bool proto_down) +int netif_change_proto_down(struct net_device *dev, bool proto_down) { if (!dev->change_proto_down) return -EOPNOTSUPP; @@ -9415,14 +9852,14 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } /** - * dev_change_proto_down_reason - proto down reason + * netdev_change_proto_down_reason_locked - proto down reason * * @dev: device * @mask: proto down mask * @value: proto down value */ -void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, - u32 value) +void netdev_change_proto_down_reason_locked(struct net_device *dev, + unsigned long mask, u32 value) { u32 proto_down_reason; int b; @@ -9499,11 +9936,31 @@ u8 dev_xdp_prog_count(struct net_device *dev) } EXPORT_SYMBOL_GPL(dev_xdp_prog_count); -int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +u8 dev_xdp_sb_prog_count(struct net_device *dev) +{ + u8 count = 0; + int i; + + for (i = 0; i < __MAX_XDP_MODE; i++) + if (dev->xdp_state[i].prog && + !dev->xdp_state[i].prog->aux->xdp_has_frags) + count++; + return count; +} + +int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) { if (!dev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + bpf->command == XDP_SETUP_PROG && + bpf->prog && !bpf->prog->aux->xdp_has_frags) { + NL_SET_ERR_MSG(bpf->extack, + "unable to propagate XDP to device using tcp-data-split"); + return -EBUSY; + } + if (dev_get_min_mp_channel_count(dev)) { NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider"); return -EBUSY; @@ -9511,7 +9968,7 @@ int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) return dev->netdev_ops->ndo_bpf(dev, bpf); } -EXPORT_SYMBOL_GPL(dev_xdp_propagate); +EXPORT_SYMBOL_GPL(netif_xdp_propagate); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { @@ -9541,6 +9998,14 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, struct netdev_bpf xdp; int err; + netdev_ops_assert_locked(dev); + + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + prog && !prog->aux->xdp_has_frags) { + NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); + return -EBUSY; + } + if (dev_get_min_mp_channel_count(dev)) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider"); return -EBUSY; @@ -9694,6 +10159,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } + if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { + NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); + return -EINVAL; + } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL; @@ -9763,7 +10232,9 @@ static void bpf_xdp_link_release(struct bpf_link *link) * already NULL, in which case link was already auto-detached */ if (xdp_link->dev) { + netdev_lock_ops(xdp_link->dev); WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link)); + netdev_unlock_ops(xdp_link->dev); xdp_link->dev = NULL; } @@ -9845,10 +10316,12 @@ static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog, goto out_unlock; } + netdev_lock_ops(xdp_link->dev); mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags); bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode); err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL, xdp_link->flags, new_prog); + netdev_unlock_ops(xdp_link->dev); if (err) goto out_unlock; @@ -9901,7 +10374,9 @@ int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) goto unlock; } + netdev_lock_ops(dev); err = dev_xdp_attach_link(dev, &extack, link); + netdev_unlock_ops(dev); rtnl_unlock(); if (err) { @@ -9974,7 +10449,7 @@ u32 dev_get_min_mp_channel_count(const struct net_device *dev) { int i; - ASSERT_RTNL(); + netdev_ops_assert_locked(dev); for (i = dev->real_num_rx_queues - 1; i >= 0; i--) if (dev->_rx[i].mp_params.mp_priv) @@ -10021,6 +10496,15 @@ static void dev_index_release(struct net *net, int ifindex) WARN_ON(xa_erase(&net->dev_by_index, ifindex)); } +static bool from_cleanup_net(void) +{ +#ifdef CONFIG_NET_NS + return current == READ_ONCE(cleanup_net_task); +#else + return false; +#endif +} + /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); @@ -10063,6 +10547,7 @@ static void netdev_sync_lower_features(struct net_device *upper, if (!(features & feature) && (lower->features & feature)) { netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", &feature, lower->name); + netdev_lock_ops(lower); lower->wanted_features &= ~feature; __netdev_update_features(lower); @@ -10071,6 +10556,7 @@ static void netdev_sync_lower_features(struct net_device *upper, &feature, lower->name); else netdev_features_change(lower); + netdev_unlock_ops(lower); } } } @@ -10191,6 +10677,7 @@ int __netdev_update_features(struct net_device *dev) int err = -1; ASSERT_RTNL(); + netdev_ops_assert_locked(dev); features = netdev_get_wanted_features(dev); @@ -10617,12 +11104,16 @@ int register_netdevice(struct net_device *dev) ret = netdev_register_kobject(dev); + netdev_lock(dev); WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); + netdev_unlock(dev); if (ret) goto err_uninit_notify; + netdev_lock_ops(dev); __netdev_update_features(dev); + netdev_unlock_ops(dev); /* * Default initial state at registry is that the @@ -10648,7 +11139,9 @@ int register_netdevice(struct net_device *dev) memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); /* Notify protocols, that a new device appeared. */ + netdev_lock_ops(dev); ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); + netdev_unlock_ops(dev); ret = notifier_to_errno(ret); if (ret) { /* Expect explicit free_netdev() on failure */ @@ -10660,8 +11153,7 @@ int register_netdevice(struct net_device *dev) * Prevent userspace races by waiting until the network * device is fully setup before sending notifications. */ - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL); out: @@ -10685,26 +11177,20 @@ err_free_name: EXPORT_SYMBOL(register_netdevice); /* Initialize the core of a dummy net device. - * This is useful if you are calling this function after alloc_netdev(), - * since it does not memset the net_device fields. + * The setup steps dummy netdevs need which normal netdevs get by going + * through register_netdevice(). */ -static void init_dummy_netdev_core(struct net_device *dev) +static void init_dummy_netdev(struct net_device *dev) { /* make sure we BUG if trying to hit standard * register/unregister code path */ dev->reg_state = NETREG_DUMMY; - /* NAPI wants this */ - INIT_LIST_HEAD(&dev->napi_list); - /* a dummy interface is started by default */ set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); - /* napi_busy_loop stats accounting wants this */ - dev_net_set(dev, &init_net); - /* Note : We dont allocate pcpu_refcnt for dummy devices, * because users of this 'device' dont need to change * its refcount. @@ -10712,28 +11198,6 @@ static void init_dummy_netdev_core(struct net_device *dev) } /** - * init_dummy_netdev - init a dummy network device for NAPI - * @dev: device to init - * - * This takes a network device structure and initializes the minimum - * amount of fields so it can be used to schedule NAPI polls without - * registering a full blown interface. This is to be used by drivers - * that need to tie several hardware interfaces to a single NAPI - * poll scheduler due to HW limitations. - */ -void init_dummy_netdev(struct net_device *dev) -{ - /* Clear everything. Note we don't initialize spinlocks - * as they aren't supposed to be taken by any of the - * NAPI code and this dummy netdev is supposed to be - * only ever used for NAPI polls - */ - memset(dev, 0, sizeof(struct net_device)); - init_dummy_netdev_core(dev); -} -EXPORT_SYMBOL_GPL(init_dummy_netdev); - -/** * register_netdev - register a network device * @dev: device to register * @@ -10748,12 +11212,16 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev); */ int register_netdev(struct net_device *dev) { + struct net *net = dev_net(dev); int err; - if (rtnl_lock_killable()) + if (rtnl_net_lock_killable(net)) return -EINTR; + err = register_netdevice(dev); - rtnl_unlock(); + + rtnl_net_unlock(net); + return err; } EXPORT_SYMBOL(register_netdev); @@ -10891,9 +11359,8 @@ void netdev_run_todo(void) list_replace_init(&net_unlink_list, &unlink_list); while (!list_empty(&unlink_list)) { - struct net_device *dev = list_first_entry(&unlink_list, - struct net_device, - unlink_list); + dev = list_first_entry(&unlink_list, struct net_device, + unlink_list); list_del_init(&dev->unlink_list); dev->nested_level = dev->lower_level - 1; } @@ -10915,7 +11382,9 @@ void netdev_run_todo(void) continue; } + netdev_lock(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); + netdev_unlock(dev); linkwatch_sync_dev(dev); } @@ -11063,6 +11532,20 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; const struct net_device_core_stats __percpu *p; + /* + * IPv{4,6} and udp tunnels share common stat helpers and use + * different stat type (NETDEV_PCPU_STAT_TSTATS vs + * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent. + */ + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) != + offsetof(struct pcpu_dstats, rx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) != + offsetof(struct pcpu_dstats, rx_packets)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) != + offsetof(struct pcpu_dstats, tx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) != + offsetof(struct pcpu_dstats, tx_packets)); + if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); @@ -11301,6 +11784,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (!dev->ethtool) goto free_all; + dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); + if (!dev->cfg) + goto free_all; + dev->cfg_pending = dev->cfg; + napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); if (!dev->napi_config) @@ -11330,6 +11818,22 @@ free_dev: } EXPORT_SYMBOL(alloc_netdev_mqs); +static void netdev_napi_exit(struct net_device *dev) +{ + if (!list_empty(&dev->napi_list)) { + struct napi_struct *p, *n; + + netdev_lock(dev); + list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) + __netif_napi_del_locked(p); + netdev_unlock(dev); + + synchronize_net(); + } + + kvfree(dev->napi_config); +} + /** * free_netdev - free network device * @dev: device @@ -11341,8 +11845,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs); */ void free_netdev(struct net_device *dev) { - struct napi_struct *p, *n; - might_sleep(); /* When called immediately after register_netdevice() failed the unwind @@ -11355,8 +11857,8 @@ void free_netdev(struct net_device *dev) return; } - mutex_destroy(&dev->lock); - + WARN_ON(dev->cfg != dev->cfg_pending); + kfree(dev->cfg); kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); @@ -11366,10 +11868,9 @@ void free_netdev(struct net_device *dev) /* Flush device addresses */ dev_addr_flush(dev); - list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) - netif_napi_del(p); + netdev_napi_exit(dev); - kvfree(dev->napi_config); + netif_del_cpu_rmap(dev); ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT @@ -11383,6 +11884,8 @@ void free_netdev(struct net_device *dev) netdev_free_phy_link_topology(dev); + mutex_destroy(&dev->lock); + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED || dev->reg_state == NETREG_DUMMY) { @@ -11407,7 +11910,7 @@ EXPORT_SYMBOL(free_netdev); struct net_device *alloc_netdev_dummy(int sizeof_priv) { return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN, - init_dummy_netdev_core); + init_dummy_netdev); } EXPORT_SYMBOL_GPL(alloc_netdev_dummy); @@ -11420,7 +11923,7 @@ EXPORT_SYMBOL_GPL(alloc_netdev_dummy); void synchronize_net(void) { might_sleep(); - if (rtnl_is_locked()) + if (from_cleanup_net() || rtnl_is_locked()) synchronize_rcu_expedited(); else synchronize_rcu(); @@ -11483,6 +11986,19 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); +static void dev_memory_provider_uninstall(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct netdev_rx_queue *rxq = &dev->_rx[i]; + struct pp_memory_provider_params *p = &rxq->mp_params; + + if (p->mp_ops && p->mp_ops->uninstall) + p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); + } +} + void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { @@ -11513,15 +12029,29 @@ void unregister_netdevice_many_notify(struct list_head *head, BUG_ON(dev->reg_state != NETREG_REGISTERED); } - /* If device is running, close it first. */ - list_for_each_entry(dev, head, unreg_list) - list_add_tail(&dev->close_list, &close_head); + /* If device is running, close it first. Start with ops locked... */ + list_for_each_entry(dev, head, unreg_list) { + if (netdev_need_ops_lock(dev)) { + list_add_tail(&dev->close_list, &close_head); + netdev_lock(dev); + } + } + dev_close_many(&close_head, true); + /* ... now unlock them and go over the rest. */ + list_for_each_entry(dev, head, unreg_list) { + if (netdev_need_ops_lock(dev)) + netdev_unlock(dev); + else + list_add_tail(&dev->close_list, &close_head); + } dev_close_many(&close_head, true); list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ unlist_netdevice(dev); + netdev_lock(dev); WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); + netdev_unlock(dev); } flush_all_backlogs(); @@ -11531,11 +12061,13 @@ void unregister_netdevice_many_notify(struct list_head *head, struct sk_buff *skb = NULL; /* Shutdown queueing discipline. */ + netdev_lock_ops(dev); dev_shutdown(dev); dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); + dev_memory_provider_uninstall(dev); + netdev_unlock_ops(dev); bpf_dev_bound_netdev_unregister(dev); - dev_dmabuf_uninstall(dev); netdev_offload_xstats_disable_all(dev); @@ -11544,8 +12076,7 @@ void unregister_netdevice_many_notify(struct list_head *head, */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); - if (!dev->rtnl_link_ops || - dev->rtnl_link_state == RTNL_LINK_INITIALIZED) + if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing)) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL, NULL, 0, portid, nlh); @@ -11623,30 +12154,15 @@ EXPORT_SYMBOL(unregister_netdevice_many); */ void unregister_netdev(struct net_device *dev) { - rtnl_lock(); + rtnl_net_dev_lock(dev); unregister_netdevice(dev); - rtnl_unlock(); + rtnl_net_dev_unlock(dev); } EXPORT_SYMBOL(unregister_netdev); -/** - * __dev_change_net_namespace - move device to different nethost namespace - * @dev: device - * @net: network namespace - * @pat: If not NULL name pattern to try if the current device name - * is already taken in the destination network namespace. - * @new_ifindex: If not zero, specifies device index in the target - * namespace. - * - * This function shuts down a device interface and moves it - * to a new network namespace. On success 0 is returned, on - * a failure a netagive errno code is returned. - * - * Callers must hold the rtnl semaphore. - */ - int __dev_change_net_namespace(struct net_device *dev, struct net *net, - const char *pat, int new_ifindex) + const char *pat, int new_ifindex, + struct netlink_ext_ack *extack) { struct netdev_name_node *name_node; struct net *net_old = dev_net(dev); @@ -11657,12 +12173,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, /* Don't allow namespace local devices to be moved. */ err = -EINVAL; - if (dev->netns_local) + if (dev->netns_immutable) { + NL_SET_ERR_MSG(extack, "The interface netns is immutable"); goto out; + } /* Ensure the device has been registered */ - if (dev->reg_state != NETREG_REGISTERED) + if (dev->reg_state != NETREG_REGISTERED) { + NL_SET_ERR_MSG(extack, "The interface isn't registered"); goto out; + } /* Get out if there is nothing todo */ err = 0; @@ -11675,30 +12195,49 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, err = -EEXIST; if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ - if (!pat) + if (!pat) { + NL_SET_ERR_MSG(extack, + "An interface with the same name exists in the target netns"); goto out; + } err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "Unable to use '%s' for the new interface name in the target netns", + pat); goto out; + } } /* Check that none of the altnames conflicts. */ err = -EEXIST; - netdev_for_each_altname(dev, name_node) - if (netdev_name_in_use(net, name_node->name)) + netdev_for_each_altname(dev, name_node) { + if (netdev_name_in_use(net, name_node->name)) { + NL_SET_ERR_MSG_FMT(extack, + "An interface with the altname %s exists in the target netns", + name_node->name); goto out; + } + } /* Check that new_ifindex isn't used yet. */ if (new_ifindex) { err = dev_index_reserve(net, new_ifindex); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "The ifindex %d is not available in the target netns", + new_ifindex); goto out; + } } else { /* If there is an ifindex conflict assign a new one */ err = dev_index_reserve(net, dev->ifindex); if (err == -EBUSY) err = dev_index_reserve(net, 0); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Unable to allocate a new ifindex in the target netns"); goto out; + } new_ifindex = err; } @@ -11706,16 +12245,23 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, * And now a mini version of register_netdevice unregister_netdevice. */ + netdev_lock_ops(dev); /* If device is running close it first. */ - dev_close(dev); - + netif_close(dev); /* And unlink it from device chain */ unlist_netdevice(dev); + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->moving_ns = true; + netdev_unlock(dev); + synchronize_net(); /* Shutdown queueing discipline. */ + netdev_lock_ops(dev); dev_shutdown(dev); + netdev_unlock_ops(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. @@ -11746,7 +12292,9 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, move_netdevice_notifiers_dev_net(dev, net); /* Actually switch the network namespace */ + netdev_lock(dev); dev_net_set(dev, net); + netdev_unlock(dev); dev->ifindex = new_ifindex; if (new_name[0]) { @@ -11772,11 +12320,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, err = netdev_change_owner(dev, net_old, net); WARN_ON(err); + netdev_lock(dev); + dev->moving_ns = false; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); + /* Add the device back in the hashes */ list_netdevice(dev); - /* Notify protocols, that a new device appeared. */ call_netdevice_notifiers(NETDEV_REGISTER, dev); + netdev_unlock_ops(dev); /* * Prevent userspace races by waiting until the network @@ -11789,7 +12342,6 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, out: return err; } -EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { @@ -11904,7 +12456,7 @@ static struct hlist_head * __net_init netdev_create_hash(void) static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > - 8 * sizeof_field(struct napi_struct, gro_bitmask)); + BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask)); INIT_LIST_HEAD(&net->dev_base_head); @@ -12039,7 +12591,7 @@ static void __net_exit default_device_exit_net(struct net *net) char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ - if (dev->netns_local) + if (dev->netns_immutable) continue; /* Leave virtual devices for the generic cleanup */ @@ -12171,12 +12723,19 @@ static int net_page_pool_create(int cpuid) .nid = cpu_to_mem(cpuid), }; struct page_pool *pp_ptr; + int err; pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); if (IS_ERR(pp_ptr)) return -ENOMEM; - per_cpu(system_page_pool, cpuid) = pp_ptr; + err = xdp_reg_page_pool(pp_ptr); + if (err) { + page_pool_destroy(pp_ptr); + return err; + } + + per_cpu(system_page_pool.pool, cpuid) = pp_ptr; #endif return 0; } @@ -12241,12 +12800,13 @@ static int __init net_dev_init(void) * Initialise the packet receive queues. */ + flush_backlogs_fallback = flush_backlogs_alloc(); + if (!flush_backlogs_fallback) + goto out; + for_each_possible_cpu(i) { - struct work_struct *flush = per_cpu_ptr(&flush_works, i); struct softnet_data *sd = &per_cpu(softnet_data, i); - INIT_WORK(flush, flush_backlog); - skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); #ifdef CONFIG_XFRM_OFFLOAD @@ -12261,7 +12821,7 @@ static int __init net_dev_init(void) INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); - init_gro_hash(&sd->backlog); + gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; INIT_LIST_HEAD(&sd->backlog.poll_list); @@ -12305,12 +12865,13 @@ out: for_each_possible_cpu(i) { struct page_pool *pp_ptr; - pp_ptr = per_cpu(system_page_pool, i); + pp_ptr = per_cpu(system_page_pool.pool, i); if (!pp_ptr) continue; + xdp_unreg_page_pool(pp_ptr); page_pool_destroy(pp_ptr); - per_cpu(system_page_pool, i) = NULL; + per_cpu(system_page_pool.pool, i) = NULL; } } diff --git a/net/core/dev.h b/net/core/dev.h index deb5eae5749f..e93f36b7ddf3 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -2,9 +2,11 @@ #ifndef _NET_CORE_DEV_H #define _NET_CORE_DEV_H +#include <linux/cleanup.h> #include <linux/types.h> #include <linux/rwsem.h> #include <linux/netdevice.h> +#include <net/netdev_lock.h> struct net; struct netlink_ext_ack; @@ -13,8 +15,9 @@ struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { - u64 count; - unsigned int num_buckets; + struct rcu_head rcu; + unsigned int count; + u8 log_buckets; unsigned int history_head; u16 history[FLOW_LIMIT_HISTORY]; u8 buckets[]; @@ -22,7 +25,37 @@ struct sd_flow_limit { extern int netdev_flow_limit_table_len; -struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id); +struct napi_struct * +netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); +struct net_device *dev_get_by_napi_id(unsigned int napi_id); + +struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); +struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); +struct net_device * +netdev_xa_find_lock(struct net *net, struct net_device *dev, + unsigned long *index); + +DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T)); + +#define for_each_netdev_lock_scoped(net, var_name, ifindex) \ + for (struct net_device *var_name __free(netdev_unlock) = NULL; \ + (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \ + ifindex++) + +struct net_device * +netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex); +struct net_device * +netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev, + unsigned long *index); + +DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *, + if (_T) netdev_unlock_ops_compat(_T)); + +#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \ + for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \ + (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \ + &ifindex)); \ + ifindex++) #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); @@ -69,6 +102,7 @@ struct netdev_name_node { }; int netdev_get_name(struct net *net, char *name, int ifindex); +int netif_change_name(struct net_device *dev, const char *newname); int dev_change_name(struct net_device *dev, const char *newname); #define netdev_for_each_altname(dev, namenode) \ @@ -82,24 +116,28 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); int dev_validate_mtu(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); -int dev_set_mtu_ext(struct net_device *dev, int mtu, - struct netlink_ext_ack *extack); +int netif_set_mtu_ext(struct net_device *dev, int new_mtu, + struct netlink_ext_ack *extack); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); +int netif_change_proto_down(struct net_device *dev, bool proto_down); int dev_change_proto_down(struct net_device *dev, bool proto_down); -void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, - u32 value); +void netdev_change_proto_down_reason_locked(struct net_device *dev, + unsigned long mask, u32 value); typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); +int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len); int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len); +void netif_set_group(struct net_device *dev, int new_group); void dev_set_group(struct net_device *dev, int new_group); +int netif_change_carrier(struct net_device *dev, bool new_carrier); int dev_change_carrier(struct net_device *dev, bool new_carrier); void __dev_set_rx_mode(struct net_device *dev); @@ -111,6 +149,20 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh); +static inline void netif_set_up(struct net_device *dev, bool value) +{ + if (value) + dev->flags |= IFF_UP; + else + dev->flags &= ~IFF_UP; + + if (!netdev_need_ops_lock(dev)) + netdev_lock(dev); + dev->up = value; + if (!netdev_need_ops_lock(dev)) + netdev_unlock(dev); +} + static inline void netif_set_gso_max_size(struct net_device *dev, unsigned int size) { @@ -271,6 +323,18 @@ void xdp_do_check_flushed(struct napi_struct *napi); static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif +/* Best effort check that NAPI is not idle (can't be scheduled to run) */ +static inline void napi_assert_will_not_race(const struct napi_struct *napi) +{ + /* uninitialized instance, can't race */ + if (!napi->poll_list.next) + return; + + /* SCHED bit is set on disabled instances */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + WARN_ON(READ_ONCE(napi->list_owner) != -1); +} + void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 @@ -311,5 +375,8 @@ static inline void dev_xmit_recursion_dec(void) int dev_set_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack); +int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg); +int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg); #endif diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 166e404f7c03..90716bd736f3 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -242,9 +242,9 @@ static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list, __hw_addr_del_entry(from_list, ha, false, false); } -static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, - struct netdev_hw_addr_list *from_list, - int addr_len) +int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, + struct netdev_hw_addr_list *from_list, + int addr_len) { int err = 0; struct netdev_hw_addr *ha, *tmp; @@ -260,6 +260,7 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list, } return err; } +EXPORT_SYMBOL(__hw_addr_sync_multiple); /* This function only works where there is a strict 1-1 relationship * between source and destination of they synch. If you ever need to diff --git a/net/core/dev_api.c b/net/core/dev_api.c new file mode 100644 index 000000000000..1bf0153195f2 --- /dev/null +++ b/net/core/dev_api.c @@ -0,0 +1,369 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/netdevice.h> +#include <net/netdev_lock.h> + +#include "dev.h" + +/** + * dev_change_name() - change name of a device + * @dev: device + * @newname: name (or format string) must be at least IFNAMSIZ + * + * Change name of a device, can pass format strings "eth%d". + * for wildcarding. + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_name(struct net_device *dev, const char *newname) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_name(dev, newname); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_set_alias() - change ifalias of a device + * @dev: device + * @alias: name up to IFALIASZ + * @len: limit of bytes to copy from info + * + * Set ifalias for a device. + * + * Return: 0 on success, -errno on failure. + */ +int dev_set_alias(struct net_device *dev, const char *alias, size_t len) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_alias(dev, alias, len); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_alias); + +/** + * dev_change_flags() - change device settings + * @dev: device + * @flags: device state flags + * @extack: netlink extended ack + * + * Change settings on device based state flags. The flags are + * in the userspace exported format. + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_flags(struct net_device *dev, unsigned int flags, + struct netlink_ext_ack *extack) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_flags(dev, flags, extack); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_change_flags); + +/** + * dev_set_group() - change group this device belongs to + * @dev: device + * @new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ + netdev_lock_ops(dev); + netif_set_group(dev, new_group); + netdev_unlock_ops(dev); +} + +int dev_set_mac_address_user(struct net_device *dev, + struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) +{ + int ret; + + down_write(&dev_addr_sem); + netdev_lock_ops(dev); + ret = netif_set_mac_address(dev, ss, extack); + netdev_unlock_ops(dev); + up_write(&dev_addr_sem); + + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address_user); + +/** + * dev_change_net_namespace() - move device to different nethost namespace + * @dev: device + * @net: network namespace + * @pat: If not NULL name pattern to try if the current device name + * is already taken in the destination network namespace. + * + * This function shuts down a device interface and moves it + * to a new network namespace. On success 0 is returned, on + * a failure a netagive errno code is returned. + * + * Callers must hold the rtnl semaphore. + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat) +{ + return __dev_change_net_namespace(dev, net, pat, 0, NULL); +} +EXPORT_SYMBOL_GPL(dev_change_net_namespace); + +/** + * dev_change_carrier() - change device carrier + * @dev: device + * @new_carrier: new value + * + * Change device carrier + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_carrier(struct net_device *dev, bool new_carrier) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_carrier(dev, new_carrier); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_change_tx_queue_len() - change TX queue length of a netdevice + * @dev: device + * @new_len: new tx queue length + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_tx_queue_len(dev, new_len); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_change_proto_down() - set carrier according to proto_down + * @dev: device + * @proto_down: new value + * + * Return: 0 on success, -errno on failure. + */ +int dev_change_proto_down(struct net_device *dev, bool proto_down) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_change_proto_down(dev, proto_down); + netdev_unlock_ops(dev); + + return ret; +} + +/** + * dev_open() - prepare an interface for use + * @dev: device to open + * @extack: netlink extended ack + * + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. + * + * Return: 0 on success, -errno on failure. + */ +int dev_open(struct net_device *dev, struct netlink_ext_ack *extack) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_open(dev, extack); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_open); + +/** + * dev_close() - shutdown an interface + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +void dev_close(struct net_device *dev) +{ + netdev_lock_ops(dev); + netif_close(dev); + netdev_unlock_ops(dev); +} +EXPORT_SYMBOL(dev_close); + +int dev_eth_ioctl(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + int ret = -ENODEV; + + if (!ops->ndo_eth_ioctl) + return -EOPNOTSUPP; + + netdev_lock_ops(dev); + if (netif_device_present(dev)) + ret = ops->ndo_eth_ioctl(dev, ifr, cmd); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_eth_ioctl); + +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_mtu(dev, new_mtu); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_mtu); + +/** + * dev_disable_lro() - disable Large Receive Offload on a device + * @dev: device + * + * Disable Large Receive Offload (LRO) on a net device. Must be + * called under RTNL. This is needed if received packets may be + * forwarded to another interface. + */ +void dev_disable_lro(struct net_device *dev) +{ + netdev_lock_ops(dev); + netif_disable_lro(dev); + netdev_unlock_ops(dev); +} +EXPORT_SYMBOL(dev_disable_lro); + +/** + * dev_set_promiscuity() - update promiscuity count on a device + * @dev: device + * @inc: modifier + * + * Add or remove promiscuity from a device. While the count in the device + * remains above zero the interface remains promiscuous. Once it hits zero + * the device reverts back to normal filtering operation. A negative inc + * value is used to drop promiscuity on the device. + * Return 0 if successful or a negative errno code on error. + */ +int dev_set_promiscuity(struct net_device *dev, int inc) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_promiscuity(dev, inc); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_promiscuity); + +/** + * dev_set_allmulti() - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + * + * Return: 0 on success, -errno on failure. + */ + +int dev_set_allmulti(struct net_device *dev, int inc) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_allmulti(dev, inc, true); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_allmulti); + +/** + * dev_set_mac_address() - change Media Access Control Address + * @dev: device + * @ss: new address + * @extack: netlink extended ack + * + * Change the hardware (MAC) address of the device + * + * Return: 0 on success, -errno on failure. + */ +int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss, + struct netlink_ext_ack *extack) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_set_mac_address(dev, ss, extack); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL(dev_set_mac_address); + +int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) +{ + int ret; + + netdev_lock_ops(dev); + ret = netif_xdp_propagate(dev, bpf); + netdev_unlock_ops(dev); + + return ret; +} +EXPORT_SYMBOL_GPL(dev_xdp_propagate); + +/** + * netdev_state_change() - device changes state + * @dev: device to cause notification + * + * Called to indicate a device has changed state. This function calls + * the notifier chains for netdev_chain and sends a NEWLINK message + * to the routing socket. + */ +void netdev_state_change(struct net_device *dev) +{ + netdev_lock_ops(dev); + netif_state_change(dev); + netdev_unlock_ops(dev); +} +EXPORT_SYMBOL(netdev_state_change); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 46d43b950471..616479e71466 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -6,9 +6,11 @@ #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <linux/phylib_stubs.h> +#include <linux/ptp_clock_kernel.h> #include <linux/wireless.h> #include <linux/if_bridge.h> #include <net/dsa_stubs.h> +#include <net/netdev_lock.h> #include <net/wext.h> #include "dev.h" @@ -109,7 +111,7 @@ static int dev_getifmap(struct net_device *dev, struct ifreq *ifr) return 0; } -static int dev_setifmap(struct net_device *dev, struct ifreq *ifr) +static int netif_setifmap(struct net_device *dev, struct ifreq *ifr) { struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map; @@ -184,7 +186,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm return err; } -static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) +int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) { enum hwtstamp_tx_types tx_type; enum hwtstamp_rx_filters rx_filter; @@ -239,20 +241,6 @@ static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) return 0; } -static int dev_eth_ioctl(struct net_device *dev, - struct ifreq *ifr, unsigned int cmd) -{ - const struct net_device_ops *ops = dev->netdev_ops; - - if (!ops->ndo_eth_ioctl) - return -EOPNOTSUPP; - - if (!netif_device_present(dev)) - return -ENODEV; - - return ops->ndo_eth_ioctl(dev, ifr, cmd); -} - /** * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC * or of attached phylib PHY @@ -266,9 +254,24 @@ static int dev_eth_ioctl(struct net_device *dev, * -EOPNOTSUPP for phylib for now, which is still more accurate than letting * the netdev handle the GET request. */ -static int dev_get_hwtstamp_phylib(struct net_device *dev, - struct kernel_hwtstamp_config *cfg) +int dev_get_hwtstamp_phylib(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) { + struct hwtstamp_provider *hwprov; + + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + cfg->qualifier = hwprov->desc.qualifier; + if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && + hwprov->phydev) + return phy_hwtstamp_get(hwprov->phydev, cfg); + + if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) + return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg); + + return -EOPNOTSUPP; + } + if (phy_is_default_hwtstamp(dev->phydev)) return phy_hwtstamp_get(dev->phydev, cfg); @@ -289,7 +292,9 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) return -ENODEV; kernel_cfg.ifr = ifr; + netdev_lock_ops(dev); err = dev_get_hwtstamp_phylib(dev, &kernel_cfg); + netdev_unlock_ops(dev); if (err) return err; @@ -324,11 +329,32 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; - bool phy_ts = phy_is_default_hwtstamp(dev->phydev); struct kernel_hwtstamp_config old_cfg = {}; + struct hwtstamp_provider *hwprov; + struct phy_device *phydev; bool changed = false; + bool phy_ts; int err; + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB && + hwprov->phydev) { + phy_ts = true; + phydev = hwprov->phydev; + } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) { + phy_ts = false; + } else { + return -EOPNOTSUPP; + } + + cfg->qualifier = hwprov->desc.qualifier; + } else { + phy_ts = phy_is_default_hwtstamp(dev->phydev); + if (phy_ts) + phydev = dev->phydev; + } + cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV; if (phy_ts && dev->see_all_hwtstamp_requests) { @@ -350,7 +376,7 @@ int dev_set_hwtstamp_phylib(struct net_device *dev, changed = kernel_hwtstamp_config_changed(&old_cfg, cfg); if (phy_ts) { - err = phy_hwtstamp_set(dev->phydev, cfg, extack); + err = phy_hwtstamp_set(phydev, cfg, extack); if (err) { if (changed) ops->ndo_hwtstamp_set(dev, &old_cfg, NULL); @@ -392,7 +418,9 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) if (!netif_device_present(dev)) return -ENODEV; + netdev_lock_ops(dev); err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack); + netdev_unlock_ops(dev); if (err) return err; @@ -467,10 +495,14 @@ static int dev_siocbond(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocbond) { + int ret = -ENODEV; + + netdev_lock_ops(dev); if (netif_device_present(dev)) - return ops->ndo_siocbond(dev, ifr, cmd); - else - return -ENODEV; + ret = ops->ndo_siocbond(dev, ifr, cmd); + netdev_unlock_ops(dev); + + return ret; } return -EOPNOTSUPP; @@ -482,10 +514,14 @@ static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr, const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocdevprivate) { + int ret = -ENODEV; + + netdev_lock_ops(dev); if (netif_device_present(dev)) - return ops->ndo_siocdevprivate(dev, ifr, data, cmd); - else - return -ENODEV; + ret = ops->ndo_siocdevprivate(dev, ifr, data, cmd); + netdev_unlock_ops(dev); + + return ret; } return -EOPNOTSUPP; @@ -496,17 +532,21 @@ static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs) const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_siocwandev) { + int ret = -ENODEV; + + netdev_lock_ops(dev); if (netif_device_present(dev)) - return ops->ndo_siocwandev(dev, ifs); - else - return -ENODEV; + ret = ops->ndo_siocwandev(dev, ifs); + netdev_unlock_ops(dev); + + return ret; } return -EOPNOTSUPP; } /* - * Perform the SIOCxIFxxx calls, inside rtnl_lock() + * Perform the SIOCxIFxxx calls, inside rtnl_net_lock() */ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, unsigned int cmd) @@ -514,7 +554,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); const struct net_device_ops *ops; - netdevice_tracker dev_tracker; if (!dev) return -ENODEV; @@ -533,9 +572,11 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return dev_set_mtu(dev, ifr->ifr_mtu); case SIOCSIFHWADDR: - if (dev->addr_len > sizeof(struct sockaddr)) + if (dev->addr_len > sizeof(ifr->ifr_hwaddr)) return -EINVAL; - return dev_set_mac_address_user(dev, &ifr->ifr_hwaddr, NULL); + return dev_set_mac_address_user(dev, + (struct sockaddr_storage *)&ifr->ifr_hwaddr, + NULL); case SIOCSIFHWBROADCAST: if (ifr->ifr_hwaddr.sa_family != dev->type) @@ -543,11 +584,16 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, min(sizeof(ifr->ifr_hwaddr.sa_data_min), (size_t)dev->addr_len)); + netdev_lock_ops(dev); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + netdev_unlock_ops(dev); return 0; case SIOCSIFMAP: - return dev_setifmap(dev, ifr); + netdev_lock_ops(dev); + err = netif_setifmap(dev, ifr); + netdev_unlock_ops(dev); + return err; case SIOCADDMULTI: if (!ops->ndo_set_rx_mode || @@ -555,7 +601,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_lock_ops(dev); + err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_unlock_ops(dev); + return err; case SIOCDELMULTI: if (!ops->ndo_set_rx_mode || @@ -563,7 +612,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, return -EINVAL; if (!netif_device_present(dev)) return -ENODEV; - return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_lock_ops(dev); + err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data); + netdev_unlock_ops(dev); + return err; case SIOCSIFTXQLEN: if (ifr->ifr_qlen < 0) @@ -577,19 +629,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, case SIOCWANDEV: return dev_siocwandev(dev, &ifr->ifr_settings); - case SIOCBRADDIF: - case SIOCBRDELIF: - if (!netif_device_present(dev)) - return -ENODEV; - if (!netif_is_bridge_master(dev)) - return -EOPNOTSUPP; - netdev_hold(dev, &dev_tracker, GFP_KERNEL); - rtnl_unlock(); - err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); - netdev_put(dev, &dev_tracker); - rtnl_lock(); - return err; - case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15: return dev_siocdevprivate(dev, ifr, data, cmd); @@ -733,9 +772,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, dev_load(net, ifr->ifr_name); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - rtnl_lock(); + + rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); - rtnl_unlock(); + rtnl_net_unlock(net); + if (colon) *colon = ':'; return ret; @@ -770,8 +811,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: - case SIOCBRADDIF: - case SIOCBRDELIF: case SIOCSHWTSTAMP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -779,9 +818,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: dev_load(net, ifr->ifr_name); - rtnl_lock(); + + rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); - rtnl_unlock(); + rtnl_net_unlock(net); + if (need_copyout) *need_copyout = false; return ret; @@ -804,9 +845,10 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15)) { dev_load(net, ifr->ifr_name); - rtnl_lock(); + + rtnl_net_lock(net); ret = dev_ifsioc(net, ifr, data, cmd); - rtnl_unlock(); + rtnl_net_unlock(net); return ret; } return -ENOTTY; diff --git a/net/core/devmem.c b/net/core/devmem.c index 11b91c12ee11..b3a62ca0df65 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -15,6 +15,8 @@ #include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/sock.h> #include <trace/events/page_pool.h> #include "devmem.h" @@ -23,29 +25,38 @@ /* Device memory support */ -/* Protected by rtnl_lock() */ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); +static const struct memory_provider_ops dmabuf_devmem_ops; + +bool net_is_devmem_iov(struct net_iov *niov) +{ + return niov->type == NET_IOV_DMABUF; +} + static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) { struct dmabuf_genpool_chunk_owner *owner = chunk->owner; - kvfree(owner->niovs); + kvfree(owner->area.niovs); kfree(owner); } static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) { - struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + struct dmabuf_genpool_chunk_owner *owner; + owner = net_devmem_iov_to_chunk_owner(niov); return owner->base_dma_addr + ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { + struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); + size_t size, avail; gen_pool_for_each_chunk(binding->chunk_pool, @@ -63,8 +74,10 @@ void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); + kvfree(binding->tx_vec); kfree(binding); } +EXPORT_SYMBOL(__net_devmem_dmabuf_binding_free); struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) @@ -82,7 +95,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) offset = dma_addr - owner->base_dma_addr; index = offset / PAGE_SIZE; - niov = &owner->niovs[index]; + niov = &owner->area.niovs[index]; niov->pp_magic = 0; niov->pp = NULL; @@ -93,7 +106,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) void net_devmem_free_dmabuf(struct net_iov *niov) { - struct net_devmem_dmabuf_binding *binding = net_iov_binding(niov); + struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); unsigned long dma_addr = net_devmem_get_dma_addr(niov); if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, @@ -109,21 +122,27 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) unsigned long xa_idx; unsigned int rxq_idx; + xa_erase(&net_devmem_dmabuf_bindings, binding->id); + + /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the + * erase. + */ + synchronize_net(); + if (binding->list.next) list_del(&binding->list); xa_for_each(&binding->bound_rxqs, xa_idx, rxq) { - WARN_ON(rxq->mp_params.mp_priv != binding); - - rxq->mp_params.mp_priv = NULL; + const struct pp_memory_provider_params mp_params = { + .mp_priv = binding, + .mp_ops = &dmabuf_devmem_ops, + }; rxq_idx = get_netdev_rx_queue_index(rxq); - WARN_ON(netdev_rx_queue_restart(binding->dev, rxq_idx)); + __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); } - xa_erase(&net_devmem_dmabuf_bindings, binding->id); - net_devmem_dmabuf_binding_put(binding); } @@ -131,50 +150,35 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack) { + struct pp_memory_provider_params mp_params = { + .mp_priv = binding, + .mp_ops = &dmabuf_devmem_ops, + }; struct netdev_rx_queue *rxq; u32 xa_idx; int err; - if (rxq_idx >= dev->real_num_rx_queues) { - NL_SET_ERR_MSG(extack, "rx queue index out of range"); - return -ERANGE; - } + err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack); + if (err) + return err; rxq = __netif_get_rx_queue(dev, rxq_idx); - if (rxq->mp_params.mp_priv) { - NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); - return -EEXIST; - } - -#ifdef CONFIG_XDP_SOCKETS - if (rxq->pool) { - NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); - return -EBUSY; - } -#endif - err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b, GFP_KERNEL); if (err) - return err; - - rxq->mp_params.mp_priv = binding; - - err = netdev_rx_queue_restart(dev, rxq_idx); - if (err) - goto err_xa_erase; + goto err_close_rxq; return 0; -err_xa_erase: - rxq->mp_params.mp_priv = NULL; - xa_erase(&binding->bound_rxqs, xa_idx); - +err_close_rxq: + __net_mp_close_rxq(dev, rxq_idx, &mp_params); return err; } struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) { struct net_devmem_dmabuf_binding *binding; @@ -197,43 +201,48 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, } binding->dev = dev; - - err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, - binding, xa_limit_32b, &id_alloc_next, - GFP_KERNEL); - if (err < 0) - goto err_free_binding; - xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); refcount_set(&binding->ref, 1); + mutex_init(&binding->lock); + binding->dmabuf = dmabuf; binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); if (IS_ERR(binding->attachment)) { err = PTR_ERR(binding->attachment); NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); - goto err_free_id; + goto err_free_binding; } binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, - DMA_FROM_DEVICE); + direction); if (IS_ERR(binding->sgt)) { err = PTR_ERR(binding->sgt); NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment"); goto err_detach; } + if (direction == DMA_TO_DEVICE) { + binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE, + sizeof(struct net_iov *), + GFP_KERNEL); + if (!binding->tx_vec) { + err = -ENOMEM; + goto err_unmap; + } + } + /* For simplicity we expect to make PAGE_SIZE allocations, but the * binding can be much more flexible than that. We may be able to * allocate MTU sized chunks here. Leave that for future work... */ - binding->chunk_pool = - gen_pool_create(PAGE_SHIFT, dev_to_node(&dev->dev)); + binding->chunk_pool = gen_pool_create(PAGE_SHIFT, + dev_to_node(&dev->dev)); if (!binding->chunk_pool) { err = -ENOMEM; - goto err_unmap; + goto err_tx_vec; } virtual = 0; @@ -250,9 +259,9 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, goto err_free_chunks; } - owner->base_virtual = virtual; + owner->area.base_virtual = virtual; owner->base_dma_addr = dma_addr; - owner->num_niovs = len / PAGE_SIZE; + owner->area.num_niovs = len / PAGE_SIZE; owner->binding = binding; err = gen_pool_add_owner(binding->chunk_pool, dma_addr, @@ -264,37 +273,48 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, goto err_free_chunks; } - owner->niovs = kvmalloc_array(owner->num_niovs, - sizeof(*owner->niovs), - GFP_KERNEL); - if (!owner->niovs) { + owner->area.niovs = kvmalloc_array(owner->area.num_niovs, + sizeof(*owner->area.niovs), + GFP_KERNEL); + if (!owner->area.niovs) { err = -ENOMEM; goto err_free_chunks; } - for (i = 0; i < owner->num_niovs; i++) { - niov = &owner->niovs[i]; - niov->owner = owner; + for (i = 0; i < owner->area.num_niovs; i++) { + niov = &owner->area.niovs[i]; + niov->type = NET_IOV_DMABUF; + niov->owner = &owner->area; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); + if (direction == DMA_TO_DEVICE) + binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov; } virtual += len; } + err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id, + binding, xa_limit_32b, &id_alloc_next, + GFP_KERNEL); + if (err < 0) + goto err_free_chunks; + + list_add(&binding->list, &priv->bindings); + return binding; err_free_chunks: gen_pool_for_each_chunk(binding->chunk_pool, net_devmem_dmabuf_free_chunk_owner, NULL); gen_pool_destroy(binding->chunk_pool); +err_tx_vec: + kvfree(binding->tx_vec); err_unmap: dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt, DMA_FROM_DEVICE); err_detach: dma_buf_detach(dmabuf, binding->attachment); -err_free_id: - xa_erase(&net_devmem_dmabuf_bindings, binding->id); err_free_binding: kfree(binding); err_put_dmabuf: @@ -302,24 +322,72 @@ err_put_dmabuf: return ERR_PTR(err); } -void dev_dmabuf_uninstall(struct net_device *dev) +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) { struct net_devmem_dmabuf_binding *binding; - struct netdev_rx_queue *rxq; - unsigned long xa_idx; - unsigned int i; - for (i = 0; i < dev->real_num_rx_queues; i++) { - binding = dev->_rx[i].mp_params.mp_priv; - if (!binding) - continue; + rcu_read_lock(); + binding = xa_load(&net_devmem_dmabuf_bindings, id); + if (binding) { + if (!net_devmem_dmabuf_binding_get(binding)) + binding = NULL; + } + rcu_read_unlock(); - xa_for_each(&binding->bound_rxqs, xa_idx, rxq) - if (rxq == &dev->_rx[i]) { - xa_erase(&binding->bound_rxqs, xa_idx); - break; - } + return binding; +} + +void net_devmem_get_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov)); +} + +void net_devmem_put_net_iov(struct net_iov *niov) +{ + net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov)); +} + +struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, + unsigned int dmabuf_id) +{ + struct net_devmem_dmabuf_binding *binding; + struct dst_entry *dst = __sk_dst_get(sk); + int err = 0; + + binding = net_devmem_lookup_dmabuf(dmabuf_id); + if (!binding || !binding->tx_vec) { + err = -EINVAL; + goto out_err; + } + + /* The dma-addrs in this binding are only reachable to the corresponding + * net_device. + */ + if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) { + err = -ENODEV; + goto out_err; } + + return binding; + +out_err: + if (binding) + net_devmem_dmabuf_binding_put(binding); + + return ERR_PTR(err); +} + +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, + size_t virt_addr, size_t *off, size_t *size) +{ + if (virt_addr >= binding->dmabuf->size) + return NULL; + + *off = virt_addr % PAGE_SIZE; + *size = PAGE_SIZE - *off; + + return binding->tx_vec[virt_addr / PAGE_SIZE]; } /*** "Dmabuf devmem memory provider" ***/ @@ -331,11 +399,11 @@ int mp_dmabuf_devmem_init(struct page_pool *pool) if (!binding) return -EINVAL; - if (!pool->dma_map) - return -EOPNOTSUPP; - - if (pool->dma_sync) - return -EOPNOTSUPP; + /* dma-buf dma addresses do not need and should not be used with + * dma_sync_for_cpu/device. Force disable dma_sync. + */ + pool->dma_sync = false; + pool->dma_sync_for_cpu = false; if (pool->p.order != 0) return -E2BIG; @@ -387,3 +455,41 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) /* We don't want the page pool put_page()ing our net_iovs. */ return false; } + +static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + const struct net_devmem_dmabuf_binding *binding = mp_priv; + int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF; + + return nla_put_u32(rsp, type, binding->id); +} + +static void mp_dmabuf_devmem_uninstall(void *mp_priv, + struct netdev_rx_queue *rxq) +{ + struct net_devmem_dmabuf_binding *binding = mp_priv; + struct netdev_rx_queue *bound_rxq; + unsigned long xa_idx; + + xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) { + if (bound_rxq == rxq) { + xa_erase(&binding->bound_rxqs, xa_idx); + if (xa_empty(&binding->bound_rxqs)) { + mutex_lock(&binding->lock); + binding->dev = NULL; + mutex_unlock(&binding->lock); + } + break; + } + } +} + +static const struct memory_provider_ops dmabuf_devmem_ops = { + .init = mp_dmabuf_devmem_init, + .destroy = mp_dmabuf_devmem_destroy, + .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, + .release_netmem = mp_dmabuf_devmem_release_page, + .nl_fill = mp_dmabuf_devmem_nl_fill, + .uninstall = mp_dmabuf_devmem_uninstall, +}; diff --git a/net/core/devmem.h b/net/core/devmem.h index 76099ef9c482..0a3b28ba5c13 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -10,6 +10,9 @@ #ifndef _NET_DEVMEM_H #define _NET_DEVMEM_H +#include <net/netmem.h> +#include <net/netdev_netlink.h> + struct netlink_ext_ack; struct net_devmem_dmabuf_binding { @@ -18,15 +21,25 @@ struct net_devmem_dmabuf_binding { struct sg_table *sgt; struct net_device *dev; struct gen_pool *chunk_pool; + /* Protect dev */ + struct mutex lock; /* The user holds a ref (via the netlink API) for as long as they want * the binding to remain alive. Each page pool using this binding holds - * a ref to keep the binding alive. Each allocated net_iov holds a - * ref. + * a ref to keep the binding alive. The page_pool does not release the + * ref until all the net_iovs allocated from this binding are released + * back to the page_pool. * * The binding undos itself and unmaps the underlying dmabuf once all * those refs are dropped and the binding is no longer desired or in * use. + * + * net_devmem_get_net_iov() on dmabuf net_iovs will increment this + * reference, making sure that the binding remains alive until all the + * net_iovs are no longer used. net_iovs allocated from this binding + * that are stuck in the TX path for any reason (such as awaiting + * retransmits) hold a reference to the binding until the skb holding + * them is freed. */ refcount_t ref; @@ -42,6 +55,14 @@ struct net_devmem_dmabuf_binding { * active. */ u32 id; + + /* Array of net_iov pointers for this binding, sorted by virtual + * address. This array is convenient to map the virtual addresses to + * net_iovs in the TX path. + */ + struct net_iov **tx_vec; + + struct work_struct unbind_w; }; #if defined(CONFIG_NET_DEVMEM) @@ -51,63 +72,57 @@ struct net_devmem_dmabuf_binding { * allocations from this chunk. */ struct dmabuf_genpool_chunk_owner { - /* Offset into the dma-buf where this chunk starts. */ - unsigned long base_virtual; + struct net_iov_area area; + struct net_devmem_dmabuf_binding *binding; /* dma_addr of the start of the chunk. */ dma_addr_t base_dma_addr; - - /* Array of net_iovs for this chunk. */ - struct net_iov *niovs; - size_t num_niovs; - - struct net_devmem_dmabuf_binding *binding; }; -void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); +void __net_devmem_dmabuf_binding_free(struct work_struct *wq); struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack); +struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id); void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); -void dev_dmabuf_uninstall(struct net_device *dev); +void net_devmem_bind_tx_release(struct sock *sk); static inline struct dmabuf_genpool_chunk_owner * -net_iov_owner(const struct net_iov *niov) +net_devmem_iov_to_chunk_owner(const struct net_iov *niov) { - return niov->owner; + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct dmabuf_genpool_chunk_owner, area); } -static inline unsigned int net_iov_idx(const struct net_iov *niov) +static inline struct net_devmem_dmabuf_binding * +net_devmem_iov_binding(const struct net_iov *niov) { - return niov - net_iov_owner(niov)->niovs; + return net_devmem_iov_to_chunk_owner(niov)->binding; } -static inline struct net_devmem_dmabuf_binding * -net_iov_binding(const struct net_iov *niov) +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) { - return net_iov_owner(niov)->binding; + return net_devmem_iov_binding(niov)->id; } static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) { - struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + struct net_iov_area *owner = net_iov_owner(niov); return owner->base_virtual + ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); } -static inline u32 net_iov_binding_id(const struct net_iov *niov) -{ - return net_iov_owner(niov)->binding->id; -} - -static inline void +static inline bool net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { - refcount_inc(&binding->ref); + return refcount_inc_not_zero(&binding->ref); } static inline void @@ -116,28 +131,59 @@ net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) if (!refcount_dec_and_test(&binding->ref)) return; - __net_devmem_dmabuf_binding_free(binding); + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); } +void net_devmem_get_net_iov(struct net_iov *niov); +void net_devmem_put_net_iov(struct net_iov *niov); + struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); +bool net_is_devmem_iov(struct net_iov *niov); +struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); +struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size); + #else struct net_devmem_dmabuf_binding; static inline void -__net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding) +net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) +{ +} + +static inline void net_devmem_get_net_iov(struct net_iov *niov) +{ +} + +static inline void net_devmem_put_net_iov(struct net_iov *niov) +{ +} + +static inline void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { } static inline struct net_devmem_dmabuf_binding * -net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, +net_devmem_bind_dmabuf(struct net_device *dev, + enum dma_data_direction direction, + unsigned int dmabuf_fd, + struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) { return ERR_PTR(-EOPNOTSUPP); } +static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id) +{ + return NULL; +} + static inline void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) { @@ -152,10 +198,6 @@ net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return -EOPNOTSUPP; } -static inline void dev_dmabuf_uninstall(struct net_device *dev) -{ -} - static inline struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) { @@ -171,10 +213,34 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) return 0; } -static inline u32 net_iov_binding_id(const struct net_iov *niov) +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) { return 0; } + +static inline bool net_is_devmem_iov(struct net_iov *niov) +{ + return false; +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline struct net_iov * +net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr, + size_t *off, size_t *size) +{ + return NULL; +} + +static inline struct net_devmem_dmabuf_binding * +net_devmem_iov_binding(const struct net_iov *niov) +{ + return NULL; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 6efd4cccc9dd..8a7ce640f74d 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1088,7 +1088,7 @@ err_module_put: struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; - del_timer_sync(&hw_data->send_timer); + timer_delete_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; @@ -1122,7 +1122,7 @@ static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack) struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu); struct sk_buff *skb; - del_timer_sync(&hw_data->send_timer); + timer_delete_sync(&hw_data->send_timer); cancel_work_sync(&hw_data->dm_alert_work); while ((skb = __skb_dequeue(&hw_data->drop_queue))) { struct devlink_trap_metadata *hw_metadata; @@ -1183,7 +1183,7 @@ err_module_put: struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; - del_timer_sync(&data->send_timer); + timer_delete_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); @@ -1211,7 +1211,7 @@ static void net_dm_trace_off_set(void) struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu); struct sk_buff *skb; - del_timer_sync(&data->send_timer); + timer_delete_sync(&data->send_timer); cancel_work_sync(&data->dm_alert_work); while ((skb = __skb_dequeue(&data->drop_queue))) consume_skb(skb); @@ -1734,30 +1734,30 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family(&net_drop_monitor_family); - if (rc) { - pr_err("Could not create drop monitor netlink family\n"); - return rc; + for_each_possible_cpu(cpu) { + net_dm_cpu_data_init(cpu); + net_dm_hw_cpu_data_init(cpu); } - WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); + return rc; + } + + rc = genl_register_family(&net_drop_monitor_family); + if (rc) { + pr_err("Could not create drop monitor netlink family\n"); goto out_unreg; } + WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = 0; - for_each_possible_cpu(cpu) { - net_dm_cpu_data_init(cpu); - net_dm_hw_cpu_data_init(cpu); - } - goto out; out_unreg: - genl_unregister_family(&net_drop_monitor_family); + WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); out: return rc; } @@ -1766,19 +1766,18 @@ static void exit_net_drop_monitor(void) { int cpu; - BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); - /* * Because of the module_get/put we do in the trace state change path * we are guaranteed not to have any current users when we get here */ + BUG_ON(genl_unregister_family(&net_drop_monitor_family)); + + BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); for_each_possible_cpu(cpu) { net_dm_hw_cpu_data_fini(cpu); net_dm_cpu_data_fini(cpu); } - - BUG_ON(genl_unregister_family(&net_drop_monitor_family)); } module_init(init_net_drop_monitor); diff --git a/net/core/dst.c b/net/core/dst.c index 9552a90d4772..795ca07e28a4 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -165,6 +165,14 @@ static void dst_count_dec(struct dst_entry *dst) void dst_release(struct dst_entry *dst) { if (dst && rcuref_put(&dst->__rcuref)) { +#ifdef CONFIG_DST_CACHE + if (dst->flags & DST_METADATA) { + struct metadata_dst *md_dst = (struct metadata_dst *)dst; + + if (md_dst->type == METADATA_IP_TUNNEL) + dst_cache_reset_now(&md_dst->u.tun_info.dst_cache); + } +#endif dst_count_dec(dst); call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } @@ -286,7 +294,8 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, { struct metadata_dst *md_dst; - md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + md_dst = kmalloc(struct_size(md_dst, u.tun_info.options, optslen), + flags); if (!md_dst) return NULL; @@ -314,7 +323,8 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) int cpu; struct metadata_dst __percpu *md_dst; - md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, + md_dst = __alloc_percpu_gfp(struct_size(md_dst, u.tun_info.options, + optslen), __alignof__(struct metadata_dst), flags); if (!md_dst) return NULL; diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 70c634b9e7b0..93a04d18e505 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -17,6 +17,7 @@ struct dst_cache_pcpu { unsigned long refresh_ts; struct dst_entry *dst; + local_lock_t bh_lock; u32 cookie; union { struct in_addr in_saddr; @@ -65,10 +66,15 @@ fail: struct dst_entry *dst_cache_get(struct dst_cache *dst_cache) { + struct dst_entry *dst; + if (!dst_cache->cache) return NULL; - return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_lock_nested_bh(&dst_cache->cache->bh_lock); + dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); + local_unlock_nested_bh(&dst_cache->cache->bh_lock); + return dst; } EXPORT_SYMBOL_GPL(dst_cache_get); @@ -80,12 +86,16 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) if (!dst_cache->cache) return NULL; + local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; + } *saddr = idst->in_saddr.s_addr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst_rtable(dst); } EXPORT_SYMBOL_GPL(dst_cache_get_ip4); @@ -98,9 +108,11 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, if (!dst_cache->cache) return; + local_lock_nested_bh(&dst_cache->cache->bh_lock); idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, 0); idst->in_saddr.s_addr = saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip4); @@ -113,10 +125,13 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, if (!dst_cache->cache) return; + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); dst_cache_per_cpu_dst_set(idst, dst, rt6_get_cookie(dst_rt6_info(dst))); idst->in6_saddr = *saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); } EXPORT_SYMBOL_GPL(dst_cache_set_ip6); @@ -129,12 +144,17 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache, if (!dst_cache->cache) return NULL; + local_lock_nested_bh(&dst_cache->cache->bh_lock); + idst = this_cpu_ptr(dst_cache->cache); dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) + if (!dst) { + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return NULL; + } *saddr = idst->in6_saddr; + local_unlock_nested_bh(&dst_cache->cache->bh_lock); return dst; } EXPORT_SYMBOL_GPL(dst_cache_get_ip6); @@ -142,10 +162,14 @@ EXPORT_SYMBOL_GPL(dst_cache_get_ip6); int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) { + unsigned int i; + dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, gfp | __GFP_ZERO); if (!dst_cache->cache) return -ENOMEM; + for_each_possible_cpu(i) + local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock); dst_cache_reset(dst_cache); return 0; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 34185d138c95..8ca634964e36 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -37,8 +37,8 @@ static const struct fib_kuid_range fib_kuid_range_unset = { bool fib_rule_matchall(const struct fib_rule *rule) { - if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || - rule->flags) + if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) || + rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; @@ -257,16 +257,36 @@ static int nla_put_port_range(struct sk_buff *skb, int attrtype, return nla_put(skb, attrtype, sizeof(*range), range); } +static bool fib_rule_iif_match(const struct fib_rule *rule, int iifindex, + const struct flowi *fl) +{ + u8 iif_is_l3_master = READ_ONCE(rule->iif_is_l3_master); + + return iif_is_l3_master ? l3mdev_fib_rule_iif_match(fl, iifindex) : + fl->flowi_iif == iifindex; +} + +static bool fib_rule_oif_match(const struct fib_rule *rule, int oifindex, + const struct flowi *fl) +{ + u8 oif_is_l3_master = READ_ONCE(rule->oif_is_l3_master); + + return oif_is_l3_master ? l3mdev_fib_rule_oif_match(fl, oifindex) : + fl->flowi_oif == oifindex; +} + static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { - int ret = 0; + int iifindex, oifindex, ret = 0; - if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) + iifindex = READ_ONCE(rule->iifindex); + if (iifindex && !fib_rule_iif_match(rule, iifindex, fl)) goto out; - if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) + oifindex = READ_ONCE(rule->oifindex); + if (oifindex && !fib_rule_oif_match(rule, oifindex, fl)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) @@ -371,7 +391,8 @@ static int call_fib_rule_notifiers(struct net *net, .rule = rule, }; - ASSERT_RTNL(); + ASSERT_RTNL_NET(net); + /* Paired with READ_ONCE() in fib_rules_seq() */ WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1); return call_fib_notifiers(net, event_type, &info.info); @@ -459,9 +480,6 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, if (rule->tun_id && r->tun_id != rule->tun_id) continue; - if (r->fr_net != rule->fr_net) - continue; - if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; @@ -481,11 +499,17 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, &rule->sport_range)) continue; + if (rule->sport_mask && r->sport_mask != rule->sport_mask) + continue; + if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; + if (rule->dport_mask && r->dport_mask != rule->dport_mask) + continue; + if (!ops->compare(r, frh, tb)) continue; return r; @@ -515,14 +539,40 @@ static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, } #endif -static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, +static int fib_nl2rule_port_mask(const struct nlattr *mask_attr, + const struct fib_rule_port_range *range, + u16 *port_mask, + struct netlink_ext_ack *extack) +{ + if (!fib_rule_port_range_valid(range)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, + "Cannot specify port mask without port value"); + return -EINVAL; + } + + if (fib_rule_port_is_range(range)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, + "Cannot specify port mask for port range"); + return -EINVAL; + } + + if (range->start & ~nla_get_u16(mask_attr)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask"); + return -EINVAL; + } + + *port_mask = nla_get_u16(mask_attr); + + return 0; +} + +static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, struct nlattr *tb[], struct fib_rule **rule, bool *user_priority) { - struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rule *nlrule = NULL; int err = -EINVAL; @@ -554,30 +604,18 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[FRA_PRIORITY]) { nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); *user_priority = true; - } else { - nlrule->pref = fib_default_rule_pref(ops); } nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC); if (tb[FRA_IIFNAME]) { - struct net_device *dev; - nlrule->iifindex = -1; nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, nlrule->iifname); - if (dev) - nlrule->iifindex = dev->ifindex; } if (tb[FRA_OIFNAME]) { - struct net_device *dev; - nlrule->oifindex = -1; nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, nlrule->oifname); - if (dev) - nlrule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { @@ -619,11 +657,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, } nlrule->target = nla_get_u32(tb[FRA_GOTO]); - /* Backward jumps are prohibited to avoid endless loops */ - if (nlrule->target <= nlrule->pref) { - NL_SET_ERR_MSG(extack, "Backward goto not supported"); - goto errout_free; - } } else if (nlrule->action == FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; @@ -662,6 +695,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } + if (!fib_rule_port_is_range(&nlrule->sport_range)) + nlrule->sport_mask = U16_MAX; + } + + if (tb[FRA_SPORT_MASK]) { + err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK], + &nlrule->sport_range, + &nlrule->sport_mask, extack); + if (err) + goto errout_free; } if (tb[FRA_DPORT_RANGE]) { @@ -671,6 +714,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } + if (!fib_rule_port_is_range(&nlrule->dport_range)) + nlrule->dport_mask = U16_MAX; + } + + if (tb[FRA_DPORT_MASK]) { + err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK], + &nlrule->dport_range, + &nlrule->dport_mask, extack); + if (err) + goto errout_free; } *rule = nlrule; @@ -683,6 +736,43 @@ errout: return err; } +static int fib_nl2rule_rtnl(struct fib_rule *nlrule, + struct fib_rules_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + if (!tb[FRA_PRIORITY]) + nlrule->pref = fib_default_rule_pref(ops); + + /* Backward jumps are prohibited to avoid endless loops */ + if (tb[FRA_GOTO] && nlrule->target <= nlrule->pref) { + NL_SET_ERR_MSG(extack, "Backward goto not supported"); + return -EINVAL; + } + + if (tb[FRA_IIFNAME]) { + struct net_device *dev; + + dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname); + if (dev) { + nlrule->iifindex = dev->ifindex; + nlrule->iif_is_l3_master = netif_is_l3_master(dev); + } + } + + if (tb[FRA_OIFNAME]) { + struct net_device *dev; + + dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname); + if (dev) { + nlrule->oifindex = dev->ifindex; + nlrule->oif_is_l3_master = netif_is_l3_master(dev); + } + } + + return 0; +} + static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule) { @@ -719,9 +809,6 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, if (r->tun_id != rule->tun_id) continue; - if (r->fr_net != rule->fr_net) - continue; - if (r->l3mdev != rule->l3mdev) continue; @@ -739,10 +826,16 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, &rule->sport_range)) continue; + if (r->sport_mask != rule->sport_mask) + continue; + if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; + if (r->dport_mask != rule->dport_mask) + continue; + if (!ops->compare(r, frh, tb)) continue; return 1; @@ -770,20 +863,25 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), + [FRA_FLOWLABEL] = { .type = NLA_BE32 }, + [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 }, + [FRA_SPORT_MASK] = { .type = NLA_U16 }, + [FRA_DPORT_MASK] = { .type = NLA_U16 }, + [FRA_DSCP_MASK] = NLA_POLICY_MASK(NLA_U8, INET_DSCP_MASK >> 2), }; -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held) { - struct net *net = sock_net(skb->sk); - struct fib_rule_hdr *frh = nlmsg_data(nlh); - struct fib_rules_ops *ops = NULL; struct fib_rule *rule = NULL, *r, *last = NULL; - struct nlattr *tb[FRA_MAX + 1]; int err = -EINVAL, unresolved = 0; + struct fib_rules_ops *ops = NULL; + struct nlattr *tb[FRA_MAX + 1]; bool user_priority = false; + struct fib_rule_hdr *frh; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } @@ -802,10 +900,17 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); + err = fib_nl2rule(net, nlh, extack, ops, tb, &rule, &user_priority); if (err) goto errout; + if (!rtnl_held) + rtnl_net_lock(net); + + err = fib_nl2rule_rtnl(rule, ops, tb, extack); + if (err) + goto errout_free; + if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; @@ -867,31 +972,45 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (rule->tun_id) ip_tunnel_need_metadata(); + fib_rule_get(rule); + + if (!rtnl_held) + rtnl_net_unlock(net); + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); + fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: + if (!rtnl_held) + rtnl_net_unlock(net); kfree(rule); errout: rules_ops_put(ops); return err; } -EXPORT_SYMBOL_GPL(fib_nl_newrule); +EXPORT_SYMBOL_GPL(fib_newrule); -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); - struct fib_rule_hdr *frh = nlmsg_data(nlh); + return fib_newrule(sock_net(skb->sk), skb, nlh, extack, false); +} + +int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held) +{ + struct fib_rule *rule = NULL, *nlrule = NULL; struct fib_rules_ops *ops = NULL; - struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; - int err = -EINVAL; bool user_priority = false; + struct fib_rule_hdr *frh; + int err = -EINVAL; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid msg length"); goto errout; } @@ -910,25 +1029,32 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); + err = fib_nl2rule(net, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) goto errout; + if (!rtnl_held) + rtnl_net_lock(net); + + err = fib_nl2rule_rtnl(nlrule, ops, tb, extack); + if (err) + goto errout_free; + rule = rule_find(ops, frh, tb, nlrule, user_priority); if (!rule) { err = -ENOENT; - goto errout; + goto errout_free; } if (rule->flags & FIB_RULE_PERMANENT) { err = -EPERM; - goto errout; + goto errout_free; } if (ops->delete) { err = ops->delete(rule); if (err) - goto errout; + goto errout_free; } if (rule->tun_id) @@ -950,7 +1076,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { - struct fib_rule *n; + struct fib_rule *n, *r; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) @@ -964,22 +1090,33 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } } - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, - NULL); - notify_rule_change(RTM_DELRULE, rule, ops, nlh, - NETLINK_CB(skb).portid); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); + + if (!rtnl_held) + rtnl_net_unlock(net); + + notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); kfree(nlrule); return 0; -errout: +errout_free: + if (!rtnl_held) + rtnl_net_unlock(net); kfree(nlrule); +errout: rules_ops_put(ops); return err; } -EXPORT_SYMBOL_GPL(fib_nl_delrule); +EXPORT_SYMBOL_GPL(fib_delrule); + +static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return fib_delrule(sock_net(skb->sk), skb, nlh, extack, false); +} static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) @@ -998,7 +1135,9 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ - + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ + + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */ + + nla_total_size(2) /* FRA_SPORT_MASK */ + + nla_total_size(2); /* FRA_DPORT_MASK */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -1039,14 +1178,14 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; - if (rule->iifindex == -1) + if (READ_ONCE(rule->iifindex) == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; - if (rule->oifindex == -1) + if (READ_ONCE(rule->oifindex) == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } @@ -1066,8 +1205,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || + (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK, + rule->sport_mask)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || + (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK, + rule->dport_mask)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; @@ -1119,12 +1262,12 @@ static int fib_valid_dumprule_req(const struct nlmsghdr *nlh, { struct fib_rule_hdr *frh; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { + frh = nlmsg_payload(nlh, sizeof(*frh)); + if (!frh) { NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request"); return -EINVAL; } - frh = nlmsg_data(nlh); if (frh->dst_len || frh->src_len || frh->tos || frh->table || frh->res1 || frh->res2 || frh->action || frh->flags) { NL_SET_ERR_MSG(extack, @@ -1217,11 +1360,17 @@ static void attach_rules(struct list_head *rules, struct net_device *dev) list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && - strcmp(dev->name, rule->iifname) == 0) - rule->iifindex = dev->ifindex; + strcmp(dev->name, rule->iifname) == 0) { + WRITE_ONCE(rule->iifindex, dev->ifindex); + WRITE_ONCE(rule->iif_is_l3_master, + netif_is_l3_master(dev)); + } if (rule->oifindex == -1 && - strcmp(dev->name, rule->oifname) == 0) - rule->oifindex = dev->ifindex; + strcmp(dev->name, rule->oifname) == 0) { + WRITE_ONCE(rule->oifindex, dev->ifindex); + WRITE_ONCE(rule->oif_is_l3_master, + netif_is_l3_master(dev)); + } } } @@ -1230,10 +1379,14 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) struct fib_rule *rule; list_for_each_entry(rule, rules, list) { - if (rule->iifindex == dev->ifindex) - rule->iifindex = -1; - if (rule->oifindex == dev->ifindex) - rule->oifindex = -1; + if (rule->iifindex == dev->ifindex) { + WRITE_ONCE(rule->iifindex, -1); + WRITE_ONCE(rule->iif_is_l3_master, false); + } + if (rule->oifindex == dev->ifindex) { + WRITE_ONCE(rule->oifindex, -1); + WRITE_ONCE(rule->oif_is_l3_master, false); + } } } @@ -1291,8 +1444,10 @@ static struct pernet_operations fib_rules_net_ops = { }; static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = { - {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule}, - {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule}, + {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule, + .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; diff --git a/net/core/filter.c b/net/core/filter.c index 2fb45a86f3dd..327ca73f9cd7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -218,24 +218,36 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) return 0; } +static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset) +{ + if (likely(offset >= 0)) + return offset; + + if (offset >= SKF_NET_OFF) + return offset - SKF_NET_OFF + skb_network_offset(skb); + + if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb)) + return offset - SKF_LL_OFF + skb_mac_offset(skb); + + return INT_MIN; +} + BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - u8 tmp, *ptr; + u8 tmp; const int len = sizeof(tmp); - if (offset >= 0) { - if (headlen - offset >= len) - return *(u8 *)(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return tmp; - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return *(u8 *)ptr; - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return *(u8 *)(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return tmp; + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, @@ -248,21 +260,19 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - __be16 tmp, *ptr; + __be16 tmp; const int len = sizeof(tmp); - if (offset >= 0) { - if (headlen - offset >= len) - return get_unaligned_be16(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return be16_to_cpu(tmp); - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return get_unaligned_be16(ptr); - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return get_unaligned_be16(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be16_to_cpu(tmp); + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, @@ -275,21 +285,19 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, data, int, headlen, int, offset) { - __be32 tmp, *ptr; + __be32 tmp; const int len = sizeof(tmp); - if (likely(offset >= 0)) { - if (headlen - offset >= len) - return get_unaligned_be32(data + offset); - if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) - return be32_to_cpu(tmp); - } else { - ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); - if (likely(ptr)) - return get_unaligned_be32(ptr); - } + offset = bpf_skb_load_helper_convert_offset(skb, offset); + if (offset == INT_MIN) + return -EFAULT; - return -EFAULT; + if (headlen - offset >= len) + return get_unaligned_be32(data + offset); + if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) + return be32_to_cpu(tmp); + else + return -EFAULT; } BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, @@ -1960,10 +1968,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, bool is_pseudo = flags & BPF_F_PSEUDO_HDR; bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; bool do_mforce = flags & BPF_F_MARK_ENFORCE; + bool is_ipv6 = flags & BPF_F_IPV6; __sum16 *ptr; if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | - BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) + BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6))) return -EINVAL; if (unlikely(offset > 0xffff || offset & 1)) return -EFAULT; @@ -1979,7 +1988,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, if (unlikely(from != 0)) return -EINVAL; - inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); + inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6); break; case 2: inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); @@ -2501,6 +2510,7 @@ int skb_do_redirect(struct sk_buff *skb) goto out_drop; skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); + skb_scrub_packet(skb, false); return -EAGAIN; } return flags & BPF_F_NEIGH ? @@ -4128,13 +4138,13 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) } static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, - struct xdp_mem_info *mem_info, bool release) + enum xdp_mem_type mem_type, bool release) { struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); if (release) { xsk_buff_del_tail(zc_frag); - __xdp_return(NULL, mem_info, false, zc_frag); + __xdp_return(0, mem_type, false, zc_frag); } else { zc_frag->data_end -= shrink; } @@ -4143,19 +4153,16 @@ static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, int shrink) { - struct xdp_mem_info *mem_info = &xdp->rxq->mem; + enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; - if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { - bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release); + if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { + bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); goto out; } - if (release) { - struct page *page = skb_frag_page(frag); - - __xdp_return(page_address(page), mem_info, false, NULL); - } + if (release) + __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); out: return release; @@ -4357,9 +4364,9 @@ u32 xdp_master_redirect(struct xdp_buff *xdp) EXPORT_SYMBOL_GPL(xdp_master_redirect); static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri, - struct net_device *dev, + const struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; @@ -4380,10 +4387,10 @@ err: return err; } -static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri, - struct net_device *dev, - struct xdp_frame *xdpf, - struct bpf_prog *xdp_prog) +static __always_inline int +__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev, + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) { enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; @@ -4452,7 +4459,7 @@ err: } int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) + const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; @@ -4466,7 +4473,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, EXPORT_SYMBOL_GPL(xdp_do_redirect); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, - struct xdp_frame *xdpf, struct bpf_prog *xdp_prog) + struct xdp_frame *xdpf, + const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; @@ -4481,9 +4489,9 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame); static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, - struct bpf_prog *xdp_prog, void *fwd, - enum bpf_map_type map_type, u32 map_id, - u32 flags) + const struct bpf_prog *xdp_prog, + void *fwd, enum bpf_map_type map_type, + u32 map_id, u32 flags) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); struct bpf_map *map; @@ -4537,7 +4545,8 @@ err: } int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, - struct xdp_buff *xdp, struct bpf_prog *xdp_prog) + struct xdp_buff *xdp, + const struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); enum bpf_map_type map_type = ri->map_type; @@ -5223,6 +5232,25 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt) +{ + u32 sk_bpf_cb_flags; + + if (getopt) { + *(u32 *)optval = sk->sk_bpf_cb_flags; + return 0; + } + + sk_bpf_cb_flags = *(u32 *)optval; + + if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK) + return -EINVAL; + + sk->sk_bpf_cb_flags = sk_bpf_cb_flags; + + return 0; +} + static int sol_socket_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) @@ -5239,6 +5267,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname, case SO_MAX_PACING_RATE: case SO_BINDTOIFINDEX: case SO_TXREHASH: + case SK_BPF_CB_FLAGS: if (*optlen != sizeof(int)) return -EINVAL; break; @@ -5248,6 +5277,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname, return -EINVAL; } + if (optname == SK_BPF_CB_FLAGS) + return sk_bpf_set_get_cb_flags(sk, optval, getopt); + if (getopt) { if (optname == SO_BINDTODEVICE) return -EINVAL; @@ -5260,6 +5292,38 @@ static int sol_socket_sockopt(struct sock *sk, int optname, KERNEL_SOCKPTR(optval), *optlen); } +static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname, + char *optval, int optlen) +{ + if (optlen != sizeof(int)) + return -EINVAL; + + switch (optname) { + case TCP_BPF_SOCK_OPS_CB_FLAGS: { + int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags; + + memcpy(optval, &cb_flags, optlen); + break; + } + case TCP_BPF_RTO_MIN: { + int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min); + + memcpy(optval, &rto_min_us, optlen); + break; + } + case TCP_BPF_DELACK_MAX: { + int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max); + + memcpy(optval, &delack_max_us, optlen); + break; + } + default: + return -EINVAL; + } + + return 0; +} + static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, char *optval, int optlen) { @@ -5383,6 +5447,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, case TCP_USER_TIMEOUT: case TCP_NOTSENT_LOWAT: case TCP_SAVE_SYN: + case TCP_RTO_MAX_MS: if (*optlen != sizeof(int)) return -EINVAL; break; @@ -5392,20 +5457,9 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, if (*optlen < 1) return -EINVAL; break; - case TCP_BPF_SOCK_OPS_CB_FLAGS: - if (*optlen != sizeof(int)) - return -EINVAL; - if (getopt) { - struct tcp_sock *tp = tcp_sk(sk); - int cb_flags = tp->bpf_sock_ops_cb_flags; - - memcpy(optval, &cb_flags, *optlen); - return 0; - } - return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); default: if (getopt) - return -EINVAL; + return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen); return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); } @@ -5501,6 +5555,11 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; } +static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock) +{ + return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB; +} + static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { @@ -5651,6 +5710,9 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } @@ -5736,6 +5798,9 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { int ret, copy_len = 0; @@ -5778,6 +5843,9 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; @@ -7587,6 +7655,9 @@ BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, u8 search_kind, search_len, copy_len, magic_len; int ret; + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + /* 2 byte is the minimal option len except TCPOPT_NOP and * TCPOPT_EOL which are useless for the bpf prog to learn * and this helper disallow loading them also. @@ -7652,7 +7723,7 @@ static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -7953,10 +8024,6 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; @@ -7982,10 +8049,6 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) if (func_proto) return func_proto; - func_proto = cgroup_current_func_proto(func_id, prog); - if (func_proto) - return func_proto; - switch (func_id) { case BPF_FUNC_bind: switch (prog->expected_attach_type) { @@ -8076,6 +8139,8 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_load_bytes_relative_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; case BPF_FUNC_perf_event_output: @@ -8417,18 +8482,12 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pop_data_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sk_msg_proto; -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif default: return bpf_sk_base_func_proto(func_id, prog); } @@ -9079,7 +9138,8 @@ static bool xdp_is_valid_access(int off, int size, return __is_valid_xdp_access(off, size); } -void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act) +void bpf_warn_invalid_xdp_action(const struct net_device *dev, + const struct bpf_prog *prog, u32 act) { const u32 act_max = XDP_REDIRECT; @@ -9635,7 +9695,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, case offsetof(struct __sk_buff, queue_mapping): if (type == BPF_WRITE) { - u32 off = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); + u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size); if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) { *insn++ = BPF_JMP_A(0); /* noop */ @@ -9644,7 +9704,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, if (BPF_CLASS(si->code) == BPF_STX) *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1); - *insn++ = BPF_EMIT_STORE(BPF_H, si, off); + *insn++ = BPF_EMIT_STORE(BPF_H, si, offset); } else { *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, bpf_target_off(struct sk_buff, @@ -10358,10 +10418,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ - is_fullsock), \ + is_locked_tcp_sock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ + is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ @@ -10446,10 +10506,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ - is_fullsock), \ + is_locked_tcp_sock), \ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ + is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ @@ -12062,6 +12122,25 @@ __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk, #endif } +__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, + u64 flags) +{ + struct sk_buff *skb; + + if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB) + return -EOPNOTSUPP; + + if (flags) + return -EINVAL; + + skb = skops->skb; + skb_shinfo(skb)->tx_flags |= SKBTX_BPF; + TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF; + skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + + return 0; +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, @@ -12095,6 +12174,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) +BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) +BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) + static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, @@ -12115,6 +12198,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { .set = &bpf_kfunc_check_set_tcp_reqsk, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_sock_ops, +}; + static int __init bpf_kfunc_init(void) { int ret; @@ -12133,7 +12221,8 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); - return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops); } late_initcall(bpf_kfunc_init); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0e638a37aa09..1b61bb25ba0e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -106,7 +106,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, #endif /* CONFIG_BPF_SYSCALL */ /** - * __skb_flow_get_ports - extract the upper layer ports and return them + * skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset @@ -116,8 +116,8 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ -__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - const void *data, int hlen) +__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); @@ -137,7 +137,7 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, return 0; } -EXPORT_SYMBOL(__skb_flow_get_ports); +EXPORT_SYMBOL(skb_flow_get_ports); static bool icmp_has_id(u8 type) { @@ -853,23 +853,30 @@ __skb_flow_dissect_ports(const struct sk_buff *skb, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { - enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; - struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_ports_range *key_ports_range = NULL; + struct flow_dissector_key_ports *key_ports = NULL; + __be32 ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS; - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); - if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) + if (!key_ports && !key_ports_range) return; - key_ports = skb_flow_dissector_target(flow_dissector, - dissector_ports, - target_container); - key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); + ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); + + if (key_ports) + key_ports->ports = ports; + + if (key_ports_range) + key_ports_range->tp.ports = ports; } static void @@ -924,6 +931,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { + struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -968,20 +976,21 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE, - target_container); - - if (key_ports) { key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) { + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + key_ports_range->tp.src = flow_keys->sport; + key_ports_range->tp.dst = flow_keys->dport; + } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { @@ -1108,10 +1117,12 @@ bool __skb_flow_dissect(const struct net *net, FLOW_DISSECTOR_KEY_BASIC, target_container); + rcu_read_lock(); + if (skb) { if (!net) { if (skb->dev) - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); else if (skb->sk) net = sock_net(skb->sk); } @@ -1122,7 +1133,6 @@ bool __skb_flow_dissect(const struct net *net, enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog_array *run_array; - rcu_read_lock(); run_array = rcu_dereference(init_net.bpf.run_array[type]); if (!run_array) run_array = rcu_dereference(net->bpf.run_array[type]); @@ -1150,17 +1160,17 @@ bool __skb_flow_dissect(const struct net *net, prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); - if (result == BPF_FLOW_DISSECTOR_CONTINUE) - goto dissect_continue; - __skb_flow_bpf_to_target(&flow_keys, flow_dissector, - target_container); - rcu_read_unlock(); - return result == BPF_OK; + if (result != BPF_FLOW_DISSECTOR_CONTINUE) { + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + rcu_read_unlock(); + return result == BPF_OK; + } } -dissect_continue: - rcu_read_unlock(); } + rcu_read_unlock(); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 412816076b8b..2b821b9a8699 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -177,7 +177,7 @@ int gen_new_estimator(struct gnet_stats_basic_sync *bstats, spin_lock_bh(lock); old = rcu_dereference_protected(*rate_est, 1); if (old) { - del_timer_sync(&old->timer); + timer_delete_sync(&old->timer); est->avbps = old->avbps; est->avpps = old->avpps; } diff --git a/net/core/gro.c b/net/core/gro.c index d1f44084e978..b350e5b69549 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -7,9 +7,6 @@ #define MAX_GRO_SKBS 8 -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) - static DEFINE_SPINLOCK(offload_lock); /** @@ -253,8 +250,7 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) return 0; } - -static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) +static void gro_complete(struct gro_node *gro, struct sk_buff *skb) { struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; @@ -287,43 +283,43 @@ static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) } out: - gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); + gro_normal_one(gro, skb, NAPI_GRO_CB(skb)->count); } -static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, - bool flush_old) +static void __gro_flush_chain(struct gro_node *gro, u32 index, bool flush_old) { - struct list_head *head = &napi->gro_hash[index].list; + struct list_head *head = &gro->hash[index].list; struct sk_buff *skb, *p; list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; skb_list_del_init(skb); - napi_gro_complete(napi, skb); - napi->gro_hash[index].count--; + gro_complete(gro, skb); + gro->hash[index].count--; } - if (!napi->gro_hash[index].count) - __clear_bit(index, &napi->gro_bitmask); + if (!gro->hash[index].count) + __clear_bit(index, &gro->bitmask); } -/* napi->gro_hash[].list contains packets ordered by age. +/* + * gro->hash[].list contains packets ordered by age. * youngest packets at the head of it. * Complete skbs in reverse order to reduce latencies. */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) +void __gro_flush(struct gro_node *gro, bool flush_old) { - unsigned long bitmask = napi->gro_bitmask; + unsigned long bitmask = gro->bitmask; unsigned int i, base = ~0U; while ((i = ffs(bitmask)) != 0) { bitmask >>= i; base += i; - __napi_gro_flush_chain(napi, base, flush_old); + __gro_flush_chain(gro, base, flush_old); } } -EXPORT_SYMBOL(napi_gro_flush); +EXPORT_SYMBOL(__gro_flush); static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, const struct sk_buff *p, @@ -442,7 +438,7 @@ static void gro_try_pull_from_frag0(struct sk_buff *skb) gro_pull_from_frag0(skb, grow); } -static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) +static void gro_flush_oldest(struct gro_node *gro, struct list_head *head) { struct sk_buff *oldest; @@ -458,14 +454,15 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) * SKB to the chain. */ skb_list_del_init(oldest); - napi_gro_complete(napi, oldest); + gro_complete(gro, oldest); } -static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static enum gro_result dev_gro_receive(struct gro_node *gro, + struct sk_buff *skb) { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &net_hotdata.offload_base; + struct gro_list *gro_list = &gro->hash[bucket]; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; @@ -529,7 +526,7 @@ found_ptype: if (pp) { skb_list_del_init(pp); - napi_gro_complete(napi, pp); + gro_complete(gro, pp); gro_list->count--; } @@ -540,7 +537,7 @@ found_ptype: goto normal; if (unlikely(gro_list->count >= MAX_GRO_SKBS)) - gro_flush_oldest(napi, &gro_list->list); + gro_flush_oldest(gro, &gro_list->list); else gro_list->count++; @@ -554,10 +551,10 @@ found_ptype: ret = GRO_HELD; ok: if (gro_list->count) { - if (!test_bit(bucket, &napi->gro_bitmask)) - __set_bit(bucket, &napi->gro_bitmask); - } else if (test_bit(bucket, &napi->gro_bitmask)) { - __clear_bit(bucket, &napi->gro_bitmask); + if (!test_bit(bucket, &gro->bitmask)) + __set_bit(bucket, &gro->bitmask); + } else if (test_bit(bucket, &gro->bitmask)) { + __clear_bit(bucket, &gro->bitmask); } return ret; @@ -596,13 +593,12 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); -static gro_result_t napi_skb_finish(struct napi_struct *napi, - struct sk_buff *skb, - gro_result_t ret) +static gro_result_t gro_skb_finish(struct gro_node *gro, struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: - gro_normal_one(napi, skb, 1); + gro_normal_one(gro, skb, 1); break; case GRO_MERGED_FREE: @@ -623,21 +619,21 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi, return ret; } -gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb) { gro_result_t ret; - skb_mark_napi_id(skb, napi); + __skb_mark_napi_id(skb, gro); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb, 0); - ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = gro_skb_finish(gro, skb, dev_gro_receive(gro, skb)); trace_napi_gro_receive_exit(ret); return ret; } -EXPORT_SYMBOL(napi_gro_receive); +EXPORT_SYMBOL(gro_receive_skb); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { @@ -656,6 +652,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; + skb->ip_summed = CHECKSUM_NONE; skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->gso_size = 0; if (unlikely(skb->slow_gro)) { @@ -693,7 +690,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) - gro_normal_one(napi, skb, 1); + gro_normal_one(&napi->gro, skb, 1); break; case GRO_MERGED_FREE: @@ -762,7 +759,7 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) trace_napi_gro_frags_entry(skb); - ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = napi_frags_finish(napi, skb, dev_gro_receive(&napi->gro, skb)); trace_napi_gro_frags_exit(ret); return ret; @@ -794,3 +791,37 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) return sum; } EXPORT_SYMBOL(__skb_gro_checksum_complete); + +void gro_init(struct gro_node *gro) +{ + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { + INIT_LIST_HEAD(&gro->hash[i].list); + gro->hash[i].count = 0; + } + + gro->bitmask = 0; + gro->cached_napi_id = 0; + + INIT_LIST_HEAD(&gro->rx_list); + gro->rx_count = 0; +} + +void gro_cleanup(struct gro_node *gro) +{ + struct sk_buff *skb, *n; + + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { + list_for_each_entry_safe(skb, n, &gro->hash[i].list, list) + kfree_skb(skb); + + gro->hash[i].count = 0; + } + + gro->bitmask = 0; + gro->cached_napi_id = 0; + + list_for_each_entry_safe(skb, n, &gro->rx_list, list) + kfree_skb(skb); + + gro->rx_count = 0; +} diff --git a/net/core/hotdata.c b/net/core/hotdata.c index d0aaaaa556f2..0bc893d5f07b 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -7,7 +7,6 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), - .ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all), .gro_normal_batch = 8, .netdev_budget = 300, diff --git a/net/core/link_watch.c b/net/core/link_watch.c index cb04ef2b9807..864f3bbc3a4c 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -183,7 +183,7 @@ static void linkwatch_do_dev(struct net_device *dev) else dev_deactivate(dev); - netdev_state_change(dev); + netif_state_change(dev); } /* Note: our callers are responsible for calling netdev_tracker_free(). * This is the reason we use __dev_put() instead of dev_put(). @@ -240,7 +240,9 @@ static void __linkwatch_run_queue(int urgent_only) */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); spin_unlock_irq(&lweventlist_lock); + netdev_lock_ops(dev); linkwatch_do_dev(dev); + netdev_unlock_ops(dev); do_dev--; spin_lock_irq(&lweventlist_lock); } @@ -253,25 +255,41 @@ static void __linkwatch_run_queue(int urgent_only) spin_unlock_irq(&lweventlist_lock); } -void linkwatch_sync_dev(struct net_device *dev) +static bool linkwatch_clean_dev(struct net_device *dev) { unsigned long flags; - int clean = 0; + bool clean = false; spin_lock_irqsave(&lweventlist_lock, flags); if (!list_empty(&dev->link_watch_list)) { list_del_init(&dev->link_watch_list); - clean = 1; + clean = true; /* We must release netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); } spin_unlock_irqrestore(&lweventlist_lock, flags); - if (clean) + + return clean; +} + +void __linkwatch_sync_dev(struct net_device *dev) +{ + netdev_ops_assert_locked(dev); + + if (linkwatch_clean_dev(dev)) linkwatch_do_dev(dev); } +void linkwatch_sync_dev(struct net_device *dev) +{ + if (linkwatch_clean_dev(dev)) { + netdev_lock_ops(dev); + linkwatch_do_dev(dev); + netdev_unlock_ops(dev); + } +} /* Must be called with the rtnl semaphore held */ void linkwatch_run_queue(void) diff --git a/net/core/rtnl_net_debug.c b/net/core/lock_debug.c index f406045cbd0e..9e9fb25314b9 100644 --- a/net/core/rtnl_net_debug.c +++ b/net/core/lock_debug.c @@ -6,10 +6,11 @@ #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <net/net_namespace.h> +#include <net/netdev_lock.h> #include <net/netns/generic.h> -static int rtnl_net_debug_event(struct notifier_block *nb, - unsigned long event, void *ptr) +int netdev_debug_event(struct notifier_block *nb, unsigned long event, + void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); @@ -17,17 +18,21 @@ static int rtnl_net_debug_event(struct notifier_block *nb, /* Keep enum and don't add default to trigger -Werror=switch */ switch (cmd) { + case NETDEV_XDP_FEAT_CHANGE: + netdev_assert_locked(dev); + fallthrough; + case NETDEV_CHANGE: + case NETDEV_REGISTER: case NETDEV_UP: + netdev_ops_assert_locked(dev); + fallthrough; case NETDEV_DOWN: case NETDEV_REBOOT: - case NETDEV_CHANGE: - case NETDEV_REGISTER: case NETDEV_UNREGISTER: case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: case NETDEV_PRE_CHANGEADDR: case NETDEV_GOING_DOWN: - case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: case NETDEV_BONDING_FAILOVER: case NETDEV_PRE_UP: @@ -56,25 +61,17 @@ static int rtnl_net_debug_event(struct notifier_block *nb, case NETDEV_OFFLOAD_XSTATS_DISABLE: case NETDEV_OFFLOAD_XSTATS_REPORT_USED: case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA: - case NETDEV_XDP_FEAT_CHANGE: ASSERT_RTNL(); break; - /* Once an event fully supports RTNL_NET, move it here - * and remove "if (0)" below. - * - * case NETDEV_XXX: - * ASSERT_RTNL_NET(net); - * break; - */ - } - - /* Just to avoid unused-variable error for dev and net. */ - if (0) + case NETDEV_CHANGENAME: ASSERT_RTNL_NET(net); + break; + } return NOTIFY_DONE; } +EXPORT_SYMBOL_NS_GPL(netdev_debug_event, "NETDEV_INTERNAL"); static int rtnl_net_debug_net_id; @@ -83,7 +80,7 @@ static int __net_init rtnl_net_debug_net_init(struct net *net) struct notifier_block *nb; nb = net_generic(net, rtnl_net_debug_net_id); - nb->notifier_call = rtnl_net_debug_event; + nb->notifier_call = netdev_debug_event; return register_netdevice_notifier_net(net, nb); } @@ -104,14 +101,14 @@ static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = { }; static struct notifier_block rtnl_net_debug_block = { - .notifier_call = rtnl_net_debug_event, + .notifier_call = netdev_debug_event, }; static int __init rtnl_net_debug_init(void) { int ret; - ret = register_pernet_device(&rtnl_net_debug_net_ops); + ret = register_pernet_subsys(&rtnl_net_debug_net_ops); if (ret) return ret; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 711cd3b4347a..f9d76d85d04f 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -23,6 +23,8 @@ #include <net/ip6_fib.h> #include <net/rtnh.h> +#include "dev.h" + DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); @@ -158,21 +160,14 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) return ret; } - rcu_read_lock(); - ops = rcu_dereference(lwtun_encaps[encap_type]); - rcu_read_unlock(); + ops = rcu_access_pointer(lwtun_encaps[encap_type]); #ifdef CONFIG_MODULES if (!ops) { const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { - __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); - rtnl_lock(); - - rcu_read_lock(); - ops = rcu_dereference(lwtun_encaps[encap_type]); - rcu_read_unlock(); + ops = rcu_access_pointer(lwtun_encaps[encap_type]); } } #endif @@ -206,8 +201,7 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, } encap_type = nla_get_u16(nla_entype); - if (lwtunnel_valid_encap_type(encap_type, - extack) != 0) + if (lwtunnel_valid_encap_type(encap_type, extack)) return -EOPNOTSUPP; } } @@ -325,82 +319,132 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; + + local_bh_disable(); - if (!dst) + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || - lwtstate->type > LWTUNNEL_ENCAP_MAX) - return 0; + lwtstate->type > LWTUNNEL_ENCAP_MAX) { + ret = 0; + goto out; + } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->output)) + if (likely(ops && ops->output)) { + dev_xmit_recursion_inc(); ret = ops->output(net, sk, skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; - return ret; + goto out; drop: kfree_skb(skb); +out: + local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_output); int lwtunnel_xmit(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; - if (!dst) + local_bh_disable(); + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || - lwtstate->type > LWTUNNEL_ENCAP_MAX) - return 0; + lwtstate->type > LWTUNNEL_ENCAP_MAX) { + ret = 0; + goto out; + } ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->xmit)) + if (likely(ops && ops->xmit)) { + dev_xmit_recursion_inc(); ret = ops->xmit(skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) goto drop; - return ret; + goto out; drop: kfree_skb(skb); +out: + local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_xmit); int lwtunnel_input(struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; struct lwtunnel_state *lwtstate; - int ret = -EINVAL; + struct dst_entry *dst; + int ret; - if (!dst) + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + + if (dev_xmit_recursion()) { + net_crit_ratelimited("%s(): recursion limit reached on datapath\n", + __func__); + ret = -ENETDOWN; goto drop; + } + + dst = skb_dst(skb); + if (!dst) { + ret = -EINVAL; + goto drop; + } lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || @@ -410,8 +454,11 @@ int lwtunnel_input(struct sk_buff *skb) ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); - if (likely(ops && ops->input)) + if (likely(ops && ops->input)) { + dev_xmit_recursion_inc(); ret = ops->input(skb); + dev_xmit_recursion_dec(); + } rcu_read_unlock(); if (ret == -EOPNOTSUPP) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 89656d180bc6..a6e2c91ec3e7 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -309,7 +309,7 @@ static void neigh_add_timer(struct neighbour *n, unsigned long when) static int neigh_del_timer(struct neighbour *n) { if ((n->nud_state & NUD_IN_TIMER) && - del_timer(&n->timer)) { + timer_delete(&n->timer)) { neigh_release(n); return 1; } @@ -427,7 +427,7 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) - del_timer_sync(&tbl->proxy_timer); + timer_delete_sync(&tbl->proxy_timer); return 0; } @@ -518,7 +518,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) if (!ret) return NULL; - hash_heads = kvzalloc(size, GFP_ATOMIC); + hash_heads = kzalloc(size, GFP_ATOMIC); if (!hash_heads) { kfree(ret); return NULL; @@ -536,7 +536,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct neigh_hash_table, rcu); - kvfree(nht->hash_heads); + kfree(nht->hash_heads); kfree(nht); } @@ -832,12 +832,10 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, return -ENOENT; } -static void neigh_parms_destroy(struct neigh_parms *parms); - static inline void neigh_parms_put(struct neigh_parms *parms) { if (refcount_dec_and_test(&parms->refcnt)) - neigh_parms_destroy(parms); + kfree(parms); } /* @@ -1519,7 +1517,7 @@ out: return rc; out_kfree_skb: rc = -EINVAL; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); goto out; } EXPORT_SYMBOL(neigh_resolve_output); @@ -1543,7 +1541,7 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) err = dev_queue_xmit(skb); else { err = -EINVAL; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL); } return err; } @@ -1599,7 +1597,7 @@ static void neigh_proxy_process(struct timer_list *t) } else if (!sched_next || tdif < sched_next) sched_next = tdif; } - del_timer(&tbl->proxy_timer); + timer_delete(&tbl->proxy_timer); if (sched_next) mod_timer(&tbl->proxy_timer, jiffies + sched_next); spin_unlock(&tbl->proxy_queue.lock); @@ -1630,7 +1628,7 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED; spin_lock(&tbl->proxy_queue.lock); - if (del_timer(&tbl->proxy_timer)) { + if (timer_delete(&tbl->proxy_timer)) { if (time_before(tbl->proxy_timer.expires, sched_next)) sched_next = tbl->proxy_timer.expires; } @@ -1713,11 +1711,6 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) } EXPORT_SYMBOL(neigh_parms_release); -static void neigh_parms_destroy(struct neigh_parms *parms) -{ - kfree(parms); -} - static struct lock_class_key neigh_table_proxy_queue_class; static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; @@ -1793,7 +1786,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl) /* It is not clean... Fix it to unload IPv6 module safely */ cancel_delayed_work_sync(&tbl->managed_work); cancel_delayed_work_sync(&tbl->gc_work); - del_timer_sync(&tbl->proxy_timer); + timer_delete_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family); neigh_ifdown(tbl, NULL); if (atomic_read(&tbl->entries)) @@ -2250,6 +2243,7 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = { static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_IFINDEX] = { .type = NLA_U32 }, [NDTPA_QUEUE_LEN] = { .type = NLA_U32 }, + [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 }, [NDTPA_PROXY_QLEN] = { .type = NLA_U32 }, [NDTPA_APP_PROBES] = { .type = NLA_U32 }, [NDTPA_UCAST_PROBES] = { .type = NLA_U32 }, @@ -2436,12 +2430,12 @@ static int neightbl_valid_dump_info(const struct nlmsghdr *nlh, { struct ndtmsg *ndtm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndtm))) { + ndtm = nlmsg_payload(nlh, sizeof(*ndtm)); + if (!ndtm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request"); return -EINVAL; } - ndtm = nlmsg_data(nlh); if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request"); return -EINVAL; @@ -2753,12 +2747,12 @@ static int neigh_valid_dump_req(const struct nlmsghdr *nlh, if (strict_check) { struct ndmsg *ndm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request"); @@ -2861,12 +2855,12 @@ static int neigh_valid_get_req(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request"); @@ -3447,10 +3441,12 @@ static const struct seq_operations neigh_stat_seq_ops = { static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { - struct net *net = dev_net(n->dev); struct sk_buff *skb; int err = -ENOBUFS; + struct net *net; + rcu_read_lock(); + net = dev_net_rcu(n->dev); skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; @@ -3463,9 +3459,11 @@ static void __neigh_notify(struct neighbour *n, int type, int flags, goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); - return; + goto out; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +out: + rcu_read_unlock(); } void neigh_app_ns(struct neighbour *n) diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index fa6d3969734a..4f0f0709a1cb 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -132,8 +132,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v) rcu_read_lock(); fl = rcu_dereference(sd->flow_limit); + /* Pairs with WRITE_ONCE() in skb_flow_limit() */ if (fl) - flow_limit_count = fl->count; + flow_limit_count = READ_ONCE(fl->count); rcu_read_unlock(); #endif @@ -144,11 +145,11 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - sd->processed, atomic_read(&sd->dropped), - sd->time_squeeze, 0, + READ_ONCE(sd->processed), atomic_read(&sd->dropped), + READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ - sd->received_rps, flow_limit_count, + READ_ONCE(sd->received_rps), flow_limit_count, input_qlen + process_qlen, (int)seq->index, input_qlen, process_qlen); return 0; @@ -185,7 +186,13 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos) } } - list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) { + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) { + if (i == pos) + return pt; + ++i; + } + + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) { if (i == pos) return pt; ++i; @@ -210,6 +217,7 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net *net = seq_file_net(seq); struct net_device *dev; struct packet_type *pt; struct list_head *nxt; @@ -232,15 +240,22 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) goto found; } } - - nxt = net_hotdata.ptype_all.next; - goto ptype_all; + nxt = net->ptype_all.next; + goto net_ptype_all; } - if (pt->type == htons(ETH_P_ALL)) { -ptype_all: - if (nxt != &net_hotdata.ptype_all) + if (pt->af_packet_net) { +net_ptype_all: + if (nxt != &net->ptype_all && nxt != &net->ptype_specific) goto found; + + if (nxt == &net->ptype_all) { + /* continue with ->ptype_specific if it's not empty */ + nxt = net->ptype_specific.next; + if (nxt != &net->ptype_specific) + goto found; + } + hash = 0; nxt = ptype_base[0].next; } else diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 2d9afc6e2161..1ace0cd01adc 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -23,6 +23,7 @@ #include <linux/of.h> #include <linux/of_net.h> #include <linux/cpu.h> +#include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/rps.h> @@ -36,12 +37,93 @@ static const char fmt_uint[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; -/* Caller holds RTNL or RCU */ +/* Caller holds RTNL, netdev->lock or RCU */ static inline int dev_isalive(const struct net_device *dev) { return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; } +/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active, + * when unregistering a net device and accessing associated sysfs files. The + * potential deadlock is as follow: + * + * CPU 0 CPU 1 + * + * rtnl_lock vfs_read + * unregister_netdevice_many kernfs_seq_start + * device_del / kobject_put kernfs_get_active (kn->active++) + * kernfs_drain sysfs_kf_seq_show + * wait_event( rtnl_lock + * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release + * -> waits on CPU 1 to decrease kn->active the rtnl lock. + * + * The historical fix was to use rtnl_trylock with restart_syscall to bail out + * of sysfs operations when the lock couldn't be taken. This fixed the above + * issue as it allowed CPU 1 to bail out of the ABBA situation. + * + * But it came with performances issues, as syscalls are being restarted in + * loops when there was contention on the rtnl lock, with huge slow downs in + * specific scenarios (e.g. lots of virtual interfaces created and userspace + * daemons querying their attributes). + * + * The idea below is to bail out of the active kernfs_node protection + * (kn->active) while trying to take the rtnl lock. + * + * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The + * net device is guaranteed to be alive if this returns successfully. + */ +static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr, + struct net_device *ndev) +{ + struct kernfs_node *kn; + int ret = 0; + + /* First, we hold a reference to the net device as the unregistration + * path might run in parallel. This will ensure the net device and the + * associated sysfs objects won't be freed while we try to take the rtnl + * lock. + */ + dev_hold(ndev); + /* sysfs_break_active_protection was introduced to allow self-removal of + * devices and their associated sysfs files by bailing out of the + * sysfs/kernfs protection. We do this here to allow the unregistration + * path to complete in parallel. The following takes a reference on the + * kobject and the kernfs_node being accessed. + * + * This works because we hold a reference onto the net device and the + * unregistration path will wait for us eventually in netdev_run_todo + * (outside an rtnl lock section). + */ + kn = sysfs_break_active_protection(kobj, attr); + /* We can now try to take the rtnl lock. This can't deadlock us as the + * unregistration path is able to drain sysfs files (kernfs_node) thanks + * to the above dance. + */ + if (rtnl_lock_interruptible()) { + ret = -ERESTARTSYS; + goto unbreak; + } + /* Check dismantle on the device hasn't started, otherwise deny the + * operation. + */ + if (!dev_isalive(ndev)) { + rtnl_unlock(); + ret = -ENODEV; + goto unbreak; + } + /* We are now sure the device dismantle hasn't started nor that it can + * start before we exit the locking section as we hold the rtnl lock. + * There's no need to keep unbreaking the sysfs protection nor to hold + * a net device reference from that point; that was only needed to take + * the rtnl lock. + */ +unbreak: + sysfs_unbreak_active_protection(kn); + dev_put(ndev); + + return ret; +} + /* use same locking rules as GIF* ioctl's */ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, @@ -95,16 +177,46 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, if (ret) goto err; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + goto err; + + ret = (*set)(netdev, new); + if (ret == 0) + ret = len; + + rtnl_unlock(); + err: + return ret; +} + +/* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */ +static ssize_t +netdev_lock_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len, + int (*set)(struct net_device *, unsigned long)) +{ + struct net_device *netdev = to_net_dev(dev); + struct net *net = dev_net(netdev); + unsigned long new; + int ret; + + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + ret = kstrtoul(buf, 0, &new); + if (ret) + return ret; + + netdev_lock(netdev); if (dev_isalive(netdev)) { ret = (*set)(netdev, new); if (ret == 0) ret = len; } - rtnl_unlock(); - err: + netdev_unlock(netdev); + return ret; } @@ -190,7 +302,7 @@ static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); /* The check is also done in change_carrier; this helps returning early - * without hitting the trylock/restart in netdev_store. + * without hitting the locking section in netdev_store. */ if (!netdev->netdev_ops->ndo_change_carrier) return -EOPNOTSUPP; @@ -202,11 +314,13 @@ static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - int ret = -EINVAL; + int ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + ret = -EINVAL; if (netif_running(netdev)) { /* Synchronize carrier state with link watch, * see also rtnl_getlink(). @@ -215,8 +329,8 @@ static ssize_t carrier_show(struct device *dev, ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); } - rtnl_unlock(); + rtnl_unlock(); return ret; } static DEVICE_ATTR_RW(carrier); @@ -228,14 +342,16 @@ static ssize_t speed_show(struct device *dev, int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + ret = -EINVAL; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; @@ -254,14 +370,16 @@ static ssize_t duplex_show(struct device *dev, int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + ret = -EINVAL; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; @@ -420,7 +538,7 @@ static ssize_t gro_flush_timeout_store(struct device *dev, if (!capable(CAP_NET_ADMIN)) return -EPERM; - return netdev_store(dev, attr, buf, len, change_gro_flush_timeout); + return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout); } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); @@ -440,7 +558,8 @@ static ssize_t napi_defer_hard_irqs_store(struct device *dev, if (!capable(CAP_NET_ADMIN)) return -EPERM; - return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); + return netdev_lock_store(dev, attr, buf, len, + change_napi_defer_hard_irqs); } NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint); @@ -450,7 +569,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); size_t count = len; - ssize_t ret = 0; + ssize_t ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -459,16 +578,15 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - ret = dev_set_alias(netdev, buf, count); - if (ret < 0) - goto err; - ret = len; - netdev_state_change(netdev); - } + ret = dev_set_alias(netdev, buf, count); + if (ret < 0) + goto err; + ret = len; + netdev_state_change(netdev); err: rtnl_unlock(); @@ -480,7 +598,7 @@ static ssize_t ifalias_show(struct device *dev, { const struct net_device *netdev = to_net_dev(dev); char tmp[IFALIASZ]; - ssize_t ret = 0; + ssize_t ret; ret = dev_get_alias(netdev, tmp, sizeof(tmp)); if (ret > 0) @@ -520,24 +638,23 @@ static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; + struct netdev_phys_item_id ppid; + ssize_t ret; /* The check is also done in dev_get_phys_port_id; this helps returning - * early without hitting the trylock/restart below. + * early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_id) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid; + ret = dev_get_phys_port_id(netdev, &ppid); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - ret = dev_get_phys_port_id(netdev, &ppid); - if (!ret) - ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - } rtnl_unlock(); return ret; @@ -548,25 +665,24 @@ static ssize_t phys_port_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; + char name[IFNAMSIZ]; + ssize_t ret; /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_name && !netdev->devlink_port) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - char name[IFNAMSIZ]; + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sysfs_emit(buf, "%s\n", name); - ret = dev_get_phys_port_name(netdev, name, sizeof(name)); - if (!ret) - ret = sysfs_emit(buf, "%s\n", name); - } rtnl_unlock(); return ret; @@ -577,26 +693,25 @@ static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; + struct netdev_phys_item_id ppid = { }; + ssize_t ret; /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the trylock/restart below. This works + * returning early without hitting the locking section below. This works * because recurse is false when calling dev_get_port_parent_id. */ if (!netdev->netdev_ops->ndo_get_port_parent_id && !netdev->devlink_port) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid = { }; + ret = dev_get_port_parent_id(netdev, &ppid, false); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - ret = dev_get_port_parent_id(netdev, &ppid, false); - if (!ret) - ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - } rtnl_unlock(); return ret; @@ -638,7 +753,7 @@ static ssize_t threaded_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { - return netdev_store(dev, attr, buf, len, modify_napi_threaded); + return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded); } static DEVICE_ATTR_RW(threaded); @@ -941,7 +1056,7 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, rcu_read_lock(); flow_table = rcu_dereference(queue->rps_flow_table); if (flow_table) - val = (unsigned long)flow_table->mask + 1; + val = 1UL << flow_table->log; rcu_read_unlock(); return sysfs_emit(buf, "%lu\n", val); @@ -994,7 +1109,7 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, if (!table) return -ENOMEM; - table->mask = mask; + table->log = ilog2(mask) + 1; for (count = 0; count <= mask; count++) table->flows[count].cpu = RPS_NO_CPU; } else { @@ -1077,7 +1192,6 @@ static void rx_queue_get_ownership(const struct kobject *kobj, static const struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, - .default_groups = rx_queue_default_groups, .namespace = rx_queue_namespace, .get_ownership = rx_queue_get_ownership, }; @@ -1100,6 +1214,22 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) struct kobject *kobj = &queue->kobj; int error = 0; + /* Rx queues are cleared in rx_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed"); + return -EAGAIN; + } + /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ @@ -1111,20 +1241,27 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = rx_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) - goto err; + goto err_default_groups; } error = rx_queue_default_mask(dev, queue); if (error) - goto err; + goto err_default_groups; kobject_uevent(kobj, KOBJ_ADD); return error; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1169,12 +1306,14 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) } while (--i >= new_num) { - struct kobject *kobj = &dev->_rx[i].kobj; + struct netdev_rx_queue *queue = &dev->_rx[i]; + struct kobject *kobj = &queue->kobj; if (!refcount_read(&dev_net(dev)->ns.count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); + sysfs_remove_groups(kobj, queue->groups); kobject_put(kobj); } @@ -1213,9 +1352,11 @@ static int net_rx_queue_change_owner(struct net_device *dev, int num, */ struct netdev_queue_attribute { struct attribute attr; - ssize_t (*show)(struct netdev_queue *queue, char *buf); - ssize_t (*store)(struct netdev_queue *queue, - const char *buf, size_t len); + ssize_t (*show)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf); + ssize_t (*store)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len); }; #define to_netdev_queue_attr(_attr) \ container_of(_attr, struct netdev_queue_attribute, attr) @@ -1232,7 +1373,7 @@ static ssize_t netdev_queue_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - return attribute->show(queue, buf); + return attribute->show(kobj, attr, queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, @@ -1246,7 +1387,7 @@ static ssize_t netdev_queue_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - return attribute->store(queue, buf, count); + return attribute->store(kobj, attr, queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { @@ -1254,7 +1395,8 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { .store = netdev_queue_attr_store, }; -static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) +static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); @@ -1272,18 +1414,18 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue) return i; } -static ssize_t traffic_class_show(struct netdev_queue *queue, - char *buf) +static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; - int num_tc, tc; - int index; + int num_tc, tc, index, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; index = get_netdev_queue_index(queue); @@ -1310,24 +1452,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, } #ifdef CONFIG_XPS -static ssize_t tx_maxrate_show(struct netdev_queue *queue, - char *buf) +static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); } -static ssize_t tx_maxrate_store(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { - struct net_device *dev = queue->dev; int err, index = get_netdev_queue_index(queue); + struct net_device *dev = queue->dev; u32 rate = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; /* The check is also done later; this helps returning early without - * hitting the trylock/restart below. + * hitting the locking section below. */ if (!dev->netdev_ops->ndo_set_tx_maxrate) return -EOPNOTSUPP; @@ -1336,18 +1479,23 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue, if (err < 0) return err; - if (!rtnl_trylock()) - return restart_syscall(); + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) + return err; err = -EOPNOTSUPP; + netdev_lock_ops(dev); if (dev->netdev_ops->ndo_set_tx_maxrate) err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); + netdev_unlock_ops(dev); - rtnl_unlock(); if (!err) { queue->tx_maxrate = rate; + rtnl_unlock(); return len; } + + rtnl_unlock(); return err; } @@ -1391,16 +1539,17 @@ static ssize_t bql_set(const char *buf, const size_t count, return count; } -static ssize_t bql_show_hold_time(struct netdev_queue *queue, - char *buf) +static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } -static ssize_t bql_set_hold_time(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct dql *dql = &queue->dql; unsigned int value; @@ -1419,15 +1568,17 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); -static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); } -static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct dql *dql = &queue->dql; unsigned int value; @@ -1453,13 +1604,15 @@ static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs); -static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); } -static ssize_t bql_set_stall_max(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { WRITE_ONCE(queue->dql.stall_max, 0); return len; @@ -1468,7 +1621,8 @@ static ssize_t bql_set_stall_max(struct netdev_queue *queue, static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init = __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max); -static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; @@ -1478,8 +1632,8 @@ static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf) static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL); -static ssize_t bql_show_inflight(struct netdev_queue *queue, - char *buf) +static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; @@ -1490,13 +1644,16 @@ static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ -static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ - char *buf) \ +static ssize_t bql_show_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ -static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ +static ssize_t bql_set_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ @@ -1582,19 +1739,21 @@ out_no_maps: return len < PAGE_SIZE ? len : -EINVAL; } -static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) +static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; - int len, tc; + int len, tc, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; index = get_netdev_queue_index(queue); - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; @@ -1605,18 +1764,21 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) return -EINVAL; } - /* Make sure the subordinate device can't be freed */ - get_device(&dev->dev); + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); rtnl_unlock(); len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); - put_device(&dev->dev); + dev_put(dev); return len; } -static ssize_t xps_cpus_store(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct net_device *dev = queue->dev; unsigned int index; @@ -1640,9 +1802,10 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, return err; } - if (!rtnl_trylock()) { + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { free_cpumask_var(mask); - return restart_syscall(); + return err; } err = netif_set_xps_queue(dev, mask, index); @@ -1656,26 +1819,34 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); -static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) +static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; - int tc; + int tc, ret; index = get_netdev_queue_index(queue); - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, dev); + if (ret) + return ret; tc = netdev_txq_to_tc(dev, index); + + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); rtnl_unlock(); - if (tc < 0) - return -EINVAL; - return xps_queue_show(dev, index, tc, buf, XPS_RXQS); + ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL; + dev_put(dev); + return ret; } -static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, +static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; @@ -1699,9 +1870,10 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } - if (!rtnl_trylock()) { + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { bitmap_free(mask); - return restart_syscall(); + return err; } cpus_read_lock(); @@ -1761,7 +1933,6 @@ static void netdev_queue_get_ownership(const struct kobject *kobj, static const struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, - .default_groups = netdev_queue_default_groups, .namespace = netdev_queue_namespace, .get_ownership = netdev_queue_get_ownership, }; @@ -1780,6 +1951,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) struct kobject *kobj = &queue->kobj; int error = 0; + /* Tx queues are cleared in netdev_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed"); + return -EAGAIN; + } + /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ @@ -1791,15 +1978,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = netdev_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (netdev_uses_bql(dev)) { error = sysfs_create_group(kobj, &dql_group); if (error) - goto err; + goto err_default_groups; } kobject_uevent(kobj, KOBJ_ADD); return 0; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1854,6 +2048,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) if (netdev_uses_bql(dev)) sysfs_remove_group(&queue->kobj, &dql_group); + sysfs_remove_groups(&queue->kobj, queue->groups); kobject_put(&queue->kobj); } @@ -1953,8 +2148,10 @@ static void remove_queue_kobjects(struct net_device *dev) net_rx_queue_update_kobjects(dev, real_rx, 0); netdev_queue_update_kobjects(dev, real_tx, 0); + netdev_lock_ops(dev); dev->real_num_rx_queues = 0; dev->real_num_tx_queues = 0; + netdev_unlock_ops(dev); #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index b5cd3ae4f04c..ae54f26709ca 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -163,16 +163,45 @@ static void ops_pre_exit_list(const struct pernet_operations *ops, } } +static void ops_exit_rtnl_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + const struct pernet_operations *saved_ops = ops; + LIST_HEAD(dev_kill_list); + struct net *net; + + rtnl_lock(); + + list_for_each_entry(net, net_exit_list, exit_list) { + __rtnl_net_lock(net); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) { + if (ops->exit_rtnl) + ops->exit_rtnl(net, &dev_kill_list); + } + + __rtnl_net_unlock(net); + } + + unregister_netdevice_many(&dev_kill_list); + + rtnl_unlock(); +} + static void ops_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { - struct net *net; if (ops->exit) { + struct net *net; + list_for_each_entry(net, net_exit_list, exit_list) { ops->exit(net); cond_resched(); } } + if (ops->exit_batch) ops->exit_batch(net_exit_list); } @@ -188,6 +217,56 @@ static void ops_free_list(const struct pernet_operations *ops, } } +static void ops_undo_list(const struct list_head *ops_list, + const struct pernet_operations *ops, + struct list_head *net_exit_list, + bool expedite_rcu) +{ + const struct pernet_operations *saved_ops; + bool hold_rtnl = false; + + if (!ops) + ops = list_entry(ops_list, typeof(*ops), list); + + saved_ops = ops; + + list_for_each_entry_continue_reverse(ops, ops_list, list) { + hold_rtnl |= !!ops->exit_rtnl; + ops_pre_exit_list(ops, net_exit_list); + } + + /* Another CPU might be rcu-iterating the list, wait for it. + * This needs to be before calling the exit() notifiers, so the + * rcu_barrier() after ops_undo_list() isn't sufficient alone. + * Also the pre_exit() and exit() methods need this barrier. + */ + if (expedite_rcu) + synchronize_rcu_expedited(); + else + synchronize_rcu(); + + if (hold_rtnl) + ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_exit_list(ops, net_exit_list); + + ops = saved_ops; + list_for_each_entry_continue_reverse(ops, ops_list, list) + ops_free_list(ops, net_exit_list); +} + +static void ops_undo_single(struct pernet_operations *ops, + struct list_head *net_exit_list) +{ + LIST_HEAD(ops_list); + + list_add(&ops->list, &ops_list); + ops_undo_list(&ops_list, NULL, net_exit_list, false); + list_del(&ops->list); +} + /* should be called with nsid_lock held */ static int alloc_netid(struct net *net, struct net *peer, int reqid) { @@ -340,6 +419,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); #endif + INIT_LIST_HEAD(&net->ptype_all); + INIT_LIST_HEAD(&net->ptype_specific); preinit_net_sysctl(net); } @@ -349,9 +430,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ static __net_init int setup_net(struct net *net) { /* Must be called with pernet_ops_rwsem held */ - const struct pernet_operations *ops, *saved_ops; + const struct pernet_operations *ops; LIST_HEAD(net_exit_list); - LIST_HEAD(dev_kill_list); int error = 0; preempt_disable(); @@ -374,29 +454,7 @@ out_undo: * for the pernet modules whose init functions did not fail. */ list_add(&net->exit_list, &net_exit_list); - saved_ops = ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - synchronize_rcu(); - - ops = saved_ops; - rtnl_lock(); - list_for_each_entry_continue_reverse(ops, &pernet_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - ops = saved_ops; - list_for_each_entry_continue_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); - + ops_undo_list(&pernet_list, ops, &net_exit_list, false); rcu_barrier(); goto out; } @@ -464,7 +522,7 @@ static void net_complete_free(void) } -static void net_free(struct net *net) +void net_passive_dec(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); @@ -482,7 +540,7 @@ void net_drop_ns(void *p) struct net *net = (struct net *)p; if (net) - net_free(net); + net_passive_dec(net); } struct net *copy_net_ns(unsigned long flags, @@ -523,7 +581,7 @@ put_userns: key_remove_domain(net->key_domain); #endif put_user_ns(user_ns); - net_free(net); + net_passive_dec(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); @@ -588,13 +646,15 @@ static void unhash_nsid(struct net *net, struct net *last) static LLIST_HEAD(cleanup_list); +struct task_struct *cleanup_net_task; + static void cleanup_net(struct work_struct *work) { - const struct pernet_operations *ops; - struct net *net, *tmp, *last; struct llist_node *net_kill_list; + struct net *net, *tmp, *last; LIST_HEAD(net_exit_list); - LIST_HEAD(dev_kill_list); + + WRITE_ONCE(cleanup_net_task, current); /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); @@ -623,33 +683,7 @@ static void cleanup_net(struct work_struct *work) list_add_tail(&net->exit_list, &net_exit_list); } - /* Run all of the network namespace pre_exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_pre_exit_list(ops, &net_exit_list); - - /* - * Another CPU might be rcu-iterating the list, wait for it. - * This needs to be before calling the exit() notifiers, so - * the rcu_barrier() below isn't sufficient alone. - * Also the pre_exit() and exit() methods need this barrier. - */ - synchronize_rcu_expedited(); - - rtnl_lock(); - list_for_each_entry_reverse(ops, &pernet_list, list) { - if (ops->exit_batch_rtnl) - ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); - } - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - - /* Run all of the network namespace exit methods */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_exit_list(ops, &net_exit_list); - - /* Free the net generic variables */ - list_for_each_entry_reverse(ops, &pernet_list, list) - ops_free_list(ops, &net_exit_list); + ops_undo_list(&pernet_list, NULL, &net_exit_list, true); up_read(&pernet_ops_rwsem); @@ -668,8 +702,9 @@ static void cleanup_net(struct work_struct *work) key_remove_domain(net->key_domain); #endif put_user_ns(net->user_ns); - net_free(net); + net_passive_dec(net); } + WRITE_ONCE(cleanup_net_task, NULL); } /** @@ -1232,31 +1267,13 @@ void __init net_ns_init(void) rtnl_register_many(net_ns_rtnl_msg_handlers); } -static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) -{ - ops_pre_exit_list(ops, net_exit_list); - synchronize_rcu(); - - if (ops->exit_batch_rtnl) { - LIST_HEAD(dev_kill_list); - - rtnl_lock(); - ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); - unregister_netdevice_many(&dev_kill_list); - rtnl_unlock(); - } - ops_exit_list(ops, net_exit_list); - - ops_free_list(ops, net_exit_list); -} - #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) { + LIST_HEAD(net_exit_list); struct net *net; int error; - LIST_HEAD(net_exit_list); list_add_tail(&ops->list, list); if (ops->init || ops->id) { @@ -1275,21 +1292,21 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); return error; } static void __unregister_pernet_operations(struct pernet_operations *ops) { - struct net *net; LIST_HEAD(net_exit_list); + struct net *net; - list_del(&ops->list); /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + list_del(&ops->list); + ops_undo_single(ops, &net_exit_list); } #else @@ -1311,8 +1328,9 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) list_del(&ops->list); } else { LIST_HEAD(net_exit_list); + list_add(&init_net.exit_list, &net_exit_list); - free_exit_list(ops, &net_exit_list); + ops_undo_single(ops, &net_exit_list); } } diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index 996ac6a449eb..4fc44587f493 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -9,7 +9,7 @@ #include "netdev-genl-gen.h" #include <uapi/linux/netdev.h> -#include <linux/list.h> +#include <net/netdev_netlink.h> /* Integer value ranges */ static const struct netlink_range_validation netdev_a_page_pool_id_range = { @@ -99,6 +99,12 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_IRQ_SUSPE [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, }; +/* NETDEV_CMD_BIND_TX - do */ +static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { + [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), + [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, +}; + /* Ops table for netdev */ static const struct genl_split_ops netdev_nl_ops[] = { { @@ -190,6 +196,13 @@ static const struct genl_split_ops netdev_nl_ops[] = { .maxattr = NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, + { + .cmd = NETDEV_CMD_BIND_TX, + .doit = netdev_nl_bind_tx_doit, + .policy = netdev_bind_tx_nl_policy, + .maxattr = NETDEV_A_DMABUF_FD, + .flags = GENL_CMD_CAP_DO, + }, }; static const struct genl_multicast_group netdev_nl_mcgrps[] = { @@ -217,7 +230,7 @@ struct genl_family netdev_nl_family __ro_after_init = { .n_split_ops = ARRAY_SIZE(netdev_nl_ops), .mcgrps = netdev_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps), - .sock_priv_size = sizeof(struct list_head), + .sock_priv_size = sizeof(struct netdev_nl_sock), .sock_priv_init = __netdev_nl_sock_priv_init, .sock_priv_destroy = __netdev_nl_sock_priv_destroy, }; diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index e09dd7539ff2..cf3fad74511f 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -10,7 +10,7 @@ #include <net/genetlink.h> #include <uapi/linux/netdev.h> -#include <linux/list.h> +#include <net/netdev_netlink.h> /* Common nested types */ extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; @@ -34,6 +34,7 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); enum { NETDEV_NLGRP_MGMT, @@ -42,7 +43,7 @@ enum { extern struct genl_family netdev_nl_family; -void netdev_nl_sock_priv_init(struct list_head *priv); -void netdev_nl_sock_priv_destroy(struct list_head *priv); +void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv); +void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv); #endif /* _LINUX_NETDEV_GEN_H */ diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index a3bdaf075b6b..2afa7b2141aa 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -10,6 +10,7 @@ #include <net/sock.h> #include <net/xdp.h> #include <net/xdp_sock.h> +#include <net/page_pool/memory_provider.h> #include "dev.h" #include "devmem.h" @@ -37,6 +38,8 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, u64 xdp_rx_meta = 0; void *hdr; + netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */ + hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; @@ -52,6 +55,8 @@ XDP_METADATA_KFUNC_xxx xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; + if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time) + xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO; } if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || @@ -119,15 +124,14 @@ int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info) if (!rsp) return -ENOMEM; - rtnl_lock(); - - netdev = __dev_get_by_index(genl_info_net(info), ifindex); - if (netdev) - err = netdev_nl_dev_fill(netdev, rsp, info); - else + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { err = -ENODEV; + goto err_free_msg; + } - rtnl_unlock(); + err = netdev_nl_dev_fill(netdev, rsp, info); + netdev_unlock(netdev); if (err) goto err_free_msg; @@ -143,18 +147,15 @@ int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb); struct net *net = sock_net(skb->sk); - struct net_device *netdev; - int err = 0; + int err; - rtnl_lock(); - for_each_netdev_dump(net, netdev, ctx->ifindex) { + for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb)); if (err < 0) - break; + return err; } - rtnl_unlock(); - return err; + return 0; } static int @@ -167,7 +168,7 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, void *hdr; pid_t pid; - if (!(napi->dev->flags & IFF_UP)) + if (!napi->dev->up) return 0; hdr = genlmsg_iput(rsp, info); @@ -229,20 +230,15 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) if (!rsp) return -ENOMEM; - rtnl_lock(); - rcu_read_lock(); - - napi = netdev_napi_by_id(genl_info_net(info), napi_id); + napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); + netdev_unlock(napi->dev); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); err = -ENOENT; } - rcu_read_unlock(); - rtnl_unlock(); - if (err) { goto err_free_msg; } else if (!rsp->len) { @@ -263,14 +259,21 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, struct netdev_nl_dump_ctx *ctx) { struct napi_struct *napi; + unsigned int prev_id; int err = 0; - if (!(netdev->flags & IFF_UP)) + if (!netdev->up) return err; + prev_id = UINT_MAX; list_for_each_entry(napi, &netdev->napi_list, dev_list) { - if (napi->napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi->napi_id)) continue; + + /* Dump continuation below depends on the list being sorted */ + WARN_ON_ONCE(napi->napi_id >= prev_id); + prev_id = napi->napi_id; + if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; @@ -294,22 +297,22 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (info->attrs[NETDEV_A_NAPI_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]); - rtnl_lock(); if (ifindex) { - netdev = __dev_get_by_index(net, ifindex); - if (netdev) + netdev = netdev_get_by_index_lock(net, ifindex); + if (netdev) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); - else + netdev_unlock(netdev); + } else { err = -ENODEV; + } } else { - for_each_netdev_dump(net, netdev, ctx->ifindex) { + for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) { err = netdev_nl_napi_dump_one(netdev, skb, info, ctx); if (err < 0) break; ctx->napi_id = 0; } } - rtnl_unlock(); return err; } @@ -350,28 +353,30 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]); - rtnl_lock(); - rcu_read_lock(); - - napi = netdev_napi_by_id(genl_info_net(info), napi_id); + napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_set_config(napi, info); + netdev_unlock(napi->dev); } else { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]); err = -ENOENT; } - rcu_read_unlock(); - rtnl_unlock(); - return err; } +static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi) +{ + if (napi && napi_id_valid(napi->napi_id)) + return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id); + return 0; +} + static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding; + struct pp_memory_provider_params *params; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; @@ -388,21 +393,30 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); - if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, - rxq->napi->napi_id)) + if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; - binding = rxq->mp_params.mp_priv; - if (binding && - nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id)) + params = &rxq->mp_params; + if (params->mp_ops && + params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) goto nla_put_failure; +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) + if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) + goto nla_put_failure; +#endif break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); - if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, - txq->napi->napi_id)) + if (nla_put_napi_id(rsp, txq->napi)) goto nla_put_failure; +#ifdef CONFIG_XDP_SOCKETS + if (txq->pool) + if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) + goto nla_put_failure; +#endif + break; } genlmsg_end(rsp, hdr); @@ -435,7 +449,7 @@ netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, { int err; - if (!(netdev->flags & IFF_UP)) + if (!netdev->up) return -ENOENT; err = netdev_nl_queue_validate(netdev, q_idx, q_type); @@ -465,15 +479,14 @@ int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info) if (!rsp) return -ENOMEM; - rtnl_lock(); - - netdev = __dev_get_by_index(genl_info_net(info), ifindex); - if (netdev) + netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info), + ifindex); + if (netdev) { err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info); - else + netdev_unlock_ops_compat(netdev); + } else { err = -ENODEV; - - rtnl_unlock(); + } if (err) goto err_free_msg; @@ -492,7 +505,7 @@ netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, { int err = 0; - if (!(netdev->flags & IFF_UP)) + if (!netdev->up) return err; for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { @@ -523,15 +536,17 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (info->attrs[NETDEV_A_QUEUE_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); - rtnl_lock(); if (ifindex) { - netdev = __dev_get_by_index(net, ifindex); - if (netdev) + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); + if (netdev) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); - else + netdev_unlock_ops_compat(netdev); + } else { err = -ENODEV; + } } else { - for_each_netdev_dump(net, netdev, ctx->ifindex) { + for_each_netdev_lock_ops_compat_scoped(net, netdev, + ctx->ifindex) { err = netdev_nl_queue_dump_one(netdev, skb, info, ctx); if (err < 0) break; @@ -539,7 +554,6 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) ctx->txq_idx = 0; } } - rtnl_unlock(); return err; } @@ -575,6 +589,7 @@ netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || @@ -687,25 +702,66 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, return 0; } +/** + * netdev_stat_queue_sum() - add up queue stats from range of queues + * @netdev: net_device + * @rx_start: index of the first Rx queue to query + * @rx_end: index after the last Rx queue (first *not* to query) + * @rx_sum: output Rx stats, should be already initialized + * @tx_start: index of the first Tx queue to query + * @tx_end: index after the last Tx queue (first *not* to query) + * @tx_sum: output Tx stats, should be already initialized + * + * Add stats from [start, end) range of queue IDs to *x_sum structs. + * The sum structs must be already initialized. Usually this + * helper is invoked from the .get_base_stats callbacks of drivers + * to account for stats of disabled queues. In that case the ranges + * are usually [netdev->real_num_*x_queues, netdev->num_*x_queues). + */ +void netdev_stat_queue_sum(struct net_device *netdev, + int rx_start, int rx_end, + struct netdev_queue_stats_rx *rx_sum, + int tx_start, int tx_end, + struct netdev_queue_stats_tx *tx_sum) +{ + const struct netdev_stat_ops *ops; + struct netdev_queue_stats_rx rx; + struct netdev_queue_stats_tx tx; + int i; + + ops = netdev->stat_ops; + + for (i = rx_start; i < rx_end; i++) { + memset(&rx, 0xff, sizeof(rx)); + if (ops->get_queue_stats_rx) + ops->get_queue_stats_rx(netdev, i, &rx); + netdev_nl_stats_add(rx_sum, &rx, sizeof(rx)); + } + for (i = tx_start; i < tx_end; i++) { + memset(&tx, 0xff, sizeof(tx)); + if (ops->get_queue_stats_tx) + ops->get_queue_stats_tx(netdev, i, &tx); + netdev_nl_stats_add(tx_sum, &tx, sizeof(tx)); + } +} +EXPORT_SYMBOL(netdev_stat_queue_sum); + static int netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, const struct genl_info *info) { - struct netdev_queue_stats_rx rx_sum, rx; - struct netdev_queue_stats_tx tx_sum, tx; - const struct netdev_stat_ops *ops; + struct netdev_queue_stats_rx rx_sum; + struct netdev_queue_stats_tx tx_sum; void *hdr; - int i; - ops = netdev->stat_ops; /* Netdev can't guarantee any complete counters */ - if (!ops->get_base_stats) + if (!netdev->stat_ops->get_base_stats) return 0; memset(&rx_sum, 0xff, sizeof(rx_sum)); memset(&tx_sum, 0xff, sizeof(tx_sum)); - ops->get_base_stats(netdev, &rx_sum, &tx_sum); + netdev->stat_ops->get_base_stats(netdev, &rx_sum, &tx_sum); /* The op was there, but nothing reported, don't bother */ if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) && @@ -718,18 +774,8 @@ netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp, if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex)) goto nla_put_failure; - for (i = 0; i < netdev->real_num_rx_queues; i++) { - memset(&rx, 0xff, sizeof(rx)); - if (ops->get_queue_stats_rx) - ops->get_queue_stats_rx(netdev, i, &rx); - netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx)); - } - for (i = 0; i < netdev->real_num_tx_queues; i++) { - memset(&tx, 0xff, sizeof(tx)); - if (ops->get_queue_stats_tx) - ops->get_queue_stats_tx(netdev, i, &tx); - netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx)); - } + netdev_stat_queue_sum(netdev, 0, netdev->real_num_rx_queues, &rx_sum, + 0, netdev->real_num_tx_queues, &tx_sum); if (netdev_nl_stats_write_rx(rsp, &rx_sum) || netdev_nl_stats_write_tx(rsp, &tx_sum)) @@ -780,26 +826,31 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, if (info->attrs[NETDEV_A_QSTATS_IFINDEX]) ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]); - rtnl_lock(); if (ifindex) { - netdev = __dev_get_by_index(net, ifindex); - if (netdev && netdev->stat_ops) { - err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, - info, ctx); - } else { + netdev = netdev_get_by_index_lock_ops_compat(net, ifindex); + if (!netdev) { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QSTATS_IFINDEX]); - err = netdev ? -EOPNOTSUPP : -ENODEV; + return -ENODEV; } - } else { - for_each_netdev_dump(net, netdev, ctx->ifindex) { + if (netdev->stat_ops) { err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, info, ctx); - if (err < 0) - break; + } else { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_QSTATS_IFINDEX]); + err = -EOPNOTSUPP; } + netdev_unlock_ops_compat(netdev); + return err; + } + + for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) { + err = netdev_nl_qstats_get_dump_one(netdev, scope, skb, + info, ctx); + if (err < 0) + break; } - rtnl_unlock(); return err; } @@ -808,8 +859,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; struct net_devmem_dmabuf_binding *binding; - struct list_head *sock_binding_list; u32 ifindex, dmabuf_fd, rxq_idx; + struct netdev_nl_sock *priv; struct net_device *netdev; struct sk_buff *rsp; struct nlattr *attr; @@ -824,10 +875,9 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); - sock_binding_list = genl_sk_priv_get(&netdev_nl_family, - NETLINK_CB(skb).sk); - if (IS_ERR(sock_binding_list)) - return PTR_ERR(sock_binding_list); + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) @@ -839,21 +889,26 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_genlmsg_free; } - rtnl_lock(); + mutex_lock(&priv->lock); - netdev = __dev_get_by_index(genl_info_net(info), ifindex); - if (!netdev || !netif_device_present(netdev)) { + err = 0; + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { err = -ENODEV; - goto err_unlock; + goto err_unlock_sock; } - - if (dev_xdp_prog_count(netdev)) { - NL_SET_ERR_MSG(info->extack, "unable to bind dmabuf to device with XDP program attached"); - err = -EEXIST; + if (!netif_device_present(netdev)) + err = -ENODEV; + else if (!netdev_need_ops_lock(netdev)) + err = -EOPNOTSUPP; + if (err) { + NL_SET_BAD_ATTR(info->extack, + info->attrs[NETDEV_A_DEV_IFINDEX]); goto err_unlock; } - binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack); + binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, + priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock; @@ -888,8 +943,6 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unbind; } - list_add(&binding->list, sock_binding_list); - nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); genlmsg_end(rsp, hdr); @@ -897,34 +950,129 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) if (err) goto err_unbind; - rtnl_unlock(); + netdev_unlock(netdev); + + mutex_unlock(&priv->lock); return 0; err_unbind: net_devmem_unbind_dmabuf(binding); err_unlock: - rtnl_unlock(); + netdev_unlock(netdev); +err_unlock_sock: + mutex_unlock(&priv->lock); +err_genlmsg_free: + nlmsg_free(rsp); + return err; +} + +int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net_devmem_dmabuf_binding *binding; + struct netdev_nl_sock *priv; + struct net_device *netdev; + u32 ifindex, dmabuf_fd; + struct sk_buff *rsp; + int err = 0; + void *hdr; + + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || + GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD)) + return -EINVAL; + + ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]); + dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]); + + priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk); + if (IS_ERR(priv)) + return PTR_ERR(priv); + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_genlmsg_free; + } + + mutex_lock(&priv->lock); + + netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); + if (!netdev) { + err = -ENODEV; + goto err_unlock_sock; + } + + if (!netif_device_present(netdev)) { + err = -ENODEV; + goto err_unlock_netdev; + } + + if (!netdev->netmem_tx) { + err = -EOPNOTSUPP; + NL_SET_ERR_MSG(info->extack, + "Driver does not support netmem TX"); + goto err_unlock_netdev; + } + + binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv, + info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_unlock_netdev; + } + + nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id); + genlmsg_end(rsp, hdr); + + netdev_unlock(netdev); + mutex_unlock(&priv->lock); + + return genlmsg_reply(rsp, info); + +err_unlock_netdev: + netdev_unlock(netdev); +err_unlock_sock: + mutex_unlock(&priv->lock); err_genlmsg_free: nlmsg_free(rsp); return err; } -void netdev_nl_sock_priv_init(struct list_head *priv) +void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv) { - INIT_LIST_HEAD(priv); + INIT_LIST_HEAD(&priv->bindings); + mutex_init(&priv->lock); } -void netdev_nl_sock_priv_destroy(struct list_head *priv) +void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv) { struct net_devmem_dmabuf_binding *binding; struct net_devmem_dmabuf_binding *temp; + netdevice_tracker dev_tracker; + struct net_device *dev; + + mutex_lock(&priv->lock); + list_for_each_entry_safe(binding, temp, &priv->bindings, list) { + mutex_lock(&binding->lock); + dev = binding->dev; + if (!dev) { + mutex_unlock(&binding->lock); + net_devmem_unbind_dmabuf(binding); + continue; + } + netdev_hold(dev, &dev_tracker, GFP_KERNEL); + mutex_unlock(&binding->lock); - list_for_each_entry_safe(binding, temp, priv, list) { - rtnl_lock(); + netdev_lock(dev); net_devmem_unbind_dmabuf(binding); - rtnl_unlock(); + netdev_unlock(dev); + netdev_put(dev, &dev_tracker); } + mutex_unlock(&priv->lock); } static int netdev_genl_netdevice_event(struct notifier_block *nb, @@ -934,10 +1082,14 @@ static int netdev_genl_netdevice_event(struct notifier_block *nb, switch (event) { case NETDEV_REGISTER: + netdev_lock_ops_to_full(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF); + netdev_unlock_full_to_ops(netdev); break; case NETDEV_UNREGISTER: + netdev_lock(netdev); netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF); + netdev_unlock(netdev); break; case NETDEV_XDP_FEAT_CHANGE: netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF); diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index e217a5838c87..d126f10197bf 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -1,36 +1,38 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/ethtool_netlink.h> #include <linux/netdevice.h> +#include <net/netdev_lock.h> #include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> +#include <net/page_pool/memory_provider.h> #include "page_pool_priv.h" int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) { struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); + const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; void *new_mem, *old_mem; int err; - if (!dev->queue_mgmt_ops || !dev->queue_mgmt_ops->ndo_queue_stop || - !dev->queue_mgmt_ops->ndo_queue_mem_free || - !dev->queue_mgmt_ops->ndo_queue_mem_alloc || - !dev->queue_mgmt_ops->ndo_queue_start) + if (!qops || !qops->ndo_queue_stop || !qops->ndo_queue_mem_free || + !qops->ndo_queue_mem_alloc || !qops->ndo_queue_start) return -EOPNOTSUPP; - ASSERT_RTNL(); + netdev_assert_locked(dev); - new_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL); + new_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); if (!new_mem) return -ENOMEM; - old_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL); + old_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); if (!old_mem) { err = -ENOMEM; goto err_free_new_mem; } - err = dev->queue_mgmt_ops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); + err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); if (err) goto err_free_old_mem; @@ -38,15 +40,19 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) if (err) goto err_free_new_queue_mem; - err = dev->queue_mgmt_ops->ndo_queue_stop(dev, old_mem, rxq_idx); - if (err) - goto err_free_new_queue_mem; + if (netif_running(dev)) { + err = qops->ndo_queue_stop(dev, old_mem, rxq_idx); + if (err) + goto err_free_new_queue_mem; - err = dev->queue_mgmt_ops->ndo_queue_start(dev, new_mem, rxq_idx); - if (err) - goto err_start_queue; + err = qops->ndo_queue_start(dev, new_mem, rxq_idx); + if (err) + goto err_start_queue; + } else { + swap(new_mem, old_mem); + } - dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem); + qops->ndo_queue_mem_free(dev, old_mem); kvfree(old_mem); kvfree(new_mem); @@ -61,15 +67,15 @@ err_start_queue: * WARN if we fail to recover the old rx queue, and at least free * old_mem so we don't also leak that. */ - if (dev->queue_mgmt_ops->ndo_queue_start(dev, old_mem, rxq_idx)) { + if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) { WARN(1, "Failed to restart old queue in error path. RX queue %d may be unhealthy.", rxq_idx); - dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem); + qops->ndo_queue_mem_free(dev, old_mem); } err_free_new_queue_mem: - dev->queue_mgmt_ops->ndo_queue_mem_free(dev, new_mem); + qops->ndo_queue_mem_free(dev, new_mem); err_free_old_mem: kvfree(old_mem); @@ -79,3 +85,103 @@ err_free_new_mem: return err; } +EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); + +int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + const struct pp_memory_provider_params *p, + struct netlink_ext_ack *extack) +{ + struct netdev_rx_queue *rxq; + int ret; + + if (!netdev_need_ops_lock(dev)) + return -EOPNOTSUPP; + + if (rxq_idx >= dev->real_num_rx_queues) + return -EINVAL; + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); + + if (rxq_idx >= dev->real_num_rx_queues) { + NL_SET_ERR_MSG(extack, "rx queue index out of range"); + return -ERANGE; + } + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { + NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); + return -EINVAL; + } + if (dev->cfg->hds_thresh) { + NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); + return -EINVAL; + } + if (dev_xdp_prog_count(dev)) { + NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); + return -EEXIST; + } + + rxq = __netif_get_rx_queue(dev, rxq_idx); + if (rxq->mp_params.mp_ops) { + NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); + return -EEXIST; + } +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) { + NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP"); + return -EBUSY; + } +#endif + + rxq->mp_params = *p; + ret = netdev_rx_queue_restart(dev, rxq_idx); + if (ret) { + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + } + return ret; +} + +int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, + struct pp_memory_provider_params *p) +{ + int ret; + + netdev_lock(dev); + ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL); + netdev_unlock(dev); + return ret; +} + +void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, + const struct pp_memory_provider_params *old_p) +{ + struct netdev_rx_queue *rxq; + int err; + + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) + return; + + rxq = __netif_get_rx_queue(dev, ifq_idx); + + /* Callers holding a netdev ref may get here after we already + * went thru shutdown via dev_memory_provider_uninstall(). + */ + if (dev->reg_state > NETREG_REGISTERED && + !rxq->mp_params.mp_ops) + return; + + if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || + rxq->mp_params.mp_priv != old_p->mp_priv)) + return; + + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + err = netdev_rx_queue_restart(dev, ifq_idx); + WARN_ON(err && err != -ENETDOWN); +} + +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + netdev_lock(dev); + __net_mp_close_rxq(dev, ifq_idx, old_p); + netdev_unlock(dev); +} diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index 7eadb8393e00..cd95394399b4 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -5,7 +5,7 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->pp_magic; + return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) @@ -15,9 +15,16 @@ static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) static inline void netmem_clear_pp_magic(netmem_ref netmem) { + WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK); + __netmem_clear_lsb(netmem)->pp_magic = 0; } +static inline bool netmem_is_pp(netmem_ref netmem) +{ + return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE; +} + static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) { __netmem_clear_lsb(netmem)->pp = pool; @@ -28,4 +35,28 @@ static inline void netmem_set_dma_addr(netmem_ref netmem, { __netmem_clear_lsb(netmem)->dma_addr = dma_addr; } + +static inline unsigned long netmem_get_dma_index(netmem_ref netmem) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return 0; + + magic = __netmem_clear_lsb(netmem)->pp_magic; + + return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; +} + +static inline void netmem_set_dma_index(netmem_ref netmem, + unsigned long id) +{ + unsigned long magic; + + if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) + return; + + magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); + __netmem_clear_lsb(netmem)->pp_magic = magic; +} #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 96a6ed37d4cc..4ddb7490df4b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -284,12 +284,13 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve) struct sk_buff *skb; zap_completion_queue(); - refill_skbs(np); repeat: skb = alloc_skb(len, GFP_ATOMIC); - if (!skb) + if (!skb) { skb = skb_dequeue(&np->skb_pool); + schedule_work(&np->refill_wq); + } if (!skb) { if (++count < 10) { @@ -319,6 +320,7 @@ static int netpoll_owner_active(struct net_device *dev) static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; + netdev_tx_t ret = NET_XMIT_DROP; struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ @@ -327,11 +329,12 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) lockdep_assert_irqs_disabled(); dev = np->dev; + rcu_read_lock(); npinfo = rcu_dereference_bh(dev->npinfo); if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); - return NET_XMIT_DROP; + goto out; } /* don't get messages out of order, and no recursion */ @@ -370,7 +373,10 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } - return NETDEV_TX_OK; + ret = NETDEV_TX_OK; +out: + rcu_read_unlock(); + return ret; } netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) @@ -390,7 +396,7 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) } EXPORT_SYMBOL(netpoll_send_skb); -void netpoll_send_udp(struct netpoll *np, const char *msg, int len) +int netpoll_send_udp(struct netpoll *np, const char *msg, int len) { int total_len, ip_len, udp_len; struct sk_buff *skb; @@ -414,7 +420,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb = find_skb(np, total_len + np->dev->needed_tailroom, total_len - len); if (!skb) - return; + return -ENOMEM; skb_copy_to_linear_data(skb, msg, len); skb_put(skb, len); @@ -427,7 +433,6 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) udph->len = htons(udp_len); if (np->ipv6) { - udph->check = 0; udph->check = csum_ipv6_magic(&np->local_ip.in6, &np->remote_ip.in6, udp_len, IPPROTO_UDP, @@ -490,7 +495,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) skb->dev = np->dev; - netpoll_send_skb(np, skb); + return (int)netpoll_send_skb(np, skb); } EXPORT_SYMBOL(netpoll_send_udp); @@ -501,7 +506,8 @@ void netpoll_print_options(struct netpoll *np) np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6); else np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip); - np_info(np, "interface '%s'\n", np->dev_name); + np_info(np, "interface name '%s'\n", np->dev_name); + np_info(np, "local ethernet address '%pM'\n", np->dev_mac); np_info(np, "remote port %d\n", np->remote_port); if (np->ipv6) np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6); @@ -535,6 +541,7 @@ static void skb_pool_flush(struct netpoll *np) { struct sk_buff_head *skb_pool; + cancel_work_sync(&np->refill_wq); skb_pool = &np->skb_pool; skb_queue_purge_reason(skb_pool, SKB_CONSUMED); } @@ -570,11 +577,18 @@ int netpoll_parse_options(struct netpoll *np, char *opt) cur++; if (*cur != ',') { - /* parse out dev name */ + /* parse out dev_name or dev_mac */ if ((delim = strchr(cur, ',')) == NULL) goto parse_failed; *delim = 0; - strscpy(np->dev_name, cur, sizeof(np->dev_name)); + + np->dev_name[0] = '\0'; + eth_broadcast_addr(np->dev_mac); + if (!strchr(cur, ':')) + strscpy(np->dev_name, cur, sizeof(np->dev_name)); + else if (!mac_pton(cur, np->dev_mac)) + goto parse_failed; + cur = delim; } cur++; @@ -621,6 +635,14 @@ int netpoll_parse_options(struct netpoll *np, char *opt) } EXPORT_SYMBOL(netpoll_parse_options); +static void refill_skbs_work_handler(struct work_struct *work) +{ + struct netpoll *np = + container_of(work, struct netpoll, refill_wq); + + refill_skbs(np); +} + int __netpoll_setup(struct netpoll *np, struct net_device *ndev) { struct netpoll_info *npinfo; @@ -636,7 +658,8 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) goto out; } - if (!rcu_access_pointer(ndev->npinfo)) { + npinfo = rtnl_dereference(ndev->npinfo); + if (!npinfo) { npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); if (!npinfo) { err = -ENOMEM; @@ -656,7 +679,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) goto free_npinfo; } } else { - npinfo = rtnl_dereference(ndev->npinfo); refcount_inc(&npinfo->refcnt); } @@ -666,6 +688,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) /* fill up the skb queue */ refill_skbs(np); + INIT_WORK(&np->refill_wq, refill_skbs_work_handler); /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); @@ -679,27 +702,45 @@ out: } EXPORT_SYMBOL_GPL(__netpoll_setup); +/* + * Returns a pointer to a string representation of the identifier used + * to select the egress interface for the given netpoll instance. buf + * must be a buffer of length at least MAC_ADDR_STR_LEN + 1. + */ +static char *egress_dev(struct netpoll *np, char *buf) +{ + if (np->dev_name[0]) + return np->dev_name; + + snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac); + return buf; +} + int netpoll_setup(struct netpoll *np) { + struct net *net = current->nsproxy->net_ns; + char buf[MAC_ADDR_STR_LEN + 1]; struct net_device *ndev = NULL; bool ip_overwritten = false; struct in_device *in_dev; int err; rtnl_lock(); - if (np->dev_name[0]) { - struct net *net = current->nsproxy->net_ns; + if (np->dev_name[0]) ndev = __dev_get_by_name(net, np->dev_name); - } + else if (is_valid_ether_addr(np->dev_mac)) + ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac); + if (!ndev) { - np_err(np, "%s doesn't exist, aborting\n", np->dev_name); + np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf)); err = -ENODEV; goto unlock; } netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL); if (netdev_master_upper_dev_get(ndev)) { - np_err(np, "%s is a slave device, aborting\n", np->dev_name); + np_err(np, "%s is a slave device, aborting\n", + egress_dev(np, buf)); err = -EBUSY; goto put; } @@ -707,7 +748,8 @@ int netpoll_setup(struct netpoll *np) if (!netif_running(ndev)) { unsigned long atmost; - np_info(np, "device %s not up yet, forcing it\n", np->dev_name); + np_info(np, "device %s not up yet, forcing it\n", + egress_dev(np, buf)); err = dev_open(ndev, NULL); @@ -741,7 +783,7 @@ int netpoll_setup(struct netpoll *np) if (!ifa) { put_noaddr: np_err(np, "no IP address for %s, aborting\n", - np->dev_name); + egress_dev(np, buf)); err = -EDESTADDRREQ; goto put; } @@ -772,13 +814,13 @@ put_noaddr: } if (err) { np_err(np, "no IPv6 address for %s, aborting\n", - np->dev_name); + egress_dev(np, buf)); goto put; } else np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6); #else np_err(np, "IPv6 is not supported %s, aborting\n", - np->dev_name); + egress_dev(np, buf)); err = -EINVAL; goto put; #endif diff --git a/net/core/page_pool.c b/net/core/page_pool.c index f89cf93f6eb4..ba7cf3e3c32f 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -11,8 +11,10 @@ #include <linux/slab.h> #include <linux/device.h> +#include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <net/xdp.h> #include <linux/dma-direction.h> @@ -25,6 +27,7 @@ #include <trace/events/page_pool.h> +#include "dev.h" #include "mp_dmabuf_devmem.h" #include "netmem_priv.h" #include "page_pool_priv.h" @@ -150,9 +153,9 @@ u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) EXPORT_SYMBOL(page_pool_ethtool_stats_get); #else -#define alloc_stat_inc(pool, __stat) -#define recycle_stat_inc(pool, __stat) -#define recycle_stat_add(pool, __stat, val) +#define alloc_stat_inc(...) do { } while (0) +#define recycle_stat_inc(...) do { } while (0) +#define recycle_stat_add(...) do { } while (0) #endif static bool page_pool_producer_lock(struct page_pool *pool) @@ -201,6 +204,7 @@ static int page_pool_init(struct page_pool *pool, memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); pool->cpuid = cpuid; + pool->dma_sync_for_cpu = true; /* Validate only known flags were used */ if (pool->slow.flags & ~PP_FLAG_ALL) @@ -272,22 +276,26 @@ static int page_pool_init(struct page_pool *pool, /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); - if (pool->dma_map) - get_device(pool->p.dev); + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { - /* We rely on rtnl_lock()ing to make sure netdev_rx_queue - * configuration doesn't change while we're initializing - * the page_pool. - */ - ASSERT_RTNL(); + netdev_assert_locked(pool->slow.netdev); rxq = __netif_get_rx_queue(pool->slow.netdev, pool->slow.queue_idx); pool->mp_priv = rxq->mp_params.mp_priv; + pool->mp_ops = rxq->mp_params.mp_ops; } - if (pool->mp_priv) { - err = mp_dmabuf_devmem_init(pool); + if (pool->mp_ops) { + if (!pool->dma_map || !pool->dma_sync) + return -EOPNOTSUPP; + + if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { + err = -EFAULT; + goto free_ptr_ring; + } + + err = pool->mp_ops->init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, err); @@ -311,9 +319,7 @@ free_ptr_ring: static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); - - if (pool->dma_map) - put_device(pool->p.dev); + xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) @@ -454,13 +460,21 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, netmem_ref netmem, u32 dma_sync_size) { - if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) - __page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); + if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { + rcu_read_lock(); + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ + if (pool->dma_sync) + __page_pool_dma_sync_for_device(pool, netmem, + dma_sync_size); + rcu_read_unlock(); + } } -static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; + int err; + u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit @@ -474,15 +488,30 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem) if (dma_mapping_error(pool->p.dev, dma)) return false; - if (page_pool_set_dma_addr_netmem(netmem, dma)) + if (page_pool_set_dma_addr_netmem(netmem, dma)) { + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); goto unmap_failed; + } + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto unset_failed; + } + + netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; +unset_failed: + page_pool_set_dma_addr_netmem(netmem, 0); unmap_failed: - WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); @@ -499,7 +528,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, if (unlikely(!page)) return NULL; - if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page)))) { + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } @@ -532,12 +561,11 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, if (unlikely(pool->alloc.count > 0)) return pool->alloc.cache[--pool->alloc.count]; - /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ + /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); - nr_pages = alloc_pages_bulk_array_node(gfp, - pool->p.nid, bulk, - (struct page **)pool->alloc.cache); + nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, + (struct page **)pool->alloc.cache); if (unlikely(!nr_pages)) return 0; @@ -546,7 +574,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, */ for (i = 0; i < nr_pages; i++) { netmem = pool->alloc.cache[i]; - if (dma_map && unlikely(!page_pool_dma_map(pool, netmem))) { + if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { put_page(netmem_to_page(netmem)); continue; } @@ -574,7 +602,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ -netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) { netmem_ref netmem; @@ -584,20 +612,20 @@ netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp) return netmem; /* Slow-path: cache empty, do real allocation */ - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + netmem = pool->mp_ops->alloc_netmems(pool, gfp); else netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; } -EXPORT_SYMBOL(page_pool_alloc_netmem); +EXPORT_SYMBOL(page_pool_alloc_netmems); +ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL); struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) { - return netmem_to_page(page_pool_alloc_netmem(pool, gfp)); + return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); } EXPORT_SYMBOL(page_pool_alloc_pages); -ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL); /* Calculate distance between two u32 values, valid if distance is below 2^(31) * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution @@ -648,6 +676,8 @@ void page_pool_clear_pp_info(netmem_ref netmem) static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, netmem_ref netmem) { + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; dma_addr_t dma; if (!pool->dma_map) @@ -656,6 +686,17 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, */ return; + id = netmem_get_dma_index(netmem); + if (!id) + return; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return; + dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ @@ -663,6 +704,7 @@ static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); + netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need @@ -676,8 +718,8 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) bool put; put = true; - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - put = mp_dmabuf_devmem_release_page(pool, netmem); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + put = pool->mp_ops->release_netmem(pool, netmem); else __page_pool_release_page_dma(pool, netmem); @@ -699,19 +741,16 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) { - int ret; - /* BH protection not needed if current is softirq */ - if (in_softirq()) - ret = ptr_ring_produce(&pool->ring, (__force void *)netmem); - else - ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem); + bool in_softirq, ret; - if (!ret) { + /* BH protection not needed if current is softirq */ + in_softirq = page_pool_producer_lock(pool); + ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem); + if (ret) recycle_stat_inc(pool, ring); - return true; - } + page_pool_producer_unlock(pool, in_softirq); - return false; + return ret; } /* Only allow direct recycling in special circumstances, into the @@ -797,6 +836,10 @@ static bool page_pool_napi_local(const struct page_pool *pool) const struct napi_struct *napi; u32 cpuid; + /* On PREEMPT_RT the softirq can be preempted by the consumer */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; + if (unlikely(!in_softirq())) return false; @@ -821,8 +864,8 @@ void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, if (!allow_direct) allow_direct = page_pool_napi_local(pool); - netmem = - __page_pool_put_page(pool, netmem, dma_sync_size, allow_direct); + netmem = __page_pool_put_page(pool, netmem, dma_sync_size, + allow_direct); if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { /* Cache full, fallback to free pages */ recycle_stat_inc(pool, ring_full); @@ -839,69 +882,104 @@ void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, } EXPORT_SYMBOL(page_pool_put_unrefed_page); -/** - * page_pool_put_page_bulk() - release references on multiple pages - * @pool: pool from which pages were allocated - * @data: array holding page pointers - * @count: number of pages in @data - * - * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring - * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() - * will release leftover pages to the page allocator. - * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx - * completion loop for the XDP_REDIRECT use case. - * - * Please note the caller must not use data area after running - * page_pool_put_page_bulk(), as this function overwrites it. - */ -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) +static void page_pool_recycle_ring_bulk(struct page_pool *pool, + netmem_ref *bulk, + u32 bulk_len) { - int i, bulk_len = 0; - bool allow_direct; bool in_softirq; + u32 i; - allow_direct = page_pool_napi_local(pool); - - for (i = 0; i < count; i++) { - netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i])); - - /* It is not the last user for the page frag case */ - if (!page_pool_is_last_ref(netmem)) - continue; - - netmem = __page_pool_put_page(pool, netmem, -1, allow_direct); - /* Approved for bulk recycling in ptr_ring cache */ - if (netmem) - data[bulk_len++] = (__force void *)netmem; - } - - if (!bulk_len) - return; - - /* Bulk producer into ptr_ring page_pool cache */ + /* Bulk produce into ptr_ring page_pool cache */ in_softirq = page_pool_producer_lock(pool); + for (i = 0; i < bulk_len; i++) { - if (__ptr_ring_produce(&pool->ring, data[i])) { + if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { /* ring full */ recycle_stat_inc(pool, ring_full); break; } } - recycle_stat_add(pool, ring, i); + page_pool_producer_unlock(pool, in_softirq); + recycle_stat_add(pool, ring, i); - /* Hopefully all pages was return into ptr_ring */ + /* Hopefully all pages were returned into ptr_ring */ if (likely(i == bulk_len)) return; - /* ptr_ring cache full, free remaining pages outside producer lock - * since put_page() with refcnt == 1 can be an expensive operation + /* + * ptr_ring cache is full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation. */ for (; i < bulk_len; i++) - page_pool_return_page(pool, (__force netmem_ref)data[i]); + page_pool_return_page(pool, bulk[i]); } -EXPORT_SYMBOL(page_pool_put_page_bulk); + +/** + * page_pool_put_netmem_bulk() - release references on multiple netmems + * @data: array holding netmem references + * @count: number of entries in @data + * + * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring + * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() + * will release leftover netmems to the memory provider. + * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx + * completion loop for the XDP_REDIRECT use case. + * + * Please note the caller must not use data area after running + * page_pool_put_netmem_bulk(), as this function overwrites it. + */ +void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) +{ + u32 bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + netmem_ref netmem = netmem_compound_head(data[i]); + + if (page_pool_unref_and_test(netmem)) + data[bulk_len++] = netmem; + } + + count = bulk_len; + while (count) { + netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; + struct page_pool *pool = NULL; + bool allow_direct; + u32 foreign = 0; + + bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + struct page_pool *netmem_pp; + netmem_ref netmem = data[i]; + + netmem_pp = netmem_get_pp(netmem); + if (unlikely(!pool)) { + pool = netmem_pp; + allow_direct = page_pool_napi_local(pool); + } else if (netmem_pp != pool) { + /* + * If the netmem belongs to a different + * page_pool, save it for another round. + */ + data[foreign++] = netmem; + continue; + } + + netmem = __page_pool_put_page(pool, netmem, -1, + allow_direct); + /* Approved for bulk recycling in ptr_ring cache */ + if (netmem) + bulk[bulk_len++] = netmem; + } + + if (bulk_len) + page_pool_recycle_ring_bulk(pool, bulk, bulk_len); + + count = foreign; + } +} +EXPORT_SYMBOL(page_pool_put_netmem_bulk); static netmem_ref page_pool_drain_frag(struct page_pool *pool, netmem_ref netmem) @@ -957,7 +1035,7 @@ netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, } if (!netmem) { - netmem = page_pool_alloc_netmem(pool, gfp); + netmem = page_pool_alloc_netmems(pool, gfp); if (unlikely(!netmem)) { pool->frag_page = 0; return 0; @@ -1010,8 +1088,8 @@ static void __page_pool_destroy(struct page_pool *pool) page_pool_unlist(pool); page_pool_uninit(pool); - if (pool->mp_priv) { - mp_dmabuf_devmem_destroy(pool); + if (pool->mp_ops) { + pool->mp_ops->destroy(pool); static_branch_dec(&page_pool_mem_providers); } @@ -1037,8 +1115,29 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) static void page_pool_scrub(struct page_pool *pool) { + unsigned long id; + void *ptr; + page_pool_empty_alloc_cache_once(pool); - pool->destroy_cnt++; + if (!pool->destroy_cnt++ && pool->dma_map) { + if (pool->dma_sync) { + /* Disable page_pool_dma_sync_for_device() */ + pool->dma_sync = false; + + /* Make sure all concurrent returns that may see the old + * value of dma_sync (and thus perform a sync) have + * finished before doing the unmapping below. Skip the + * wait if the device doesn't actually need syncing, or + * if there are no outstanding mapped pages. + */ + if (dma_dev_need_sync(pool->p.dev) && + !xa_empty(&pool->dma_mapped)) + synchronize_net(); + } + + xa_for_each(&pool->dma_mapped, id, ptr) + __page_pool_release_page_dma(pool, page_to_netmem(ptr)); + } /* No more consumers should exist, but producers could still * be in-flight. @@ -1048,10 +1147,14 @@ static void page_pool_scrub(struct page_pool *pool) static int page_pool_release(struct page_pool *pool) { + bool in_softirq; int inflight; page_pool_scrub(pool); inflight = page_pool_inflight(pool, true); + /* Acquire producer lock to make sure producers have exited. */ + in_softirq = page_pool_producer_lock(pool); + page_pool_producer_unlock(pool, in_softirq); if (!inflight) __page_pool_destroy(pool); @@ -1066,7 +1169,13 @@ static void page_pool_release_retry(struct work_struct *wq) int inflight; inflight = page_pool_release(pool); - if (!inflight) + /* In rare cases, a driver bug may cause inflight to go negative. + * Don't reschedule release if inflight is 0 or negative. + * - If 0, the page_pool has been destroyed + * - if negative, we will never recover + * in both cases no reschedule is necessary. + */ + if (inflight <= 0) return; /* Periodic warning for page pools the user can't see */ @@ -1102,13 +1211,11 @@ void page_pool_disable_direct_recycling(struct page_pool *pool) if (!pool->p.napi) return; - /* To avoid races with recycling and additional barriers make sure - * pool and NAPI are unlinked when NAPI is disabled. - */ - WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state)); - WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1); + napi_assert_will_not_race(pool->p.napi); + mutex_lock(&page_pools_lock); WRITE_ONCE(pool->p.napi, NULL); + mutex_unlock(&page_pools_lock); } EXPORT_SYMBOL(page_pool_disable_direct_recycling); @@ -1150,3 +1257,31 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); +} + +/* Associate a niov with a page pool. Should follow with a matching + * net_mp_niov_clear_page_pool() + */ +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); +} + +/* Disassociate a niov from a page pool. Should only be used in the + * ->release_netmem() path. + */ +void net_mp_niov_clear_page_pool(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_clear_pp_info(netmem); +} diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h index 57439787b9c2..2fb06d5f6d55 100644 --- a/net/core/page_pool_priv.h +++ b/net/core/page_pool_priv.h @@ -7,6 +7,8 @@ #include "netmem_priv.h" +extern struct mutex page_pools_lock; + s32 page_pool_inflight(const struct page_pool *pool, bool strict); int page_pool_list(struct page_pool *pool); diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 48335766c1bf..c82a95beceff 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -3,21 +3,23 @@ #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/xarray.h> +#include <net/busy_poll.h> #include <net/net_debug.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/page_pool/types.h> +#include <net/page_pool/memory_provider.h> #include <net/sock.h> -#include "devmem.h" #include "page_pool_priv.h" #include "netdev-genl-gen.h" static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); -/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user. +/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev, + * pool->user. * Ordering: inside rtnl_lock */ -static DEFINE_MUTEX(page_pools_lock); +DEFINE_MUTEX(page_pools_lock); /* Page pools are only reachable from user space (via netlink) if they are * linked to a netdev at creation time. Following page pool "visibility" @@ -214,8 +216,8 @@ static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding = pool->mp_priv; size_t inflight, refsz; + unsigned int napi_id; void *hdr; hdr = genlmsg_iput(rsp, info); @@ -229,8 +231,10 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, pool->slow.netdev->ifindex)) goto err_cancel; - if (pool->user.napi_id && - nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id)) + + napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0; + if (napi_id_valid(napi_id) && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id)) goto err_cancel; inflight = page_pool_inflight(pool, false); @@ -244,7 +248,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, pool->user.detach_time)) goto err_cancel; - if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) + if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) goto err_cancel; genlmsg_end(rsp, hdr); @@ -319,8 +323,6 @@ int page_pool_list(struct page_pool *pool) if (pool->slow.netdev) { hlist_add_head(&pool->user.list, &pool->slow.netdev->page_pools); - pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0; - netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); } @@ -353,7 +355,7 @@ void page_pool_unlist(struct page_pool *pool) int page_pool_check_memory_provider(struct net_device *dev, struct netdev_rx_queue *rxq) { - struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv; + void *binding = rxq->mp_params.mp_priv; struct page_pool *pool; struct hlist_node *n; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 4cb547fae91f..0ebe5461d4d9 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -158,9 +158,7 @@ #include <net/udp.h> #include <net/ip6_checksum.h> #include <net/addrconf.h> -#ifdef CONFIG_XFRM #include <net/xfrm.h> -#endif #include <net/netns/generic.h> #include <asm/byteorder.h> #include <linux/rcupdate.h> @@ -179,7 +177,7 @@ #define MAX_IMIX_ENTRIES 20 #define IMIX_PRECISION 100 /* Precision of IMIX distribution */ -#define func_enter() pr_debug("entering %s\n", __func__); +#define func_enter() pr_debug("entering %s\n", __func__) #define PKT_FLAGS \ pf(IPV6) /* Interface in IPV6 Mode */ \ @@ -229,12 +227,12 @@ static char *pkt_flag_names[] = { /* Xmit modes */ #define M_START_XMIT 0 /* Default normal TX */ -#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ +#define M_NETIF_RECEIVE 1 /* Inject packets into stack */ #define M_QUEUE_XMIT 2 /* Inject packet into qdisc */ /* If lock -- protects updating of if_list */ -#define if_lock(t) mutex_lock(&(t->if_lock)); -#define if_unlock(t) mutex_unlock(&(t->if_lock)); +#define if_lock(t) mutex_lock(&(t->if_lock)) +#define if_unlock(t) mutex_unlock(&(t->if_lock)) /* Used to help with determining the pkts on receive */ #define PKTGEN_MAGIC 0xbe9be955 @@ -285,7 +283,8 @@ struct pktgen_dev { int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; int removal_mark; /* non-zero => the device is marked for - * removal by worker thread */ + * removal by worker thread + */ struct page *page; u64 delay; /* nano-seconds */ @@ -348,10 +347,12 @@ struct pktgen_dev { __u16 udp_dst_max; /* exclusive, dest UDP port */ /* DSCP + ECN */ - __u8 tos; /* six MSB of (former) IPv4 TOS - are for dscp codepoint */ - __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 - (see RFC 3260, sec. 4) */ + __u8 tos; /* six MSB of (former) IPv4 TOS + * are for dscp codepoint + */ + __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 + * (see RFC 3260, sec. 4) + */ /* IMIX */ unsigned int n_imix_entries; @@ -391,12 +392,12 @@ struct pktgen_dev { __u8 hh[14]; /* = { - 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, - - We fill in SRC address later - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x08, 0x00 - }; + * 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + * + * We fill in SRC address later + * 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + * 0x08, 0x00 + * }; */ __u16 pad; /* pad out the hh struct to an even 16 bytes */ @@ -460,7 +461,8 @@ struct pktgen_thread { char result[512]; /* Field for thread to receive "posted" events terminate, - stop ifs etc. */ + * stop ifs etc. + */ u32 control; int cpu; @@ -474,8 +476,7 @@ struct pktgen_thread { #define FIND 0 static const char version[] = - "Packet Generator for packet performance testing. " - "Version: " VERSION "\n"; + "Packet Generator for packet performance testing. Version: " VERSION "\n"; static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i); static int pktgen_add_device(struct pktgen_thread *t, const char *ifname); @@ -517,21 +518,23 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char data[128]; + size_t max; struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id); if (!capable(CAP_NET_ADMIN)) return -EPERM; - if (count == 0) + if (count < 1) return -EINVAL; - if (count > sizeof(data)) - count = sizeof(data); - - if (copy_from_user(data, buf, count)) + max = min(count, sizeof(data) - 1); + if (copy_from_user(data, buf, max)) return -EFAULT; - data[count - 1] = 0; /* Strip trailing '\n' and terminate string */ + if (data[max - 1] == '\n') + data[max - 1] = 0; /* strip trailing '\n', terminate string */ + else + data[max] = 0; /* terminate string */ if (!strcmp(data, "stop")) pktgen_stop_all_threads(pn); @@ -624,8 +627,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_printf(seq, "%pM\n", pkt_dev->dst_mac); seq_printf(seq, - " udp_src_min: %d udp_src_max: %d" - " udp_dst_min: %d udp_dst_max: %d\n", + " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, pkt_dev->udp_dst_max); @@ -744,34 +746,37 @@ static int pktgen_if_show(struct seq_file *seq, void *v) } -static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, - __u32 *num) +static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen, + __u32 *num) { - int i = 0; + size_t i = 0; + *num = 0; for (; i < maxlen; i++) { int value; char c; - *num <<= 4; + if (get_user(c, &user_buffer[i])) return -EFAULT; value = hex_to_bin(c); - if (value >= 0) + if (value >= 0) { + *num <<= 4; *num |= value; - else + } else { break; + } } return i; } -static int count_trail_chars(const char __user * user_buffer, - unsigned int maxlen) +static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen) { - int i; + size_t i; for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -790,14 +795,15 @@ done: return i; } -static long num_arg(const char __user *user_buffer, unsigned long maxlen, - unsigned long *num) +static ssize_t num_arg(const char __user *user_buffer, size_t maxlen, + unsigned long *num) { - int i; + size_t i; *num = 0; for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; if ((c >= '0') && (c <= '9')) { @@ -809,12 +815,13 @@ static long num_arg(const char __user *user_buffer, unsigned long maxlen, return i; } -static int strn_len(const char __user * user_buffer, unsigned int maxlen) +static ssize_t strn_len(const char __user *user_buffer, size_t maxlen) { - int i; + size_t i; for (i = 0; i < maxlen; i++) { char c; + if (get_user(c, &user_buffer[i])) return -EFAULT; switch (c) { @@ -823,6 +830,7 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen) case '\r': case '\t': case ' ': + case '=': goto done_str; default: break; @@ -838,11 +846,11 @@ done_str: * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example. */ static ssize_t get_imix_entries(const char __user *buffer, + size_t maxlen, struct pktgen_dev *pkt_dev) { - const int max_digits = 10; - int i = 0; - long len; + size_t i = 0, max; + ssize_t len; char c; pkt_dev->n_imix_entries = 0; @@ -854,21 +862,30 @@ static ssize_t get_imix_entries(const char __user *buffer, if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES) return -E2BIG; - len = num_arg(&buffer[i], max_digits, &size); + if (i >= maxlen) + return -EINVAL; + + max = min(10, maxlen - i); + len = num_arg(&buffer[i], max, &size); if (len < 0) return len; i += len; + if (i >= maxlen) + return -EINVAL; if (get_user(c, &buffer[i])) return -EFAULT; /* Check for comma between size_i and weight_i */ if (c != ',') return -EINVAL; i++; + if (i >= maxlen) + return -EINVAL; if (size < 14 + 20 + 8) size = 14 + 20 + 8; - len = num_arg(&buffer[i], max_digits, &weight); + max = min(10, maxlen - i); + len = num_arg(&buffer[i], max, &weight); if (len < 0) return len; if (weight <= 0) @@ -878,39 +895,55 @@ static ssize_t get_imix_entries(const char __user *buffer, pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight; i += len; + pkt_dev->n_imix_entries++; + + if (i >= maxlen) + break; if (get_user(c, &buffer[i])) return -EFAULT; - i++; - pkt_dev->n_imix_entries++; } while (c == ' '); return i; } -static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) +static ssize_t get_labels(const char __user *buffer, + size_t maxlen, struct pktgen_dev *pkt_dev) { unsigned int n = 0; + size_t i = 0, max; + ssize_t len; char c; - ssize_t i = 0; - int len; pkt_dev->nr_labels = 0; do { __u32 tmp; - len = hex32_arg(&buffer[i], 8, &tmp); - if (len <= 0) + + if (n >= MAX_MPLS_LABELS) + return -E2BIG; + + if (i >= maxlen) + return -EINVAL; + + max = min(8, maxlen - i); + len = hex32_arg(&buffer[i], max, &tmp); + if (len < 0) return len; + + /* return empty list in case of invalid input or zero value */ + if (len == 0 || tmp == 0) + return maxlen; + pkt_dev->labels[n] = htonl(tmp); if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM) pkt_dev->flags |= F_MPLS_RND; i += len; + n++; + if (i >= maxlen) + break; if (get_user(c, &buffer[i])) return -EFAULT; i++; - n++; - if (n >= MAX_MPLS_LABELS) - return -E2BIG; } while (c == ','); pkt_dev->nr_labels = n; @@ -947,16 +980,16 @@ static __u32 pktgen_read_flag(const char *f, bool *disable) } static ssize_t pktgen_if_write(struct file *file, - const char __user * user_buffer, size_t count, - loff_t * offset) + const char __user *user_buffer, size_t count, + loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_dev *pkt_dev = seq->private; - int i, max, len; + size_t i, max; + ssize_t len; char name[16], valstr[32]; unsigned long value = 0; char *pg_result = NULL; - int tmp = 0; char buf[128]; pg_result = &(pkt_dev->result[0]); @@ -967,16 +1000,16 @@ static ssize_t pktgen_if_write(struct file *file, } max = count; - tmp = count_trail_chars(user_buffer, max); - if (tmp < 0) { + len = count_trail_chars(user_buffer, max); + if (len < 0) { pr_warn("illegal format\n"); - return tmp; + return len; } - i = tmp; + i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1004,11 +1037,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "min_pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->min_pkt_size) { @@ -1021,11 +1054,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "max_pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->max_pkt_size) { @@ -1040,11 +1073,11 @@ static ssize_t pktgen_if_write(struct file *file, /* Shortcut for min = max */ if (!strcmp(name, "pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->min_pkt_size) { @@ -1060,43 +1093,43 @@ static ssize_t pktgen_if_write(struct file *file, if (pkt_dev->clone_skb > 0) return -EINVAL; - len = get_imix_entries(&user_buffer[i], pkt_dev); + max = count - i; + len = get_imix_entries(&user_buffer[i], max, pkt_dev); if (len < 0) return len; fill_imix_distribution(pkt_dev); - i += len; return count; } if (!strcmp(name, "debug")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; debug = value; sprintf(pg_result, "OK: debug=%u", debug); return count; } if (!strcmp(name, "frags")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->nfrags = value; sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags); return count; } if (!strcmp(name, "delay")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value == 0x7FFFFFFF) pkt_dev->delay = ULLONG_MAX; else @@ -1107,13 +1140,13 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "rate")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (!value) - return len; + return -EINVAL; pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value; if (debug) pr_info("Delay set at: %llu ns\n", pkt_dev->delay); @@ -1122,13 +1155,13 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "ratep")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (!value) - return len; + return -EINVAL; pkt_dev->delay = NSEC_PER_SEC/value; if (debug) pr_info("Delay set at: %llu ns\n", pkt_dev->delay); @@ -1137,11 +1170,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_src_min")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_src_min) { pkt_dev->udp_src_min = value; pkt_dev->cur_udp_src = value; @@ -1150,11 +1183,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_dst_min")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_dst_min) { pkt_dev->udp_dst_min = value; pkt_dev->cur_udp_dst = value; @@ -1163,11 +1196,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_src_max")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_src_max) { pkt_dev->udp_src_max = value; pkt_dev->cur_udp_src = value; @@ -1176,11 +1209,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_dst_max")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_dst_max) { pkt_dev->udp_dst_max = value; pkt_dev->cur_udp_dst = value; @@ -1189,7 +1222,8 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "clone_skb")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; /* clone_skb is not supported for netif_receive xmit_mode and @@ -1198,34 +1232,33 @@ static ssize_t pktgen_if_write(struct file *file, if ((value > 0) && ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) - return -ENOTSUPP; + return -EOPNOTSUPP; if (value > 0 && (pkt_dev->n_imix_entries > 0 || !(pkt_dev->flags & F_SHARED))) return -EINVAL; - i += len; pkt_dev->clone_skb = value; sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); return count; } if (!strcmp(name, "count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->count = value; sprintf(pg_result, "OK: count=%llu", (unsigned long long)pkt_dev->count); return count; } if (!strcmp(name, "src_mac_count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (pkt_dev->src_mac_count != value) { pkt_dev->src_mac_count = value; pkt_dev->cur_src_mac_offset = 0; @@ -1235,11 +1268,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "dst_mac_count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (pkt_dev->dst_mac_count != value) { pkt_dev->dst_mac_count = value; pkt_dev->cur_dst_mac_offset = 0; @@ -1249,16 +1282,16 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "burst")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value > 1) && ((pkt_dev->xmit_mode == M_QUEUE_XMIT) || ((pkt_dev->xmit_mode == M_START_XMIT) && (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))))) - return -ENOTSUPP; + return -EOPNOTSUPP; if (value > 1 && !(pkt_dev->flags & F_SHARED)) return -EINVAL; @@ -1268,12 +1301,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "node")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; - if (node_possible(value)) { pkt_dev->node = value; sprintf(pg_result, "OK: node=%d", pkt_dev->node); @@ -1281,29 +1313,29 @@ static ssize_t pktgen_if_write(struct file *file, put_page(pkt_dev->page); pkt_dev->page = NULL; } - } - else + } else { sprintf(pg_result, "ERROR: node not possible"); + } return count; } if (!strcmp(name, "xmit_mode")) { char f[32]; - memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; + memset(f, 0, sizeof(f)); if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; if (strcmp(f, "start_xmit") == 0) { pkt_dev->xmit_mode = M_START_XMIT; } else if (strcmp(f, "netif_receive") == 0) { /* clone_skb set earlier, not supported in this mode */ if (pkt_dev->clone_skb > 0) - return -ENOTSUPP; + return -EOPNOTSUPP; pkt_dev->xmit_mode = M_NETIF_RECEIVE; @@ -1329,14 +1361,14 @@ static ssize_t pktgen_if_write(struct file *file, char f[32]; char *end; - memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; + memset(f, 0, 32); if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; flag = pktgen_read_flag(f, &disable); if (flag) { @@ -1378,7 +1410,8 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1); + max = min(sizeof(pkt_dev->dst_min) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1386,19 +1419,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_min) != 0) { - memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); - strcpy(pkt_dev->dst_min, buf); + strscpy_pad(pkt_dev->dst_min, buf); pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); pkt_dev->cur_daddr = pkt_dev->daddr_min; } if (debug) pr_debug("dst_min set to: %s\n", pkt_dev->dst_min); - i += len; + sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min); return count; } if (!strcmp(name, "dst_max")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1); + max = min(sizeof(pkt_dev->dst_max) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1406,19 +1439,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->dst_max) != 0) { - memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); - strcpy(pkt_dev->dst_max, buf); + strscpy_pad(pkt_dev->dst_max, buf); pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); pkt_dev->cur_daddr = pkt_dev->daddr_max; } if (debug) pr_debug("dst_max set to: %s\n", pkt_dev->dst_max); - i += len; + sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max); return count; } if (!strcmp(name, "dst6")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1436,12 +1469,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6 set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6=%s", buf); return count; } if (!strcmp(name, "dst6_min")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1458,12 +1491,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6_min set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6_min=%s", buf); return count; } if (!strcmp(name, "dst6_max")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1479,12 +1512,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6_max set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6_max=%s", buf); return count; } if (!strcmp(name, "src6")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1502,12 +1535,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("src6 set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: src6=%s", buf); return count; } if (!strcmp(name, "src_min")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1); + max = min(sizeof(pkt_dev->src_min) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1515,19 +1548,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_min) != 0) { - memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); - strcpy(pkt_dev->src_min, buf); + strscpy_pad(pkt_dev->src_min, buf); pkt_dev->saddr_min = in_aton(pkt_dev->src_min); pkt_dev->cur_saddr = pkt_dev->saddr_min; } if (debug) pr_debug("src_min set to: %s\n", pkt_dev->src_min); - i += len; + sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min); return count; } if (!strcmp(name, "src_max")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1); + max = min(sizeof(pkt_dev->src_max) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1535,19 +1568,19 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; if (strcmp(buf, pkt_dev->src_max) != 0) { - memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); - strcpy(pkt_dev->src_max, buf); + strscpy_pad(pkt_dev->src_max, buf); pkt_dev->saddr_max = in_aton(pkt_dev->src_max); pkt_dev->cur_saddr = pkt_dev->saddr_max; } if (debug) pr_debug("src_max set to: %s\n", pkt_dev->src_max); - i += len; + sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max); return count; } if (!strcmp(name, "dst_mac")) { - len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + max = min(sizeof(valstr) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1564,7 +1597,8 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "src_mac")) { - len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + max = min(sizeof(valstr) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1588,11 +1622,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "flows")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value > MAX_CFLOWS) value = MAX_CFLOWS; @@ -1602,44 +1636,44 @@ static ssize_t pktgen_if_write(struct file *file, } #ifdef CONFIG_XFRM if (!strcmp(name, "spi")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->spi = value; sprintf(pg_result, "OK: spi=%u", pkt_dev->spi); return count; } #endif if (!strcmp(name, "flowlen")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->lflow = value; sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow); return count; } if (!strcmp(name, "queue_map_min")) { - len = num_arg(&user_buffer[i], 5, &value); + max = min(5, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->queue_map_min = value; sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min); return count; } if (!strcmp(name, "queue_map_max")) { - len = num_arg(&user_buffer[i], 5, &value); + max = min(5, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->queue_map_max = value; sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max); return count; @@ -1648,10 +1682,11 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "mpls")) { unsigned int n, cnt; - len = get_labels(&user_buffer[i], pkt_dev); + max = count - i; + len = get_labels(&user_buffer[i], max, pkt_dev); if (len < 0) return len; - i += len; + cnt = sprintf(pg_result, "OK: mpls="); for (n = 0; n < pkt_dev->nr_labels; n++) cnt += sprintf(pg_result + cnt, @@ -1669,11 +1704,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_id")) { - len = num_arg(&user_buffer[i], 4, &value); + max = min(4, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value <= 4095) { pkt_dev->vlan_id = value; /* turn on VLAN */ @@ -1696,11 +1731,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_p")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) { pkt_dev->vlan_p = value; sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p); @@ -1711,11 +1746,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_cfi")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) { pkt_dev->vlan_cfi = value; sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi); @@ -1726,11 +1761,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_id")) { - len = num_arg(&user_buffer[i], 4, &value); + max = min(4, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) { pkt_dev->svlan_id = value; /* turn on SVLAN */ @@ -1753,11 +1788,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_p")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) { pkt_dev->svlan_p = value; sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p); @@ -1768,11 +1803,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_cfi")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) { pkt_dev->svlan_cfi = value; sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi); @@ -1783,12 +1818,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "tos")) { - __u32 tmp_value = 0; - len = hex32_arg(&user_buffer[i], 2, &tmp_value); + __u32 tmp_value; + + max = min(2, count - i); + len = hex32_arg(&user_buffer[i], max, &tmp_value); if (len < 0) return len; - i += len; if (len == 2) { pkt_dev->tos = tmp_value; sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos); @@ -1799,12 +1835,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "traffic_class")) { - __u32 tmp_value = 0; - len = hex32_arg(&user_buffer[i], 2, &tmp_value); + __u32 tmp_value; + + max = min(2, count - i); + len = hex32_arg(&user_buffer[i], max, &tmp_value); if (len < 0) return len; - i += len; if (len == 2) { pkt_dev->traffic_class = tmp_value; sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class); @@ -1815,11 +1852,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "skb_priority")) { - len = num_arg(&user_buffer[i], 9, &value); + max = min(9, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->skb_priority = value; sprintf(pg_result, "OK: skb_priority=%i", pkt_dev->skb_priority); @@ -1874,12 +1911,13 @@ static int pktgen_thread_show(struct seq_file *seq, void *v) } static ssize_t pktgen_thread_write(struct file *file, - const char __user * user_buffer, - size_t count, loff_t * offset) + const char __user *user_buffer, + size_t count, loff_t *offset) { struct seq_file *seq = file->private_data; struct pktgen_thread *t = seq->private; - int i, max, len, ret; + size_t i, max; + ssize_t len, ret; char name[40]; char *pg_result; @@ -1896,8 +1934,8 @@ static ssize_t pktgen_thread_write(struct file *file, i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1926,15 +1964,17 @@ static ssize_t pktgen_thread_write(struct file *file, if (!strcmp(name, "add_device")) { char f[32]; + memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) { ret = len; goto out; } if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; + mutex_lock(&pktgen_thread_lock); ret = pktgen_add_device(t, f); mutex_unlock(&pktgen_thread_lock); @@ -2358,15 +2398,16 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) } -#ifdef CONFIG_XFRM /* If there was already an IPSEC SA, we keep it as is, else * we go look for it ... -*/ + */ #define DUMMY_MARK 0 static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) { +#ifdef CONFIG_XFRM struct xfrm_state *x = pkt_dev->flows[flow].x; struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); + if (!x) { if (pkt_dev->spi) { @@ -2390,16 +2431,16 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) } } -} #endif +} static void set_cur_queue_map(struct pktgen_dev *pkt_dev) { - if (pkt_dev->flags & F_QUEUE_MAP_CPU) pkt_dev->cur_queue_map = smp_processor_id(); else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; + if (pkt_dev->flags & F_QUEUE_MAP_RND) { t = get_random_u32_inclusive(pkt_dev->queue_map_min, pkt_dev->queue_map_max); @@ -2481,6 +2522,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_MPLS_RND) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | @@ -2525,6 +2567,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) imx = ntohl(pkt_dev->saddr_max); if (imn < imx) { __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) t = get_random_u32_inclusive(imn, imx - 1); else { @@ -2545,6 +2588,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; __be32 s; + if (pkt_dev->flags & F_IPDST_RND) { do { @@ -2569,10 +2613,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->flows[flow].flags |= F_INIT; pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr; -#ifdef CONFIG_XFRM if (pkt_dev->flags & F_IPSEC) get_ipsec_sa(pkt_dev, flow); -#endif pkt_dev->nflows++; } } @@ -2594,6 +2636,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { t = get_random_u32_inclusive(pkt_dev->min_pkt_size, pkt_dev->max_pkt_size - 1); @@ -2660,7 +2703,8 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) if (!x) return 0; /* XXX: we dont support tunnel mode for now until - * we resolve the dst issue */ + * we resolve the dst issue + */ if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0)) return 0; @@ -2695,8 +2739,10 @@ static void free_SAs(struct pktgen_dev *pkt_dev) if (pkt_dev->cflows) { /* let go of the SAs if we have them */ int i; + for (i = 0; i < pkt_dev->cflows; i++) { struct xfrm_state *x = pkt_dev->flows[i].x; + if (x) { xfrm_state_put(x); pkt_dev->flows[i].x = NULL; @@ -2711,6 +2757,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev, if (pkt_dev->flags & F_IPSEC) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; + if (x) { struct ethhdr *eth; struct iphdr *iph; @@ -2754,6 +2801,7 @@ err: static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev) { unsigned int i; + for (i = 0; i < pkt_dev->nr_labels; i++) *mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM; @@ -2866,7 +2914,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, skb->dev = dev; } } else { - skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); + skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } /* the caller pre-fetches from skb->data and reserves for the mac hdr */ @@ -2947,7 +2995,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->priority = pkt_dev->skb_priority; memcpy(eth, pkt_dev->hh, 12); - *(__be16 *) & eth[12] = protocol; + *(__be16 *)ð[12] = protocol; /* Eth + IPh + UDPh + mpls */ datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 - @@ -3176,11 +3224,11 @@ static void pktgen_run(struct pktgen_thread *t) set_pkt_overhead(pkt_dev); - strcpy(pkt_dev->result, "Starting"); + strscpy(pkt_dev->result, "Starting"); pkt_dev->running = 1; /* Cranke yeself! */ started++; } else - strcpy(pkt_dev->result, "Error starting"); + strscpy(pkt_dev->result, "Error starting"); } rcu_read_unlock(); if (started) @@ -3439,6 +3487,7 @@ static void pktgen_rem_thread(struct pktgen_thread *t) static void pktgen_resched(struct pktgen_dev *pkt_dev) { ktime_t idle_start = ktime_get(); + schedule(); pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start)); } @@ -3754,7 +3803,8 @@ static int add_dev_to_thread(struct pktgen_thread *t, * userspace on another CPU than the kthread. The if_lock() * is used here to sync with concurrent instances of * _rem_dev_from_if_list() invoked via kthread, which is also - * updating the if_list */ + * updating the if_list + */ if_lock(t); if (pkt_dev->pg_thread) { @@ -3792,7 +3842,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) if (!pkt_dev) return -ENOMEM; - strcpy(pkt_dev->odevname, ifname); + strscpy(pkt_dev->odevname, ifname); pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS, sizeof(struct flow_state)), node); @@ -3883,17 +3933,14 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn) list_add_tail(&t->th_list, &pn->pktgen_threads); init_completion(&t->start_done); - p = kthread_create_on_node(pktgen_thread_worker, - t, - cpu_to_node(cpu), - "kpktgend_%d", cpu); + p = kthread_create_on_cpu(pktgen_thread_worker, t, cpu, "kpktgend_%d"); if (IS_ERR(p)) { pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu); list_del(&t->th_list); kfree(t); return PTR_ERR(p); } - kthread_bind(p, cpu); + t->tsk = p; pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir, @@ -3952,7 +3999,8 @@ static int pktgen_remove_device(struct pktgen_thread *t, /* Remove proc before if_list entry, because add_device uses * list to determine if interface already exist, avoid race - * with proc_create_data() */ + * with proc_create_data() + */ proc_remove(pkt_dev->entry); /* And update the thread if_list */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d9f959c619d9..c57692eb8da9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -53,6 +53,7 @@ #include <net/fib_rules.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> +#include <net/netdev_lock.h> #include <net/devlink.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/addrconf.h> @@ -80,11 +81,15 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +int rtnl_lock_interruptible(void) +{ + return mutex_lock_interruptible(&rtnl_mutex); +} + int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); } -EXPORT_SYMBOL(rtnl_lock_killable); static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) @@ -221,6 +226,16 @@ int rtnl_net_trylock(struct net *net) } EXPORT_SYMBOL(rtnl_net_trylock); +int rtnl_net_lock_killable(struct net *net) +{ + int ret = rtnl_lock_killable(); + + if (!ret) + __rtnl_net_lock(net); + + return ret; +} + static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b) { if (net_eq(net_a, net_b)) @@ -1028,7 +1043,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); -void netdev_set_operstate(struct net_device *dev, int newstate) +void netif_set_operstate(struct net_device *dev, int newstate) { unsigned int old = READ_ONCE(dev->operstate); @@ -1037,9 +1052,9 @@ void netdev_set_operstate(struct net_device *dev, int newstate) return; } while (!try_cmpxchg(&dev->operstate, &old, newstate)); - netdev_state_change(dev); + netif_state_change(dev); } -EXPORT_SYMBOL(netdev_set_operstate); +EXPORT_SYMBOL(netif_set_operstate); static void set_operstate(struct net_device *dev, unsigned char transition) { @@ -1065,7 +1080,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition) break; } - netdev_set_operstate(dev, operstate); + netif_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) @@ -1162,6 +1177,9 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, /* IFLA_VF_STATS_TX_DROPPED */ nla_total_size_64bit(sizeof(__u64))); } + if (dev->netdev_ops->ndo_get_vf_guid) + size += num_vfs * 2 * + nla_total_size(sizeof(struct ifla_vf_guid)); return size; } else return 0; @@ -1278,6 +1296,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + + nla_total_size(1) /* IFLA_NETNS_IMMUTABLE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(4) /* IFLA_LINK_NETNSID */ + nla_total_size(4) /* IFLA_GROUP */ @@ -2032,6 +2051,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN) || nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || + nla_put_u8(skb, IFLA_NETNS_IMMUTABLE, dev->netns_immutable) || nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || @@ -2220,6 +2240,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_ALLMULTI] = { .type = NLA_REJECT }, [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2369,12 +2390,12 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, if (strict_check) { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); @@ -2895,12 +2916,19 @@ static int do_set_master(struct net_device *dev, int ifindex, const struct net_device_ops *ops; int err; + /* Release the lower lock, the upper is responsible for locking + * the lower if needed. None of the existing upper devices + * use netdev instance lock, so don't grab it. + */ + if (upper_dev) { if (upper_dev->ifindex == ifindex) return 0; ops = upper_dev->netdev_ops; if (ops->ndo_del_slave) { + netdev_unlock_ops(dev); err = ops->ndo_del_slave(upper_dev, dev); + netdev_lock_ops(dev); if (err) return err; } else { @@ -2914,7 +2942,9 @@ static int do_set_master(struct net_device *dev, int ifindex, return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { + netdev_unlock_ops(dev); err = ops->ndo_add_slave(upper_dev, dev, extack); + netdev_lock_ops(dev); if (err) return err; } else { @@ -2964,7 +2994,7 @@ static int do_set_proto_down(struct net_device *dev, if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); - dev_change_proto_down_reason(dev, mask, value); + netdev_change_proto_down_reason_locked(dev, mask, value); } if (nl_proto_down) { @@ -2975,8 +3005,7 @@ static int do_set_proto_down(struct net_device *dev, NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); return -EBUSY; } - err = dev_change_proto_down(dev, - proto_down); + err = netif_change_proto_down(dev, proto_down); if (err) return err; } @@ -2998,7 +3027,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, err = validate_linkmsg(dev, tb, extack); if (err < 0) - goto errout; + return err; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); @@ -3011,13 +3040,16 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0); - err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex); + err = __dev_change_net_namespace(dev, tgt_net, pat, + new_ifindex, extack); if (err) - goto errout; + return err; status |= DO_SETLINK_MODIFIED; } + netdev_lock_ops(dev); + if (tb[IFLA_MAP]) { struct rtnl_link_ifmap *u_map; struct ifmap k_map; @@ -3048,35 +3080,35 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, } if (tb[IFLA_ADDRESS]) { - struct sockaddr *sa; - int len; - - len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, - sizeof(*sa)); - sa = kmalloc(len, GFP_KERNEL); - if (!sa) { - err = -ENOMEM; + struct sockaddr_storage ss = { }; + + netdev_unlock_ops(dev); + + /* dev_addr_sem is an outer lock, enforce proper ordering */ + down_write(&dev_addr_sem); + netdev_lock_ops(dev); + + ss.ss_family = dev->type; + memcpy(ss.__data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); + err = netif_set_mac_address(dev, &ss, extack); + if (err) { + up_write(&dev_addr_sem); goto errout; } - sa->sa_family = dev->type; - memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), - dev->addr_len); - err = dev_set_mac_address_user(dev, sa, extack); - kfree(sa); - if (err) - goto errout; status |= DO_SETLINK_MODIFIED; + + up_write(&dev_addr_sem); } if (tb[IFLA_MTU]) { - err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); + err = netif_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { - dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); status |= DO_SETLINK_NOTIFY; } @@ -3086,15 +3118,15 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, * requested. */ if (ifm->ifi_index > 0 && ifname[0]) { - err = dev_change_name(dev, ifname); + err = netif_change_name(dev, ifname); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { - err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), - nla_len(tb[IFLA_IFALIAS])); + err = netif_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), + nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; @@ -3106,8 +3138,8 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, } if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), - extack); + err = netif_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); if (err < 0) goto errout; } @@ -3120,7 +3152,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, } if (tb[IFLA_CARRIER]) { - err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); + err = netif_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; status |= DO_SETLINK_MODIFIED; @@ -3129,7 +3161,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, if (tb[IFLA_TXQLEN]) { unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); - err = dev_change_tx_queue_len(dev, value); + err = netif_change_tx_queue_len(dev, value); if (err) goto errout; status |= DO_SETLINK_MODIFIED; @@ -3353,13 +3385,15 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, errout: if (status & DO_SETLINK_MODIFIED) { if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY) - netdev_state_change(dev); + netif_state_change(dev); if (err < 0) net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", dev->name); } + netdev_unlock_ops(dev); + return err; } @@ -3423,6 +3457,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = -ENODEV; rtnl_nets_unlock(&rtnl_nets); + rtnl_nets_destroy(&rtnl_nets); errout: return err; } @@ -3534,7 +3569,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, u32 portid, const struct nlmsghdr *nlh) { - unsigned int old_flags; + unsigned int old_flags, changed; int err; old_flags = dev->flags; @@ -3545,12 +3580,13 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm, return err; } - if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { - __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh); - } else { - dev->rtnl_link_state = RTNL_LINK_INITIALIZED; - __dev_notify_flags(dev, old_flags, ~0U, portid, nlh); + changed = old_flags ^ dev->flags; + if (dev->rtnl_link_initializing) { + dev->rtnl_link_initializing = false; + changed = ~0U; } + + __dev_notify_flags(dev, old_flags, changed, portid, nlh); return 0; } EXPORT_SYMBOL(rtnl_configure_link); @@ -3608,7 +3644,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, dev_net_set(dev, net); dev->rtnl_link_ops = ops; - dev->rtnl_link_state = RTNL_LINK_INITIALIZING; + dev->rtnl_link_initializing = true; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); @@ -3635,7 +3671,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) - dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) @@ -3752,7 +3788,13 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, struct netlink_ext_ack *extack) { unsigned char name_assign_type = NET_NAME_USER; - struct net *net = sock_net(skb->sk); + struct rtnl_newlink_params params = { + .src_net = sock_net(skb->sk), + .link_net = link_net, + .peer_net = peer_net, + .tb = tb, + .data = data, + }; u32 portid = NETLINK_CB(skb).portid; struct net_device *dev; char ifname[IFNAMSIZ]; @@ -3768,8 +3810,8 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, name_assign_type = NET_NAME_ENUM; } - dev = rtnl_create_link(link_net ? : tgt_net, ifname, - name_assign_type, ops, tb, extack); + dev = rtnl_create_link(tgt_net, ifname, name_assign_type, ops, tb, + extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; @@ -3777,13 +3819,8 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, dev->ifindex = ifm->ifi_index; - if (link_net) - net = link_net; - if (peer_net) - net = peer_net; - if (ops->newlink) - err = ops->newlink(net, dev, tb, data, extack); + err = ops->newlink(dev, ¶ms, extack); else err = register_netdevice(dev); if (err < 0) { @@ -3791,22 +3828,22 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, goto out; } + netdev_lock_ops(dev); + err = rtnl_configure_link(dev, ifm, portid, nlh); if (err < 0) goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, tgt_net, ifname); - if (err < 0) - goto out_unregister; - } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto out_unregister; } + + netdev_unlock_ops(dev); out: return err; out_unregister: + netdev_unlock_ops(dev); if (ops->newlink) { LIST_HEAD(list_kill); @@ -3852,20 +3889,26 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, { struct nlattr ** const tb = tbs->tb; struct net *net = sock_net(skb->sk); + struct net *device_net; struct net_device *dev; struct ifinfomsg *ifm; bool link_specified; + /* When creating, lookup for existing device in target net namespace */ + device_net = (nlh->nlmsg_flags & NLM_F_CREATE) && + (nlh->nlmsg_flags & NLM_F_EXCL) ? + tgt_net : net; + ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { link_specified = true; - dev = __dev_get_by_index(net, ifm->ifi_index); + dev = __dev_get_by_index(device_net, ifm->ifi_index); } else if (ifm->ifi_index < 0) { NL_SET_ERR_MSG(extack, "ifindex can't be negative"); return -EINVAL; } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { link_specified = true; - dev = rtnl_dev_get(net, tb); + dev = rtnl_dev_get(device_net, tb); } else { link_specified = false; dev = NULL; @@ -4030,7 +4073,8 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, struct ifinfomsg *ifm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for get link"); return -EINVAL; } @@ -4039,7 +4083,6 @@ static int rtnl_valid_getlink_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); @@ -4765,15 +4808,16 @@ static int nlmsg_populate_fdb(struct sk_buff *skb, int *idx, struct netdev_hw_addr_list *list) { + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; struct netdev_hw_addr *ha; - int err; u32 portid, seq; + int err; portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { - if (*idx < cb->args[2]) + if (*idx < ctx->fdb_idx) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, @@ -4829,12 +4873,12 @@ static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); @@ -4912,18 +4956,16 @@ static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { - struct net_device *dev; - struct net_device *br_dev = NULL; - const struct net_device_ops *ops = NULL; - const struct net_device_ops *cops = NULL; + const struct net_device_ops *ops = NULL, *cops = NULL; + struct ndo_fdb_dump_context *ctx = (void *)cb->ctx; + struct net_device *dev, *br_dev = NULL; struct net *net = sock_net(skb->sk); - struct hlist_head *head; int brport_idx = 0; int br_idx = 0; - int h, s_h; - int idx = 0, s_idx; - int err = 0; int fidx = 0; + int err; + + NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context); if (cb->strict_check) err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, @@ -4942,70 +4984,51 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) ops = br_dev->netdev_ops; } - s_h = cb->args[0]; - s_idx = cb->args[1]; - - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - - if (brport_idx && (dev->ifindex != brport_idx)) - continue; - - if (!br_idx) { /* user did not specify a specific bridge */ - if (netif_is_bridge_port(dev)) { - br_dev = netdev_master_upper_dev_get(dev); - cops = br_dev->netdev_ops; - } - } else { - if (dev != br_dev && - !netif_is_bridge_port(dev)) - continue; + for_each_netdev_dump(net, dev, ctx->ifindex) { + if (brport_idx && (dev->ifindex != brport_idx)) + continue; - if (br_dev != netdev_master_upper_dev_get(dev) && - !netif_is_bridge_master(dev)) - continue; - cops = ops; + if (!br_idx) { /* user did not specify a specific bridge */ + if (netif_is_bridge_port(dev)) { + br_dev = netdev_master_upper_dev_get(dev); + cops = br_dev->netdev_ops; } + } else { + if (dev != br_dev && + !netif_is_bridge_port(dev)) + continue; - if (idx < s_idx) - goto cont; + if (br_dev != netdev_master_upper_dev_get(dev) && + !netif_is_bridge_master(dev)) + continue; + cops = ops; + } - if (netif_is_bridge_port(dev)) { - if (cops && cops->ndo_fdb_dump) { - err = cops->ndo_fdb_dump(skb, cb, - br_dev, dev, - &fidx); - if (err == -EMSGSIZE) - goto out; - } + if (netif_is_bridge_port(dev)) { + if (cops && cops->ndo_fdb_dump) { + err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, + &fidx); + if (err == -EMSGSIZE) + break; } + } - if (dev->netdev_ops->ndo_fdb_dump) - err = dev->netdev_ops->ndo_fdb_dump(skb, cb, - dev, NULL, - &fidx); - else - err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, - &fidx); - if (err == -EMSGSIZE) - goto out; + if (dev->netdev_ops->ndo_fdb_dump) + err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, + &fidx); + else + err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx); + if (err == -EMSGSIZE) + break; - cops = NULL; + cops = NULL; - /* reset fdb offset to 0 for rest of the interfaces */ - cb->args[2] = 0; - fidx = 0; -cont: - idx++; - } + /* reset fdb offset to 0 for rest of the interfaces */ + ctx->fdb_idx = 0; + fidx = 0; } -out: - cb->args[0] = h; - cb->args[1] = idx; - cb->args[2] = fidx; + ctx->fdb_idx = fidx; return skb->len; } @@ -5018,12 +5041,12 @@ static int valid_fdb_get_strict(const struct nlmsghdr *nlh, struct ndmsg *ndm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { + ndm = nlmsg_payload(nlh, sizeof(*ndm)); + if (!ndm) { NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); return -EINVAL; } - ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); @@ -5290,12 +5313,12 @@ static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, if (strict_check) { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); @@ -6187,7 +6210,8 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, { struct if_stats_msg *ifsm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { + ifsm = nlmsg_payload(nlh, sizeof(*ifsm)); + if (!ifsm) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } @@ -6195,8 +6219,6 @@ static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, if (!strict_check) return 0; - ifsm = nlmsg_data(nlh); - /* only requests using strict checks can pass data to influence * the dump. The legacy exception is filter_mask. */ @@ -6424,12 +6446,12 @@ static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh, { struct br_port_msg *bpm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) { + bpm = nlmsg_payload(nlh, sizeof(*bpm)); + if (!bpm) { NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request"); return -EINVAL; } - bpm = nlmsg_data(nlh); if (bpm->ifindex) { NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request"); return -EINVAL; diff --git a/net/core/scm.c b/net/core/scm.c index 4f6a14babe5a..0225bd94170f 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -282,6 +282,16 @@ efault: } EXPORT_SYMBOL(put_cmsg); +int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data) +{ + /* Don't produce truncated CMSGs */ + if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len)) + return -ETOOSMALL; + + return put_cmsg(msg, level, type, len, data); +} + void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) { struct scm_timestamping64 tss; @@ -394,3 +404,125 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) return new_fpl; } EXPORT_SYMBOL(scm_fp_dup); + +#ifdef CONFIG_SECURITY_NETWORK +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) +{ + struct lsm_context ctx; + int err; + + if (sk->sk_scm_security) { + err = security_secid_to_secctx(scm->secid, &ctx); + + if (err >= 0) { + put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len, + ctx.context); + + security_release_secctx(&ctx); + } + } +} + +static bool scm_has_secdata(struct sock *sk) +{ + return sk->sk_scm_security; +} +#else +static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm) +{ +} + +static bool scm_has_secdata(struct sock *sk) +{ + return false; +} +#endif + +static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm) +{ + struct file *pidfd_file = NULL; + int len, pidfd; + + /* put_cmsg() doesn't return an error if CMSG is truncated, + * that's why we need to opencode these checks here. + */ + if (msg->msg_flags & MSG_CMSG_COMPAT) + len = sizeof(struct compat_cmsghdr) + sizeof(int); + else + len = sizeof(struct cmsghdr) + sizeof(int); + + if (msg->msg_controllen < len) { + msg->msg_flags |= MSG_CTRUNC; + return; + } + + if (!scm->pid) + return; + + pidfd = pidfd_prepare(scm->pid, 0, &pidfd_file); + + if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) { + if (pidfd_file) { + put_unused_fd(pidfd); + fput(pidfd_file); + } + + return; + } + + if (pidfd_file) + fd_install(pidfd, pidfd_file); +} + +static bool __scm_recv_common(struct sock *sk, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!msg->msg_control) { + if (sk->sk_scm_credentials || sk->sk_scm_pidfd || + scm->fp || scm_has_secdata(sk)) + msg->msg_flags |= MSG_CTRUNC; + + scm_destroy(scm); + return false; + } + + if (sk->sk_scm_credentials) { + struct user_namespace *current_ns = current_user_ns(); + struct ucred ucreds = { + .pid = scm->creds.pid, + .uid = from_kuid_munged(current_ns, scm->creds.uid), + .gid = from_kgid_munged(current_ns, scm->creds.gid), + }; + + put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); + } + + scm_passec(sk, msg, scm); + + if (scm->fp) + scm_detach_fds(msg, scm); + + return true; +} + +void scm_recv(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock->sk, msg, scm, flags)) + return; + + scm_destroy_cred(scm); +} +EXPORT_SYMBOL(scm_recv); + +void scm_recv_unix(struct socket *sock, struct msghdr *msg, + struct scm_cookie *scm, int flags) +{ + if (!__scm_recv_common(sock->sk, msg, scm, flags)) + return; + + if (sock->sk->sk_scm_pidfd) + scm_pidfd_recv(msg, scm); + + scm_destroy_cred(scm); +} diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index b0ff6153be62..9a3965680451 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -71,7 +71,7 @@ u32 secure_tcpv6_ts_off(const struct net *net, return siphash(&combined, offsetofend(typeof(combined), daddr), &ts_secret); } -EXPORT_SYMBOL(secure_tcpv6_ts_off); +EXPORT_IPV6_MOD(secure_tcpv6_ts_off); u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport) @@ -156,45 +156,3 @@ u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif - -#if IS_ENABLED(CONFIG_IP_DCCP) -u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport) -{ - u64 seq; - net_secret_init(); - seq = siphash_3u32((__force u32)saddr, (__force u32)daddr, - (__force u32)sport << 16 | (__force u32)dport, - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccp_sequence_number); - -#if IS_ENABLED(CONFIG_IPV6) -u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, - __be16 sport, __be16 dport) -{ - const struct { - struct in6_addr saddr; - struct in6_addr daddr; - __be16 sport; - __be16 dport; - } __aligned(SIPHASH_ALIGNMENT) combined = { - .saddr = *(struct in6_addr *)saddr, - .daddr = *(struct in6_addr *)daddr, - .sport = sport, - .dport = dport - }; - u64 seq; - net_secret_init(); - seq = siphash(&combined, offsetofend(typeof(combined), dport), - &net_secret); - seq += ktime_get_real_ns(); - seq &= (1ull << 48) - 1; - return seq; -} -EXPORT_SYMBOL(secure_dccpv6_sequence_number); -#endif -#endif diff --git a/net/core/selftests.c b/net/core/selftests.c index 8f801e6e3b91..35f807ea9952 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -100,10 +100,10 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, ehdr->h_proto = htons(ETH_P_IP); if (attr->tcp) { + memset(thdr, 0, sizeof(*thdr)); thdr->source = htons(attr->sport); thdr->dest = htons(attr->dport); thdr->doff = sizeof(struct tcphdr) / 4; - thdr->check = 0; } else { uhdr->source = htons(attr->sport); uhdr->dest = htons(attr->dport); @@ -144,10 +144,18 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, attr->id = net_test_next_id; shdr->id = net_test_next_id++; - if (attr->size) - skb_put(skb, attr->size); - if (attr->max_size && attr->max_size > skb->len) - skb_put(skb, attr->max_size - skb->len); + if (attr->size) { + void *payload = skb_put(skb, attr->size); + + memset(payload, 0, attr->size); + } + + if (attr->max_size && attr->max_size > skb->len) { + size_t pad_len = attr->max_size - skb->len; + void *pad = skb_put(skb, pad_len); + + memset(pad, 0, pad_len); + } skb->csum = 0; skb->ip_summed = CHECKSUM_PARTIAL; @@ -299,7 +307,7 @@ static int net_test_phy_loopback_enable(struct net_device *ndev) if (!ndev->phydev) return -EOPNOTSUPP; - return phy_loopback(ndev->phydev, true); + return phy_loopback(ndev->phydev, true, 0); } static int net_test_phy_loopback_disable(struct net_device *ndev) @@ -307,7 +315,7 @@ static int net_test_phy_loopback_disable(struct net_device *ndev) if (!ndev->phydev) return -EOPNOTSUPP; - return phy_loopback(ndev->phydev, false); + return phy_loopback(ndev->phydev, false, 0); } static int net_test_phy_loopback_udp(struct net_device *ndev) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6841e61a6bd0..85fc82f72d26 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -64,11 +64,13 @@ #include <linux/mpls.h> #include <linux/kcov.h> #include <linux/iov_iter.h> +#include <linux/crc32.h> #include <net/protocol.h> #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> +#include <net/gro.h> #include <net/gso.h> #include <net/hotdata.h> #include <net/ip6_checksum.h> @@ -88,6 +90,7 @@ #include <linux/textsearch.h> #include "dev.h" +#include "devmem.h" #include "netmem_priv.h" #include "sock_destructor.h" @@ -95,7 +98,9 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif -#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) +#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN) +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \ + GRO_MAX_HEAD_PAD)) /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique @@ -220,67 +225,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) -#if PAGE_SIZE == SZ_4K - -#define NAPI_HAS_SMALL_PAGE_FRAG 1 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) - -/* specialized page frag allocator using a single order 0 page - * and slicing it into 1K sized fragment. Constrained to systems - * with a very limited amount of 1K fragments fitting a single - * page - to avoid excessive truesize underestimation - */ - -struct page_frag_1k { - void *va; - u16 offset; - bool pfmemalloc; -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) -{ - struct page *page; - int offset; - - offset = nc->offset - SZ_1K; - if (likely(offset >= 0)) - goto use_frag; - - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - if (!page) - return NULL; - - nc->va = page_address(page); - nc->pfmemalloc = page_is_pfmemalloc(page); - offset = PAGE_SIZE - SZ_1K; - page_ref_add(page, offset / SZ_1K); - -use_frag: - nc->offset = offset; - return nc->va + offset; -} -#else - -/* the small page is actually unused in this build; add dummy helpers - * to please the compiler and avoid later preprocessor's conditionals - */ -#define NAPI_HAS_SMALL_PAGE_FRAG 0 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false - -struct page_frag_1k { -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) -{ - return NULL; -} - -#endif - struct napi_alloc_cache { local_lock_t bh_lock; struct page_frag_cache page; - struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -290,23 +237,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -367,6 +297,68 @@ static struct sk_buff *napi_skb_cache_get(void) return skb; } +/** + * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache + * @skbs: pointer to an at least @n-sized array to fill with skb pointers + * @n: number of entries to provide + * + * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes + * the pointers into the provided array @skbs. If there are less entries + * available, tries to replenish the cache and bulk-allocates the diff from + * the MM layer if needed. + * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are + * ready for {,__}build_skb_around() and don't have any data buffers attached. + * Must be called *only* from the BH context. + * + * Return: number of successfully allocated skbs (@n if no actual allocation + * needed or kmem_cache_alloc_bulk() didn't fail). + */ +u32 napi_skb_cache_get_bulk(void **skbs, u32 n) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + u32 bulk, total = n; + + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + + if (nc->skb_count >= n) + goto get; + + /* No enough cached skbs. Try refilling the cache first */ + bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK); + nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN, bulk, + &nc->skb_cache[nc->skb_count]); + if (likely(nc->skb_count >= n)) + goto get; + + /* Still not enough. Bulk-allocate the missing part directly, zeroed */ + n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN, + n - nc->skb_count, &skbs[nc->skb_count]); + if (likely(nc->skb_count >= n)) + goto get; + + /* kmem_cache didn't allocate the number we need, limit the output */ + total -= n - nc->skb_count; + n = nc->skb_count; + +get: + for (u32 base = nc->skb_count - n, i = 0; i < n; i++) { + u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache); + + skbs[i] = nc->skb_cache[base + i]; + + kasan_mempool_unpoison_object(skbs[i], cache_size); + memset(skbs[i], 0, offsetof(struct sk_buff, tail)); + } + + nc->skb_count -= n; + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + + return total; +} +EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk); + static inline void __finalize_skb_around(struct sk_buff *skb, void *data, unsigned int size) { @@ -736,7 +728,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); @@ -813,10 +805,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. - * When the small frag allocator is available, prefer it over kmalloc - * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -826,32 +816,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) goto skb_success; } + len = SKB_HEAD_ALIGN(len); + if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache); - if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { - /* we are artificially inflating the allocation size, but - * that is not as bad as it may look like, as: - * - 'len' less than GRO_MAX_HEAD makes little sense - * - On most systems, larger 'len' values lead to fragment - * size above 512 bytes - * - kmalloc would use the kmalloc-1k slab for such values - * - Builds with smaller GRO_MAX_HEAD will very likely do - * little networking, as that implies no WiFi and no - * tunnels support, and 32 bits arches. - */ - len = SZ_1K; - - data = page_frag_alloc_1k(&nc->page_small, gfp_mask); - pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); - } else { - len = SKB_HEAD_ALIGN(len); - data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); - } + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) @@ -921,11 +895,6 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } -static bool is_pp_netmem(netmem_ref netmem) -{ - return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE; -} - int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, unsigned int headroom) { @@ -1009,7 +978,7 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, EXPORT_SYMBOL(skb_pp_cow_data); int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, - struct bpf_prog *prog) + const struct bpf_prog *prog) { if (!prog->aux->xdp_has_frags) return -EINVAL; @@ -1023,14 +992,7 @@ bool napi_pp_put_page(netmem_ref netmem) { netmem = netmem_compound_head(netmem); - /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation - * in order to preserve any existing bits, such as bit 0 for the - * head page of compound page and bit 1 for pfmemalloc page, so - * mask those bits for freeing side when doing below checking, - * and page_is_pfmemalloc() is checked in __page_pool_put_page() - * to avoid recycling the pfmemalloc page. - */ - if (unlikely(!is_pp_netmem(netmem))) + if (unlikely(!netmem_is_pp(netmem))) return false; page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); @@ -1070,7 +1032,7 @@ static int skb_pp_frag_ref(struct sk_buff *skb) for (i = 0; i < shinfo->nr_frags; i++) { head_netmem = netmem_compound_head(shinfo->frags[i].netmem); - if (likely(is_pp_netmem(head_netmem))) + if (likely(netmem_is_pp(head_netmem))) page_pool_ref_netmem(head_netmem); else page_ref_inc(netmem_to_page(head_netmem)); @@ -1694,7 +1656,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp) } EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); -static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) +static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size, + bool devmem) { struct ubuf_info_msgzc *uarg; struct sk_buff *skb; @@ -1709,7 +1672,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) uarg = (void *)skb->cb; uarg->mmp.user = NULL; - if (mm_account_pinned_pages(&uarg->mmp, size)) { + if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) { kfree_skb(skb); return NULL; } @@ -1732,7 +1695,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) } struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, - struct ubuf_info *uarg) + struct ubuf_info *uarg, bool devmem) { if (uarg) { struct ubuf_info_msgzc *uarg_zc; @@ -1762,7 +1725,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, next = (u32)atomic_read(&sk->sk_zckey); if ((u32)(uarg_zc->id + uarg_zc->len) == next) { - if (mm_account_pinned_pages(&uarg_zc->mmp, size)) + if (likely(!devmem) && + mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; uarg_zc->len++; uarg_zc->bytelen = bytelen; @@ -1777,7 +1741,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, } new_alloc: - return msg_zerocopy_alloc(sk, size); + return msg_zerocopy_alloc(sk, size, devmem); } EXPORT_SYMBOL_GPL(msg_zerocopy_realloc); @@ -1881,7 +1845,8 @@ EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, - struct ubuf_info *uarg) + struct ubuf_info *uarg, + struct net_devmem_dmabuf_binding *binding) { int err, orig_len = skb->len; @@ -1900,7 +1865,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return -EEXIST; } - err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); + err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len, + binding); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { struct sock *save_sk = skb->sk; @@ -3267,7 +3233,7 @@ static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg) typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg); static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, - int len, sendmsg_func sendmsg) + int len, sendmsg_func sendmsg, int flags) { unsigned int orig_len = len; struct sk_buff *head = skb; @@ -3285,7 +3251,7 @@ do_frag_list: kv.iov_base = skb->data + offset; kv.iov_len = slen; memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; + msg.msg_flags = MSG_DONTWAIT | flags; iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen); ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked, @@ -3322,7 +3288,8 @@ do_frag_list: while (slen) { struct bio_vec bvec; struct msghdr msg = { - .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT, + .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT | + flags, }; bvec_set_page(&bvec, skb_frag_page(frag), slen, @@ -3368,14 +3335,21 @@ error: int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_locked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0); } EXPORT_SYMBOL_GPL(skb_send_sock_locked); +int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb, + int offset, int len, int flags) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags); +} +EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags); + /* Send skb data on a socket. Socket must be unlocked. */ int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) { - return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked); + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0); } /** @@ -3471,8 +3445,7 @@ fault: EXPORT_SYMBOL(skb_store_bits); /* Checksum skb data. */ -__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, - __wsum csum, const struct skb_checksum_ops *ops) +__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum) { int start = skb_headlen(skb); int i, copy = start - offset; @@ -3483,8 +3456,7 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, if (copy > 0) { if (copy > len) copy = len; - csum = INDIRECT_CALL_1(ops->update, csum_partial_ext, - skb->data + offset, copy, csum); + csum = csum_partial(skb->data + offset, copy, csum); if ((len -= copy) == 0) return csum; offset += copy; @@ -3514,13 +3486,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, skb_frag_off(frag) + offset - start, copy, p, p_off, p_len, copied) { vaddr = kmap_atomic(p); - csum2 = INDIRECT_CALL_1(ops->update, - csum_partial_ext, - vaddr + p_off, p_len, 0); + csum2 = csum_partial(vaddr + p_off, p_len, 0); kunmap_atomic(vaddr); - csum = INDIRECT_CALL_1(ops->combine, - csum_block_add_ext, csum, - csum2, pos, p_len); + csum = csum_block_add(csum, csum2, pos); pos += p_len; } @@ -3541,10 +3509,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum2; if (copy > len) copy = len; - csum2 = __skb_checksum(frag_iter, offset - start, - copy, 0, ops); - csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext, - csum, csum2, pos, copy); + csum2 = skb_checksum(frag_iter, offset - start, copy, + 0); + csum = csum_block_add(csum, csum2, pos); if ((len -= copy) == 0) return csum; offset += copy; @@ -3556,18 +3523,6 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len, return csum; } -EXPORT_SYMBOL(__skb_checksum); - -__wsum skb_checksum(const struct sk_buff *skb, int offset, - int len, __wsum csum) -{ - const struct skb_checksum_ops ops = { - .update = csum_partial_ext, - .combine = csum_block_add_ext, - }; - - return __skb_checksum(skb, offset, len, csum, &ops); -} EXPORT_SYMBOL(skb_checksum); /* Both of above in one bottle. */ @@ -3660,6 +3615,78 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, } EXPORT_SYMBOL(skb_copy_and_csum_bits); +#ifdef CONFIG_NET_CRC32C +u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + if (copy > 0) { + copy = min(copy, len); + crc = crc32c(crc, skb->data + offset, copy); + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + + if (WARN_ON_ONCE(!skb_frags_readable(skb))) + return 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + copy = end - offset; + if (copy > 0) { + u32 p_off, p_len, copied; + struct page *p; + u8 *vaddr; + + copy = min(copy, len); + skb_frag_foreach_page(frag, + skb_frag_off(frag) + offset - start, + copy, p, p_off, p_len, copied) { + vaddr = kmap_atomic(p); + crc = crc32c(crc, vaddr + p_off, p_len); + kunmap_atomic(vaddr); + } + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + copy = end - offset; + if (copy > 0) { + copy = min(copy, len); + crc = skb_crc32c(frag_iter, offset - start, copy, crc); + len -= copy; + if (len == 0) + return crc; + offset += copy; + } + start = end; + } + BUG_ON(len); + + return crc; +} +EXPORT_SYMBOL(skb_crc32c); +#endif /* CONFIG_NET_CRC32C */ + __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len) { __sum16 sum; @@ -3719,32 +3746,6 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb) } EXPORT_SYMBOL(__skb_checksum_complete); -static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum) -{ - net_warn_ratelimited( - "%s: attempt to compute crc32c without libcrc32c.ko\n", - __func__); - return 0; -} - -static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2, - int offset, int len) -{ - net_warn_ratelimited( - "%s: attempt to compute crc32c without libcrc32c.ko\n", - __func__); - return 0; -} - -static const struct skb_checksum_ops default_crc32c_ops = { - .update = warn_crc32c_csum_update, - .combine = warn_crc32c_csum_combine, -}; - -const struct skb_checksum_ops *crc32c_csum_stub __read_mostly = - &default_crc32c_ops; -EXPORT_SYMBOL(crc32c_csum_stub); - /** * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() * @from: source buffer @@ -5539,6 +5540,54 @@ err: } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); +static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps, + int tstype) +{ + switch (tstype) { + case SCM_TSTAMP_SCHED: + return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP; + case SCM_TSTAMP_SND: + return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF : + SKBTX_SW_TSTAMP); + case SCM_TSTAMP_ACK: + return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK; + case SCM_TSTAMP_COMPLETION: + return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP; + } + + return false; +} + +static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, + int tstype) +{ + int op; + + switch (tstype) { + case SCM_TSTAMP_SCHED: + op = BPF_SOCK_OPS_TSTAMP_SCHED_CB; + break; + case SCM_TSTAMP_SND: + if (hwtstamps) { + op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB; + *skb_hwtstamps(skb) = *hwtstamps; + } else { + op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB; + } + break; + case SCM_TSTAMP_ACK: + op = BPF_SOCK_OPS_TSTAMP_ACK_CB; + break; + default: + return; + } + + bpf_skops_tx_timestamping(sk, skb, op); +} + void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, @@ -5551,6 +5600,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!sk) return; + if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF) + skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps, + sk, tstype); + + if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype)) + return; + tsflags = READ_ONCE(sk->sk_tsflags); if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) @@ -6123,11 +6179,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->offload_fwd_mark = 0; skb->offload_l3_fwd_mark = 0; #endif + ipvs_reset(skb); if (!xnet) return; - ipvs_reset(skb); skb->mark = 0; skb_clear_tstamp(skb); } @@ -7290,3 +7346,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, return false; } EXPORT_SYMBOL(csum_and_copy_from_iter_full); + +void get_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_get_net_iov(netmem_to_net_iov(netmem)); + return; + } + get_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(get_netmem); + +void put_netmem(netmem_ref netmem) +{ + struct net_iov *niov; + + if (netmem_is_net_iov(netmem)) { + niov = netmem_to_net_iov(netmem); + if (net_is_devmem_iov(niov)) + net_devmem_put_net_iov(netmem_to_net_iov(netmem)); + return; + } + + put_page(netmem_to_page(netmem)); +} +EXPORT_SYMBOL(put_netmem); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 61f3f3d4e528..34c51eb1a14f 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -530,16 +530,22 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, u32 off, u32 len, struct sk_psock *psock, struct sock *sk, - struct sk_msg *msg) + struct sk_msg *msg, + bool take_ref) { int num_sge, copied; + /* skb_to_sgvec will fail when the total number of fragments in + * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the + * caller may aggregate multiple skbs. + */ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (num_sge < 0) { /* skb linearize may fail with ENOMEM, but lets simply try again * later if this happens. Under memory pressure we don't want to * drop the skb. We need to linearize the skb so that the mapping * in skb_to_sgvec can not error. + * Note that skb_linearize requires the skb not to be shared. */ if (skb_linearize(skb)) return -EAGAIN; @@ -549,11 +555,14 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, return num_sge; } +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + psock->ingress_bytes += len; +#endif copied = len; msg->sg.start = 0; msg->sg.size = copied; msg->sg.end = num_sge; - msg->skb = skb; + msg->skb = take_ref ? skb_get(skb) : skb; sk_psock_queue_msg(psock, msg); sk_psock_data_ready(sk, psock); @@ -561,7 +570,7 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, } static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len); + u32 off, u32 len, bool take_ref); static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) @@ -575,7 +584,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * correctly. */ if (unlikely(skb->sk == sk)) - return sk_psock_skb_ingress_self(psock, skb, off, len); + return sk_psock_skb_ingress_self(psock, skb, off, len, true); msg = sk_psock_create_ingress_msg(sk, skb); if (!msg) return -EAGAIN; @@ -587,7 +596,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * into user buffers. */ skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true); if (err < 0) kfree(msg); return err; @@ -598,7 +607,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, * because the skb is already accounted for here. */ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, - u32 off, u32 len) + u32 off, u32 len, bool take_ref) { struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; @@ -607,7 +616,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb if (unlikely(!msg)) return -EAGAIN; skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref); if (err < 0) kfree(msg); return err; @@ -616,18 +625,13 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len, bool ingress) { - int err = 0; - if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } - skb_get(skb); - err = sk_psock_skb_ingress(psock, skb, off, len); - if (err < 0) - kfree_skb(skb); - return err; + + return sk_psock_skb_ingress(psock, skb, off, len); } static void sk_psock_skb_state(struct sk_psock *psock, @@ -652,12 +656,14 @@ static void sk_psock_backlog(struct work_struct *work) bool ingress; int ret; + /* Increment the psock refcnt to synchronize with close(fd) path in + * sock_map_close(), ensuring we wait for backlog thread completion + * before sk_socket freed. If refcnt increment fails, it indicates + * sock_map_close() completed with sk_socket potentially already freed. + */ + if (!sk_psock_get(psock->sk)) + return; mutex_lock(&psock->work_mutex); - if (unlikely(state->len)) { - len = state->len; - off = state->off; - } - while ((skb = skb_peek(&psock->ingress_skb))) { len = skb->len; off = 0; @@ -667,6 +673,13 @@ static void sk_psock_backlog(struct work_struct *work) off = stm->offset; len = stm->full_len; } + + /* Resume processing from previous partial state */ + if (unlikely(state->len)) { + len = state->len; + off = state->off; + } + ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); do { @@ -677,7 +690,8 @@ static void sk_psock_backlog(struct work_struct *work) if (ret <= 0) { if (ret == -EAGAIN) { sk_psock_skb_state(psock, state, len, off); - + /* Restore redir info we cleared before */ + skb_bpf_set_redir(skb, psock->sk, ingress); /* Delay slightly to prioritize any * other work that might be here. */ @@ -694,11 +708,14 @@ static void sk_psock_backlog(struct work_struct *work) len -= ret; } while (len); + /* The entire skb sent, clear state */ + sk_psock_skb_state(psock, state, 0, 0); skb = skb_dequeue(&psock->ingress_skb); kfree_skb(skb); } end: mutex_unlock(&psock->work_mutex); + sk_psock_put(psock->sk, psock); } struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -1011,7 +1028,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, off = stm->offset; len = stm->full_len; } - err = sk_psock_skb_ingress_self(psock, skb, off, len); + err = sk_psock_skb_ingress_self(psock, skb, off, len, false); } if (err < 0) { spin_lock_bh(&psock->ingress_lock); @@ -1144,6 +1161,10 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) if (!ret) sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + if (sk_is_tcp(sk)) { + psock->strp.cb.read_sock = tcp_bpf_strp_read_sock; + psock->copied_seq = tcp_sk(sk)->copied_seq; + } return ret; } diff --git a/net/core/sock.c b/net/core/sock.c index be84885f9290..3b409bc8ef6d 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -148,6 +148,8 @@ #include <linux/ethtool.h> +#include <uapi/linux/pidfd.h> + #include "dev.h" static DEFINE_MUTEX(proto_list_mutex); @@ -454,6 +456,13 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, return 0; } +static bool sk_set_prio_allowed(const struct sock *sk, int val) +{ + return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); +} + static bool sock_needs_netstamp(const struct sock *sk) { switch (sk->sk_family) { @@ -931,6 +940,7 @@ int sock_set_timestamping(struct sock *sk, int optname, WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY)); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, @@ -941,6 +951,20 @@ int sock_set_timestamping(struct sock *sk, int optname, return 0; } +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ + struct bpf_sock_ops_kern sock_ops; + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = op; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, 0); + __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); +} +#endif + void sock_set_keepalive(struct sock *sk) { lock_sock(sk); @@ -1193,22 +1217,11 @@ int sk_setsockopt(struct sock *sk, int level, int optname, /* handle options which do not require locking the socket. */ switch (optname) { case SO_PRIORITY: - if ((val >= 0 && val <= 6) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || - sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (sk_set_prio_allowed(sk, val)) { sock_set_priority(sk, val); return 0; } return -EPERM; - case SO_PASSSEC: - assign_bit(SOCK_PASSSEC, &sock->flags, valbool); - return 0; - case SO_PASSCRED: - assign_bit(SOCK_PASSCRED, &sock->flags, valbool); - return 0; - case SO_PASSPIDFD: - assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); - return 0; case SO_TYPE: case SO_PROTOCOL: case SO_DOMAIN: @@ -1256,6 +1269,8 @@ int sk_setsockopt(struct sock *sk, int level, int optname, return 0; } case SO_TXREHASH: + if (!sk_is_tcp(sk)) + return -EOPNOTSUPP; if (val < -1 || val > 1) return -EINVAL; if ((u8)val == SOCK_TXREHASH_DEFAULT) @@ -1517,6 +1532,10 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_RCVMARK, valbool); break; + case SO_RCVPRIORITY: + sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); + break; + case SO_RXQ_OVFL: sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); break; @@ -1533,6 +1552,33 @@ set_sndbuf: sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); break; + case SO_PASSCRED: + if (sk_may_scm_recv(sk)) + sk->sk_scm_credentials = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSSEC: + if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) + sk->sk_scm_security = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSPIDFD: + if (sk_is_unix(sk)) + sk->sk_scm_pidfd = valbool; + else + ret = -EOPNOTSUPP; + break; + + case SO_PASSRIGHTS: + if (sk_is_unix(sk)) + sk->sk_scm_rights = valbool; + else + ret = -EOPNOTSUPP; + break; case SO_INCOMING_CPU: reuseport_update_incoming_cpu(sk, val); @@ -1829,11 +1875,24 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PASSCRED: - v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); + if (!sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_credentials; break; case SO_PASSPIDFD: - v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags); + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_pidfd; + break; + + case SO_PASSRIGHTS: + if (!sk_is_unix(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_rights; break; case SO_PEERCRED: @@ -1855,6 +1914,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, { struct pid *peer_pid; struct file *pidfd_file = NULL; + unsigned int flags = 0; int pidfd; if (len > sizeof(pidfd)) @@ -1867,7 +1927,14 @@ int sk_getsockopt(struct sock *sk, int level, int optname, if (!peer_pid) return -ENODATA; - pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file); + /* The use of PIDFD_STALE requires stashing of struct pid + * on pidfs with pidfs_register_pid() and only AF_UNIX + * were prepared for this. + */ + if (sk->sk_family == AF_UNIX) + flags = PIDFD_STALE; + + pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); put_pid(peer_pid); if (pidfd < 0) return pidfd; @@ -1930,7 +1997,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_PASSSEC: - v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); + if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) + return -EOPNOTSUPP; + + v.val = sk->sk_scm_security; break; case SO_PEERSEC: @@ -1945,6 +2015,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = sock_flag(sk, SOCK_RCVMARK); break; + case SO_RCVPRIORITY: + v.val = sock_flag(sk, SOCK_RCVPRIORITY); + break; + case SO_RXQ_OVFL: v.val = sock_flag(sk, SOCK_RXQ_OVFL); break; @@ -2028,7 +2102,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = READ_ONCE(sk->sk_napi_id); /* aggregate non-NAPI IDs down to 0 */ - if (v.val < MIN_NAPI_ID) + if (!napi_id_valid(v.val)) v.val = 0; break; @@ -2074,6 +2148,9 @@ int sk_getsockopt(struct sock *sk, int level, int optname, break; case SO_TXREHASH: + if (!sk_is_tcp(sk)) + return -EOPNOTSUPP; + /* Paired with WRITE_ONCE() in sk_setsockopt() */ v.val = READ_ONCE(sk->sk_txrehash); break; @@ -2102,6 +2179,8 @@ lenout: */ static inline void sock_lock_init(struct sock *sk) { + sk_owner_clear(sk); + if (sk->sk_kern_sock) sock_lock_init_class_and_name( sk, @@ -2198,6 +2277,9 @@ static void sk_prot_free(struct proto *prot, struct sock *sk) cgroup_sk_free(&sk->sk_cgrp_data); mem_cgroup_sk_free(sk); security_sk_free(sk); + + sk_owner_put(sk); + if (slab != NULL) kmem_cache_free(slab, sk); else @@ -2233,6 +2315,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } else { + net_passive_inc(net); __netns_tracker_alloc(net, &sk->ns_tracker, false, priority); } @@ -2257,6 +2340,7 @@ EXPORT_SYMBOL(sk_alloc); static void __sk_destruct(struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); + struct net *net = sock_net(sk); struct sk_filter *filter; if (sk->sk_destruct) @@ -2288,14 +2372,28 @@ static void __sk_destruct(struct rcu_head *head) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - if (likely(sk->sk_net_refcnt)) - put_net_track(sock_net(sk), &sk->ns_tracker); - else - __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); - + if (likely(sk->sk_net_refcnt)) { + put_net_track(net, &sk->ns_tracker); + } else { + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + } sk_prot_free(sk->sk_prot_creator, sk); } +void sk_net_refcnt_upgrade(struct sock *sk) +{ + struct net *net = sock_net(sk); + + WARN_ON_ONCE(sk->sk_net_refcnt); + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); +} +EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); + void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); @@ -2392,6 +2490,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) * is not properly dismantling its kernel sockets at netns * destroy time. */ + net_passive_inc(sock_net(newsk)); __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } @@ -2444,17 +2543,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ if (!is_charged) RCU_INIT_POINTER(newsk->sk_filter, NULL); - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; + + goto free; } + RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); - if (bpf_sk_storage_clone(sk, newsk)) { - sk_free_unlock_clone(newsk); - newsk = NULL; - goto out; - } + if (bpf_sk_storage_clone(sk, newsk)) + goto free; /* Clear sk_user_data if parent had the pointer tagged * as not suitable for copying when cloning. @@ -2484,18 +2580,17 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) net_enable_timestamp(); out: return newsk; -} -EXPORT_SYMBOL_GPL(sk_clone_lock); - -void sk_free_unlock_clone(struct sock *sk) -{ +free: /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - sk->sk_destruct = NULL; - bh_unlock_sock(sk); - sk_free(sk); + * destructor and make plain sk_free() + */ + newsk->sk_destruct = NULL; + bh_unlock_sock(newsk); + sk_free(newsk); + newsk = NULL; + goto out; } -EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +EXPORT_SYMBOL_GPL(sk_clone_lock); static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) { @@ -2520,8 +2615,12 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) u32 max_segs = 1; sk->sk_route_caps = dst->dev->features; - if (sk_is_tcp(sk)) + if (sk_is_tcp(sk)) { + struct inet_connection_sock *icsk = inet_csk(sk); + sk->sk_route_caps |= NETIF_F_GSO; + icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); + } if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; if (unlikely(sk->sk_gso_disabled)) @@ -2791,6 +2890,22 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) } EXPORT_SYMBOL(sock_kmalloc); +/* + * Duplicate the input "src" memory block using the socket's + * option memory buffer. + */ +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority) +{ + void *mem; + + mem = sock_kmalloc(sk, size, priority); + if (mem) + memcpy(mem, src, size); + return mem; +} +EXPORT_SYMBOL(sock_kmemdup); + /* Free an option memory block. Note, we actually want the inline * here as this allows gcc to detect the nullify and fold away the * condition entirely. @@ -2945,6 +3060,18 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, case SCM_RIGHTS: case SCM_CREDENTIALS: break; + case SO_PRIORITY: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) + return -EPERM; + sockc->priority = *(u32 *)CMSG_DATA(cmsg); + break; + case SCM_DEVMEM_DMABUF: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); + break; default: return -EINVAL; } @@ -3157,16 +3284,16 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; struct proto *prot = sk->sk_prot; - bool charged = false; + bool charged = true; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); if (memcg) { - if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge())) + charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); + if (!charged) goto suppress_allocation; - charged = true; } /* Under limit. */ @@ -3251,7 +3378,7 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); - if (charged) + if (memcg && charged) mem_cgroup_uncharge_skmem(memcg, amt); return 0; @@ -3526,14 +3653,14 @@ EXPORT_SYMBOL(sk_reset_timer); void sk_stop_timer(struct sock *sk, struct timer_list* timer) { - if (del_timer(timer)) + if (timer_delete(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) { - if (del_timer_sync(timer)) + if (timer_delete_sync(timer)) __sock_put(sk); } EXPORT_SYMBOL(sk_stop_timer_sync); @@ -3861,7 +3988,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); - mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); + mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); @@ -3927,7 +4054,7 @@ static int assign_proto_idx(struct proto *prot) { prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); - if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { pr_err("PROTO_INUSE_NR exhausted\n"); return -ENOSPC; } @@ -3938,7 +4065,7 @@ static int assign_proto_idx(struct proto *prot) static void release_proto_idx(struct proto *prot) { - if (prot->inuse_idx != PROTO_INUSE_NR - 1) + if (prot->inuse_idx != PROTO_INUSE_NR) clear_bit(prot->inuse_idx, proto_inuse_idx); } #else diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index a08eed9b9142..b23594c767f2 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -264,8 +264,6 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, switch (nlh->nlmsg_type) { case TCPDIAG_GETSOCK: - case DCCPDIAG_GETSOCK: - if (!rcu_access_pointer(inet_rcv_compat)) sock_load_diag_module(AF_INET, 0); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index f1b9b3958792..82a14f131d00 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -303,7 +303,10 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { - ret = sk_psock_init_strp(sk, psock); + if (sk_is_tcp(sk)) + ret = sk_psock_init_strp(sk, psock); + else + ret = -EOPNOTSUPP; if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); @@ -541,6 +544,9 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; + if (sk_is_vsock(sk) && + (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cb8d32e5c14e..5dbb2c6f371d 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -34,6 +34,7 @@ static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; +static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -200,7 +201,7 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, if (orig_sock_table) { static_branch_dec(&rps_needed); static_branch_dec(&rfs_needed); - kvfree_rcu_mightsleep(orig_sock_table); + kvfree_rcu(orig_sock_table, rcu); } } } @@ -238,7 +239,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); - kfree_rcu_mightsleep(cur); + kfree_rcu(cur, rcu); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); @@ -247,7 +248,7 @@ static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write, ret = -ENOMEM; goto write_unlock; } - cur->num_buckets = netdev_flow_limit_table_len; + cur->log_buckets = ilog2(netdev_flow_limit_table_len); rcu_assign_pointer(sd->flow_limit, cur); } } @@ -319,7 +320,7 @@ static int proc_do_dev_weight(const struct ctl_table *table, int write, int ret, weight; mutex_lock(&dev_weight_mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); @@ -412,6 +413,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_rx_bias", @@ -419,6 +421,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_tx_bias", @@ -426,6 +429,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "netdev_max_backlog", @@ -584,7 +588,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = &netdev_budget_usecs_min, }, { .procname = "fb_tunnels_only_for_init_net", diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 3717fb152ecc..a50a7ef49ae8 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -9,6 +9,7 @@ #include <linux/ptp_classify.h> #include <linux/skbuff.h> #include <linux/export.h> +#include <linux/ptp_clock_kernel.h> static unsigned int classify(const struct sk_buff *skb) { @@ -21,19 +22,39 @@ static unsigned int classify(const struct sk_buff *skb) void skb_clone_tx_timestamp(struct sk_buff *skb) { + struct hwtstamp_provider *hwprov; struct mii_timestamper *mii_ts; + struct phy_device *phydev; struct sk_buff *clone; unsigned int type; - if (!skb->sk || !skb->dev || - !phy_is_default_hwtstamp(skb->dev->phydev)) + if (!skb->sk || !skb->dev) return; + rcu_read_lock(); + hwprov = rcu_dereference(skb->dev->hwprov); + if (hwprov) { + if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB || + !hwprov->phydev) { + rcu_read_unlock(); + return; + } + + phydev = hwprov->phydev; + } else { + phydev = skb->dev->phydev; + if (!phy_is_default_hwtstamp(phydev)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + type = classify(skb); if (type == PTP_CLASS_NONE) return; - mii_ts = skb->dev->phydev->mii_ts; + mii_ts = phydev->mii_ts; if (likely(mii_ts->txtstamp)) { clone = skb_clone_sk(skb); if (!clone) @@ -45,12 +66,33 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp); bool skb_defer_rx_timestamp(struct sk_buff *skb) { + struct hwtstamp_provider *hwprov; struct mii_timestamper *mii_ts; + struct phy_device *phydev; unsigned int type; - if (!skb->dev || !phy_is_default_hwtstamp(skb->dev->phydev)) + if (!skb->dev) return false; + rcu_read_lock(); + hwprov = rcu_dereference(skb->dev->hwprov); + if (hwprov) { + if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB || + !hwprov->phydev) { + rcu_read_unlock(); + return false; + } + + phydev = hwprov->phydev; + } else { + phydev = skb->dev->phydev; + if (!phy_is_default_hwtstamp(phydev)) { + rcu_read_unlock(); + return false; + } + } + rcu_read_unlock(); + if (skb_headroom(skb) < ETH_HLEN) return false; @@ -63,7 +105,7 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) if (type == PTP_CLASS_NONE) return false; - mii_ts = skb->dev->phydev->mii_ts; + mii_ts = phydev->mii_ts; if (likely(mii_ts->rxtstamp)) return mii_ts->rxtstamp(mii_ts, skb, type); diff --git a/net/core/utils.c b/net/core/utils.c index 27f4cffaae05..5e63b0ea21f3 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -399,9 +399,9 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, } EXPORT_SYMBOL(inet_pton_with_scope); -bool inet_addr_is_any(struct sockaddr *addr) +bool inet_addr_is_any(struct sockaddr_storage *addr) { - if (addr->sa_family == AF_INET6) { + if (addr->ss_family == AF_INET6) { struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr; const struct sockaddr_in6 in6_any = { .sin6_addr = IN6ADDR_ANY_INIT }; @@ -409,13 +409,13 @@ bool inet_addr_is_any(struct sockaddr *addr) if (!memcmp(in6->sin6_addr.s6_addr, in6_any.sin6_addr.s6_addr, 16)) return true; - } else if (addr->sa_family == AF_INET) { + } else if (addr->ss_family == AF_INET) { struct sockaddr_in *in = (struct sockaddr_in *)addr; if (in->sin_addr.s_addr == htonl(INADDR_ANY)) return true; } else { - pr_warn("unexpected address family %u\n", addr->sa_family); + pr_warn("unexpected address family %u\n", addr->ss_family); } return false; @@ -473,11 +473,11 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, EXPORT_SYMBOL(inet_proto_csum_replace16); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, - __wsum diff, bool pseudohdr) + __wsum diff, bool pseudohdr, bool ipv6) { if (skb->ip_summed != CHECKSUM_PARTIAL) { csum_replace_by_diff(sum, diff); - if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr && !ipv6) skb->csum = ~csum_sub(diff, skb->csum); } else if (pseudohdr) { *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); diff --git a/net/core/xdp.c b/net/core/xdp.c index 2315feed94ef..491334b9b8be 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -17,6 +17,7 @@ #include <net/page_pool/helpers.h> #include <net/hotdata.h> +#include <net/netdev_lock.h> #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ #include <trace/events/xdp.h> @@ -357,6 +358,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, if (IS_ERR(xdp_alloc)) return PTR_ERR(xdp_alloc); + if (type == MEM_TYPE_XSK_BUFF_POOL && allocator) + xsk_pool_set_rxq_info(allocator, xdp_rxq); + if (trace_mem_connect_enabled() && xdp_alloc) trace_mem_connect(xdp_alloc, xdp_rxq); return 0; @@ -364,33 +368,87 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); +/** + * xdp_reg_page_pool - register &page_pool as a memory provider for XDP + * @pool: &page_pool to register + * + * Can be used to register pools manually without connecting to any XDP RxQ + * info, so that the XDP layer will be aware of them. Then, they can be + * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool(). + * + * Return: %0 on success, -errno on error. + */ +int xdp_reg_page_pool(struct page_pool *pool) +{ + struct xdp_mem_info mem; + + return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool); +} +EXPORT_SYMBOL_GPL(xdp_reg_page_pool); + +/** + * xdp_unreg_page_pool - unregister &page_pool from the memory providers list + * @pool: &page_pool to unregister + * + * A shorthand for manual unregistering page pools. If the pool was previously + * attached to an RxQ info, it must be detached first. + */ +void xdp_unreg_page_pool(const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_unreg_mem_model(&mem); +} +EXPORT_SYMBOL_GPL(xdp_unreg_page_pool); + +/** + * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info + * @xdp_rxq: XDP RxQ info to attach the pool to + * @pool: pool to attach + * + * If the pool was registered manually, this function must be called instead + * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info. + */ +void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, + const struct page_pool *pool) +{ + struct xdp_mem_info mem = { + .type = MEM_TYPE_PAGE_POOL, + .id = pool->xdp_mem_id, + }; + + xdp_rxq_info_attach_mem_model(xdp_rxq, &mem); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool); + /* XDP RX runs under NAPI protection, and in different delivery error * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ -void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - struct xdp_buff *xdp) +void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, + bool napi_direct, struct xdp_buff *xdp) { - struct page *page; - - switch (mem->type) { + switch (mem_type) { case MEM_TYPE_PAGE_POOL: - page = virt_to_head_page(data); + netmem = netmem_compound_head(netmem); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; - /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) - * as mem->type knows this a page_pool page + /* No need to check netmem_is_pp() as mem->type knows this a + * page_pool page */ - page_pool_put_full_page(page->pp, page, napi_direct); + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, + napi_direct); break; case MEM_TYPE_PAGE_SHARED: - page_frag_free(data); + page_frag_free(__netmem_address(netmem)); break; case MEM_TYPE_PAGE_ORDER0: - page = virt_to_page(data); /* Assumes order0 page*/ - put_page(page); + put_page(__netmem_to_page(netmem)); break; case MEM_TYPE_XSK_BUFF_POOL: /* NB! Only valid from an xdp_buff! */ @@ -398,7 +456,7 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ - WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); + WARN(1, "Incorrect XDP memory type (%d) usage", mem_type); break; } } @@ -406,38 +464,34 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, + false, NULL); - __xdp_return(page_address(page), &xdpf->mem, false, NULL); - } out: - __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type, + true, NULL); - __xdp_return(page_address(page), &xdpf->mem, true, NULL); - } out: - __xdp_return(xdpf->data, &xdpf->mem, true, NULL); + __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); @@ -451,46 +505,19 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); * xdp_frame_bulk is usually stored/allocated on the function * call-stack to avoid locking penalties. */ -void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) -{ - struct xdp_mem_allocator *xa = bq->xa; - - if (unlikely(!xa || !bq->count)) - return; - - page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); - /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ - bq->count = 0; -} -EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); /* Must be called with rcu_read_lock held */ void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq) { - struct xdp_mem_info *mem = &xdpf->mem; - struct xdp_mem_allocator *xa; - - if (mem->type != MEM_TYPE_PAGE_POOL) { + if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) { xdp_return_frame(xdpf); return; } - xa = bq->xa; - if (unlikely(!xa)) { - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - bq->count = 0; - bq->xa = xa; - } - if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); - if (unlikely(mem->id != xa->mem.id)) { - xdp_flush_frame_bulk(bq); - bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - } - if (unlikely(xdp_frame_has_frags(xdpf))) { struct skb_shared_info *sinfo; int i; @@ -499,31 +526,40 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf, for (i = 0; i < sinfo->nr_frags; i++) { skb_frag_t *frag = &sinfo->frags[i]; - bq->q[bq->count++] = skb_frag_address(frag); + bq->q[bq->count++] = skb_frag_netmem(frag); if (bq->count == XDP_BULK_QUEUE_SIZE) xdp_flush_frame_bulk(bq); } } - bq->q[bq->count++] = xdpf->data; + bq->q[bq->count++] = virt_to_netmem(xdpf->data); } EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); +/** + * xdp_return_frag -- free one XDP frag or decrement its refcount + * @netmem: network memory reference to release + * @xdp: &xdp_buff to release the frag for + */ +void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp) +{ + __xdp_return(netmem, xdp->rxq->mem.type, true, NULL); +} +EXPORT_SYMBOL_GPL(xdp_return_frag); + void xdp_return_buff(struct xdp_buff *xdp) { struct skb_shared_info *sinfo; - int i; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); - for (i = 0; i < sinfo->nr_frags; i++) { - struct page *page = skb_frag_page(&sinfo->frags[i]); + for (u32 i = 0; i < sinfo->nr_frags; i++) + __xdp_return(skb_frag_netmem(&sinfo->frags[i]), + xdp->rxq->mem.type, true, xdp); - __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp); - } out: - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); + __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); @@ -569,7 +605,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->headroom = 0; xdpf->metasize = metasize; xdpf->frame_sz = PAGE_SIZE; - xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; + xdpf->mem_type = MEM_TYPE_PAGE_ORDER0; xsk_buff_free(xdp); return xdpf; @@ -583,15 +619,177 @@ void xdp_warn(const char *msg, const char *func, const int line) }; EXPORT_SYMBOL_GPL(xdp_warn); -int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) +/** + * xdp_build_skb_from_buff - create an skb from &xdp_buff + * @xdp: &xdp_buff to convert to an skb + * + * Perform common operations to create a new skb to pass up the stack from + * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize + * skb data pointers and offsets, set the recycle bit if the buff is + * PP-backed, Rx queue index, protocol and update frags info. + * + * Return: new &sk_buff on success, %NULL on error. + */ +struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) { - n_skb = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, n_skb, skbs); - if (unlikely(!n_skb)) - return -ENOMEM; + const struct xdp_rxq_info *rxq = xdp->rxq; + const struct skb_shared_info *sinfo; + struct sk_buff *skb; + u32 nr_frags = 0; + int metalen; - return 0; + if (unlikely(xdp_buff_has_frags(xdp))) { + sinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = sinfo->nr_frags; + } + + skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, xdp->data - xdp->data_hard_start); + __skb_put(skb, xdp->data_end - xdp->data); + + metalen = xdp->data - xdp->data_meta; + if (metalen > 0) + skb_metadata_set(skb, metalen); + + if (rxq->mem.type == MEM_TYPE_PAGE_POOL) + skb_mark_for_recycle(skb); + + skb_record_rx_queue(skb, rxq->queue_index); + + if (unlikely(nr_frags)) { + u32 tsize; + + tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; + xdp_update_skb_shared_info(skb, nr_frags, + sinfo->xdp_frags_size, tsize, + xdp_buff_is_frag_pfmemalloc(xdp)); + } + + skb->protocol = eth_type_trans(skb, rxq->dev); + + return skb; +} +EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff); + +/** + * xdp_copy_frags_from_zc - copy frags from XSk buff to skb + * @skb: skb to copy frags to + * @xdp: XSk &xdp_buff from which the frags will be copied + * @pp: &page_pool backing page allocation, if available + * + * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack. + * Allocate a new buffer for each frag, copy it and attach to the skb. + * + * Return: true on success, false on netmem allocation fail. + */ +static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, + const struct xdp_buff *xdp, + struct page_pool *pp) +{ + struct skb_shared_info *sinfo = skb_shinfo(skb); + const struct skb_shared_info *xinfo; + u32 nr_frags, tsize = 0; + bool pfmemalloc = false; + + xinfo = xdp_get_shared_info_from_buff(xdp); + nr_frags = xinfo->nr_frags; + + for (u32 i = 0; i < nr_frags; i++) { + const skb_frag_t *frag = &xinfo->frags[i]; + u32 len = skb_frag_size(frag); + u32 offset, truesize = len; + struct page *page; + + page = page_pool_dev_alloc(pp, &offset, &truesize); + if (unlikely(!page)) { + sinfo->nr_frags = i; + return false; + } + + memcpy(page_address(page) + offset, skb_frag_address(frag), + LARGEST_ALIGN(len)); + __skb_fill_page_desc_noacc(sinfo, i, page, offset, len); + + tsize += truesize; + pfmemalloc |= page_is_pfmemalloc(page); + } + + xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size, + tsize, pfmemalloc); + + return true; +} + +/** + * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff + * @xdp: source XSk buff + * + * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb + * head, new buffer for the head, copy the data and initialize the skb fields. + * If there are frags, allocate new buffers for them and copy. + * Buffers are allocated from the system percpu pools to try recycling them. + * If new skb was built successfully, @xdp is returned to XSk pool's freelist. + * On error, it remains untouched and the caller must take care of this. + * + * Return: new &sk_buff on success, %NULL on error. + */ +struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp) +{ + const struct xdp_rxq_info *rxq = xdp->rxq; + u32 len = xdp->data_end - xdp->data_meta; + u32 truesize = xdp->frame_sz; + struct sk_buff *skb = NULL; + struct page_pool *pp; + int metalen; + void *data; + + if (!IS_ENABLED(CONFIG_PAGE_POOL)) + return NULL; + + local_lock_nested_bh(&system_page_pool.bh_lock); + pp = this_cpu_read(system_page_pool.pool); + data = page_pool_dev_alloc_va(pp, &truesize); + if (unlikely(!data)) + goto out; + + skb = napi_build_skb(data, truesize); + if (unlikely(!skb)) { + page_pool_free_va(pp, data, true); + goto out; + } + + skb_mark_for_recycle(skb); + skb_reserve(skb, xdp->data_meta - xdp->data_hard_start); + + memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len)); + + metalen = xdp->data - xdp->data_meta; + if (metalen > 0) { + skb_metadata_set(skb, metalen); + __skb_pull(skb, metalen); + } + + skb_record_rx_queue(skb, rxq->queue_index); + + if (unlikely(xdp_buff_has_frags(xdp)) && + unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) { + napi_consume_skb(skb, true); + skb = NULL; + goto out; + } + + xsk_buff_free(xdp); + + skb->protocol = eth_type_trans(skb, rxq->dev); + +out: + local_unlock_nested_bh(&system_page_pool.bh_lock); + return skb; } -EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); +EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, @@ -639,7 +837,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, * - RX ring dev queue index (skb_record_rx_queue) */ - if (xdpf->mem.type == MEM_TYPE_PAGE_POOL) + if (xdpf->mem_type == MEM_TYPE_PAGE_POOL) skb_mark_for_recycle(skb); /* Allow SKB to reuse area used by xdp_frame */ @@ -686,8 +884,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf) nxdpf = addr; nxdpf->data = addr + headroom; nxdpf->frame_sz = PAGE_SIZE; - nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0; - nxdpf->mem.id = 0; + nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0; return nxdpf; } @@ -800,34 +997,60 @@ static int __init xdp_metadata_init(void) } late_initcall(xdp_metadata_init); -void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val) { val &= NETDEV_XDP_ACT_MASK; if (dev->xdp_features == val) return; + netdev_assert_locked_or_invisible(dev); dev->xdp_features = val; if (dev->reg_state == NETREG_REGISTERED) call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev); } +EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked); + +void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) +{ + netdev_lock(dev); + xdp_set_features_flag_locked(dev, val); + netdev_unlock(dev); +} EXPORT_SYMBOL_GPL(xdp_set_features_flag); -void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +void xdp_features_set_redirect_target_locked(struct net_device *dev, + bool support_sg) { xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT); if (support_sg) val |= NETDEV_XDP_ACT_NDO_XMIT_SG; - xdp_set_features_flag(dev, val); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked); + +void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) +{ + netdev_lock(dev); + xdp_features_set_redirect_target_locked(dev, support_sg); + netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target); -void xdp_features_clear_redirect_target(struct net_device *dev) +void xdp_features_clear_redirect_target_locked(struct net_device *dev) { xdp_features_t val = dev->xdp_features; val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG); - xdp_set_features_flag(dev, val); + xdp_set_features_flag_locked(dev, val); +} +EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked); + +void xdp_features_clear_redirect_target(struct net_device *dev) +{ + netdev_lock(dev); + xdp_features_clear_redirect_target_locked(dev); + netdev_unlock(dev); } EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target); diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig deleted file mode 100644 index 0c7d2f66ba27..000000000000 --- a/net/dccp/Kconfig +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menuconfig IP_DCCP - tristate "The DCCP Protocol" - depends on INET - help - Datagram Congestion Control Protocol (RFC 4340) - - From https://www.ietf.org/rfc/rfc4340.txt: - - The Datagram Congestion Control Protocol (DCCP) is a transport - protocol that implements bidirectional, unicast connections of - congestion-controlled, unreliable datagrams. It should be suitable - for use by applications such as streaming media, Internet telephony, - and on-line games. - - To compile this protocol support as a module, choose M here: the - module will be called dccp. - - If in doubt, say N. - -if IP_DCCP - -config INET_DCCP_DIAG - depends on INET_DIAG - def_tristate y if (IP_DCCP = y && INET_DIAG = y) - def_tristate m - -source "net/dccp/ccids/Kconfig" - -menu "DCCP Kernel Hacking" - depends on DEBUG_KERNEL=y - -config IP_DCCP_DEBUG - bool "DCCP debug messages" - help - Only use this if you're hacking DCCP. - - When compiling DCCP as a module, this debugging output can be toggled - by setting the parameter dccp_debug of the `dccp' module to 0 or 1. - - Just say N. - - -endmenu - -endif # IP_DDCP diff --git a/net/dccp/Makefile b/net/dccp/Makefile deleted file mode 100644 index 5b4ff37bc806..000000000000 --- a/net/dccp/Makefile +++ /dev/null @@ -1,30 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o - -dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \ - qpolicy.o -# -# CCID algorithms to be used by dccp.ko -# -# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency -dccp-y += ccids/ccid2.o ackvec.o -dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o -dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \ - ccids/lib/tfrc_equation.o \ - ccids/lib/packet_history.o \ - ccids/lib/loss_interval.o - -dccp_ipv4-y := ipv4.o - -# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module -obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o -dccp_ipv6-y := ipv6.o - -obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o - -dccp-$(CONFIG_SYSCTL) += sysctl.o - -dccp_diag-y := diag.o - -# build with local directory for trace.h -CFLAGS_proto.o := -I$(src) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c deleted file mode 100644 index 1cba001bb4c8..000000000000 --- a/net/dccp/ackvec.c +++ /dev/null @@ -1,403 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/ackvec.c - * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> - */ -#include "dccp.h" -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/export.h> - -static struct kmem_cache *dccp_ackvec_slab; -static struct kmem_cache *dccp_ackvec_record_slab; - -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) -{ - struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); - - if (av != NULL) { - av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; - INIT_LIST_HEAD(&av->av_records); - } - return av; -} - -static void dccp_ackvec_purge_records(struct dccp_ackvec *av) -{ - struct dccp_ackvec_record *cur, *next; - - list_for_each_entry_safe(cur, next, &av->av_records, avr_node) - kmem_cache_free(dccp_ackvec_record_slab, cur); - INIT_LIST_HEAD(&av->av_records); -} - -void dccp_ackvec_free(struct dccp_ackvec *av) -{ - if (likely(av != NULL)) { - dccp_ackvec_purge_records(av); - kmem_cache_free(dccp_ackvec_slab, av); - } -} - -/** - * dccp_ackvec_update_records - Record information about sent Ack Vectors - * @av: Ack Vector records to update - * @seqno: Sequence number of the packet carrying the Ack Vector just sent - * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector - */ -int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) -{ - struct dccp_ackvec_record *avr; - - avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); - if (avr == NULL) - return -ENOBUFS; - - avr->avr_ack_seqno = seqno; - avr->avr_ack_ptr = av->av_buf_head; - avr->avr_ack_ackno = av->av_buf_ackno; - avr->avr_ack_nonce = nonce_sum; - avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); - /* - * When the buffer overflows, we keep no more than one record. This is - * the simplest way of disambiguating sender-Acks dating from before the - * overflow from sender-Acks which refer to after the overflow; a simple - * solution is preferable here since we are handling an exception. - */ - if (av->av_overflow) - dccp_ackvec_purge_records(av); - /* - * Since GSS is incremented for each packet, the list is automatically - * arranged in descending order of @ack_seqno. - */ - list_add(&avr->avr_node, &av->av_records); - - dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", - (unsigned long long)avr->avr_ack_seqno, - (unsigned long long)avr->avr_ack_ackno, - avr->avr_ack_runlen); - return 0; -} - -static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, - const u64 ackno) -{ - struct dccp_ackvec_record *avr; - /* - * Exploit that records are inserted in descending order of sequence - * number, start with the oldest record first. If @ackno is `before' - * the earliest ack_ackno, the packet is too old to be considered. - */ - list_for_each_entry_reverse(avr, av_list, avr_node) { - if (avr->avr_ack_seqno == ackno) - return avr; - if (before48(ackno, avr->avr_ack_seqno)) - break; - } - return NULL; -} - -/* - * Buffer index and length computation using modulo-buffersize arithmetic. - * Note that, as pointers move from right to left, head is `before' tail. - */ -static inline u16 __ackvec_idx_add(const u16 a, const u16 b) -{ - return (a + b) % DCCPAV_MAX_ACKVEC_LEN; -} - -static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) -{ - return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); -} - -u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) -{ - if (unlikely(av->av_overflow)) - return DCCPAV_MAX_ACKVEC_LEN; - return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); -} - -/** - * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 - * @av: non-empty buffer to update - * @distance: negative or zero distance of @seqno from buf_ackno downward - * @seqno: the (old) sequence number whose record is to be updated - * @state: state in which packet carrying @seqno was received - */ -static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, - u64 seqno, enum dccp_ackvec_states state) -{ - u16 ptr = av->av_buf_head; - - BUG_ON(distance > 0); - if (unlikely(dccp_ackvec_is_empty(av))) - return; - - do { - u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); - - if (distance + runlen >= 0) { - /* - * Only update the state if packet has not been received - * yet. This is OK as per the second table in RFC 4340, - * 11.4.1; i.e. here we are using the following table: - * RECEIVED - * 0 1 3 - * S +---+---+---+ - * T 0 | 0 | 0 | 0 | - * O +---+---+---+ - * R 1 | 1 | 1 | 1 | - * E +---+---+---+ - * D 3 | 0 | 1 | 3 | - * +---+---+---+ - * The "Not Received" state was set by reserve_seats(). - */ - if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) - av->av_buf[ptr] = state; - else - dccp_pr_debug("Not changing %llu state to %u\n", - (unsigned long long)seqno, state); - break; - } - - distance += runlen + 1; - ptr = __ackvec_idx_add(ptr, 1); - - } while (ptr != av->av_buf_tail); -} - -/* Mark @num entries after buf_head as "Not yet received". */ -static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) -{ - u16 start = __ackvec_idx_add(av->av_buf_head, 1), - len = DCCPAV_MAX_ACKVEC_LEN - start; - - /* check for buffer wrap-around */ - if (num > len) { - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); - start = 0; - num -= len; - } - if (num) - memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); -} - -/** - * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer - * @av: container of buffer to update (can be empty or non-empty) - * @num_packets: number of packets to register (must be >= 1) - * @seqno: sequence number of the first packet in @num_packets - * @state: state in which packet carrying @seqno was received - */ -static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, - u64 seqno, enum dccp_ackvec_states state) -{ - u32 num_cells = num_packets; - - if (num_packets > DCCPAV_BURST_THRESH) { - u32 lost_packets = num_packets - 1; - - DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); - /* - * We received 1 packet and have a loss of size "num_packets-1" - * which we squeeze into num_cells-1 rather than reserving an - * entire byte for each lost packet. - * The reason is that the vector grows in O(burst_length); when - * it grows too large there will no room left for the payload. - * This is a trade-off: if a few packets out of the burst show - * up later, their state will not be changed; it is simply too - * costly to reshuffle/reallocate/copy the buffer each time. - * Should such problems persist, we will need to switch to a - * different underlying data structure. - */ - for (num_packets = num_cells = 1; lost_packets; ++num_cells) { - u8 len = min_t(u32, lost_packets, DCCPAV_MAX_RUNLEN); - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); - av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; - - lost_packets -= len; - } - } - - if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { - DCCP_CRIT("Ack Vector buffer overflow: dropping old entries"); - av->av_overflow = true; - } - - av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); - if (av->av_overflow) - av->av_buf_tail = av->av_buf_head; - - av->av_buf[av->av_buf_head] = state; - av->av_buf_ackno = seqno; - - if (num_packets > 1) - dccp_ackvec_reserve_seats(av, num_packets - 1); -} - -/** - * dccp_ackvec_input - Register incoming packet in the buffer - * @av: Ack Vector to register packet to - * @skb: Packet to register - */ -void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) -{ - u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; - enum dccp_ackvec_states state = DCCPAV_RECEIVED; - - if (dccp_ackvec_is_empty(av)) { - dccp_ackvec_add_new(av, 1, seqno, state); - av->av_tail_ackno = seqno; - - } else { - s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); - u8 *current_head = av->av_buf + av->av_buf_head; - - if (num_packets == 1 && - dccp_ackvec_state(current_head) == state && - dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { - - *current_head += 1; - av->av_buf_ackno = seqno; - - } else if (num_packets > 0) { - dccp_ackvec_add_new(av, num_packets, seqno, state); - } else { - dccp_ackvec_update_old(av, num_packets, seqno, state); - } - } -} - -/** - * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection - * @av: Ack Vector record to clean - * @ackno: last Ack Vector which has been acknowledged - * - * This routine is called when the peer acknowledges the receipt of Ack Vectors - * up to and including @ackno. While based on section A.3 of RFC 4340, here - * are additional precautions to prevent corrupted buffer state. In particular, - * we use tail_ackno to identify outdated records; it always marks the earliest - * packet of group (2) in 11.4.2. - */ -void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) -{ - struct dccp_ackvec_record *avr, *next; - u8 runlen_now, eff_runlen; - s64 delta; - - avr = dccp_ackvec_lookup(&av->av_records, ackno); - if (avr == NULL) - return; - /* - * Deal with outdated acknowledgments: this arises when e.g. there are - * several old records and the acks from the peer come in slowly. In - * that case we may still have records that pre-date tail_ackno. - */ - delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); - if (delta < 0) - goto free_records; - /* - * Deal with overlapping Ack Vectors: don't subtract more than the - * number of packets between tail_ackno and ack_ackno. - */ - eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; - - runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); - /* - * The run length of Ack Vector cells does not decrease over time. If - * the run length is the same as at the time the Ack Vector was sent, we - * free the ack_ptr cell. That cell can however not be freed if the run - * length has increased: in this case we need to move the tail pointer - * backwards (towards higher indices), to its next-oldest neighbour. - */ - if (runlen_now > eff_runlen) { - - av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; - av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); - - /* This move may not have cleared the overflow flag. */ - if (av->av_overflow) - av->av_overflow = (av->av_buf_head == av->av_buf_tail); - } else { - av->av_buf_tail = avr->avr_ack_ptr; - /* - * We have made sure that avr points to a valid cell within the - * buffer. This cell is either older than head, or equals head - * (empty buffer): in both cases we no longer have any overflow. - */ - av->av_overflow = 0; - } - - /* - * The peer has acknowledged up to and including ack_ackno. Hence the - * first packet in group (2) of 11.4.2 is the successor of ack_ackno. - */ - av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); - -free_records: - list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { - list_del(&avr->avr_node); - kmem_cache_free(dccp_ackvec_record_slab, avr); - } -} - -/* - * Routines to keep track of Ack Vectors received in an skb - */ -int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) -{ - struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); - - if (new == NULL) - return -ENOBUFS; - new->vec = vec; - new->len = len; - new->nonce = nonce; - - list_add_tail(&new->node, head); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); - -void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) -{ - struct dccp_ackvec_parsed *cur, *next; - - list_for_each_entry_safe(cur, next, parsed_chunks, node) - kfree(cur); - INIT_LIST_HEAD(parsed_chunks); -} -EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); - -int __init dccp_ackvec_init(void) -{ - dccp_ackvec_slab = KMEM_CACHE(dccp_ackvec, SLAB_HWCACHE_ALIGN); - if (dccp_ackvec_slab == NULL) - goto out_err; - - dccp_ackvec_record_slab = KMEM_CACHE(dccp_ackvec_record, SLAB_HWCACHE_ALIGN); - if (dccp_ackvec_record_slab == NULL) - goto out_destroy_slab; - - return 0; - -out_destroy_slab: - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; -out_err: - DCCP_CRIT("Unable to create Ack Vector slab cache"); - return -ENOBUFS; -} - -void dccp_ackvec_exit(void) -{ - kmem_cache_destroy(dccp_ackvec_slab); - dccp_ackvec_slab = NULL; - kmem_cache_destroy(dccp_ackvec_record_slab); - dccp_ackvec_record_slab = NULL; -} diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h deleted file mode 100644 index d2c4220fb377..000000000000 --- a/net/dccp/ackvec.h +++ /dev/null @@ -1,136 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _ACKVEC_H -#define _ACKVEC_H -/* - * net/dccp/ackvec.h - * - * An implementation of Ack Vectors for the DCCP protocol - * Copyright (c) 2007 University of Aberdeen, Scotland, UK - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com> - */ - -#include <linux/dccp.h> -#include <linux/compiler.h> -#include <linux/list.h> -#include <linux/types.h> - -/* - * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, - * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 - * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives - * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. - * The maximum value is bounded by the u16 types for indices and functions. - */ -#define DCCPAV_NUM_ACKVECS 2 -#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) - -/* Estimated minimum average Ack Vector length - used for updating MPS */ -#define DCCPAV_MIN_OPTLEN 16 - -/* Threshold for coping with large bursts of losses */ -#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) - -enum dccp_ackvec_states { - DCCPAV_RECEIVED = 0x00, - DCCPAV_ECN_MARKED = 0x40, - DCCPAV_RESERVED = 0x80, - DCCPAV_NOT_RECEIVED = 0xC0 -}; -#define DCCPAV_MAX_RUNLEN 0x3F - -static inline u8 dccp_ackvec_runlen(const u8 *cell) -{ - return *cell & DCCPAV_MAX_RUNLEN; -} - -static inline u8 dccp_ackvec_state(const u8 *cell) -{ - return *cell & ~DCCPAV_MAX_RUNLEN; -} - -/** - * struct dccp_ackvec - Ack Vector main data structure - * - * This implements a fixed-size circular buffer within an array and is largely - * based on Appendix A of RFC 4340. - * - * @av_buf: circular buffer storage area - * @av_buf_head: head index; begin of live portion in @av_buf - * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf - * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf - * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf - * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to - * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf - * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound - * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) - */ -struct dccp_ackvec { - u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; - u16 av_buf_head; - u16 av_buf_tail; - u64 av_buf_ackno:48; - u64 av_tail_ackno:48; - bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; - u8 av_overflow:1; - struct list_head av_records; -}; - -/** - * struct dccp_ackvec_record - Records information about sent Ack Vectors - * - * These list entries define the additional information which the HC-Receiver - * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. - * - * @avr_node: the list node in @av_records - * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on - * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to - * @avr_ack_ptr: pointer into @av_buf where this record starts - * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending - * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent - * - * The list as a whole is sorted in descending order by @avr_ack_seqno. - */ -struct dccp_ackvec_record { - struct list_head avr_node; - u64 avr_ack_seqno:48; - u64 avr_ack_ackno:48; - u16 avr_ack_ptr; - u8 avr_ack_runlen; - u8 avr_ack_nonce:1; -}; - -int dccp_ackvec_init(void); -void dccp_ackvec_exit(void); - -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); -void dccp_ackvec_free(struct dccp_ackvec *av); - -void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); -int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); -void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); -u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); - -static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) -{ - return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; -} - -/** - * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb - * @vec: start of vector (offset into skb) - * @len: length of @vec - * @nonce: whether @vec had an ECN nonce of 0 or 1 - * @node: FIFO - arranged in descending order of ack_ackno - * - * This structure is used by CCIDs to access Ack Vectors in a received skb. - */ -struct dccp_ackvec_parsed { - u8 *vec, - len, - nonce:1; - struct list_head node; -}; - -int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce); -void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); -#endif /* _ACKVEC_H */ diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c deleted file mode 100644 index 6beac5d348e2..000000000000 --- a/net/dccp/ccid.c +++ /dev/null @@ -1,219 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/ccid.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * - * CCID infrastructure - */ - -#include <linux/slab.h> - -#include "ccid.h" -#include "ccids/lib/tfrc.h" - -static struct ccid_operations *ccids[] = { - &ccid2_ops, -#ifdef CONFIG_IP_DCCP_CCID3 - &ccid3_ops, -#endif -}; - -static struct ccid_operations *ccid_by_number(const u8 id) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) - if (ccids[i]->ccid_id == id) - return ccids[i]; - return NULL; -} - -/* check that up to @array_len members in @ccid_array are supported */ -bool ccid_support_check(u8 const *ccid_array, u8 array_len) -{ - while (array_len > 0) - if (ccid_by_number(ccid_array[--array_len]) == NULL) - return false; - return true; -} - -/** - * ccid_get_builtin_ccids - Populate a list of built-in CCIDs - * @ccid_array: pointer to copy into - * @array_len: value to return length into - * - * This function allocates memory - caller must see that it is freed after use. - */ -int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) -{ - *ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any()); - if (*ccid_array == NULL) - return -ENOBUFS; - - for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1) - (*ccid_array)[*array_len] = ccids[*array_len]->ccid_id; - return 0; -} - -int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *optval, int __user *optlen) -{ - u8 *ccid_array, array_len; - int err = 0; - - if (ccid_get_builtin_ccids(&ccid_array, &array_len)) - return -ENOBUFS; - - if (put_user(array_len, optlen)) - err = -EFAULT; - else if (len > 0 && copy_to_user(optval, ccid_array, - len > array_len ? array_len : len)) - err = -EFAULT; - - kfree(ccid_array); - return err; -} - -static __printf(3, 4) struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...) -{ - struct kmem_cache *slab; - va_list args; - - va_start(args, fmt); - vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args); - va_end(args); - - slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); - return slab; -} - -static void ccid_kmem_cache_destroy(struct kmem_cache *slab) -{ - kmem_cache_destroy(slab); -} - -static int __init ccid_activate(struct ccid_operations *ccid_ops) -{ - int err = -ENOBUFS; - - ccid_ops->ccid_hc_rx_slab = - ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size, - ccid_ops->ccid_hc_rx_slab_name, - "ccid%u_hc_rx_sock", - ccid_ops->ccid_id); - if (ccid_ops->ccid_hc_rx_slab == NULL) - goto out; - - ccid_ops->ccid_hc_tx_slab = - ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size, - ccid_ops->ccid_hc_tx_slab_name, - "ccid%u_hc_tx_sock", - ccid_ops->ccid_id); - if (ccid_ops->ccid_hc_tx_slab == NULL) - goto out_free_rx_slab; - - pr_info("DCCP: Activated CCID %d (%s)\n", - ccid_ops->ccid_id, ccid_ops->ccid_name); - err = 0; -out: - return err; -out_free_rx_slab: - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); - ccid_ops->ccid_hc_rx_slab = NULL; - goto out; -} - -static void ccid_deactivate(struct ccid_operations *ccid_ops) -{ - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab); - ccid_ops->ccid_hc_tx_slab = NULL; - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); - ccid_ops->ccid_hc_rx_slab = NULL; - - pr_info("DCCP: Deactivated CCID %d (%s)\n", - ccid_ops->ccid_id, ccid_ops->ccid_name); -} - -struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx) -{ - struct ccid_operations *ccid_ops = ccid_by_number(id); - struct ccid *ccid = NULL; - - if (ccid_ops == NULL) - goto out; - - ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab : - ccid_ops->ccid_hc_tx_slab, gfp_any()); - if (ccid == NULL) - goto out; - ccid->ccid_ops = ccid_ops; - if (rx) { - memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size); - if (ccid->ccid_ops->ccid_hc_rx_init != NULL && - ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0) - goto out_free_ccid; - } else { - memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size); - if (ccid->ccid_ops->ccid_hc_tx_init != NULL && - ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0) - goto out_free_ccid; - } -out: - return ccid; -out_free_ccid: - kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab : - ccid_ops->ccid_hc_tx_slab, ccid); - ccid = NULL; - goto out; -} - -void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk) -{ - if (ccid != NULL) { - if (ccid->ccid_ops->ccid_hc_rx_exit != NULL) - ccid->ccid_ops->ccid_hc_rx_exit(sk); - kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid); - } -} - -void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk) -{ - if (ccid != NULL) { - if (ccid->ccid_ops->ccid_hc_tx_exit != NULL) - ccid->ccid_ops->ccid_hc_tx_exit(sk); - kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid); - } -} - -int __init ccid_initialize_builtins(void) -{ - int i, err = tfrc_lib_init(); - - if (err) - return err; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) { - err = ccid_activate(ccids[i]); - if (err) - goto unwind_registrations; - } - return 0; - -unwind_registrations: - while(--i >= 0) - ccid_deactivate(ccids[i]); - tfrc_lib_exit(); - return err; -} - -void ccid_cleanup_builtins(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ccids); i++) - ccid_deactivate(ccids[i]); - tfrc_lib_exit(); -} diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h deleted file mode 100644 index 105f3734dadb..000000000000 --- a/net/dccp/ccid.h +++ /dev/null @@ -1,262 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _CCID_H -#define _CCID_H -/* - * net/dccp/ccid.h - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * - * CCID infrastructure - */ - -#include <net/sock.h> -#include <linux/compiler.h> -#include <linux/dccp.h> -#include <linux/list.h> -#include <linux/module.h> - -/* maximum value for a CCID (RFC 4340, 19.5) */ -#define CCID_MAX 255 -#define CCID_SLAB_NAME_LENGTH 32 - -struct tcp_info; - -/** - * struct ccid_operations - Interface to Congestion-Control Infrastructure - * - * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) - * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled) - * @ccid_name: alphabetical identifier string for @ccid_id - * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection - * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket - * - * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup) - * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction) - * @ccid_hc_rx_packet_recv: implements the HC-receiver side - * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options - * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options - * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender - * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender - * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender - * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender - * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender - */ -struct ccid_operations { - unsigned char ccid_id; - __u32 ccid_ccmps; - const char *ccid_name; - struct kmem_cache *ccid_hc_rx_slab, - *ccid_hc_tx_slab; - char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH]; - char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH]; - __u32 ccid_hc_rx_obj_size, - ccid_hc_tx_obj_size; - /* Interface Routines */ - int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk); - int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk); - void (*ccid_hc_rx_exit)(struct sock *sk); - void (*ccid_hc_tx_exit)(struct sock *sk); - void (*ccid_hc_rx_packet_recv)(struct sock *sk, - struct sk_buff *skb); - int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); - int (*ccid_hc_rx_insert_options)(struct sock *sk, - struct sk_buff *skb); - void (*ccid_hc_tx_packet_recv)(struct sock *sk, - struct sk_buff *skb); - int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, - u8 opt, u8 *val, u8 len); - int (*ccid_hc_tx_send_packet)(struct sock *sk, - struct sk_buff *skb); - void (*ccid_hc_tx_packet_sent)(struct sock *sk, - unsigned int len); - void (*ccid_hc_rx_get_info)(struct sock *sk, - struct tcp_info *info); - void (*ccid_hc_tx_get_info)(struct sock *sk, - struct tcp_info *info); - int (*ccid_hc_rx_getsockopt)(struct sock *sk, - const int optname, int len, - u32 __user *optval, - int __user *optlen); - int (*ccid_hc_tx_getsockopt)(struct sock *sk, - const int optname, int len, - u32 __user *optval, - int __user *optlen); -}; - -extern struct ccid_operations ccid2_ops; -#ifdef CONFIG_IP_DCCP_CCID3 -extern struct ccid_operations ccid3_ops; -#endif - -int ccid_initialize_builtins(void); -void ccid_cleanup_builtins(void); - -struct ccid { - struct ccid_operations *ccid_ops; - char ccid_priv[]; -}; - -static inline void *ccid_priv(const struct ccid *ccid) -{ - return (void *)ccid->ccid_priv; -} - -bool ccid_support_check(u8 const *ccid_array, u8 array_len); -int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); -int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, - char __user *, int __user *); - -struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx); - -static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_rx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} - -static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) -{ - struct ccid *ccid = dp->dccps_hc_tx_ccid; - - if (ccid == NULL || ccid->ccid_ops == NULL) - return -1; - return ccid->ccid_ops->ccid_id; -} - -void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); -void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); - -/* - * Congestion control of queued data packets via CCID decision. - * - * The TX CCID performs its congestion-control by indicating whether and when a - * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). - * The following modes are supported via the symbolic constants below: - * - timer-based pacing (CCID returns a delay value in milliseconds); - * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). - */ - -enum ccid_dequeueing_decision { - CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ - CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ - CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ - CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ - CCID_PACKET_ERR = 0xF0000, /* error condition */ -}; - -static inline int ccid_packet_dequeue_eval(const int return_code) -{ - if (return_code < 0) - return CCID_PACKET_ERR; - if (return_code == 0) - return CCID_PACKET_SEND_AT_ONCE; - if (return_code <= CCID_PACKET_DELAY_MAX) - return CCID_PACKET_DELAY; - return return_code; -} - -static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) - return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); - return CCID_PACKET_SEND_AT_ONCE; -} - -static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, - unsigned int len) -{ - if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); -} - -static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL) - ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb); -} - -static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); -} - -/** - * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver - * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) - * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) - * @val: value of @opt - * @len: length of @val in bytes - */ -static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) -{ - if (!ccid || !ccid->ccid_ops->ccid_hc_tx_parse_options) - return 0; - return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); -} - -/** - * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender - * Arguments are analogous to ccid_hc_tx_parse_options() - */ -static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, - u8 pkt, u8 opt, u8 *val, u8 len) -{ - if (!ccid || !ccid->ccid_ops->ccid_hc_rx_parse_options) - return 0; - return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); -} - -static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, - struct sk_buff *skb) -{ - if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL) - return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb); - return 0; -} - -static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk, - struct tcp_info *info) -{ - if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL) - ccid->ccid_ops->ccid_hc_rx_get_info(sk, info); -} - -static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk, - struct tcp_info *info) -{ - if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL) - ccid->ccid_ops->ccid_hc_tx_get_info(sk, info); -} - -static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk, - const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - int rc = -ENOPROTOOPT; - if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) - rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, - optval, optlen); - return rc; -} - -static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk, - const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - int rc = -ENOPROTOOPT; - if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) - rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, - optval, optlen); - return rc; -} -#endif /* _CCID_H */ diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig deleted file mode 100644 index e3d388c33d25..000000000000 --- a/net/dccp/ccids/Kconfig +++ /dev/null @@ -1,55 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -menu "DCCP CCIDs Configuration" - -config IP_DCCP_CCID2_DEBUG - bool "CCID-2 debugging messages" - help - Enable CCID-2 specific debugging messages. - - The debugging output can additionally be toggled by setting the - ccid2_debug parameter to 0 or 1. - - If in doubt, say N. - -config IP_DCCP_CCID3 - bool "CCID-3 (TCP-Friendly)" - default IP_DCCP = y || IP_DCCP = m - help - CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based - rate-controlled congestion control mechanism. TFRC is designed to - be reasonably fair when competing for bandwidth with TCP-like flows, - where a flow is "reasonably fair" if its sending rate is generally - within a factor of two of the sending rate of a TCP flow under the - same conditions. However, TFRC has a much lower variation of - throughput over time compared with TCP, which makes CCID-3 more - suitable than CCID-2 for applications such streaming media where a - relatively smooth sending rate is of importance. - - CCID-3 is further described in RFC 4342, - https://www.ietf.org/rfc/rfc4342.txt - - The TFRC congestion control algorithms were initially described in - RFC 5348. - - This text was extracted from RFC 4340 (sec. 10.2), - https://www.ietf.org/rfc/rfc4340.txt - - If in doubt, say N. - -config IP_DCCP_CCID3_DEBUG - bool "CCID-3 debugging messages" - depends on IP_DCCP_CCID3 - help - Enable CCID-3 specific debugging messages. - - The debugging output can additionally be toggled by setting the - ccid3_debug parameter to 0 or 1. - - If in doubt, say N. - -config IP_DCCP_TFRC_LIB - def_bool y if IP_DCCP_CCID3 - -config IP_DCCP_TFRC_DEBUG - def_bool y if IP_DCCP_CCID3_DEBUG -endmenu diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c deleted file mode 100644 index d6b30700af67..000000000000 --- a/net/dccp/ccids/ccid2.c +++ /dev/null @@ -1,794 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk> - * - * Changes to meet Linux coding standards, and DCCP infrastructure fixes. - * - * Copyright (c) 2006 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -/* - * This implementation should follow RFC 4341 - */ -#include <linux/slab.h> -#include "../feat.h" -#include "ccid2.h" - - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG -static bool ccid2_debug; -#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) -#else -#define ccid2_pr_debug(format, a...) -#endif - -static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) -{ - struct ccid2_seq *seqp; - int i; - - /* check if we have space to preserve the pointer to the buffer */ - if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) / - sizeof(struct ccid2_seq *))) - return -ENOMEM; - - /* allocate buffer and initialize linked list */ - seqp = kmalloc_array(CCID2_SEQBUF_LEN, sizeof(struct ccid2_seq), - gfp_any()); - if (seqp == NULL) - return -ENOMEM; - - for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) { - seqp[i].ccid2s_next = &seqp[i + 1]; - seqp[i + 1].ccid2s_prev = &seqp[i]; - } - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp; - seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - - /* This is the first allocation. Initiate the head and tail. */ - if (hc->tx_seqbufc == 0) - hc->tx_seqh = hc->tx_seqt = seqp; - else { - /* link the existing list with the one we just created */ - hc->tx_seqh->ccid2s_next = seqp; - seqp->ccid2s_prev = hc->tx_seqh; - - hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt; - } - - /* store the original pointer to the buffer so we can free it */ - hc->tx_seqbuf[hc->tx_seqbufc] = seqp; - hc->tx_seqbufc++; - - return 0; -} - -static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) -{ - if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) - return CCID_PACKET_WILL_DEQUEUE_LATER; - return CCID_PACKET_SEND_AT_ONCE; -} - -static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) -{ - u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2); - - /* - * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from - * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always - * acceptable since this causes starvation/deadlock whenever cwnd < 2. - * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled). - */ - if (val == 0 || val > max_ratio) { - DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); - val = max_ratio; - } - dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO, - min_t(u32, val, DCCPF_ACK_RATIO_MAX)); -} - -static void ccid2_check_l_ack_ratio(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - /* - * After a loss, idle period, application limited period, or RTO we - * need to check that the ack ratio is still less than the congestion - * window. Otherwise, we will send an entire congestion window of - * packets and got no response because we haven't sent ack ratio - * packets yet. - * If the ack ratio does need to be reduced, we reduce it to half of - * the congestion window (or 1 if that's zero) instead of to the - * congestion window. This prevents problems if one ack is lost. - */ - if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd) - ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U); -} - -static void ccid2_change_l_seq_window(struct sock *sk, u64 val) -{ - dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW, - clamp_val(val, DCCPF_SEQ_WMIN, - DCCPF_SEQ_WMAX)); -} - -static void dccp_tasklet_schedule(struct sock *sk) -{ - struct tasklet_struct *t = &dccp_sk(sk)->dccps_xmitlet; - - if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { - sock_hold(sk); - __tasklet_schedule(t); - } -} - -static void ccid2_hc_tx_rto_expire(struct timer_list *t) -{ - struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer); - struct sock *sk = hc->sk; - const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5); - goto out; - } - - ccid2_pr_debug("RTO_EXPIRE\n"); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - /* back-off timer */ - hc->tx_rto <<= 1; - if (hc->tx_rto > DCCP_RTO_MAX) - hc->tx_rto = DCCP_RTO_MAX; - - /* adjust pipe, cwnd etc */ - hc->tx_ssthresh = hc->tx_cwnd / 2; - if (hc->tx_ssthresh < 2) - hc->tx_ssthresh = 2; - hc->tx_cwnd = 1; - hc->tx_pipe = 0; - - /* clear state about stuff we sent */ - hc->tx_seqt = hc->tx_seqh; - hc->tx_packets_acked = 0; - - /* clear ack ratio state. */ - hc->tx_rpseq = 0; - hc->tx_rpdupack = -1; - ccid2_change_l_ack_ratio(sk, 1); - - /* if we were blocked before, we may now send cwnd=1 packet */ - if (sender_was_blocked) - dccp_tasklet_schedule(sk); - /* restart backed-off timer */ - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/* - * Congestion window validation (RFC 2861). - */ -static bool ccid2_do_cwv = true; -module_param(ccid2_do_cwv, bool, 0644); -MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation"); - -/** - * ccid2_update_used_window - Track how much of cwnd is actually used - * @hc: socket to update window - * @new_wnd: new window values to add into the filter - * - * This is done in addition to CWV. The sender needs to have an idea of how many - * packets may be in flight, to set the local Sequence Window value accordingly - * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the - * maximum-used window. We use an EWMA low-pass filter to filter out noise. - */ -static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd) -{ - hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4; -} - -/* This borrows the code of tcp_cwnd_application_limited() */ -static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - /* don't reduce cwnd below the initial window (IW) */ - u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache), - win_used = max(hc->tx_cwnd_used, init_win); - - if (win_used < hc->tx_cwnd) { - hc->tx_ssthresh = max(hc->tx_ssthresh, - (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2)); - hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1; - } - hc->tx_cwnd_used = 0; - hc->tx_cwnd_stamp = now; - - ccid2_check_l_ack_ratio(sk); -} - -/* This borrows the code of tcp_cwnd_restart() */ -static void ccid2_cwnd_restart(struct sock *sk, const u32 now) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - u32 cwnd = hc->tx_cwnd, restart_cwnd, - iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache); - s32 delta = now - hc->tx_lsndtime; - - hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2)); - - /* don't reduce cwnd below the initial window (IW) */ - restart_cwnd = min(cwnd, iwnd); - - while ((delta -= hc->tx_rto) >= 0 && cwnd > restart_cwnd) - cwnd >>= 1; - hc->tx_cwnd = max(cwnd, restart_cwnd); - hc->tx_cwnd_stamp = now; - hc->tx_cwnd_used = 0; - - ccid2_check_l_ack_ratio(sk); -} - -static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - const u32 now = ccid2_jiffies32; - struct ccid2_seq *next; - - /* slow-start after idle periods (RFC 2581, RFC 2861) */ - if (ccid2_do_cwv && !hc->tx_pipe && - (s32)(now - hc->tx_lsndtime) >= hc->tx_rto) - ccid2_cwnd_restart(sk, now); - - hc->tx_lsndtime = now; - hc->tx_pipe += 1; - - /* see whether cwnd was fully used (RFC 2861), update expected window */ - if (ccid2_cwnd_network_limited(hc)) { - ccid2_update_used_window(hc, hc->tx_cwnd); - hc->tx_cwnd_used = 0; - hc->tx_cwnd_stamp = now; - } else { - if (hc->tx_pipe > hc->tx_cwnd_used) - hc->tx_cwnd_used = hc->tx_pipe; - - ccid2_update_used_window(hc, hc->tx_cwnd_used); - - if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto) - ccid2_cwnd_application_limited(sk, now); - } - - hc->tx_seqh->ccid2s_seq = dp->dccps_gss; - hc->tx_seqh->ccid2s_acked = 0; - hc->tx_seqh->ccid2s_sent = now; - - next = hc->tx_seqh->ccid2s_next; - /* check if we need to alloc more space */ - if (next == hc->tx_seqt) { - if (ccid2_hc_tx_alloc_seq(hc)) { - DCCP_CRIT("packet history - out of memory!"); - /* FIXME: find a more graceful way to bail out */ - return; - } - next = hc->tx_seqh->ccid2s_next; - BUG_ON(next == hc->tx_seqt); - } - hc->tx_seqh = next; - - ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe); - - /* - * FIXME: The code below is broken and the variables have been removed - * from the socket struct. The `ackloss' variable was always set to 0, - * and with arsent there are several problems: - * (i) it doesn't just count the number of Acks, but all sent packets; - * (ii) it is expressed in # of packets, not # of windows, so the - * comparison below uses the wrong formula: Appendix A of RFC 4341 - * comes up with the number K = cwnd / (R^2 - R) of consecutive windows - * of data with no lost or marked Ack packets. If arsent were the # of - * consecutive Acks received without loss, then Ack Ratio needs to be - * decreased by 1 when - * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2) - * where cwnd / R is the number of Acks received per window of data - * (cf. RFC 4341, App. A). The problems are that - * - arsent counts other packets as well; - * - the comparison uses a formula different from RFC 4341; - * - computing a cubic/quadratic equation each time is too complicated. - * Hence a different algorithm is needed. - */ -#if 0 - /* Ack Ratio. Need to maintain a concept of how many windows we sent */ - hc->tx_arsent++; - /* We had an ack loss in this window... */ - if (hc->tx_ackloss) { - if (hc->tx_arsent >= hc->tx_cwnd) { - hc->tx_arsent = 0; - hc->tx_ackloss = 0; - } - } else { - /* No acks lost up to now... */ - /* decrease ack ratio if enough packets were sent */ - if (dp->dccps_l_ack_ratio > 1) { - /* XXX don't calculate denominator each time */ - int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - - dp->dccps_l_ack_ratio; - - denom = hc->tx_cwnd * hc->tx_cwnd / denom; - - if (hc->tx_arsent >= denom) { - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); - hc->tx_arsent = 0; - } - } else { - /* we can't increase ack ratio further [1] */ - hc->tx_arsent = 0; /* or maybe set it to cwnd*/ - } - } -#endif - - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG - do { - struct ccid2_seq *seqp = hc->tx_seqt; - - while (seqp != hc->tx_seqh) { - ccid2_pr_debug("out seq=%llu acked=%d time=%u\n", - (unsigned long long)seqp->ccid2s_seq, - seqp->ccid2s_acked, seqp->ccid2s_sent); - seqp = seqp->ccid2s_next; - } - } while (0); - ccid2_pr_debug("=========\n"); -#endif -} - -/** - * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm - * @sk: socket to perform estimator on - * @mrtt: measured RTT - * - * This code is almost identical with TCP's tcp_rtt_estimator(), since - * - it has a higher sampling frequency (recommended by RFC 1323), - * - the RTO does not collapse into RTT due to RTTVAR going towards zero, - * - it is simple (cf. more complex proposals such as Eifel timer or research - * which suggests that the gain should be set according to window size), - * - in tests it was found to work well with CCID2 [gerrit]. - */ -static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - long m = mrtt ? : 1; - - if (hc->tx_srtt == 0) { - /* First measurement m */ - hc->tx_srtt = m << 3; - hc->tx_mdev = m << 1; - - hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk)); - hc->tx_rttvar = hc->tx_mdev_max; - - hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; - } else { - /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ - m -= (hc->tx_srtt >> 3); - hc->tx_srtt += m; - - /* Similarly, update scaled mdev with regard to |m| */ - if (m < 0) { - m = -m; - m -= (hc->tx_mdev >> 2); - /* - * This neutralises RTO increase when RTT < SRTT - mdev - * (see P. Sarolahti, A. Kuznetsov,"Congestion Control - * in Linux TCP", USENIX 2002, pp. 49-62). - */ - if (m > 0) - m >>= 3; - } else { - m -= (hc->tx_mdev >> 2); - } - hc->tx_mdev += m; - - if (hc->tx_mdev > hc->tx_mdev_max) { - hc->tx_mdev_max = hc->tx_mdev; - if (hc->tx_mdev_max > hc->tx_rttvar) - hc->tx_rttvar = hc->tx_mdev_max; - } - - /* - * Decay RTTVAR at most once per flight, exploiting that - * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) - * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) - * GAR is a useful bound for FlightSize = pipe. - * AWL is probably too low here, as it over-estimates pipe. - */ - if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { - if (hc->tx_mdev_max < hc->tx_rttvar) - hc->tx_rttvar -= (hc->tx_rttvar - - hc->tx_mdev_max) >> 2; - hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; - hc->tx_mdev_max = tcp_rto_min(sk); - } - } - - /* - * Set RTO from SRTT and RTTVAR - * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. - * This agrees with RFC 4341, 5: - * "Because DCCP does not retransmit data, DCCP does not require - * TCP's recommended minimum timeout of one second". - */ - hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; - - if (hc->tx_rto > DCCP_RTO_MAX) - hc->tx_rto = DCCP_RTO_MAX; -} - -static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, - unsigned int *maxincr) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio; - - if (hc->tx_cwnd < dp->dccps_l_seq_win && - r_seq_used < dp->dccps_r_seq_win) { - if (hc->tx_cwnd < hc->tx_ssthresh) { - if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) { - hc->tx_cwnd += 1; - *maxincr -= 1; - hc->tx_packets_acked = 0; - } - } else if (++hc->tx_packets_acked >= hc->tx_cwnd) { - hc->tx_cwnd += 1; - hc->tx_packets_acked = 0; - } - } - - /* - * Adjust the local sequence window and the ack ratio to allow about - * 5 times the number of packets in the network (RFC 4340 7.5.2) - */ - if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win) - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2); - else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2) - ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U); - - if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win) - ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2); - else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2) - ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2); - - /* - * FIXME: RTT is sampled several times per acknowledgment (for each - * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). - * This causes the RTT to be over-estimated, since the older entries - * in the Ack Vector have earlier sending times. - * The cleanest solution is to not use the ccid2s_sent field at all - * and instead use DCCP timestamps: requires changes in other places. - */ - ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent); -} - -static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) { - ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); - return; - } - - hc->tx_last_cong = ccid2_jiffies32; - - hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; - hc->tx_ssthresh = max(hc->tx_cwnd, 2U); - - ccid2_check_l_ack_ratio(sk); -} - -static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - switch (option) { - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen, - option - DCCPO_ACK_VECTOR_0); - } - return 0; -} - -static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - const bool sender_was_blocked = ccid2_cwnd_network_limited(hc); - struct dccp_ackvec_parsed *avp; - u64 ackno, seqno; - struct ccid2_seq *seqp; - int done = 0; - unsigned int maxincr = 0; - - /* check reverse path congestion */ - seqno = DCCP_SKB_CB(skb)->dccpd_seq; - - /* XXX this whole "algorithm" is broken. Need to fix it to keep track - * of the seqnos of the dupacks so that rpseq and rpdupack are correct - * -sorbo. - */ - /* need to bootstrap */ - if (hc->tx_rpdupack == -1) { - hc->tx_rpdupack = 0; - hc->tx_rpseq = seqno; - } else { - /* check if packet is consecutive */ - if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1) - hc->tx_rpseq = seqno; - /* it's a later packet */ - else if (after48(seqno, hc->tx_rpseq)) { - hc->tx_rpdupack++; - - /* check if we got enough dupacks */ - if (hc->tx_rpdupack >= NUMDUPACK) { - hc->tx_rpdupack = -1; /* XXX lame */ - hc->tx_rpseq = 0; -#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__ - /* - * FIXME: Ack Congestion Control is broken; in - * the current state instabilities occurred with - * Ack Ratios greater than 1; causing hang-ups - * and long RTO timeouts. This needs to be fixed - * before opening up dynamic changes. -- gerrit - */ - ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); -#endif - } - } - } - - /* check forward path congestion */ - if (dccp_packet_without_ack(skb)) - return; - - /* still didn't send out new data packets */ - if (hc->tx_seqh == hc->tx_seqt) - goto done; - - ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - if (after48(ackno, hc->tx_high_ack)) - hc->tx_high_ack = ackno; - - seqp = hc->tx_seqt; - while (before48(seqp->ccid2s_seq, ackno)) { - seqp = seqp->ccid2s_next; - if (seqp == hc->tx_seqh) { - seqp = hc->tx_seqh->ccid2s_prev; - break; - } - } - - /* - * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2 - * packets per acknowledgement. Rounding up avoids that cwnd is not - * advanced when Ack Ratio is 1 and gives a slight edge otherwise. - */ - if (hc->tx_cwnd < hc->tx_ssthresh) - maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); - - /* go through all ack vectors */ - list_for_each_entry(avp, &hc->tx_av_chunks, node) { - /* go through this ack vector */ - for (; avp->len--; avp->vec++) { - u64 ackno_end_rl = SUB48(ackno, - dccp_ackvec_runlen(avp->vec)); - - ccid2_pr_debug("ackvec %llu |%u,%u|\n", - (unsigned long long)ackno, - dccp_ackvec_state(avp->vec) >> 6, - dccp_ackvec_runlen(avp->vec)); - /* if the seqno we are analyzing is larger than the - * current ackno, then move towards the tail of our - * seqnos. - */ - while (after48(seqp->ccid2s_seq, ackno)) { - if (seqp == hc->tx_seqt) { - done = 1; - break; - } - seqp = seqp->ccid2s_prev; - } - if (done) - break; - - /* check all seqnos in the range of the vector - * run length - */ - while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = dccp_ackvec_state(avp->vec); - - /* new packet received or marked */ - if (state != DCCPAV_NOT_RECEIVED && - !seqp->ccid2s_acked) { - if (state == DCCPAV_ECN_MARKED) - ccid2_congestion_event(sk, - seqp); - else - ccid2_new_ack(sk, seqp, - &maxincr); - - seqp->ccid2s_acked = 1; - ccid2_pr_debug("Got ack for %llu\n", - (unsigned long long)seqp->ccid2s_seq); - hc->tx_pipe--; - } - if (seqp == hc->tx_seqt) { - done = 1; - break; - } - seqp = seqp->ccid2s_prev; - } - if (done) - break; - - ackno = SUB48(ackno_end_rl, 1); - } - if (done) - break; - } - - /* The state about what is acked should be correct now - * Check for NUMDUPACK - */ - seqp = hc->tx_seqt; - while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) { - seqp = seqp->ccid2s_next; - if (seqp == hc->tx_seqh) { - seqp = hc->tx_seqh->ccid2s_prev; - break; - } - } - done = 0; - while (1) { - if (seqp->ccid2s_acked) { - done++; - if (done == NUMDUPACK) - break; - } - if (seqp == hc->tx_seqt) - break; - seqp = seqp->ccid2s_prev; - } - - /* If there are at least 3 acknowledgements, anything unacknowledged - * below the last sequence number is considered lost - */ - if (done == NUMDUPACK) { - struct ccid2_seq *last_acked = seqp; - - /* check for lost packets */ - while (1) { - if (!seqp->ccid2s_acked) { - ccid2_pr_debug("Packet lost: %llu\n", - (unsigned long long)seqp->ccid2s_seq); - /* XXX need to traverse from tail -> head in - * order to detect multiple congestion events in - * one ack vector. - */ - ccid2_congestion_event(sk, seqp); - hc->tx_pipe--; - } - if (seqp == hc->tx_seqt) - break; - seqp = seqp->ccid2s_prev; - } - - hc->tx_seqt = last_acked; - } - - /* trim acked packets in tail */ - while (hc->tx_seqt != hc->tx_seqh) { - if (!hc->tx_seqt->ccid2s_acked) - break; - - hc->tx_seqt = hc->tx_seqt->ccid2s_next; - } - - /* restart RTO timer if not all outstanding data has been acked */ - if (hc->tx_pipe == 0) - sk_stop_timer(sk, &hc->tx_rtotimer); - else - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -done: - /* check if incoming Acks allow pending packets to be sent */ - if (sender_was_blocked && !ccid2_cwnd_network_limited(hc)) - dccp_tasklet_schedule(sk); - dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); -} - -static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid_priv(ccid); - struct dccp_sock *dp = dccp_sk(sk); - u32 max_ratio; - - /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ - hc->tx_ssthresh = ~0U; - - /* Use larger initial windows (RFC 4341, section 5). */ - hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); - hc->tx_expected_wnd = hc->tx_cwnd; - - /* Make sure that Ack Ratio is enabled and within bounds. */ - max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); - if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) - dp->dccps_l_ack_ratio = max_ratio; - - /* XXX init ~ to window size... */ - if (ccid2_hc_tx_alloc_seq(hc)) - return -ENOMEM; - - hc->tx_rto = DCCP_TIMEOUT_INIT; - hc->tx_rpdupack = -1; - hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32; - hc->tx_cwnd_used = 0; - hc->sk = sk; - timer_setup(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 0); - INIT_LIST_HEAD(&hc->tx_av_chunks); - return 0; -} - -static void ccid2_hc_tx_exit(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - int i; - - sk_stop_timer(sk, &hc->tx_rtotimer); - - for (i = 0; i < hc->tx_seqbufc; i++) - kfree(hc->tx_seqbuf[i]); - hc->tx_seqbufc = 0; - dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); -} - -static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk); - - if (!dccp_data_packet(skb)) - return; - - if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) { - dccp_send_ack(sk); - hc->rx_num_data_pkts = 0; - } -} - -struct ccid_operations ccid2_ops = { - .ccid_id = DCCPC_CCID2, - .ccid_name = "TCP-like", - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), - .ccid_hc_tx_init = ccid2_hc_tx_init, - .ccid_hc_tx_exit = ccid2_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, - .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, -}; - -#ifdef CONFIG_IP_DCCP_CCID2_DEBUG -module_param(ccid2_debug, bool, 0644); -MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages"); -#endif diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h deleted file mode 100644 index 330c7b4ec001..000000000000 --- a/net/dccp/ccids/ccid2.h +++ /dev/null @@ -1,121 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> - */ -#ifndef _DCCP_CCID2_H_ -#define _DCCP_CCID2_H_ - -#include <linux/timer.h> -#include <linux/types.h> -#include "../ccid.h" -#include "../dccp.h" - -/* - * CCID-2 timestamping faces the same issues as TCP timestamping. - * Hence we reuse/share as much of the code as possible. - */ -#define ccid2_jiffies32 ((u32)jiffies) - -/* NUMDUPACK parameter from RFC 4341, p. 6 */ -#define NUMDUPACK 3 - -struct ccid2_seq { - u64 ccid2s_seq; - u32 ccid2s_sent; - int ccid2s_acked; - struct ccid2_seq *ccid2s_prev; - struct ccid2_seq *ccid2s_next; -}; - -#define CCID2_SEQBUF_LEN 1024 -#define CCID2_SEQBUF_MAX 128 - -/* - * Multiple of congestion window to keep the sequence window at - * (RFC 4340 7.5.2) - */ -#define CCID2_WIN_CHANGE_FACTOR 5 - -/** - * struct ccid2_hc_tx_sock - CCID2 TX half connection - * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 - * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @tx_srtt: smoothed RTT estimate, scaled by 2^3 - * @tx_mdev: smoothed RTT variation, scaled by 2^2 - * @tx_mdev_max: maximum of @mdev during one flight - * @tx_rttvar: moving average/maximum of @mdev_max - * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) - * @tx_rtt_seq: to decay RTTVAR at most once per flight - * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861 - * @tx_expected_wnd: moving average of @tx_cwnd_used - * @tx_cwnd_stamp: to track idle periods in CWV - * @tx_lsndtime: last time (in jiffies) a data packet was sent - * @tx_rpseq: last consecutive seqno - * @tx_rpdupack: dupacks since rpseq - * @tx_av_chunks: list of Ack Vectors received on current skb - */ -struct ccid2_hc_tx_sock { - u32 tx_cwnd; - u32 tx_ssthresh; - u32 tx_pipe; - u32 tx_packets_acked; - struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX]; - int tx_seqbufc; - struct ccid2_seq *tx_seqh; - struct ccid2_seq *tx_seqt; - - /* RTT measurement: variables/principles are the same as in TCP */ - u32 tx_srtt, - tx_mdev, - tx_mdev_max, - tx_rttvar, - tx_rto; - u64 tx_rtt_seq:48; - struct timer_list tx_rtotimer; - struct sock *sk; - - /* Congestion Window validation (optional, RFC 2861) */ - u32 tx_cwnd_used, - tx_expected_wnd, - tx_cwnd_stamp, - tx_lsndtime; - - u64 tx_rpseq; - int tx_rpdupack; - u32 tx_last_cong; - u64 tx_high_ack; - struct list_head tx_av_chunks; -}; - -static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc) -{ - return hc->tx_pipe >= hc->tx_cwnd; -} - -/* - * Convert RFC 3390 larger initial window into an equivalent number of packets. - * This is based on the numbers specified in RFC 5681, 3.1. - */ -static inline u32 rfc3390_bytes_to_packets(const u32 smss) -{ - return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3); -} - -/** - * struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection - * @rx_num_data_pkts: number of data packets received since last feedback - */ -struct ccid2_hc_rx_sock { - u32 rx_num_data_pkts; -}; - -static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) -{ - return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); -} - -static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk) -{ - return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); -} -#endif /* _DCCP_CCID2_H_ */ diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c deleted file mode 100644 index f349d16dd8f6..000000000000 --- a/net/dccp/ccids/ccid3.c +++ /dev/null @@ -1,866 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ -#include "../dccp.h" -#include "ccid3.h" - -#include <linux/unaligned.h> - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static bool ccid3_debug; -#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a) -#else -#define ccid3_pr_debug(format, a...) -#endif - -/* - * Transmitter Half-Connection Routines - */ -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) -{ - static const char *const ccid3_state_names[] = { - [TFRC_SSTATE_NO_SENT] = "NO_SENT", - [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", - [TFRC_SSTATE_FBACK] = "FBACK", - }; - - return ccid3_state_names[state]; -} -#endif - -static void ccid3_hc_tx_set_state(struct sock *sk, - enum ccid3_hc_tx_states state) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - enum ccid3_hc_tx_states oldstate = hc->tx_state; - - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_tx_state_name(oldstate), - ccid3_tx_state_name(state)); - WARN_ON(state == oldstate); - hc->tx_state = state; -} - -/* - * Compute the initial sending rate X_init in the manner of RFC 3390: - * - * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT - * - * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis - * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. - * For consistency with other parts of the code, X_init is scaled by 2^6. - */ -static inline u64 rfc3390_initial_rate(struct sock *sk) -{ - const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s); - - return scaled_div(w_init << 6, hc->tx_rtt); -} - -/** - * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst - * @hc: socket to have the send interval updated - * - * This respects the granularity of X_inst (64 * bytes/second). - */ -static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) -{ - hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x); - - DCCP_BUG_ON(hc->tx_t_ipi == 0); - ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi, - hc->tx_s, (unsigned int)(hc->tx_x >> 6)); -} - -static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) -{ - u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count); - - return delta / hc->tx_rtt; -} - -/** - * ccid3_hc_tx_update_x - Update allowed sending rate X - * @sk: socket to be updated - * @stamp: most recent time if available - can be left NULL. - * - * This function tracks draft rfc3448bis, check there for latest details. - * - * Note: X and X_recv are both stored in units of 64 * bytes/second, to support - * fine-grained resolution of sending rates. This requires scaling by 2^6 - * throughout the code. Only X_calc is unscaled (in bytes/second). - * - */ -static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - __u64 min_rate = 2 * hc->tx_x_recv; - const __u64 old_x = hc->tx_x; - ktime_t now = stamp ? *stamp : ktime_get_real(); - - /* - * Handle IDLE periods: do not reduce below RFC3390 initial sending rate - * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis: - * a sender is idle if it has not sent anything over a 2-RTT-period. - * For consistency with X and X_recv, min_rate is also scaled by 2^6. - */ - if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) { - min_rate = rfc3390_initial_rate(sk); - min_rate = max(min_rate, 2 * hc->tx_x_recv); - } - - if (hc->tx_p > 0) { - - hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate); - hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); - - } else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) { - - hc->tx_x = min(2 * hc->tx_x, min_rate); - hc->tx_x = max(hc->tx_x, - scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt)); - hc->tx_t_ld = now; - } - - if (hc->tx_x != old_x) { - ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " - "X_recv=%u\n", (unsigned int)(old_x >> 6), - (unsigned int)(hc->tx_x >> 6), hc->tx_x_calc, - (unsigned int)(hc->tx_x_recv >> 6)); - - ccid3_update_send_interval(hc); - } -} - -/** - * ccid3_hc_tx_update_s - Track the mean packet size `s' - * @hc: socket to be updated - * @len: DCCP packet payload size in bytes - * - * cf. RFC 4342, 5.3 and RFC 3448, 4.1 - */ -static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len) -{ - const u16 old_s = hc->tx_s; - - hc->tx_s = tfrc_ewma(hc->tx_s, len, 9); - - if (hc->tx_s != old_s) - ccid3_update_send_interval(hc); -} - -/* - * Update Window Counter using the algorithm from [RFC 4342, 8.1]. - * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt(). - */ -static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc, - ktime_t now) -{ - u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count), - quarter_rtts = (4 * delta) / hc->tx_rtt; - - if (quarter_rtts > 0) { - hc->tx_t_last_win_count = now; - hc->tx_last_win_count += min(quarter_rtts, 5U); - hc->tx_last_win_count &= 0xF; /* mod 16 */ - } -} - -static void ccid3_hc_tx_no_feedback_timer(struct timer_list *t) -{ - struct ccid3_hc_tx_sock *hc = from_timer(hc, t, tx_no_feedback_timer); - struct sock *sk = hc->sk; - unsigned long t_nfb = USEC_PER_SEC / 5; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - /* XXX: set some sensible MIB */ - goto restart_timer; - } - - ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk, - ccid3_tx_state_name(hc->tx_state)); - - /* Ignore and do not restart after leaving the established state */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) - goto out; - - /* Reset feedback state to "no feedback received" */ - if (hc->tx_state == TFRC_SSTATE_FBACK) - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - - /* - * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 - * RTO is 0 if and only if no feedback has been received yet. - */ - if (hc->tx_t_rto == 0 || hc->tx_p == 0) { - - /* halve send rate directly */ - hc->tx_x = max(hc->tx_x / 2, - (((__u64)hc->tx_s) << 6) / TFRC_T_MBI); - ccid3_update_send_interval(hc); - } else { - /* - * Modify the cached value of X_recv - * - * If (X_calc > 2 * X_recv) - * X_recv = max(X_recv / 2, s / (2 * t_mbi)); - * Else - * X_recv = X_calc / 4; - * - * Note that X_recv is scaled by 2^6 while X_calc is not - */ - if (hc->tx_x_calc > (hc->tx_x_recv >> 5)) - hc->tx_x_recv = - max(hc->tx_x_recv / 2, - (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI)); - else { - hc->tx_x_recv = hc->tx_x_calc; - hc->tx_x_recv <<= 4; - } - ccid3_hc_tx_update_x(sk, NULL); - } - ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", - (unsigned long long)hc->tx_x); - - /* - * Set new timeout for the nofeedback timer. - * See comments in packet_recv() regarding the value of t_RTO. - */ - if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */ - t_nfb = TFRC_INITIAL_TIMEOUT; - else - t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); - -restart_timer: - sk_reset_timer(sk, &hc->tx_no_feedback_timer, - jiffies + usecs_to_jiffies(t_nfb)); -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/** - * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets - * @sk: socket to send packet from - * @skb: next packet candidate to send on @sk - * - * This function uses the convention of ccid_packet_dequeue_eval() and - * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. - */ -static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - ktime_t now = ktime_get_real(); - s64 delay; - - /* - * This function is called only for Data and DataAck packets. Sending - * zero-sized Data(Ack)s is theoretically possible, but for congestion - * control this case is pathological - ignore it. - */ - if (unlikely(skb->len == 0)) - return -EBADMSG; - - if (hc->tx_state == TFRC_SSTATE_NO_SENT) { - sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies + - usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); - hc->tx_last_win_count = 0; - hc->tx_t_last_win_count = now; - - /* Set t_0 for initial packet */ - hc->tx_t_nom = now; - - hc->tx_s = skb->len; - - /* - * Use initial RTT sample when available: recommended by erratum - * to RFC 4342. This implements the initialisation procedure of - * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6. - */ - if (dp->dccps_syn_rtt) { - ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); - hc->tx_rtt = dp->dccps_syn_rtt; - hc->tx_x = rfc3390_initial_rate(sk); - hc->tx_t_ld = now; - } else { - /* - * Sender does not have RTT sample: - * - set fallback RTT (RFC 4340, 3.4) since a RTT value - * is needed in several parts (e.g. window counter); - * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. - */ - hc->tx_rtt = DCCP_FALLBACK_RTT; - hc->tx_x = hc->tx_s; - hc->tx_x <<= 6; - } - ccid3_update_send_interval(hc); - - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - - } else { - delay = ktime_us_delta(hc->tx_t_nom, now); - ccid3_pr_debug("delay=%ld\n", (long)delay); - /* - * Scheduling of packet transmissions (RFC 5348, 8.3) - * - * if (t_now > t_nom - delta) - * // send the packet now - * else - * // send the packet in (t_nom - t_now) milliseconds. - */ - if (delay >= TFRC_T_DELTA) - return (u32)delay / USEC_PER_MSEC; - - ccid3_hc_tx_update_win_count(hc, now); - } - - /* prepare to send now (add options etc.) */ - dp->dccps_hc_tx_insert_options = 1; - DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count; - - /* set the nominal send time for the next following packet */ - hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi); - return CCID_PACKET_SEND_AT_ONCE; -} - -static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - - ccid3_hc_tx_update_s(hc, len); - - if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss)) - DCCP_CRIT("packet history - out of memory!"); -} - -static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct tfrc_tx_hist_entry *acked; - ktime_t now; - unsigned long t_nfb; - u32 r_sample; - - /* we are only interested in ACKs */ - if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || - DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) - return; - /* - * Locate the acknowledged packet in the TX history. - * - * Returning "entry not found" here can for instance happen when - * - the host has not sent out anything (e.g. a passive server), - * - the Ack is outdated (packet with higher Ack number was received), - * - it is a bogus Ack (for a packet not sent on this connection). - */ - acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb)); - if (acked == NULL) - return; - /* For the sake of RTT sampling, ignore/remove all older entries */ - tfrc_tx_hist_purge(&acked->next); - - /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ - now = ktime_get_real(); - r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); - hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); - - /* - * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 - */ - if (hc->tx_state == TFRC_SSTATE_NO_FBACK) { - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); - - if (hc->tx_t_rto == 0) { - /* - * Initial feedback packet: Larger Initial Windows (4.2) - */ - hc->tx_x = rfc3390_initial_rate(sk); - hc->tx_t_ld = now; - - ccid3_update_send_interval(hc); - - goto done_computing_x; - } else if (hc->tx_p == 0) { - /* - * First feedback after nofeedback timer expiry (4.3) - */ - goto done_computing_x; - } - } - - /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ - if (hc->tx_p > 0) - hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p); - ccid3_hc_tx_update_x(sk, &now); - -done_computing_x: - ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " - "p=%u, X_calc=%u, X_recv=%u, X=%u\n", - dccp_role(sk), sk, hc->tx_rtt, r_sample, - hc->tx_s, hc->tx_p, hc->tx_x_calc, - (unsigned int)(hc->tx_x_recv >> 6), - (unsigned int)(hc->tx_x >> 6)); - - /* unschedule no feedback timer */ - sk_stop_timer(sk, &hc->tx_no_feedback_timer); - - /* - * As we have calculated new ipi, delta, t_nom it is possible - * that we now can send a packet, so wake up dccp_wait_for_ccid - */ - sk->sk_write_space(sk); - - /* - * Update timeout interval for the nofeedback timer. In order to control - * rate halving on networks with very low RTTs (<= 1 ms), use per-route - * tunable RTAX_RTO_MIN value as the lower bound. - */ - hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, - USEC_PER_SEC/HZ * tcp_rto_min(sk)); - /* - * Schedule no feedback timer to expire in - * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) - */ - t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); - - ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " - "expire in %lu jiffies (%luus)\n", - dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); - - sk_reset_timer(sk, &hc->tx_no_feedback_timer, - jiffies + usecs_to_jiffies(t_nfb)); -} - -static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, - u8 option, u8 *optval, u8 optlen) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - __be32 opt_val; - - switch (option) { - case TFRC_OPT_RECEIVE_RATE: - case TFRC_OPT_LOSS_EVENT_RATE: - /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ - if (packet_type == DCCP_PKT_DATA) - break; - if (unlikely(optlen != 4)) { - DCCP_WARN("%s(%p), invalid len %d for %u\n", - dccp_role(sk), sk, optlen, option); - return -EINVAL; - } - opt_val = ntohl(get_unaligned((__be32 *)optval)); - - if (option == TFRC_OPT_RECEIVE_RATE) { - /* Receive Rate is kept in units of 64 bytes/second */ - hc->tx_x_recv = opt_val; - hc->tx_x_recv <<= 6; - - ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", - dccp_role(sk), sk, opt_val); - } else { - /* Update the fixpoint Loss Event Rate fraction */ - hc->tx_p = tfrc_invert_loss_event_rate(opt_val); - - ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", - dccp_role(sk), sk, opt_val); - } - } - return 0; -} - -static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid3_hc_tx_sock *hc = ccid_priv(ccid); - - hc->tx_state = TFRC_SSTATE_NO_SENT; - hc->tx_hist = NULL; - hc->sk = sk; - timer_setup(&hc->tx_no_feedback_timer, - ccid3_hc_tx_no_feedback_timer, 0); - return 0; -} - -static void ccid3_hc_tx_exit(struct sock *sk) -{ - struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - - sk_stop_timer(sk, &hc->tx_no_feedback_timer); - tfrc_tx_hist_purge(&hc->tx_hist); -} - -static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) -{ - info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto; - info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt; -} - -static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct tfrc_tx_info tfrc; - const void *val; - - switch (optname) { - case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(tfrc)) - return -EINVAL; - memset(&tfrc, 0, sizeof(tfrc)); - tfrc.tfrctx_x = hc->tx_x; - tfrc.tfrctx_x_recv = hc->tx_x_recv; - tfrc.tfrctx_x_calc = hc->tx_x_calc; - tfrc.tfrctx_rtt = hc->tx_rtt; - tfrc.tfrctx_p = hc->tx_p; - tfrc.tfrctx_rto = hc->tx_t_rto; - tfrc.tfrctx_ipi = hc->tx_t_ipi; - len = sizeof(tfrc); - val = &tfrc; - break; - default: - return -ENOPROTOOPT; - } - - if (put_user(len, optlen) || copy_to_user(optval, val, len)) - return -EFAULT; - - return 0; -} - -/* - * Receiver Half-Connection Routines - */ - -/* CCID3 feedback types */ -enum ccid3_fback_type { - CCID3_FBACK_NONE = 0, - CCID3_FBACK_INITIAL, - CCID3_FBACK_PERIODIC, - CCID3_FBACK_PARAM_CHANGE -}; - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) -{ - static const char *const ccid3_rx_state_names[] = { - [TFRC_RSTATE_NO_DATA] = "NO_DATA", - [TFRC_RSTATE_DATA] = "DATA", - }; - - return ccid3_rx_state_names[state]; -} -#endif - -static void ccid3_hc_rx_set_state(struct sock *sk, - enum ccid3_hc_rx_states state) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - enum ccid3_hc_rx_states oldstate = hc->rx_state; - - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_rx_state_name(oldstate), - ccid3_rx_state_name(state)); - WARN_ON(state == oldstate); - hc->rx_state = state; -} - -static void ccid3_hc_rx_send_feedback(struct sock *sk, - const struct sk_buff *skb, - enum ccid3_fback_type fbtype) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - ktime_t now = ktime_get(); - s64 delta = 0; - - switch (fbtype) { - case CCID3_FBACK_INITIAL: - hc->rx_x_recv = 0; - hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */ - break; - case CCID3_FBACK_PARAM_CHANGE: - /* - * When parameters change (new loss or p > p_prev), we do not - * have a reliable estimate for R_m of [RFC 3448, 6.2] and so - * need to reuse the previous value of X_recv. However, when - * X_recv was 0 (due to early loss), this would kill X down to - * s/t_mbi (i.e. one packet in 64 seconds). - * To avoid such drastic reduction, we approximate X_recv as - * the number of bytes since last feedback. - * This is a safe fallback, since X is bounded above by X_calc. - */ - if (hc->rx_x_recv > 0) - break; - fallthrough; - case CCID3_FBACK_PERIODIC: - delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback); - if (delta <= 0) - delta = 1; - hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta); - break; - default: - return; - } - - ccid3_pr_debug("Interval %lldusec, X_recv=%u, 1/p=%u\n", delta, - hc->rx_x_recv, hc->rx_pinv); - - hc->rx_tstamp_last_feedback = now; - hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval; - hc->rx_bytes_recv = 0; - - dp->dccps_hc_rx_insert_options = 1; - dccp_send_ack(sk); -} - -static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) -{ - const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - __be32 x_recv, pinv; - - if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) - return 0; - - if (dccp_packet_without_ack(skb)) - return 0; - - x_recv = htonl(hc->rx_x_recv); - pinv = htonl(hc->rx_pinv); - - if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE, - &pinv, sizeof(pinv)) || - dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE, - &x_recv, sizeof(x_recv))) - return -1; - - return 0; -} - -/** - * ccid3_first_li - Implements [RFC 5348, 6.3.1] - * @sk: socket to calculate loss interval for - * - * Determine the length of the first loss interval via inverse lookup. - * Assume that X_recv can be computed by the throughput equation - * s - * X_recv = -------- - * R * fval - * Find some p such that f(p) = fval; return 1/p (scaled). - */ -static u32 ccid3_first_li(struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - u32 x_recv, p; - s64 delta; - u64 fval; - - if (hc->rx_rtt == 0) { - DCCP_WARN("No RTT estimate available, using fallback RTT\n"); - hc->rx_rtt = DCCP_FALLBACK_RTT; - } - - delta = ktime_us_delta(ktime_get(), hc->rx_tstamp_last_feedback); - if (delta <= 0) - delta = 1; - x_recv = scaled_div32(hc->rx_bytes_recv, delta); - if (x_recv == 0) { /* would also trigger divide-by-zero */ - DCCP_WARN("X_recv==0\n"); - if (hc->rx_x_recv == 0) { - DCCP_BUG("stored value of X_recv is zero"); - return ~0U; - } - x_recv = hc->rx_x_recv; - } - - fval = scaled_div(hc->rx_s, hc->rx_rtt); - fval = scaled_div32(fval, x_recv); - p = tfrc_calc_x_reverse_lookup(fval); - - ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " - "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); - - return p == 0 ? ~0U : scaled_div(1, p); -} - -static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; - const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; - const bool is_data_packet = dccp_data_packet(skb); - - if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) { - if (is_data_packet) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - do_feedback = CCID3_FBACK_INITIAL; - ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); - hc->rx_s = payload; - /* - * Not necessary to update rx_bytes_recv here, - * since X_recv = 0 for the first feedback packet (cf. - * RFC 3448, 6.3) -- gerrit - */ - } - goto update_records; - } - - if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb)) - return; /* done receiving */ - - if (is_data_packet) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - /* - * Update moving-average of s and the sum of received payload bytes - */ - hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9); - hc->rx_bytes_recv += payload; - } - - /* - * Perform loss detection and handle pending losses - */ - if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist, - skb, ndp, ccid3_first_li, sk)) { - do_feedback = CCID3_FBACK_PARAM_CHANGE; - goto done_receiving; - } - - if (tfrc_rx_hist_loss_pending(&hc->rx_hist)) - return; /* done receiving */ - - /* - * Handle data packets: RTT sampling and monitoring p - */ - if (unlikely(!is_data_packet)) - goto update_records; - - if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) { - const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb); - /* - * Empty loss history: no loss so far, hence p stays 0. - * Sample RTT values, since an RTT estimate is required for the - * computation of p when the first loss occurs; RFC 3448, 6.3.1. - */ - if (sample != 0) - hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9); - - } else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) { - /* - * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean - * has decreased (resp. p has increased), send feedback now. - */ - do_feedback = CCID3_FBACK_PARAM_CHANGE; - } - - /* - * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 - */ - if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3) - do_feedback = CCID3_FBACK_PERIODIC; - -update_records: - tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp); - -done_receiving: - if (do_feedback) - ccid3_hc_rx_send_feedback(sk, skb, do_feedback); -} - -static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid_priv(ccid); - - hc->rx_state = TFRC_RSTATE_NO_DATA; - tfrc_lh_init(&hc->rx_li_hist); - return tfrc_rx_hist_alloc(&hc->rx_hist); -} - -static void ccid3_hc_rx_exit(struct sock *sk) -{ - struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - - tfrc_rx_hist_purge(&hc->rx_hist); - tfrc_lh_cleanup(&hc->rx_li_hist); -} - -static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) -{ - info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state; - info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt; -} - -static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, - u32 __user *optval, int __user *optlen) -{ - const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - struct tfrc_rx_info rx_info; - const void *val; - - switch (optname) { - case DCCP_SOCKOPT_CCID_RX_INFO: - if (len < sizeof(rx_info)) - return -EINVAL; - rx_info.tfrcrx_x_recv = hc->rx_x_recv; - rx_info.tfrcrx_rtt = hc->rx_rtt; - rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv); - len = sizeof(rx_info); - val = &rx_info; - break; - default: - return -ENOPROTOOPT; - } - - if (put_user(len, optlen) || copy_to_user(optval, val, len)) - return -EFAULT; - - return 0; -} - -struct ccid_operations ccid3_ops = { - .ccid_id = DCCPC_CCID3, - .ccid_name = "TCP-Friendly Rate Control", - .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock), - .ccid_hc_tx_init = ccid3_hc_tx_init, - .ccid_hc_tx_exit = ccid3_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent, - .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv, - .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options, - .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock), - .ccid_hc_rx_init = ccid3_hc_rx_init, - .ccid_hc_rx_exit = ccid3_hc_rx_exit, - .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options, - .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv, - .ccid_hc_rx_get_info = ccid3_hc_rx_get_info, - .ccid_hc_tx_get_info = ccid3_hc_tx_get_info, - .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt, - .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, -}; - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -module_param(ccid3_debug, bool, 0644); -MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages"); -#endif diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h deleted file mode 100644 index 02e0fc9f6334..000000000000 --- a/net/dccp/ccids/ccid3.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ -#ifndef _DCCP_CCID3_H_ -#define _DCCP_CCID3_H_ - -#include <linux/ktime.h> -#include <linux/list.h> -#include <linux/types.h> -#include <linux/tfrc.h> -#include "lib/tfrc.h" -#include "../ccid.h" - -/* Two seconds as per RFC 5348, 4.2 */ -#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) - -/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ -#define TFRC_T_MBI 64 - -/* - * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are - * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. - * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse - * resolution of HZ < 500 means that the error is below one timer tick (t_gran) - * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). - */ -#if (HZ >= 500) -# define TFRC_T_DELTA USEC_PER_MSEC -#else -# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) -#endif - -enum ccid3_options { - TFRC_OPT_LOSS_EVENT_RATE = 192, - TFRC_OPT_LOSS_INTERVALS = 193, - TFRC_OPT_RECEIVE_RATE = 194, -}; - -/* TFRC sender states */ -enum ccid3_hc_tx_states { - TFRC_SSTATE_NO_SENT = 1, - TFRC_SSTATE_NO_FBACK, - TFRC_SSTATE_FBACK, -}; - -/** - * struct ccid3_hc_tx_sock - CCID3 sender half-connection socket - * @tx_x: Current sending rate in 64 * bytes per second - * @tx_x_recv: Receive rate in 64 * bytes per second - * @tx_x_calc: Calculated rate in bytes per second - * @tx_rtt: Estimate of current round trip time in usecs - * @tx_p: Current loss event rate (0-1) scaled by 1000000 - * @tx_s: Packet size in bytes - * @tx_t_rto: Nofeedback Timer setting in usecs - * @tx_t_ipi: Interpacket (send) interval (RFC 3448, 4.6) in usecs - * @tx_state: Sender state, one of %ccid3_hc_tx_states - * @tx_last_win_count: Last window counter sent - * @tx_t_last_win_count: Timestamp of earliest packet - * with last_win_count value sent - * @tx_no_feedback_timer: Handle to no feedback timer - * @tx_t_ld: Time last doubled during slow start - * @tx_t_nom: Nominal send time of next packet - * @tx_hist: Packet history - */ -struct ccid3_hc_tx_sock { - u64 tx_x; - u64 tx_x_recv; - u32 tx_x_calc; - u32 tx_rtt; - u32 tx_p; - u32 tx_t_rto; - u32 tx_t_ipi; - u16 tx_s; - enum ccid3_hc_tx_states tx_state:8; - u8 tx_last_win_count; - ktime_t tx_t_last_win_count; - struct timer_list tx_no_feedback_timer; - struct sock *sk; - ktime_t tx_t_ld; - ktime_t tx_t_nom; - struct tfrc_tx_hist_entry *tx_hist; -}; - -static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) -{ - struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); - BUG_ON(hctx == NULL); - return hctx; -} - -/* TFRC receiver states */ -enum ccid3_hc_rx_states { - TFRC_RSTATE_NO_DATA = 1, - TFRC_RSTATE_DATA, -}; - -/** - * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket - * @rx_last_counter: Tracks window counter (RFC 4342, 8.1) - * @rx_state: Receiver state, one of %ccid3_hc_rx_states - * @rx_bytes_recv: Total sum of DCCP payload bytes - * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3) - * @rx_rtt: Receiver estimate of RTT - * @rx_tstamp_last_feedback: Time at which last feedback was sent - * @rx_hist: Packet history (loss detection + RTT sampling) - * @rx_li_hist: Loss Interval database - * @rx_s: Received packet size in bytes - * @rx_pinv: Inverse of Loss Event Rate (RFC 4342, sec. 8.5) - */ -struct ccid3_hc_rx_sock { - u8 rx_last_counter:4; - enum ccid3_hc_rx_states rx_state:8; - u32 rx_bytes_recv; - u32 rx_x_recv; - u32 rx_rtt; - ktime_t rx_tstamp_last_feedback; - struct tfrc_rx_hist rx_hist; - struct tfrc_loss_hist rx_li_hist; - u16 rx_s; -#define rx_pinv rx_li_hist.i_mean -}; - -static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) -{ - struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); - BUG_ON(hcrx == NULL); - return hcrx; -} - -#endif /* _DCCP_CCID3_H_ */ diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c deleted file mode 100644 index da95319842bb..000000000000 --- a/net/dccp/ccids/lib/loss_interval.c +++ /dev/null @@ -1,184 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ -#include <net/sock.h> -#include "tfrc.h" - -static struct kmem_cache *tfrc_lh_slab __read_mostly; -/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */ -static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 }; - -/* implements LIFO semantics on the array */ -static inline u8 LIH_INDEX(const u8 ctr) -{ - return LIH_SIZE - 1 - (ctr % LIH_SIZE); -} - -/* the `counter' index always points at the next entry to be populated */ -static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh) -{ - return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL; -} - -/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */ -static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i) -{ - BUG_ON(i >= lh->counter); - return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length; -} - -/* - * On-demand allocation and de-allocation of entries - */ -static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh) -{ - if (lh->ring[LIH_INDEX(lh->counter)] == NULL) - lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab, - GFP_ATOMIC); - return lh->ring[LIH_INDEX(lh->counter)]; -} - -void tfrc_lh_cleanup(struct tfrc_loss_hist *lh) -{ - if (!tfrc_lh_is_initialised(lh)) - return; - - for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++) - if (lh->ring[LIH_INDEX(lh->counter)] != NULL) { - kmem_cache_free(tfrc_lh_slab, - lh->ring[LIH_INDEX(lh->counter)]); - lh->ring[LIH_INDEX(lh->counter)] = NULL; - } -} - -static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) -{ - u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0; - int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */ - - if (k <= 0) - return; - - for (i = 0; i <= k; i++) { - i_i = tfrc_lh_get_interval(lh, i); - - if (i < k) { - i_tot0 += i_i * tfrc_lh_weights[i]; - w_tot += tfrc_lh_weights[i]; - } - if (i > 0) - i_tot1 += i_i * tfrc_lh_weights[i-1]; - } - - lh->i_mean = max(i_tot0, i_tot1) / w_tot; -} - -/** - * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 - * @lh: histogram to update - * @skb: received socket triggering loss interval update - * - * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev - */ -u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) -{ - struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); - u32 old_i_mean = lh->i_mean; - s64 len; - - if (cur == NULL) /* not initialised */ - return 0; - - len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; - - if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ - return 0; - - if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) - /* - * Implements RFC 4342, 10.2: - * If a packet S (skb) exists whose seqno comes `after' the one - * starting the current loss interval (cur) and if the modulo-16 - * distance from C(cur) to C(S) is greater than 4, consider all - * subsequent packets as belonging to a new loss interval. This - * test is necessary since CCVal may wrap between intervals. - */ - cur->li_is_closed = 1; - - if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ - return 0; - - cur->li_length = len; - tfrc_lh_calc_i_mean(lh); - - return lh->i_mean < old_i_mean; -} - -/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ -static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, - struct tfrc_rx_hist_entry *new_loss) -{ - return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 && - (cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4); -} - -/** - * tfrc_lh_interval_add - Insert new record into the Loss Interval database - * @lh: Loss Interval database - * @rh: Receive history containing a fresh loss event - * @calc_first_li: Caller-dependent routine to compute length of first interval - * @sk: Used by @calc_first_li in caller-specific way (subtyping) - * - * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. - */ -int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, - u32 (*calc_first_li)(struct sock *), struct sock *sk) -{ - struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; - - if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) - return 0; - - new = tfrc_lh_demand_next(lh); - if (unlikely(new == NULL)) { - DCCP_CRIT("Cannot allocate/add loss record."); - return 0; - } - - new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; - new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval; - new->li_is_closed = 0; - - if (++lh->counter == 1) - lh->i_mean = new->li_length = (*calc_first_li)(sk); - else { - cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno); - new->li_length = dccp_delta_seqno(new->li_seqno, - tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1; - if (lh->counter > (2*LIH_SIZE)) - lh->counter -= LIH_SIZE; - - tfrc_lh_calc_i_mean(lh); - } - return 1; -} - -int __init tfrc_li_init(void) -{ - tfrc_lh_slab = kmem_cache_create("tfrc_li_hist", - sizeof(struct tfrc_loss_interval), 0, - SLAB_HWCACHE_ALIGN, NULL); - return tfrc_lh_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_li_exit(void) -{ - if (tfrc_lh_slab != NULL) { - kmem_cache_destroy(tfrc_lh_slab); - tfrc_lh_slab = NULL; - } -} diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h deleted file mode 100644 index c3d95f85e43b..000000000000 --- a/net/dccp/ccids/lib/loss_interval.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _DCCP_LI_HIST_ -#define _DCCP_LI_HIST_ -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz> - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ -#include <linux/ktime.h> -#include <linux/list.h> -#include <linux/slab.h> - -/* - * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than - * NINTERVAL, since the `open' interval I_0 is always stored as the first entry. - */ -#define NINTERVAL 8 -#define LIH_SIZE (NINTERVAL + 1) - -/** - * tfrc_loss_interval - Loss history record for TFRC-based protocols - * @li_seqno: Highest received seqno before the start of loss - * @li_ccval: The CCVal belonging to @li_seqno - * @li_is_closed: Whether @li_seqno is older than 1 RTT - * @li_length: Loss interval sequence length - */ -struct tfrc_loss_interval { - u64 li_seqno:48, - li_ccval:4, - li_is_closed:1; - u32 li_length; -}; - -/** - * tfrc_loss_hist - Loss record database - * @ring: Circular queue managed in LIFO manner - * @counter: Current count of entries (can be more than %LIH_SIZE) - * @i_mean: Current Average Loss Interval [RFC 3448, 5.4] - */ -struct tfrc_loss_hist { - struct tfrc_loss_interval *ring[LIH_SIZE]; - u8 counter; - u32 i_mean; -}; - -static inline void tfrc_lh_init(struct tfrc_loss_hist *lh) -{ - memset(lh, 0, sizeof(struct tfrc_loss_hist)); -} - -static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh) -{ - return lh->counter > 0; -} - -static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) -{ - return min(lh->counter, (u8)LIH_SIZE); -} - -struct tfrc_rx_hist; - -int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, - u32 (*first_li)(struct sock *), struct sock *); -u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); -void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); - -#endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c deleted file mode 100644 index 0cdda3c66fb5..000000000000 --- a/net/dccp/ccids/lib/packet_history.c +++ /dev/null @@ -1,439 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand. - * - * An implementation of the DCCP protocol - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/string.h> -#include <linux/slab.h> -#include "packet_history.h" -#include "../../dccp.h" - -/* - * Transmitter History Routines - */ -static struct kmem_cache *tfrc_tx_hist_slab; - -int __init tfrc_tx_packet_history_init(void) -{ - tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist", - sizeof(struct tfrc_tx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, NULL); - return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_tx_packet_history_exit(void) -{ - if (tfrc_tx_hist_slab != NULL) { - kmem_cache_destroy(tfrc_tx_hist_slab); - tfrc_tx_hist_slab = NULL; - } -} - -int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) -{ - struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); - - if (entry == NULL) - return -ENOBUFS; - entry->seqno = seqno; - entry->stamp = ktime_get_real(); - entry->next = *headp; - *headp = entry; - return 0; -} - -void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) -{ - struct tfrc_tx_hist_entry *head = *headp; - - while (head != NULL) { - struct tfrc_tx_hist_entry *next = head->next; - - kmem_cache_free(tfrc_tx_hist_slab, head); - head = next; - } - - *headp = NULL; -} - -/* - * Receiver History Routines - */ -static struct kmem_cache *tfrc_rx_hist_slab; - -int __init tfrc_rx_packet_history_init(void) -{ - tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache", - sizeof(struct tfrc_rx_hist_entry), - 0, SLAB_HWCACHE_ALIGN, NULL); - return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0; -} - -void tfrc_rx_packet_history_exit(void) -{ - if (tfrc_rx_hist_slab != NULL) { - kmem_cache_destroy(tfrc_rx_hist_slab); - tfrc_rx_hist_slab = NULL; - } -} - -static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry, - const struct sk_buff *skb, - const u64 ndp) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - - entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - entry->tfrchrx_ccval = dh->dccph_ccval; - entry->tfrchrx_type = dh->dccph_type; - entry->tfrchrx_ndp = ndp; - entry->tfrchrx_tstamp = ktime_get_real(); -} - -void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, - const struct sk_buff *skb, - const u64 ndp) -{ - struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h); - - tfrc_rx_hist_entry_from_skb(entry, skb, ndp); -} - -/* has the packet contained in skb been seen before? */ -int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) -{ - const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq; - int i; - - if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0) - return 1; - - for (i = 1; i <= h->loss_count; i++) - if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq) - return 1; - - return 0; -} - -static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) -{ - const u8 idx_a = tfrc_rx_hist_index(h, a), - idx_b = tfrc_rx_hist_index(h, b); - - swap(h->ring[idx_a], h->ring[idx_b]); -} - -/* - * Private helper functions for loss detection. - * - * In the descriptions, `Si' refers to the sequence number of entry number i, - * whose NDP count is `Ni' (lower case is used for variables). - * Note: All __xxx_loss functions expect that a test against duplicates has been - * performed already: the seqno of the skb must not be less than the seqno - * of loss_prev; and it must not equal that of any valid history entry. - */ -static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ - h->loss_count = 1; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); - } -} - -static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */ - h->loss_count = 2; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2); - return; - } - - /* S0 < S2 < S1 */ - - if (dccp_loss_free(s0, s2, n2)) { - u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; - - if (dccp_loss_free(s2, s1, n1)) { - /* hole is filled: S0, S2, and S1 are consecutive */ - h->loss_count = 0; - h->loss_start = tfrc_rx_hist_index(h, 1); - } else - /* gap between S2 and S1: just update loss_prev */ - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); - - } else { /* gap between S0 and S2 */ - /* - * Reorder history to insert S2 between S0 and S1 - */ - tfrc_rx_hist_swap(h, 0, 3); - h->loss_start = tfrc_rx_hist_index(h, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2); - h->loss_count = 2; - } -} - -/* return 1 if a new loss event has been identified */ -static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) -{ - u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, - s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, - s3 = DCCP_SKB_CB(skb)->dccpd_seq; - - if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */ - h->loss_count = 3; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3); - return 1; - } - - /* S3 < S2 */ - - if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */ - /* - * Reorder history to insert S3 between S1 and S2 - */ - tfrc_rx_hist_swap(h, 2, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3); - h->loss_count = 3; - return 1; - } - - /* S0 < S3 < S1 */ - - if (dccp_loss_free(s0, s3, n3)) { - u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp; - - if (dccp_loss_free(s3, s1, n1)) { - /* hole between S0 and S1 filled by S3 */ - u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp; - - if (dccp_loss_free(s1, s2, n2)) { - /* entire hole filled by S0, S3, S1, S2 */ - h->loss_start = tfrc_rx_hist_index(h, 2); - h->loss_count = 0; - } else { - /* gap remains between S1 and S2 */ - h->loss_start = tfrc_rx_hist_index(h, 1); - h->loss_count = 1; - } - - } else /* gap exists between S3 and S1, loss_count stays at 2 */ - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3); - - return 0; - } - - /* - * The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3 - * Reorder history to insert S3 between S0 and S1. - */ - tfrc_rx_hist_swap(h, 0, 3); - h->loss_start = tfrc_rx_hist_index(h, 3); - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3); - h->loss_count = 3; - - return 1; -} - -/* recycle RX history records to continue loss detection if necessary */ -static void __three_after_loss(struct tfrc_rx_hist *h) -{ - /* - * At this stage we know already that there is a gap between S0 and S1 - * (since S0 was the highest sequence number received before detecting - * the loss). To recycle the loss record, it is thus only necessary to - * check for other possible gaps between S1/S2 and between S2/S3. - */ - u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno, - s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno, - s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno; - u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp, - n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp; - - if (dccp_loss_free(s1, s2, n2)) { - - if (dccp_loss_free(s2, s3, n3)) { - /* no gap between S2 and S3: entire hole is filled */ - h->loss_start = tfrc_rx_hist_index(h, 3); - h->loss_count = 0; - } else { - /* gap between S2 and S3 */ - h->loss_start = tfrc_rx_hist_index(h, 2); - h->loss_count = 1; - } - - } else { /* gap between S1 and S2 */ - h->loss_start = tfrc_rx_hist_index(h, 1); - h->loss_count = 2; - } -} - -/** - * tfrc_rx_handle_loss - Loss detection and further processing - * @h: The non-empty RX history object - * @lh: Loss Intervals database to update - * @skb: Currently received packet - * @ndp: The NDP count belonging to @skb - * @calc_first_li: Caller-dependent computation of first loss interval in @lh - * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) - * - * Chooses action according to pending loss, updates LI database when a new - * loss was detected, and does required post-processing. Returns 1 when caller - * should send feedback, 0 otherwise. - * Since it also takes care of reordering during loss detection and updates the - * records accordingly, the caller should not perform any more RX history - * operations when loss_count is greater than 0 after calling this function. - */ -int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*calc_first_li)(struct sock *), struct sock *sk) -{ - int is_new_loss = 0; - - if (h->loss_count == 0) { - __do_track_loss(h, skb, ndp); - } else if (h->loss_count == 1) { - __one_after_loss(h, skb, ndp); - } else if (h->loss_count != 2) { - DCCP_BUG("invalid loss_count %d", h->loss_count); - } else if (__two_after_loss(h, skb, ndp)) { - /* - * Update Loss Interval database and recycle RX records - */ - is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); - __three_after_loss(h); - } - return is_new_loss; -} - -int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) -{ - int i; - - for (i = 0; i <= TFRC_NDUPACK; i++) { - h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); - if (h->ring[i] == NULL) - goto out_free; - } - - h->loss_count = h->loss_start = 0; - return 0; - -out_free: - while (i-- != 0) { - kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); - h->ring[i] = NULL; - } - return -ENOBUFS; -} - -void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) -{ - int i; - - for (i = 0; i <= TFRC_NDUPACK; ++i) - if (h->ring[i] != NULL) { - kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); - h->ring[i] = NULL; - } -} - -/** - * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against - * @h: The non-empty RX history object - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) -{ - return h->ring[0]; -} - -/** - * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry - * @h: The non-empty RX history object - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) -{ - return h->ring[h->rtt_sample_prev]; -} - -/** - * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal - * @h: receive histogram - * @skb: packet containing timestamp. - * - * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able - * to compute a sample with given data - calling function should check this. - */ -u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) -{ - u32 sample = 0, - delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - - if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ - if (h->rtt_sample_prev == 2) { /* previous candidate stored */ - sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - if (sample) - sample = 4 / sample * - ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); - else /* - * FIXME: This condition is in principle not - * possible but occurs when CCID is used for - * two-way data traffic. I have tried to trace - * it, but the cause does not seem to be here. - */ - DCCP_BUG("please report to dccp@vger.kernel.org" - " => prev = %u, last = %u", - tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - } else if (delta_v < 1) { - h->rtt_sample_prev = 1; - goto keep_ref_for_next_time; - } - - } else if (delta_v == 4) /* optimal match */ - sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); - else { /* suboptimal match */ - h->rtt_sample_prev = 2; - goto keep_ref_for_next_time; - } - - if (unlikely(sample > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %u too large, using max\n", sample); - sample = DCCP_SANE_RTT_MAX; - } - - h->rtt_sample_prev = 0; /* use current entry as next reference */ -keep_ref_for_next_time: - - return sample; -} diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h deleted file mode 100644 index 159cc9326eab..000000000000 --- a/net/dccp/ccids/lib/packet_history.h +++ /dev/null @@ -1,142 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * Packet RX/TX history data structures and routines for TFRC-based protocols. - * - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. - * - * This code has been developed by the University of Waikato WAND - * research group. For further information please see https://www.wand.net.nz/ - * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz - * - * This code also uses code from Lulea University, rereleased as GPL by its - * authors: - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - * - * Changes to meet Linux coding standards, to make it meet latest ccid3 draft - * and to make it work as a loadable module in the DCCP stack written by - * Arnaldo Carvalho de Melo <acme@conectiva.com.br>. - * - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#ifndef _DCCP_PKT_HIST_ -#define _DCCP_PKT_HIST_ - -#include <linux/list.h> -#include <linux/slab.h> -#include "tfrc.h" - -/** - * tfrc_tx_hist_entry - Simple singly-linked TX history list - * @next: next oldest entry (LIFO order) - * @seqno: sequence number of this entry - * @stamp: send time of packet with sequence number @seqno - */ -struct tfrc_tx_hist_entry { - struct tfrc_tx_hist_entry *next; - u64 seqno; - ktime_t stamp; -}; - -static inline struct tfrc_tx_hist_entry * - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) -{ - while (head != NULL && head->seqno != seqno) - head = head->next; - return head; -} - -int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); -void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); - -/* Subtraction a-b modulo-16, respects circular wrap-around */ -#define SUB16(a, b) (((a) + 16 - (b)) & 0xF) - -/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */ -#define TFRC_NDUPACK 3 - -/** - * tfrc_rx_hist_entry - Store information about a single received packet - * @tfrchrx_seqno: DCCP packet sequence number - * @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1) - * @tfrchrx_ndp: the NDP count (if any) of the packet - * @tfrchrx_tstamp: actual receive time of packet - */ -struct tfrc_rx_hist_entry { - u64 tfrchrx_seqno:48, - tfrchrx_ccval:4, - tfrchrx_type:4; - u64 tfrchrx_ndp:48; - ktime_t tfrchrx_tstamp; -}; - -/** - * tfrc_rx_hist - RX history structure for TFRC-based protocols - * @ring: Packet history for RTT sampling and loss detection - * @loss_count: Number of entries in circular history - * @loss_start: Movable index (for loss detection) - * @rtt_sample_prev: Used during RTT sampling, points to candidate entry - */ -struct tfrc_rx_hist { - struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; - u8 loss_count:2, - loss_start:2; -#define rtt_sample_prev loss_start -}; - -/** - * tfrc_rx_hist_index - index to reach n-th entry after loss_start - */ -static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n) -{ - return (h->loss_start + n) & TFRC_NDUPACK; -} - -/** - * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h) -{ - return h->ring[tfrc_rx_hist_index(h, h->loss_count)]; -} - -/** - * tfrc_rx_hist_entry - return the n-th history entry after loss_start - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n) -{ - return h->ring[tfrc_rx_hist_index(h, n)]; -} - -/** - * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h) -{ - return h->ring[h->loss_start]; -} - -/* indicate whether previously a packet was detected missing */ -static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) -{ - return h->loss_count > 0; -} - -void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, - const u64 ndp); - -int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); - -struct tfrc_loss_hist; -int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*first_li)(struct sock *sk), struct sock *sk); -u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb); -int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); -void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); - -#endif /* _DCCP_PKT_HIST_ */ diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c deleted file mode 100644 index d7f265e1f50c..000000000000 --- a/net/dccp/ccids/lib/tfrc.c +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * TFRC library initialisation - * - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com> - */ -#include <linux/moduleparam.h> -#include "tfrc.h" - -#ifdef CONFIG_IP_DCCP_TFRC_DEBUG -bool tfrc_debug; -module_param(tfrc_debug, bool, 0644); -MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages"); -#endif - -int __init tfrc_lib_init(void) -{ - int rc = tfrc_li_init(); - - if (rc) - goto out; - - rc = tfrc_tx_packet_history_init(); - if (rc) - goto out_free_loss_intervals; - - rc = tfrc_rx_packet_history_init(); - if (rc) - goto out_free_tx_history; - return 0; - -out_free_tx_history: - tfrc_tx_packet_history_exit(); -out_free_loss_intervals: - tfrc_li_exit(); -out: - return rc; -} - -void tfrc_lib_exit(void) -{ - tfrc_rx_packet_history_exit(); - tfrc_tx_packet_history_exit(); - tfrc_li_exit(); -} diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h deleted file mode 100644 index 0a63e8750cc5..000000000000 --- a/net/dccp/ccids/lib/tfrc.h +++ /dev/null @@ -1,73 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _TFRC_H_ -#define _TFRC_H_ -/* - * Copyright (c) 2007 The University of Aberdeen, Scotland, UK - * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz> - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ -#include <linux/types.h> -#include <linux/math64.h> -#include "../../dccp.h" - -/* internal includes that this library exports: */ -#include "loss_interval.h" -#include "packet_history.h" - -#ifdef CONFIG_IP_DCCP_TFRC_DEBUG -extern bool tfrc_debug; -#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a) -#else -#define tfrc_pr_debug(format, a...) -#endif - -/* integer-arithmetic divisions of type (a * 1000000)/b */ -static inline u64 scaled_div(u64 a, u64 b) -{ - BUG_ON(b == 0); - return div64_u64(a * 1000000, b); -} - -static inline u32 scaled_div32(u64 a, u64 b) -{ - u64 result = scaled_div(a, b); - - if (result > UINT_MAX) { - DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX", - (unsigned long long)a, (unsigned long long)b); - return UINT_MAX; - } - return result; -} - -/** - * tfrc_ewma - Exponentially weighted moving average - * @weight: Weight to be used as damping factor, in units of 1/10 - */ -static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) -{ - return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval; -} - -u32 tfrc_calc_x(u16 s, u32 R, u32 p); -u32 tfrc_calc_x_reverse_lookup(u32 fvalue); -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); - -int tfrc_tx_packet_history_init(void); -void tfrc_tx_packet_history_exit(void); -int tfrc_rx_packet_history_init(void); -void tfrc_rx_packet_history_exit(void); - -int tfrc_li_init(void); -void tfrc_li_exit(void); - -#ifdef CONFIG_IP_DCCP_TFRC_LIB -int tfrc_lib_init(void); -void tfrc_lib_exit(void); -#else -#define tfrc_lib_init() (0) -#define tfrc_lib_exit() -#endif -#endif /* _TFRC_H_ */ diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c deleted file mode 100644 index 92a8c6bea316..000000000000 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ /dev/null @@ -1,702 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. - * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz> - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon - */ - -#include <linux/module.h> -#include "../../dccp.h" -#include "tfrc.h" - -#define TFRC_CALC_X_ARRSIZE 500 -#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */ -#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE) - -/* - TFRC TCP Reno Throughput Equation Lookup Table for f(p) - - The following two-column lookup table implements a part of the TCP throughput - equation from [RFC 3448, sec. 3.1]: - - s - X_calc = -------------------------------------------------------------- - R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3)) - - Where: - X is the transmit rate in bytes/second - s is the packet size in bytes - R is the round trip time in seconds - p is the loss event rate, between 0 and 1.0, of the number of loss - events as a fraction of the number of packets transmitted - t_RTO is the TCP retransmission timeout value in seconds - b is the number of packets acknowledged by a single TCP ACK - - We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes: - - s - X_calc = ------------------------------------------------------- - R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3)) - - which we can break down into: - - s - X_calc = --------- - R * f(p) - - where f(p) is given for 0 < p <= 1 by: - - f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3) - - Since this is kernel code, floating-point arithmetic is avoided in favour of - integer arithmetic. This means that nearly all fractional parameters are - scaled by 1000000: - * the parameters p and R - * the return result f(p) - The lookup table therefore actually tabulates the following function g(q): - - g(q) = 1000000 * f(q/1000000) - - Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer - granularity for the practically more relevant case of small values of p (up to - 5%), the second column is used; the first one ranges up to 100%. This split - corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also - determines the smallest resolution possible with this lookup table: - - TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE - - The entire table is generated by: - for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) { - lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE); - lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE); - } - - With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1, - lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%) - lookup[M][0] = g(1000000) = 1000000 * f(100%) - lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%) - lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%) - - In summary, the two columns represent f(p) for the following ranges: - * The first column is for 0.002 <= p <= 1.0 - * The second column is for 0.0001 <= p <= 0.05 - Where the columns overlap, the second (finer-grained) is given preference, - i.e. the first column is used only for p >= 0.05. - */ -static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { - { 37172, 8172 }, - { 53499, 11567 }, - { 66664, 14180 }, - { 78298, 16388 }, - { 89021, 18339 }, - { 99147, 20108 }, - { 108858, 21738 }, - { 118273, 23260 }, - { 127474, 24693 }, - { 136520, 26052 }, - { 145456, 27348 }, - { 154316, 28589 }, - { 163130, 29783 }, - { 171919, 30935 }, - { 180704, 32049 }, - { 189502, 33130 }, - { 198328, 34180 }, - { 207194, 35202 }, - { 216114, 36198 }, - { 225097, 37172 }, - { 234153, 38123 }, - { 243294, 39055 }, - { 252527, 39968 }, - { 261861, 40864 }, - { 271305, 41743 }, - { 280866, 42607 }, - { 290553, 43457 }, - { 300372, 44293 }, - { 310333, 45117 }, - { 320441, 45929 }, - { 330705, 46729 }, - { 341131, 47518 }, - { 351728, 48297 }, - { 362501, 49066 }, - { 373460, 49826 }, - { 384609, 50577 }, - { 395958, 51320 }, - { 407513, 52054 }, - { 419281, 52780 }, - { 431270, 53499 }, - { 443487, 54211 }, - { 455940, 54916 }, - { 468635, 55614 }, - { 481581, 56306 }, - { 494785, 56991 }, - { 508254, 57671 }, - { 521996, 58345 }, - { 536019, 59014 }, - { 550331, 59677 }, - { 564939, 60335 }, - { 579851, 60988 }, - { 595075, 61636 }, - { 610619, 62279 }, - { 626491, 62918 }, - { 642700, 63553 }, - { 659253, 64183 }, - { 676158, 64809 }, - { 693424, 65431 }, - { 711060, 66050 }, - { 729073, 66664 }, - { 747472, 67275 }, - { 766266, 67882 }, - { 785464, 68486 }, - { 805073, 69087 }, - { 825103, 69684 }, - { 845562, 70278 }, - { 866460, 70868 }, - { 887805, 71456 }, - { 909606, 72041 }, - { 931873, 72623 }, - { 954614, 73202 }, - { 977839, 73778 }, - { 1001557, 74352 }, - { 1025777, 74923 }, - { 1050508, 75492 }, - { 1075761, 76058 }, - { 1101544, 76621 }, - { 1127867, 77183 }, - { 1154739, 77741 }, - { 1182172, 78298 }, - { 1210173, 78852 }, - { 1238753, 79405 }, - { 1267922, 79955 }, - { 1297689, 80503 }, - { 1328066, 81049 }, - { 1359060, 81593 }, - { 1390684, 82135 }, - { 1422947, 82675 }, - { 1455859, 83213 }, - { 1489430, 83750 }, - { 1523671, 84284 }, - { 1558593, 84817 }, - { 1594205, 85348 }, - { 1630518, 85878 }, - { 1667543, 86406 }, - { 1705290, 86932 }, - { 1743770, 87457 }, - { 1782994, 87980 }, - { 1822973, 88501 }, - { 1863717, 89021 }, - { 1905237, 89540 }, - { 1947545, 90057 }, - { 1990650, 90573 }, - { 2034566, 91087 }, - { 2079301, 91600 }, - { 2124869, 92111 }, - { 2171279, 92622 }, - { 2218543, 93131 }, - { 2266673, 93639 }, - { 2315680, 94145 }, - { 2365575, 94650 }, - { 2416371, 95154 }, - { 2468077, 95657 }, - { 2520707, 96159 }, - { 2574271, 96660 }, - { 2628782, 97159 }, - { 2684250, 97658 }, - { 2740689, 98155 }, - { 2798110, 98651 }, - { 2856524, 99147 }, - { 2915944, 99641 }, - { 2976382, 100134 }, - { 3037850, 100626 }, - { 3100360, 101117 }, - { 3163924, 101608 }, - { 3228554, 102097 }, - { 3294263, 102586 }, - { 3361063, 103073 }, - { 3428966, 103560 }, - { 3497984, 104045 }, - { 3568131, 104530 }, - { 3639419, 105014 }, - { 3711860, 105498 }, - { 3785467, 105980 }, - { 3860253, 106462 }, - { 3936229, 106942 }, - { 4013410, 107422 }, - { 4091808, 107902 }, - { 4171435, 108380 }, - { 4252306, 108858 }, - { 4334431, 109335 }, - { 4417825, 109811 }, - { 4502501, 110287 }, - { 4588472, 110762 }, - { 4675750, 111236 }, - { 4764349, 111709 }, - { 4854283, 112182 }, - { 4945564, 112654 }, - { 5038206, 113126 }, - { 5132223, 113597 }, - { 5227627, 114067 }, - { 5324432, 114537 }, - { 5422652, 115006 }, - { 5522299, 115474 }, - { 5623389, 115942 }, - { 5725934, 116409 }, - { 5829948, 116876 }, - { 5935446, 117342 }, - { 6042439, 117808 }, - { 6150943, 118273 }, - { 6260972, 118738 }, - { 6372538, 119202 }, - { 6485657, 119665 }, - { 6600342, 120128 }, - { 6716607, 120591 }, - { 6834467, 121053 }, - { 6953935, 121514 }, - { 7075025, 121976 }, - { 7197752, 122436 }, - { 7322131, 122896 }, - { 7448175, 123356 }, - { 7575898, 123815 }, - { 7705316, 124274 }, - { 7836442, 124733 }, - { 7969291, 125191 }, - { 8103877, 125648 }, - { 8240216, 126105 }, - { 8378321, 126562 }, - { 8518208, 127018 }, - { 8659890, 127474 }, - { 8803384, 127930 }, - { 8948702, 128385 }, - { 9095861, 128840 }, - { 9244875, 129294 }, - { 9395760, 129748 }, - { 9548529, 130202 }, - { 9703198, 130655 }, - { 9859782, 131108 }, - { 10018296, 131561 }, - { 10178755, 132014 }, - { 10341174, 132466 }, - { 10505569, 132917 }, - { 10671954, 133369 }, - { 10840345, 133820 }, - { 11010757, 134271 }, - { 11183206, 134721 }, - { 11357706, 135171 }, - { 11534274, 135621 }, - { 11712924, 136071 }, - { 11893673, 136520 }, - { 12076536, 136969 }, - { 12261527, 137418 }, - { 12448664, 137867 }, - { 12637961, 138315 }, - { 12829435, 138763 }, - { 13023101, 139211 }, - { 13218974, 139658 }, - { 13417071, 140106 }, - { 13617407, 140553 }, - { 13819999, 140999 }, - { 14024862, 141446 }, - { 14232012, 141892 }, - { 14441465, 142339 }, - { 14653238, 142785 }, - { 14867346, 143230 }, - { 15083805, 143676 }, - { 15302632, 144121 }, - { 15523842, 144566 }, - { 15747453, 145011 }, - { 15973479, 145456 }, - { 16201939, 145900 }, - { 16432847, 146345 }, - { 16666221, 146789 }, - { 16902076, 147233 }, - { 17140429, 147677 }, - { 17381297, 148121 }, - { 17624696, 148564 }, - { 17870643, 149007 }, - { 18119154, 149451 }, - { 18370247, 149894 }, - { 18623936, 150336 }, - { 18880241, 150779 }, - { 19139176, 151222 }, - { 19400759, 151664 }, - { 19665007, 152107 }, - { 19931936, 152549 }, - { 20201564, 152991 }, - { 20473907, 153433 }, - { 20748982, 153875 }, - { 21026807, 154316 }, - { 21307399, 154758 }, - { 21590773, 155199 }, - { 21876949, 155641 }, - { 22165941, 156082 }, - { 22457769, 156523 }, - { 22752449, 156964 }, - { 23049999, 157405 }, - { 23350435, 157846 }, - { 23653774, 158287 }, - { 23960036, 158727 }, - { 24269236, 159168 }, - { 24581392, 159608 }, - { 24896521, 160049 }, - { 25214642, 160489 }, - { 25535772, 160929 }, - { 25859927, 161370 }, - { 26187127, 161810 }, - { 26517388, 162250 }, - { 26850728, 162690 }, - { 27187165, 163130 }, - { 27526716, 163569 }, - { 27869400, 164009 }, - { 28215234, 164449 }, - { 28564236, 164889 }, - { 28916423, 165328 }, - { 29271815, 165768 }, - { 29630428, 166208 }, - { 29992281, 166647 }, - { 30357392, 167087 }, - { 30725779, 167526 }, - { 31097459, 167965 }, - { 31472452, 168405 }, - { 31850774, 168844 }, - { 32232445, 169283 }, - { 32617482, 169723 }, - { 33005904, 170162 }, - { 33397730, 170601 }, - { 33792976, 171041 }, - { 34191663, 171480 }, - { 34593807, 171919 }, - { 34999428, 172358 }, - { 35408544, 172797 }, - { 35821174, 173237 }, - { 36237335, 173676 }, - { 36657047, 174115 }, - { 37080329, 174554 }, - { 37507197, 174993 }, - { 37937673, 175433 }, - { 38371773, 175872 }, - { 38809517, 176311 }, - { 39250924, 176750 }, - { 39696012, 177190 }, - { 40144800, 177629 }, - { 40597308, 178068 }, - { 41053553, 178507 }, - { 41513554, 178947 }, - { 41977332, 179386 }, - { 42444904, 179825 }, - { 42916290, 180265 }, - { 43391509, 180704 }, - { 43870579, 181144 }, - { 44353520, 181583 }, - { 44840352, 182023 }, - { 45331092, 182462 }, - { 45825761, 182902 }, - { 46324378, 183342 }, - { 46826961, 183781 }, - { 47333531, 184221 }, - { 47844106, 184661 }, - { 48358706, 185101 }, - { 48877350, 185541 }, - { 49400058, 185981 }, - { 49926849, 186421 }, - { 50457743, 186861 }, - { 50992759, 187301 }, - { 51531916, 187741 }, - { 52075235, 188181 }, - { 52622735, 188622 }, - { 53174435, 189062 }, - { 53730355, 189502 }, - { 54290515, 189943 }, - { 54854935, 190383 }, - { 55423634, 190824 }, - { 55996633, 191265 }, - { 56573950, 191706 }, - { 57155606, 192146 }, - { 57741621, 192587 }, - { 58332014, 193028 }, - { 58926806, 193470 }, - { 59526017, 193911 }, - { 60129666, 194352 }, - { 60737774, 194793 }, - { 61350361, 195235 }, - { 61967446, 195677 }, - { 62589050, 196118 }, - { 63215194, 196560 }, - { 63845897, 197002 }, - { 64481179, 197444 }, - { 65121061, 197886 }, - { 65765563, 198328 }, - { 66414705, 198770 }, - { 67068508, 199213 }, - { 67726992, 199655 }, - { 68390177, 200098 }, - { 69058085, 200540 }, - { 69730735, 200983 }, - { 70408147, 201426 }, - { 71090343, 201869 }, - { 71777343, 202312 }, - { 72469168, 202755 }, - { 73165837, 203199 }, - { 73867373, 203642 }, - { 74573795, 204086 }, - { 75285124, 204529 }, - { 76001380, 204973 }, - { 76722586, 205417 }, - { 77448761, 205861 }, - { 78179926, 206306 }, - { 78916102, 206750 }, - { 79657310, 207194 }, - { 80403571, 207639 }, - { 81154906, 208084 }, - { 81911335, 208529 }, - { 82672880, 208974 }, - { 83439562, 209419 }, - { 84211402, 209864 }, - { 84988421, 210309 }, - { 85770640, 210755 }, - { 86558080, 211201 }, - { 87350762, 211647 }, - { 88148708, 212093 }, - { 88951938, 212539 }, - { 89760475, 212985 }, - { 90574339, 213432 }, - { 91393551, 213878 }, - { 92218133, 214325 }, - { 93048107, 214772 }, - { 93883493, 215219 }, - { 94724314, 215666 }, - { 95570590, 216114 }, - { 96422343, 216561 }, - { 97279594, 217009 }, - { 98142366, 217457 }, - { 99010679, 217905 }, - { 99884556, 218353 }, - { 100764018, 218801 }, - { 101649086, 219250 }, - { 102539782, 219698 }, - { 103436128, 220147 }, - { 104338146, 220596 }, - { 105245857, 221046 }, - { 106159284, 221495 }, - { 107078448, 221945 }, - { 108003370, 222394 }, - { 108934074, 222844 }, - { 109870580, 223294 }, - { 110812910, 223745 }, - { 111761087, 224195 }, - { 112715133, 224646 }, - { 113675069, 225097 }, - { 114640918, 225548 }, - { 115612702, 225999 }, - { 116590442, 226450 }, - { 117574162, 226902 }, - { 118563882, 227353 }, - { 119559626, 227805 }, - { 120561415, 228258 }, - { 121569272, 228710 }, - { 122583219, 229162 }, - { 123603278, 229615 }, - { 124629471, 230068 }, - { 125661822, 230521 }, - { 126700352, 230974 }, - { 127745083, 231428 }, - { 128796039, 231882 }, - { 129853241, 232336 }, - { 130916713, 232790 }, - { 131986475, 233244 }, - { 133062553, 233699 }, - { 134144966, 234153 }, - { 135233739, 234608 }, - { 136328894, 235064 }, - { 137430453, 235519 }, - { 138538440, 235975 }, - { 139652876, 236430 }, - { 140773786, 236886 }, - { 141901190, 237343 }, - { 143035113, 237799 }, - { 144175576, 238256 }, - { 145322604, 238713 }, - { 146476218, 239170 }, - { 147636442, 239627 }, - { 148803298, 240085 }, - { 149976809, 240542 }, - { 151156999, 241000 }, - { 152343890, 241459 }, - { 153537506, 241917 }, - { 154737869, 242376 }, - { 155945002, 242835 }, - { 157158929, 243294 }, - { 158379673, 243753 }, - { 159607257, 244213 }, - { 160841704, 244673 }, - { 162083037, 245133 }, - { 163331279, 245593 }, - { 164586455, 246054 }, - { 165848586, 246514 }, - { 167117696, 246975 }, - { 168393810, 247437 }, - { 169676949, 247898 }, - { 170967138, 248360 }, - { 172264399, 248822 }, - { 173568757, 249284 }, - { 174880235, 249747 }, - { 176198856, 250209 }, - { 177524643, 250672 }, - { 178857621, 251136 }, - { 180197813, 251599 }, - { 181545242, 252063 }, - { 182899933, 252527 }, - { 184261908, 252991 }, - { 185631191, 253456 }, - { 187007807, 253920 }, - { 188391778, 254385 }, - { 189783129, 254851 }, - { 191181884, 255316 }, - { 192588065, 255782 }, - { 194001698, 256248 }, - { 195422805, 256714 }, - { 196851411, 257181 }, - { 198287540, 257648 }, - { 199731215, 258115 }, - { 201182461, 258582 }, - { 202641302, 259050 }, - { 204107760, 259518 }, - { 205581862, 259986 }, - { 207063630, 260454 }, - { 208553088, 260923 }, - { 210050262, 261392 }, - { 211555174, 261861 }, - { 213067849, 262331 }, - { 214588312, 262800 }, - { 216116586, 263270 }, - { 217652696, 263741 }, - { 219196666, 264211 }, - { 220748520, 264682 }, - { 222308282, 265153 }, - { 223875978, 265625 }, - { 225451630, 266097 }, - { 227035265, 266569 }, - { 228626905, 267041 }, - { 230226576, 267514 }, - { 231834302, 267986 }, - { 233450107, 268460 }, - { 235074016, 268933 }, - { 236706054, 269407 }, - { 238346244, 269881 }, - { 239994613, 270355 }, - { 241651183, 270830 }, - { 243315981, 271305 } -}; - -/* return largest index i such that fval <= lookup[i][small] */ -static inline u32 tfrc_binsearch(u32 fval, u8 small) -{ - u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1; - - while (low < high) { - try = (low + high) / 2; - if (fval <= tfrc_calc_x_lookup[try][small]) - high = try; - else - low = try + 1; - } - return high; -} - -/** - * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448 - * @s: packet size in bytes - * @R: RTT scaled by 1000000 (i.e., microseconds) - * @p: loss ratio estimate scaled by 1000000 - * - * Returns X_calc in bytes per second (not scaled). - */ -u32 tfrc_calc_x(u16 s, u32 R, u32 p) -{ - u16 index; - u32 f; - u64 result; - - /* check against invalid parameters and divide-by-zero */ - BUG_ON(p > 1000000); /* p must not exceed 100% */ - BUG_ON(p == 0); /* f(0) = 0, divide by zero */ - if (R == 0) { /* possible divide by zero */ - DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc."); - return ~0U; - } - - if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ - if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ - DCCP_WARN("Value of p (%d) below resolution. " - "Substituting %d\n", p, TFRC_SMALLEST_P); - index = 0; - } else /* 0.0001 <= p <= 0.05 */ - index = p/TFRC_SMALLEST_P - 1; - - f = tfrc_calc_x_lookup[index][1]; - - } else { /* 0.05 < p <= 1.00 */ - index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1; - - f = tfrc_calc_x_lookup[index][0]; - } - - /* - * Compute X = s/(R*f(p)) in bytes per second. - * Since f(p) and R are both scaled by 1000000, we need to multiply by - * 1000000^2. To avoid overflow, the result is computed in two stages. - * This works under almost all reasonable operational conditions, for a - * wide range of parameters. Yet, should some strange combination of - * parameters result in overflow, the use of scaled_div32 will catch - * this and return UINT_MAX - which is a logically adequate consequence. - */ - result = scaled_div(s, R); - return scaled_div32(result, f); -} - -/** - * tfrc_calc_x_reverse_lookup - try to find p given f(p) - * @fvalue: function value to match, scaled by 1000000 - * - * Returns closest match for p, also scaled by 1000000 - */ -u32 tfrc_calc_x_reverse_lookup(u32 fvalue) -{ - int index; - - if (fvalue == 0) /* f(p) = 0 whenever p = 0 */ - return 0; - - /* Error cases. */ - if (fvalue < tfrc_calc_x_lookup[0][1]) { - DCCP_WARN("fvalue %u smaller than resolution\n", fvalue); - return TFRC_SMALLEST_P; - } - if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) { - DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue); - return 1000000; - } - - if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) { - index = tfrc_binsearch(fvalue, 1); - return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; - } - - /* else ... it must be in the coarse-grained column */ - index = tfrc_binsearch(fvalue, 0); - return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; -} - -/** - * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% - * @loss_event_rate: loss event rate to invert - * When @loss_event_rate is large, there is a chance that p is truncated to 0. - * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. - */ -u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) -{ - if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ - return 0; - if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ - return 1000000; - return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); -} diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h deleted file mode 100644 index 1f748ed1279d..000000000000 --- a/net/dccp/dccp.h +++ /dev/null @@ -1,483 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_H -#define _DCCP_H -/* - * net/dccp/dccp.h - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz> - */ - -#include <linux/dccp.h> -#include <linux/ktime.h> -#include <net/snmp.h> -#include <net/sock.h> -#include <net/tcp.h> -#include "ackvec.h" - -/* - * DCCP - specific warning and debugging macros. - */ -#define DCCP_WARN(fmt, ...) \ - net_warn_ratelimited("%s: " fmt, __func__, ##__VA_ARGS__) -#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \ - __FILE__, __LINE__, __func__) -#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0) -#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \ - DCCP_BUG("\"%s\" holds (exception!)", \ - __stringify(cond)); \ - } while (0) - -#define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \ - printk(fmt, ##args); \ - } while(0) -#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \ - "%s: " fmt, __func__, ##a) - -#ifdef CONFIG_IP_DCCP_DEBUG -extern bool dccp_debug; -#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) -#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) -#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) -#else -#define dccp_pr_debug(format, a...) do {} while (0) -#define dccp_pr_debug_cat(format, a...) do {} while (0) -#define dccp_debug(format, a...) do {} while (0) -#endif - -extern struct inet_hashinfo dccp_hashinfo; - -DECLARE_PER_CPU(unsigned int, dccp_orphan_count); - -void dccp_time_wait(struct sock *sk, int state, int timeo); - -/* - * Set safe upper bounds for header and option length. Since Data Offset is 8 - * bits (RFC 4340, sec. 5.1), the total header length can never be more than - * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1): - * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR - * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields - * Hence a safe upper bound for the maximum option length is 1020-28 = 992 - */ -#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) -#define DCCP_MAX_PACKET_HDR 28 -#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) -#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) - -/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ -#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) - -#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT - * state, about 60 seconds */ - -/* RFC 1122, 4.2.3.1 initial RTO value */ -#define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ)) - -/* - * The maximum back-off value for retransmissions. This is needed for - * - retransmitting client-Requests (sec. 8.1.1), - * - retransmitting Close/CloseReq when closing (sec. 8.3), - * - feature-negotiation retransmission (sec. 6.6.3), - * - Acks in client-PARTOPEN state (sec. 8.1.5). - */ -#define DCCP_RTO_MAX ((unsigned int)(64 * HZ)) - -/* - * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 - */ -#define DCCP_SANE_RTT_MIN 100 -#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) -#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) - -/* sysctl variables for DCCP */ -extern int sysctl_dccp_request_retries; -extern int sysctl_dccp_retries1; -extern int sysctl_dccp_retries2; -extern int sysctl_dccp_tx_qlen; -extern int sysctl_dccp_sync_ratelimit; - -/* - * 48-bit sequence number arithmetic (signed and unsigned) - */ -#define INT48_MIN 0x800000000000LL /* 2^47 */ -#define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */ -#define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */ -#define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x))) -#define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x))) -#define ADD48(a, b) (((a) + (b)) & UINT48_MAX) -#define SUB48(a, b) ADD48((a), COMPLEMENT48(b)) - -static inline void dccp_inc_seqno(u64 *seqno) -{ - *seqno = ADD48(*seqno, 1); -} - -/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */ -static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2) -{ - u64 delta = SUB48(seqno2, seqno1); - - return TO_SIGNED48(delta); -} - -/* is seq1 < seq2 ? */ -static inline int before48(const u64 seq1, const u64 seq2) -{ - return (s64)((seq2 << 16) - (seq1 << 16)) > 0; -} - -/* is seq1 > seq2 ? */ -#define after48(seq1, seq2) before48(seq2, seq1) - -/* is seq2 <= seq1 <= seq3 ? */ -static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3) -{ - return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16); -} - -/** - * dccp_loss_count - Approximate the number of lost data packets in a burst loss - * @s1: last known sequence number before the loss ('hole') - * @s2: first sequence number seen after the 'hole' - * @ndp: NDP count on packet with sequence number @s2 - */ -static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp) -{ - s64 delta = dccp_delta_seqno(s1, s2); - - WARN_ON(delta < 0); - delta -= ndp + 1; - - return delta > 0 ? delta : 0; -} - -/** - * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1 - */ -static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) -{ - return dccp_loss_count(s1, s2, ndp) == 0; -} - -enum { - DCCP_MIB_NUM = 0, - DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */ - DCCP_MIB_ESTABRESETS, /* EstabResets */ - DCCP_MIB_CURRESTAB, /* CurrEstab */ - DCCP_MIB_OUTSEGS, /* OutSegs */ - DCCP_MIB_OUTRSTS, - DCCP_MIB_ABORTONTIMEOUT, - DCCP_MIB_TIMEOUTS, - DCCP_MIB_ABORTFAILED, - DCCP_MIB_PASSIVEOPENS, - DCCP_MIB_ATTEMPTFAILS, - DCCP_MIB_OUTDATAGRAMS, - DCCP_MIB_INERRS, - DCCP_MIB_OPTMANDATORYERROR, - DCCP_MIB_INVALIDOPT, - __DCCP_MIB_MAX -}; - -#define DCCP_MIB_MAX __DCCP_MIB_MAX -struct dccp_mib { - unsigned long mibs[DCCP_MIB_MAX]; -}; - -DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics); -#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field) -#define __DCCP_INC_STATS(field) __SNMP_INC_STATS(dccp_statistics, field) -#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field) - -/* - * Checksumming routines - */ -static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb) -{ - const struct dccp_hdr* dh = dccp_hdr(skb); - - if (dh->dccph_cscov == 0) - return skb->len; - return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32); -} - -static inline void dccp_csum_outgoing(struct sk_buff *skb) -{ - unsigned int cov = dccp_csum_coverage(skb); - - if (cov >= skb->len) - dccp_hdr(skb)->dccph_cscov = 0; - - skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0); -} - -void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb); - -int dccp_retransmit_skb(struct sock *sk); - -void dccp_send_ack(struct sock *sk); -void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *rsk); - -void dccp_send_sync(struct sock *sk, const u64 seq, - const enum dccp_pkt_type pkt_type); - -/* - * TX Packet Dequeueing Interface - */ -void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); -bool dccp_qpolicy_full(struct sock *sk); -void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); -struct sk_buff *dccp_qpolicy_top(struct sock *sk); -struct sk_buff *dccp_qpolicy_pop(struct sock *sk); -bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); - -/* - * TX Packet Output and TX Timers - */ -void dccp_write_xmit(struct sock *sk); -void dccp_write_space(struct sock *sk); -void dccp_flush_write_queue(struct sock *sk, long *time_budget); - -void dccp_init_xmit_timers(struct sock *sk); -static inline void dccp_clear_xmit_timers(struct sock *sk) -{ - inet_csk_clear_xmit_timers(sk); -} - -unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); - -const char *dccp_packet_name(const int type); - -void dccp_set_state(struct sock *sk, const int state); -void dccp_done(struct sock *sk); - -int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, - struct sk_buff const *skb); - -int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); - -struct sock *dccp_create_openreq_child(const struct sock *sk, - const struct request_sock *req, - const struct sk_buff *skb); - -int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); - -struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req); -struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req); - -int dccp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb); -int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct dccp_hdr *dh, unsigned int len); -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len); - -void dccp_destruct_common(struct sock *sk); -int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); -void dccp_destroy_sock(struct sock *sk); - -void dccp_close(struct sock *sk, long timeout); -struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, - struct request_sock *req); - -int dccp_connect(struct sock *sk); -int dccp_disconnect(struct sock *sk, int flags); -int dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen); -int dccp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen); -int dccp_ioctl(struct sock *sk, int cmd, int *karg); -int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); -int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len); -void dccp_shutdown(struct sock *sk, int how); -int inet_dccp_listen(struct socket *sock, int backlog); -__poll_t dccp_poll(struct file *file, struct socket *sock, - poll_table *wait); -int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); -void dccp_req_err(struct sock *sk, u64 seq); - -struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb); -int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); -void dccp_send_close(struct sock *sk, const int active); -int dccp_invalid_packet(struct sk_buff *skb); -u32 dccp_sample_rtt(struct sock *sk, long delta); - -static inline bool dccp_bad_service_code(const struct sock *sk, - const __be32 service) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - if (dp->dccps_service == service) - return false; - return !dccp_list_has_service(dp->dccps_service_list, service); -} - -/** - * dccp_skb_cb - DCCP per-packet control information - * @dccpd_type: one of %dccp_pkt_type (or unknown) - * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1 - * @dccpd_reset_code: one of %dccp_reset_codes - * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code) - * @dccpd_opt_len: total length of all options (5.8) in the packet - * @dccpd_seq: sequence number - * @dccpd_ack_seq: acknowledgment number subheader field value - * - * This is used for transmission as well as for reception. - */ -struct dccp_skb_cb { - union { - struct inet_skb_parm h4; -#if IS_ENABLED(CONFIG_IPV6) - struct inet6_skb_parm h6; -#endif - } header; - __u8 dccpd_type:4; - __u8 dccpd_ccval:4; - __u8 dccpd_reset_code, - dccpd_reset_data[3]; - __u16 dccpd_opt_len; - __u64 dccpd_seq; - __u64 dccpd_ack_seq; -}; - -#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0])) - -/* RFC 4340, sec. 7.7 */ -static inline int dccp_non_data_packet(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_ACK || - type == DCCP_PKT_CLOSE || - type == DCCP_PKT_CLOSEREQ || - type == DCCP_PKT_RESET || - type == DCCP_PKT_SYNC || - type == DCCP_PKT_SYNCACK; -} - -/* RFC 4340, sec. 7.7 */ -static inline int dccp_data_packet(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_DATA || - type == DCCP_PKT_DATAACK || - type == DCCP_PKT_REQUEST || - type == DCCP_PKT_RESPONSE; -} - -static inline int dccp_packet_without_ack(const struct sk_buff *skb) -{ - const __u8 type = DCCP_SKB_CB(skb)->dccpd_type; - - return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST; -} - -#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2) - -static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss) -{ - struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh + - sizeof(*dh)); - dh->dccph_seq2 = 0; - dh->dccph_seq = htons((gss >> 32) & 0xfffff); - dhx->dccph_seq_low = htonl(gss & 0xffffffff); -} - -static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, - const u64 gsr) -{ - dhack->dccph_reserved1 = 0; - dhack->dccph_ack_nr_high = htons(gsr >> 32); - dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff); -} - -static inline void dccp_update_gsr(struct sock *sk, u64 seq) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (after48(seq, dp->dccps_gsr)) - dp->dccps_gsr = seq; - /* Sequence validity window depends on remote Sequence Window (7.5.1) */ - dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); - /* - * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, - * 7.5.1 we perform this check beyond the initial handshake: W/W' are - * always > 32, so for the first W/W' packets in the lifetime of a - * connection we always have to adjust SWL. - * A second reason why we are doing this is that the window depends on - * the feature-remote value of Sequence Window: nothing stops the peer - * from updating this value while we are busy adjusting SWL for the - * first W packets (we would have to count from scratch again then). - * Therefore it is safer to always make sure that the Sequence Window - * is not artificially extended by a peer who grows SWL downwards by - * continually updating the feature-remote Sequence-Window. - * If sequence numbers wrap it is bad luck. But that will take a while - * (48 bit), and this measure prevents Sequence-number attacks. - */ - if (before48(dp->dccps_swl, dp->dccps_isr)) - dp->dccps_swl = dp->dccps_isr; - dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); -} - -static inline void dccp_update_gss(struct sock *sk, u64 seq) -{ - struct dccp_sock *dp = dccp_sk(sk); - - dp->dccps_gss = seq; - /* Ack validity window depends on local Sequence Window value (7.5.1) */ - dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); - /* Adjust AWL so that it is not below ISS - see comment above for SWL */ - if (before48(dp->dccps_awl, dp->dccps_iss)) - dp->dccps_awl = dp->dccps_iss; - dp->dccps_awh = dp->dccps_gss; -} - -static inline int dccp_ackvec_pending(const struct sock *sk) -{ - return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && - !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); -} - -static inline int dccp_ack_pending(const struct sock *sk) -{ - return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); -} - -int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); -int dccp_feat_finalise_settings(struct dccp_sock *dp); -int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); -int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, - struct sk_buff *skb); -int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); -void dccp_feat_list_purge(struct list_head *fn_list); - -int dccp_insert_options(struct sock *sk, struct sk_buff *skb); -int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *); -u32 dccp_timestamp(void); -void dccp_timestamping_init(void); -int dccp_insert_option(struct sk_buff *skb, unsigned char option, - const void *value, unsigned char len); - -#ifdef CONFIG_SYSCTL -int dccp_sysctl_init(void); -void dccp_sysctl_exit(void); -#else -static inline int dccp_sysctl_init(void) -{ - return 0; -} - -static inline void dccp_sysctl_exit(void) -{ -} -#endif - -#endif /* _DCCP_H */ diff --git a/net/dccp/diag.c b/net/dccp/diag.c deleted file mode 100644 index f5019d95c3ae..000000000000 --- a/net/dccp/diag.c +++ /dev/null @@ -1,85 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/diag.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@mandriva.com> - */ - - -#include <linux/module.h> -#include <linux/inet_diag.h> - -#include "ccid.h" -#include "dccp.h" - -static void dccp_get_info(struct sock *sk, struct tcp_info *info) -{ - struct dccp_sock *dp = dccp_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - - memset(info, 0, sizeof(*info)); - - info->tcpi_state = sk->sk_state; - info->tcpi_retransmits = icsk->icsk_retransmits; - info->tcpi_probes = icsk->icsk_probes_out; - info->tcpi_backoff = icsk->icsk_backoff; - info->tcpi_pmtu = icsk->icsk_pmtu_cookie; - - if (dp->dccps_hc_rx_ackvec != NULL) - info->tcpi_options |= TCPI_OPT_SACK; - - if (dp->dccps_hc_rx_ccid != NULL) - ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); - - if (dp->dccps_hc_tx_ccid != NULL) - ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info); -} - -static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, - void *_info) -{ - r->idiag_rqueue = r->idiag_wqueue = 0; - - if (_info != NULL) - dccp_get_info(sk, _info); -} - -static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) -{ - inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r); -} - -static int dccp_diag_dump_one(struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) -{ - return inet_diag_dump_one_icsk(&dccp_hashinfo, cb, req); -} - -static const struct inet_diag_handler dccp_diag_handler = { - .owner = THIS_MODULE, - .dump = dccp_diag_dump, - .dump_one = dccp_diag_dump_one, - .idiag_get_info = dccp_diag_get_info, - .idiag_type = IPPROTO_DCCP, - .idiag_info_size = sizeof(struct tcp_info), -}; - -static int __init dccp_diag_init(void) -{ - return inet_diag_register(&dccp_diag_handler); -} - -static void __exit dccp_diag_fini(void) -{ - inet_diag_unregister(&dccp_diag_handler); -} - -module_init(dccp_diag_init); -module_exit(dccp_diag_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>"); -MODULE_DESCRIPTION("DCCP inet_diag handler"); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-33 /* AF_INET - IPPROTO_DCCP */); diff --git a/net/dccp/feat.c b/net/dccp/feat.c deleted file mode 100644 index f7554dcdaaba..000000000000 --- a/net/dccp/feat.c +++ /dev/null @@ -1,1581 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/feat.c - * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * - * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> - * Rewrote from scratch, some bits from earlier code by - * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> - * - * ASSUMPTIONS - * ----------- - * o Feature negotiation is coordinated with connection setup (as in TCP), wild - * changes of parameters of an established connection are not supported. - * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN. - * o All currently known SP features have 1-byte quantities. If in the future - * extensions of RFCs 4340..42 define features with item lengths larger than - * one byte, a feature-specific extension of the code will be required. - */ -#include <linux/module.h> -#include <linux/slab.h> -#include "ccid.h" -#include "feat.h" - -/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ -unsigned long sysctl_dccp_sequence_window __read_mostly = 100; -int sysctl_dccp_rx_ccid __read_mostly = 2, - sysctl_dccp_tx_ccid __read_mostly = 2; - -/* - * Feature activation handlers. - * - * These all use an u64 argument, to provide enough room for NN/SP features. At - * this stage the negotiated values have been checked to be within their range. - */ -static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct ccid *new_ccid = ccid_new(ccid, sk, rx); - - if (new_ccid == NULL) - return -ENOMEM; - - if (rx) { - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = new_ccid; - } else { - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = new_ccid; - } - return 0; -} - -static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) { - dp->dccps_r_seq_win = seq_win; - /* propagate changes to update SWL/SWH */ - dccp_update_gsr(sk, dp->dccps_gsr); - } else { - dp->dccps_l_seq_win = seq_win; - /* propagate changes to update AWL */ - dccp_update_gss(sk, dp->dccps_gss); - } - return 0; -} - -static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) -{ - if (rx) - dccp_sk(sk)->dccps_r_ack_ratio = ratio; - else - dccp_sk(sk)->dccps_l_ack_ratio = ratio; - return 0; -} - -static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) { - if (enable && dp->dccps_hc_rx_ackvec == NULL) { - dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); - if (dp->dccps_hc_rx_ackvec == NULL) - return -ENOMEM; - } else if (!enable) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } - } - return 0; -} - -static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) -{ - if (!rx) - dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); - return 0; -} - -/* - * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that - * `rx' holds when the sending peer informs about his partial coverage via a - * ChangeR() option. In the other case, we are the sender and the receiver - * announces its coverage via ChangeL() options. The policy here is to honour - * such communication by enabling the corresponding partial coverage - but only - * if it has not been set manually before; the warning here means that all - * packets will be dropped. - */ -static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) -{ - struct dccp_sock *dp = dccp_sk(sk); - - if (rx) - dp->dccps_pcrlen = cscov; - else { - if (dp->dccps_pcslen == 0) - dp->dccps_pcslen = cscov; - else if (cscov > dp->dccps_pcslen) - DCCP_WARN("CsCov %u too small, peer requires >= %u\n", - dp->dccps_pcslen, (u8)cscov); - } - return 0; -} - -static const struct { - u8 feat_num; /* DCCPF_xxx */ - enum dccp_feat_type rxtx; /* RX or TX */ - enum dccp_feat_type reconciliation; /* SP or NN */ - u8 default_value; /* as in 6.4 */ - int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); -/* - * Lookup table for location and type of features (from RFC 4340/4342) - * +--------------------------+----+-----+----+----+---------+-----------+ - * | Feature | Location | Reconc. | Initial | Section | - * | | RX | TX | SP | NN | Value | Reference | - * +--------------------------+----+-----+----+----+---------+-----------+ - * | DCCPF_CCID | | X | X | | 2 | 10 | - * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | - * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | - * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | - * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | - * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | - * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | - * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | - * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | - * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | - * +--------------------------+----+-----+----+----+---------+-----------+ - */ -} dccp_feat_table[] = { - { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, - { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, - { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, - { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, - { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, - { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, - { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, - { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, - { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, -}; -#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) - -/** - * dccp_feat_index - Hash function to map feature number into array position - * @feat_num: feature to hash, one of %dccp_feature_numbers - * - * Returns consecutive array index or -1 if the feature is not understood. - */ -static int dccp_feat_index(u8 feat_num) -{ - /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ - if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) - return feat_num - 1; - - /* - * Other features: add cases for new feature types here after adding - * them to the above table. - */ - switch (feat_num) { - case DCCPF_SEND_LEV_RATE: - return DCCP_FEAT_SUPPORTED_MAX - 1; - } - return -1; -} - -static u8 dccp_feat_type(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); - - if (idx < 0) - return FEAT_UNKNOWN; - return dccp_feat_table[idx].reconciliation; -} - -static int dccp_feat_default_value(u8 feat_num) -{ - int idx = dccp_feat_index(feat_num); - /* - * There are no default values for unknown features, so encountering a - * negative index here indicates a serious problem somewhere else. - */ - DCCP_BUG_ON(idx < 0); - - return idx < 0 ? 0 : dccp_feat_table[idx].default_value; -} - -/* - * Debugging and verbose-printing section - */ -static const char *dccp_feat_fname(const u8 feat) -{ - static const char *const feature_names[] = { - [DCCPF_RESERVED] = "Reserved", - [DCCPF_CCID] = "CCID", - [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", - [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", - [DCCPF_ECN_INCAPABLE] = "ECN Incapable", - [DCCPF_ACK_RATIO] = "Ack Ratio", - [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", - [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", - [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", - [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", - }; - if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) - return feature_names[DCCPF_RESERVED]; - - if (feat == DCCPF_SEND_LEV_RATE) - return "Send Loss Event Rate"; - if (feat >= DCCPF_MIN_CCID_SPECIFIC) - return "CCID-specific"; - - return feature_names[feat]; -} - -static const char *const dccp_feat_sname[] = { - "DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE", -}; - -#ifdef CONFIG_IP_DCCP_DEBUG -static const char *dccp_feat_oname(const u8 opt) -{ - switch (opt) { - case DCCPO_CHANGE_L: return "Change_L"; - case DCCPO_CONFIRM_L: return "Confirm_L"; - case DCCPO_CHANGE_R: return "Change_R"; - case DCCPO_CONFIRM_R: return "Confirm_R"; - } - return NULL; -} - -static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) -{ - u8 i, type = dccp_feat_type(feat_num); - - if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) - dccp_pr_debug_cat("(NULL)"); - else if (type == FEAT_SP) - for (i = 0; i < val->sp.len; i++) - dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); - else if (type == FEAT_NN) - dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); - else - dccp_pr_debug_cat("unknown type %u", type); -} - -static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) -{ - u8 type = dccp_feat_type(feat_num); - dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; - - if (type == FEAT_NN) - fval.nn = dccp_decode_value_var(list, len); - dccp_feat_printval(feat_num, &fval); -} - -static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) -{ - dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", - dccp_feat_fname(entry->feat_num)); - dccp_feat_printval(entry->feat_num, &entry->val); - dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], - entry->needs_confirm ? "(Confirm pending)" : ""); -} - -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ - dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ - dccp_feat_printvals(feat, val, len); \ - dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) - -#define dccp_feat_print_fnlist(fn_list) { \ - const struct dccp_feat_entry *___entry; \ - \ - dccp_pr_debug("List Dump:\n"); \ - list_for_each_entry(___entry, fn_list, node) \ - dccp_feat_print_entry(___entry); \ -} -#else /* ! CONFIG_IP_DCCP_DEBUG */ -#define dccp_feat_print_opt(opt, feat, val, len, mandatory) -#define dccp_feat_print_fnlist(fn_list) -#endif - -static int __dccp_feat_activate(struct sock *sk, const int idx, - const bool is_local, dccp_feat_val const *fval) -{ - bool rx; - u64 val; - - if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) - return -1; - if (dccp_feat_table[idx].activation_hdlr == NULL) - return 0; - - if (fval == NULL) { - val = dccp_feat_table[idx].default_value; - } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { - if (fval->sp.vec == NULL) { - /* - * This can happen when an empty Confirm is sent - * for an SP (i.e. known) feature. In this case - * we would be using the default anyway. - */ - DCCP_CRIT("Feature #%d undefined: using default", idx); - val = dccp_feat_table[idx].default_value; - } else { - val = fval->sp.vec[0]; - } - } else { - val = fval->nn; - } - - /* Location is RX if this is a local-RX or remote-TX feature */ - rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); - - dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", - dccp_feat_fname(dccp_feat_table[idx].feat_num), - fval ? "" : "default ", (unsigned long long)val); - - return dccp_feat_table[idx].activation_hdlr(sk, val, rx); -} - -/** - * dccp_feat_activate - Activate feature value on socket - * @sk: fully connected DCCP socket (after handshake is complete) - * @feat_num: feature to activate, one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @fval: the value (SP or NN) to activate, or NULL to use the default value - * - * For general use this function is preferable over __dccp_feat_activate(). - */ -static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, - dccp_feat_val const *fval) -{ - return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); -} - -/* Test for "Req'd" feature (RFC 4340, 6.4) */ -static inline int dccp_feat_must_be_understood(u8 feat_num) -{ - return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || - feat_num == DCCPF_SEQUENCE_WINDOW; -} - -/* copy constructor, fval must not already contain allocated memory */ -static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) -{ - fval->sp.len = len; - if (fval->sp.len > 0) { - fval->sp.vec = kmemdup(val, len, gfp_any()); - if (fval->sp.vec == NULL) { - fval->sp.len = 0; - return -ENOMEM; - } - } - return 0; -} - -static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) -{ - if (unlikely(val == NULL)) - return; - if (dccp_feat_type(feat_num) == FEAT_SP) - kfree(val->sp.vec); - memset(val, 0, sizeof(*val)); -} - -static struct dccp_feat_entry * - dccp_feat_clone_entry(struct dccp_feat_entry const *original) -{ - struct dccp_feat_entry *new; - u8 type = dccp_feat_type(original->feat_num); - - if (type == FEAT_UNKNOWN) - return NULL; - - new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); - if (new == NULL) - return NULL; - - if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, - original->val.sp.vec, - original->val.sp.len)) { - kfree(new); - return NULL; - } - return new; -} - -static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) -{ - if (entry != NULL) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - kfree(entry); - } -} - -/* - * List management functions - * - * Feature negotiation lists rely on and maintain the following invariants: - * - each feat_num in the list is known, i.e. we know its type and default value - * - each feat_num/is_local combination is unique (old entries are overwritten) - * - SP values are always freshly allocated - * - list is sorted in increasing order of feature number (faster lookup) - */ -static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, - u8 feat_num, bool is_local) -{ - struct dccp_feat_entry *entry; - - list_for_each_entry(entry, fn_list, node) { - if (entry->feat_num == feat_num && entry->is_local == is_local) - return entry; - else if (entry->feat_num > feat_num) - break; - } - return NULL; -} - -/** - * dccp_feat_entry_new - Central list update routine (called by all others) - * @head: list to add to - * @feat: feature number - * @local: whether the local (1) or remote feature with number @feat is meant - * - * This is the only constructor and serves to ensure the above invariants. - */ -static struct dccp_feat_entry * - dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) -{ - struct dccp_feat_entry *entry; - - list_for_each_entry(entry, head, node) - if (entry->feat_num == feat && entry->is_local == local) { - dccp_feat_val_destructor(entry->feat_num, &entry->val); - return entry; - } else if (entry->feat_num > feat) { - head = &entry->node; - break; - } - - entry = kmalloc(sizeof(*entry), gfp_any()); - if (entry != NULL) { - entry->feat_num = feat; - entry->is_local = local; - list_add_tail(&entry->node, head); - } - return entry; -} - -/** - * dccp_feat_push_change - Add/overwrite a Change option in the list - * @fn_list: feature-negotiation list to update - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is meant - * @mandatory: whether to use Mandatory feature negotiation options - * @fval: pointer to NN/SP value to be inserted (will be copied) - */ -static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, - u8 mandatory, dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); - - if (new == NULL) - return -ENOMEM; - - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_INITIALISING; - new->needs_confirm = false; - new->empty_confirm = false; - new->val = *fval; - new->needs_mandatory = mandatory; - - return 0; -} - -/** - * dccp_feat_push_confirm - Add a Confirm entry to the FN list - * @fn_list: feature-negotiation list to add to - * @feat: one of %dccp_feature_numbers - * @local: whether local (1) or remote (0) @feat_num is being confirmed - * @fval: pointer to NN/SP value to be inserted or NULL - * - * Returns 0 on success, a Reset code for further processing otherwise. - */ -static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, - dccp_feat_val *fval) -{ - struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); - - if (new == NULL) - return DCCP_RESET_CODE_TOO_BUSY; - - new->feat_num = feat; - new->is_local = local; - new->state = FEAT_STABLE; /* transition in 6.6.2 */ - new->needs_confirm = true; - new->empty_confirm = (fval == NULL); - new->val.nn = 0; /* zeroes the whole structure */ - if (!new->empty_confirm) - new->val = *fval; - new->needs_mandatory = false; - - return 0; -} - -static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) -{ - return dccp_feat_push_confirm(fn_list, feat, local, NULL); -} - -static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) -{ - list_del(&entry->node); - dccp_feat_entry_destructor(entry); -} - -void dccp_feat_list_purge(struct list_head *fn_list) -{ - struct dccp_feat_entry *entry, *next; - - list_for_each_entry_safe(entry, next, fn_list, node) - dccp_feat_entry_destructor(entry); - INIT_LIST_HEAD(fn_list); -} -EXPORT_SYMBOL_GPL(dccp_feat_list_purge); - -/* generate @to as full clone of @from - @to must not contain any nodes */ -int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) -{ - struct dccp_feat_entry *entry, *new; - - INIT_LIST_HEAD(to); - list_for_each_entry(entry, from, node) { - new = dccp_feat_clone_entry(entry); - if (new == NULL) - goto cloning_failed; - list_add_tail(&new->node, to); - } - return 0; - -cloning_failed: - dccp_feat_list_purge(to); - return -ENOMEM; -} - -/** - * dccp_feat_valid_nn_length - Enforce length constraints on NN options - * @feat_num: feature to return length of, one of %dccp_feature_numbers - * - * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, - * incoming options are accepted as long as their values are valid. - */ -static u8 dccp_feat_valid_nn_length(u8 feat_num) -{ - if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ - return 2; - if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ - return 6; - return 0; -} - -static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) -{ - switch (feat_num) { - case DCCPF_ACK_RATIO: - return val <= DCCPF_ACK_RATIO_MAX; - case DCCPF_SEQUENCE_WINDOW: - return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; - } - return 0; /* feature unknown - so we can't tell */ -} - -/* check that SP values are within the ranges defined in RFC 4340 */ -static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) -{ - switch (feat_num) { - case DCCPF_CCID: - return val == DCCPC_CCID2 || val == DCCPC_CCID3; - /* Type-check Boolean feature values: */ - case DCCPF_SHORT_SEQNOS: - case DCCPF_ECN_INCAPABLE: - case DCCPF_SEND_ACK_VECTOR: - case DCCPF_SEND_NDP_COUNT: - case DCCPF_DATA_CHECKSUM: - case DCCPF_SEND_LEV_RATE: - return val < 2; - case DCCPF_MIN_CSUM_COVER: - return val < 16; - } - return 0; /* feature unknown */ -} - -static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) -{ - if (sp_list == NULL || sp_len < 1) - return 0; - while (sp_len--) - if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) - return 0; - return 1; -} - -/** - * dccp_feat_insert_opts - Generate FN options from current list state - * @skb: next sk_buff to be sent to the peer - * @dp: for client during handshake and general negotiation - * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) - */ -int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - struct dccp_feat_entry *pos, *next; - u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; - bool rpt; - - /* put entries into @skb in the order they appear in the list */ - list_for_each_entry_safe_reverse(pos, next, fn, node) { - opt = dccp_feat_genopt(pos); - type = dccp_feat_type(pos->feat_num); - rpt = false; - - if (pos->empty_confirm) { - len = 0; - ptr = NULL; - } else { - if (type == FEAT_SP) { - len = pos->val.sp.len; - ptr = pos->val.sp.vec; - rpt = pos->needs_confirm; - } else if (type == FEAT_NN) { - len = dccp_feat_valid_nn_length(pos->feat_num); - ptr = nn_in_nbo; - dccp_encode_value_var(pos->val.nn, ptr, len); - } else { - DCCP_BUG("unknown feature %u", pos->feat_num); - return -1; - } - } - dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); - - if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) - return -1; - if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) - return -1; - - if (skb->sk->sk_state == DCCP_OPEN && - (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) { - /* - * Confirms don't get retransmitted (6.6.3) once the - * connection is in state OPEN - */ - dccp_feat_list_pop(pos); - } else { - /* - * Enter CHANGING after transmitting the Change - * option (6.6.2). - */ - if (pos->state == FEAT_INITIALISING) - pos->state = FEAT_CHANGING; - } - } - return 0; -} - -/** - * __feat_register_nn - Register new NN value on socket - * @fn: feature-negotiation list to register with - * @feat: an NN feature from %dccp_feature_numbers - * @mandatory: use Mandatory option if 1 - * @nn_val: value to register (restricted to 4 bytes) - * - * Note that NN features are local by definition (RFC 4340, 6.3.2). - */ -static int __feat_register_nn(struct list_head *fn, u8 feat, - u8 mandatory, u64 nn_val) -{ - dccp_feat_val fval = { .nn = nn_val }; - - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; - - /* Don't bother with default values, they will be activated anyway. */ - if (nn_val - (u64)dccp_feat_default_value(feat) == 0) - return 0; - - return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); -} - -/** - * __feat_register_sp - Register new SP value/list on socket - * @fn: feature-negotiation list to register with - * @feat: an SP feature from %dccp_feature_numbers - * @is_local: whether the local (1) or the remote (0) @feat is meant - * @mandatory: use Mandatory option if 1 - * @sp_val: SP value followed by optional preference list - * @sp_len: length of @sp_val in bytes - */ -static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, - u8 mandatory, u8 const *sp_val, u8 sp_len) -{ - dccp_feat_val fval; - - if (dccp_feat_type(feat) != FEAT_SP || - !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) - return -EINVAL; - - /* Avoid negotiating alien CCIDs by only advertising supported ones */ - if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) - return -EOPNOTSUPP; - - if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) - return -ENOMEM; - - if (dccp_feat_push_change(fn, feat, is_local, mandatory, &fval)) { - kfree(fval.sp.vec); - return -ENOMEM; - } - - return 0; -} - -/** - * dccp_feat_register_sp - Register requests to change SP feature values - * @sk: client or listening socket - * @feat: one of %dccp_feature_numbers - * @is_local: whether the local (1) or remote (0) @feat is meant - * @list: array of preferred values, in descending order of preference - * @len: length of @list in bytes - */ -int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len) -{ /* any changes must be registered before establishing the connection */ - if (sk->sk_state != DCCP_CLOSED) - return -EISCONN; - if (dccp_feat_type(feat) != FEAT_SP) - return -EINVAL; - return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, - 0, list, len); -} - -/** - * dccp_feat_nn_get - Query current/pending value of NN feature - * @sk: DCCP socket of an established connection - * @feat: NN feature number from %dccp_feature_numbers - * - * For a known NN feature, returns value currently being negotiated, or - * current (confirmed) value if no negotiation is going on. - */ -u64 dccp_feat_nn_get(struct sock *sk, u8 feat) -{ - if (dccp_feat_type(feat) == FEAT_NN) { - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_feat_entry *entry; - - entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1); - if (entry != NULL) - return entry->val.nn; - - switch (feat) { - case DCCPF_ACK_RATIO: - return dp->dccps_l_ack_ratio; - case DCCPF_SEQUENCE_WINDOW: - return dp->dccps_l_seq_win; - } - } - DCCP_BUG("attempt to look up unsupported feature %u", feat); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_feat_nn_get); - -/** - * dccp_feat_signal_nn_change - Update NN values for an established connection - * @sk: DCCP socket of an established connection - * @feat: NN feature number from %dccp_feature_numbers - * @nn_val: the new value to use - * - * This function is used to communicate NN updates out-of-band. - */ -int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - dccp_feat_val fval = { .nn = nn_val }; - struct dccp_feat_entry *entry; - - if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) - return 0; - - if (dccp_feat_type(feat) != FEAT_NN || - !dccp_feat_is_valid_nn_val(feat, nn_val)) - return -EINVAL; - - if (nn_val == dccp_feat_nn_get(sk, feat)) - return 0; /* already set or negotiation under way */ - - entry = dccp_feat_list_lookup(fn, feat, 1); - if (entry != NULL) { - dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n", - (unsigned long long)entry->val.nn, - (unsigned long long)nn_val); - dccp_feat_list_pop(entry); - } - - inet_csk_schedule_ack(sk); - return dccp_feat_push_change(fn, feat, 1, 0, &fval); -} -EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); - -/* - * Tracking features whose value depend on the choice of CCID - * - * This is designed with an extension in mind so that a list walk could be done - * before activating any features. However, the existing framework was found to - * work satisfactorily up until now, the automatic verification is left open. - * When adding new CCIDs, add a corresponding dependency table here. - */ -static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) -{ - static const struct ccid_dependency ccid2_dependencies[2][2] = { - /* - * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX - * feature and Send Ack Vector is an RX feature, `is_local' - * needs to be reversed. - */ - { /* Dependencies of the receiver-side (remote) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - }, - { /* Dependencies of the sender-side (local) CCID2 */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - static const struct ccid_dependency ccid3_dependencies[2][5] = { - { /* - * Dependencies of the receiver-side CCID3 - */ - { /* locally disable Ack Vectors */ - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* see below why Send Loss Event Rate is on */ - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = true, - .is_mandatory = true, - .val = 1 - }, - { /* NDP Count is needed as per RFC 4342, 6.1.1 */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { 0, 0, 0, 0 }, - }, - { /* - * CCID3 at the TX side: we request that the HC-receiver - * will not send Ack Vectors (they will be ignored, so - * Mandatory is not set); we enable Send Loss Event Rate - * (Mandatory since the implementation does not support - * the Loss Intervals option of RFC 4342, 8.6). - * The last two options are for peer's information only. - */ - { - .dependent_feat = DCCPF_SEND_ACK_VECTOR, - .is_local = false, - .is_mandatory = false, - .val = 0 - }, - { - .dependent_feat = DCCPF_SEND_LEV_RATE, - .is_local = false, - .is_mandatory = true, - .val = 1 - }, - { /* this CCID does not support Ack Ratio */ - .dependent_feat = DCCPF_ACK_RATIO, - .is_local = true, - .is_mandatory = false, - .val = 0 - }, - { /* tell receiver we are sending NDP counts */ - .dependent_feat = DCCPF_SEND_NDP_COUNT, - .is_local = true, - .is_mandatory = false, - .val = 1 - }, - { 0, 0, 0, 0 } - } - }; - switch (ccid) { - case DCCPC_CCID2: - return ccid2_dependencies[is_local]; - case DCCPC_CCID3: - return ccid3_dependencies[is_local]; - default: - return NULL; - } -} - -/** - * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID - * @fn: feature-negotiation list to update - * @id: CCID number to track - * @is_local: whether TX CCID (1) or RX CCID (0) is meant - * - * This function needs to be called after registering all other features. - */ -static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) -{ - const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); - int i, rc = (table == NULL); - - for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) - if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) - rc = __feat_register_sp(fn, table[i].dependent_feat, - table[i].is_local, - table[i].is_mandatory, - &table[i].val, 1); - else - rc = __feat_register_nn(fn, table[i].dependent_feat, - table[i].is_mandatory, - table[i].val); - return rc; -} - -/** - * dccp_feat_finalise_settings - Finalise settings before starting negotiation - * @dp: client or listening socket (settings will be inherited) - * - * This is called after all registrations (socket initialisation, sysctls, and - * sockopt calls), and before sending the first packet containing Change options - * (ie. client-Request or server-Response), to ensure internal consistency. - */ -int dccp_feat_finalise_settings(struct dccp_sock *dp) -{ - struct list_head *fn = &dp->dccps_featneg; - struct dccp_feat_entry *entry; - int i = 2, ccids[2] = { -1, -1 }; - - /* - * Propagating CCIDs: - * 1) not useful to propagate CCID settings if this host advertises more - * than one CCID: the choice of CCID may still change - if this is - * the client, or if this is the server and the client sends - * singleton CCID values. - * 2) since is that propagate_ccid changes the list, we defer changing - * the sorted list until after the traversal. - */ - list_for_each_entry(entry, fn, node) - if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) - ccids[entry->is_local] = entry->val.sp.vec[0]; - while (i--) - if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) - return -1; - dccp_feat_print_fnlist(fn); - return 0; -} - -/** - * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features - * @dreq: server socket to resolve - * - * It is the server which resolves the dependencies once the CCID has been - * fully negotiated. If no CCID has been negotiated, it uses the default CCID. - */ -int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) -{ - struct list_head *fn = &dreq->dreq_featneg; - struct dccp_feat_entry *entry; - u8 is_local, ccid; - - for (is_local = 0; is_local <= 1; is_local++) { - entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); - - if (entry != NULL && !entry->empty_confirm) - ccid = entry->val.sp.vec[0]; - else - ccid = dccp_feat_default_value(DCCPF_CCID); - - if (dccp_feat_propagate_ccid(fn, ccid, is_local)) - return -1; - } - return 0; -} - -/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ -static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) -{ - u8 c, s; - - for (s = 0; s < slen; s++) - for (c = 0; c < clen; c++) - if (servlist[s] == clilist[c]) - return servlist[s]; - return -1; -} - -/** - * dccp_feat_prefer - Move preferred entry to the start of array - * @preferred_value: entry to move to start of array - * @array: array of preferred entries - * @array_len: size of the array - * - * Reorder the @array_len elements in @array so that @preferred_value comes - * first. Returns >0 to indicate that @preferred_value does occur in @array. - */ -static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) -{ - u8 i, does_occur = 0; - - if (array != NULL) { - for (i = 0; i < array_len; i++) - if (array[i] == preferred_value) { - array[i] = array[0]; - does_occur++; - } - if (does_occur) - array[0] = preferred_value; - } - return does_occur; -} - -/** - * dccp_feat_reconcile - Reconcile SP preference lists - * @fv: SP list to reconcile into - * @arr: received SP preference list - * @len: length of @arr in bytes - * @is_server: whether this side is the server (and @fv is the server's list) - * @reorder: whether to reorder the list in @fv after reconciling with @arr - * When successful, > 0 is returned and the reconciled list is in @fval. - * A value of 0 means that negotiation failed (no shared entry). - */ -static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, - bool is_server, bool reorder) -{ - int rc; - - if (!fv->sp.vec || !arr) { - DCCP_CRIT("NULL feature value or array"); - return 0; - } - - if (is_server) - rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); - else - rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); - - if (!reorder) - return rc; - if (rc < 0) - return 0; - - /* - * Reorder list: used for activating features and in dccp_insert_fn_opt. - */ - return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); -} - -/** - * dccp_feat_change_recv - Process incoming ChangeL/R options - * @fn: feature-negotiation list to update - * @is_mandatory: whether the Change was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is the server (1) or the client (0) - */ -static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 defval, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CHANGE_R); - struct dccp_feat_entry *entry; - dccp_feat_val fval; - - if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ - goto unknown_feature_or_value; - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - /* - * Negotiation of NN features: Change R is invalid, so there is no - * simultaneous negotiation; hence we do not look up in the list. - */ - if (type == FEAT_NN) { - if (local || len > sizeof(fval.nn)) - goto unknown_feature_or_value; - - /* 6.3.2: "The feature remote MUST accept any valid value..." */ - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto unknown_feature_or_value; - - return dccp_feat_push_confirm(fn, feat, local, &fval); - } - - /* - * Unidirectional/simultaneous negotiation of SP features (6.3.1) - */ - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL) { - /* - * No particular preferences have been registered. We deal with - * this situation by assuming that all valid values are equally - * acceptable, and apply the following checks: - * - if the peer's list is a singleton, we accept a valid value; - * - if we are the server, we first try to see if the peer (the - * client) advertises the default value. If yes, we use it, - * otherwise we accept the preferred value; - * - else if we are the client, we use the first list element. - */ - if (dccp_feat_clone_sp_val(&fval, val, 1)) - return DCCP_RESET_CODE_TOO_BUSY; - - if (len > 1 && server) { - defval = dccp_feat_default_value(feat); - if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) - fval.sp.vec[0] = defval; - } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { - kfree(fval.sp.vec); - goto unknown_feature_or_value; - } - - /* Treat unsupported CCIDs like invalid values */ - if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { - kfree(fval.sp.vec); - goto not_valid_or_not_known; - } - - if (dccp_feat_push_confirm(fn, feat, local, &fval)) { - kfree(fval.sp.vec); - return DCCP_RESET_CODE_TOO_BUSY; - } - - return 0; - } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ - return 0; - } - - if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { - entry->empty_confirm = false; - } else if (is_mandatory) { - return DCCP_RESET_CODE_MANDATORY_ERROR; - } else if (entry->state == FEAT_INITIALISING) { - /* - * Failed simultaneous negotiation (server only): try to `save' - * the connection by checking whether entry contains the default - * value for @feat. If yes, send an empty Confirm to signal that - * the received Change was not understood - which implies using - * the default value. - * If this also fails, we use Reset as the last resort. - */ - WARN_ON(!server); - defval = dccp_feat_default_value(feat); - if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) - return DCCP_RESET_CODE_OPTION_ERROR; - entry->empty_confirm = true; - } - entry->needs_confirm = true; - entry->needs_mandatory = false; - entry->state = FEAT_STABLE; - return 0; - -unknown_feature_or_value: - if (!is_mandatory) - return dccp_push_empty_confirm(fn, feat, local); - -not_valid_or_not_known: - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_confirm_recv - Process received Confirm options - * @fn: feature-negotiation list to update - * @is_mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: NN value or SP value/preference list - * @len: length of @val in bytes - * @server: whether this node is server (1) or client (0) - */ -static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, - u8 feat, u8 *val, u8 len, const bool server) -{ - u8 *plist, plen, type = dccp_feat_type(feat); - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); - - dccp_feat_print_opt(opt, feat, val, len, is_mandatory); - - if (entry == NULL) { /* nothing queued: ignore or handle error */ - if (is_mandatory && type == FEAT_UNKNOWN) - return DCCP_RESET_CODE_MANDATORY_ERROR; - - if (!local && type == FEAT_NN) /* 6.3.2 */ - goto confirmation_failed; - return 0; - } - - if (entry->state != FEAT_CHANGING) /* 6.6.2 */ - return 0; - - if (len == 0) { - if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ - goto confirmation_failed; - /* - * Empty Confirm during connection setup: this means reverting - * to the `old' value, which in this case is the default. Since - * we handle default values automatically when no other values - * have been set, we revert to the old value by removing this - * entry from the list. - */ - dccp_feat_list_pop(entry); - return 0; - } - - if (type == FEAT_NN) { - if (len > sizeof(entry->val.nn)) - goto confirmation_failed; - - if (entry->val.nn == dccp_decode_value_var(val, len)) - goto confirmation_succeeded; - - DCCP_WARN("Bogus Confirm for non-existing value\n"); - goto confirmation_failed; - } - - /* - * Parsing SP Confirms: the first element of @val is the preferred - * SP value which the peer confirms, the remainder depends on @len. - * Note that only the confirmed value need to be a valid SP value. - */ - if (!dccp_feat_is_valid_sp_val(feat, *val)) - goto confirmation_failed; - - if (len == 1) { /* peer didn't supply a preference list */ - plist = val; - plen = len; - } else { /* preferred value + preference list */ - plist = val + 1; - plen = len - 1; - } - - /* Check whether the peer got the reconciliation right (6.6.8) */ - if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { - DCCP_WARN("Confirm selected the wrong value %u\n", *val); - return DCCP_RESET_CODE_OPTION_ERROR; - } - entry->val.sp.vec[0] = *val; - -confirmation_succeeded: - entry->state = FEAT_STABLE; - return 0; - -confirmation_failed: - DCCP_WARN("Confirmation failed\n"); - return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_handle_nn_established - Fast-path reception of NN options - * @sk: socket of an established DCCP connection - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) - * @feat: NN number, one of %dccp_feature_numbers - * @val: NN value - * @len: length of @val in bytes - * - * This function combines the functionality of change_recv/confirm_recv, with - * the following differences (reset codes are the same): - * - cleanup after receiving the Confirm; - * - values are directly activated after successful parsing; - * - deliberately restricted to NN features. - * The restriction to NN features is essential since SP features can have non- - * predictable outcomes (depending on the remote configuration), and are inter- - * dependent (CCIDs for instance cause further dependencies). - */ -static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, - u8 feat, u8 *val, u8 len) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - const bool local = (opt == DCCPO_CONFIRM_R); - struct dccp_feat_entry *entry; - u8 type = dccp_feat_type(feat); - dccp_feat_val fval; - - dccp_feat_print_opt(opt, feat, val, len, mandatory); - - /* Ignore non-mandatory unknown and non-NN features */ - if (type == FEAT_UNKNOWN) { - if (local && !mandatory) - return 0; - goto fast_path_unknown; - } else if (type != FEAT_NN) { - return 0; - } - - /* - * We don't accept empty Confirms, since in fast-path feature - * negotiation the values are enabled immediately after sending - * the Change option. - * Empty Changes on the other hand are invalid (RFC 4340, 6.1). - */ - if (len == 0 || len > sizeof(fval.nn)) - goto fast_path_unknown; - - if (opt == DCCPO_CHANGE_L) { - fval.nn = dccp_decode_value_var(val, len); - if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) - goto fast_path_unknown; - - if (dccp_feat_push_confirm(fn, feat, local, &fval) || - dccp_feat_activate(sk, feat, local, &fval)) - return DCCP_RESET_CODE_TOO_BUSY; - - /* set the `Ack Pending' flag to piggyback a Confirm */ - inet_csk_schedule_ack(sk); - - } else if (opt == DCCPO_CONFIRM_R) { - entry = dccp_feat_list_lookup(fn, feat, local); - if (entry == NULL || entry->state != FEAT_CHANGING) - return 0; - - fval.nn = dccp_decode_value_var(val, len); - /* - * Just ignore a value that doesn't match our current value. - * If the option changes twice within two RTTs, then at least - * one CONFIRM will be received for the old value after a - * new CHANGE was sent. - */ - if (fval.nn != entry->val.nn) - return 0; - - /* Only activate after receiving the Confirm option (6.6.1). */ - dccp_feat_activate(sk, feat, local, &fval); - - /* It has been confirmed - so remove the entry */ - dccp_feat_list_pop(entry); - - } else { - DCCP_WARN("Received illegal option %u\n", opt); - goto fast_path_failed; - } - return 0; - -fast_path_unknown: - if (!mandatory) - return dccp_push_empty_confirm(fn, feat, local); - -fast_path_failed: - return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR - : DCCP_RESET_CODE_OPTION_ERROR; -} - -/** - * dccp_feat_parse_options - Process Feature-Negotiation Options - * @sk: for general use and used by the client during connection setup - * @dreq: used by the server during connection setup - * @mandatory: whether @opt was preceded by a Mandatory option - * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R - * @feat: one of %dccp_feature_numbers - * @val: value contents of @opt - * @len: length of @val in bytes - * - * Returns 0 on success, a Reset code for ending the connection otherwise. - */ -int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; - bool server = false; - - switch (sk->sk_state) { - /* - * Negotiation during connection setup - */ - case DCCP_LISTEN: - server = true; - fallthrough; - case DCCP_REQUESTING: - switch (opt) { - case DCCPO_CHANGE_L: - case DCCPO_CHANGE_R: - return dccp_feat_change_recv(fn, mandatory, opt, feat, - val, len, server); - case DCCPO_CONFIRM_R: - case DCCPO_CONFIRM_L: - return dccp_feat_confirm_recv(fn, mandatory, opt, feat, - val, len, server); - } - break; - /* - * Support for exchanging NN options on an established connection. - */ - case DCCP_OPEN: - case DCCP_PARTOPEN: - return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, - val, len); - } - return 0; /* ignore FN options in all other states */ -} - -/** - * dccp_feat_init - Seed feature negotiation with host-specific defaults - * @sk: Socket to initialize. - * - * This initialises global defaults, depending on the value of the sysctls. - * These can later be overridden by registering changes via setsockopt calls. - * The last link in the chain is finalise_settings, to make sure that between - * here and the start of actual feature negotiation no inconsistencies enter. - * - * All features not appearing below use either defaults or are otherwise - * later adjusted through dccp_feat_finalise_settings(). - */ -int dccp_feat_init(struct sock *sk) -{ - struct list_head *fn = &dccp_sk(sk)->dccps_featneg; - u8 on = 1, off = 0; - int rc; - struct { - u8 *val; - u8 len; - } tx, rx; - - /* Non-negotiable (NN) features */ - rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, - sysctl_dccp_sequence_window); - if (rc) - return rc; - - /* Server-priority (SP) features */ - - /* Advertise that short seqnos are not supported (7.6.1) */ - rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); - if (rc) - return rc; - - /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ - rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); - if (rc) - return rc; - - /* - * We advertise the available list of CCIDs and reorder according to - * preferences, to avoid failure resulting from negotiating different - * singleton values (which always leads to failure). - * These settings can still (later) be overridden via sockopts. - */ - if (ccid_get_builtin_ccids(&tx.val, &tx.len)) - return -ENOBUFS; - if (ccid_get_builtin_ccids(&rx.val, &rx.len)) { - kfree(tx.val); - return -ENOBUFS; - } - - if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || - !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) - goto free_ccid_lists; - - rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); - if (rc) - goto free_ccid_lists; - - rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); - -free_ccid_lists: - kfree(tx.val); - kfree(rx.val); - return rc; -} - -int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_feat_entry *cur, *next; - int idx; - dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { - [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } - }; - - list_for_each_entry(cur, fn_list, node) { - /* - * An empty Confirm means that either an unknown feature type - * or an invalid value was present. In the first case there is - * nothing to activate, in the other the default value is used. - */ - if (cur->empty_confirm) - continue; - - idx = dccp_feat_index(cur->feat_num); - if (idx < 0) { - DCCP_BUG("Unknown feature %u", cur->feat_num); - goto activation_failed; - } - if (cur->state != FEAT_STABLE) { - DCCP_CRIT("Negotiation of %s %s failed in state %s", - cur->is_local ? "local" : "remote", - dccp_feat_fname(cur->feat_num), - dccp_feat_sname[cur->state]); - goto activation_failed; - } - fvals[idx][cur->is_local] = &cur->val; - } - - /* - * Activate in decreasing order of index, so that the CCIDs are always - * activated as the last feature. This avoids the case where a CCID - * relies on the initialisation of one or more features that it depends - * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). - */ - for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) - if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || - __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { - DCCP_CRIT("Could not activate %d", idx); - goto activation_failed; - } - - /* Clean up Change options which have been confirmed already */ - list_for_each_entry_safe(cur, next, fn_list, node) - if (!cur->needs_confirm) - dccp_feat_list_pop(cur); - - dccp_pr_debug("Activation OK\n"); - return 0; - -activation_failed: - /* - * We clean up everything that may have been allocated, since - * it is difficult to track at which stage negotiation failed. - * This is ok, since all allocation functions below are robust - * against NULL arguments. - */ - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - return -1; -} diff --git a/net/dccp/feat.h b/net/dccp/feat.h deleted file mode 100644 index 57d9c026aa3f..000000000000 --- a/net/dccp/feat.h +++ /dev/null @@ -1,133 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_FEAT_H -#define _DCCP_FEAT_H -/* - * net/dccp/feat.h - * - * Feature negotiation for the DCCP protocol (RFC 4340, section 6) - * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> - * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> - */ -#include <linux/types.h> -#include "dccp.h" - -/* - * Known limit values - */ -/* Ack Ratio takes 2-byte integer values (11.3) */ -#define DCCPF_ACK_RATIO_MAX 0xFFFF -/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ -#define DCCPF_SEQ_WMIN 32 -#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull -/* Maximum number of SP values that fit in a single (Confirm) option */ -#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) - -enum dccp_feat_type { - FEAT_AT_RX = 1, /* located at RX side of half-connection */ - FEAT_AT_TX = 2, /* located at TX side of half-connection */ - FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ - FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ - FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ -}; - -enum dccp_feat_state { - FEAT_DEFAULT = 0, /* using default values from 6.4 */ - FEAT_INITIALISING, /* feature is being initialised */ - FEAT_CHANGING, /* Change sent but not confirmed yet */ - FEAT_UNSTABLE, /* local modification in state CHANGING */ - FEAT_STABLE /* both ends (think they) agree */ -}; - -/** - * dccp_feat_val - Container for SP or NN feature values - * @nn: single NN value - * @sp.vec: single SP value plus optional preference list - * @sp.len: length of @sp.vec in bytes - */ -typedef union { - u64 nn; - struct { - u8 *vec; - u8 len; - } sp; -} dccp_feat_val; - -/** - * struct feat_entry - Data structure to perform feature negotiation - * @val: feature's current value (SP features may have preference list) - * @state: feature's current state - * @feat_num: one of %dccp_feature_numbers - * @needs_mandatory: whether Mandatory options should be sent - * @needs_confirm: whether to send a Confirm instead of a Change - * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) - * @is_local: feature location (1) or feature-remote (0) - * @node: list pointers, entries arranged in FIFO order - */ -struct dccp_feat_entry { - dccp_feat_val val; - enum dccp_feat_state state:8; - u8 feat_num; - - bool needs_mandatory, - needs_confirm, - empty_confirm, - is_local; - - struct list_head node; -}; - -static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) -{ - if (entry->needs_confirm) - return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; - return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; -} - -/** - * struct ccid_dependency - Track changes resulting from choosing a CCID - * @dependent_feat: one of %dccp_feature_numbers - * @is_local: local (1) or remote (0) @dependent_feat - * @is_mandatory: whether presence of @dependent_feat is mission-critical or not - * @val: corresponding default value for @dependent_feat (u8 is sufficient here) - */ -struct ccid_dependency { - u8 dependent_feat; - bool is_local:1, - is_mandatory:1; - u8 val; -}; - -/* - * Sysctls to seed defaults for feature negotiation - */ -extern unsigned long sysctl_dccp_sequence_window; -extern int sysctl_dccp_rx_ccid; -extern int sysctl_dccp_tx_ccid; - -int dccp_feat_init(struct sock *sk); -int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, - u8 const *list, u8 len); -int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, - u8 mand, u8 opt, u8 feat, u8 *val, u8 len); -int dccp_feat_clone_list(struct list_head const *, struct list_head *); - -/* - * Encoding variable-length options and their maximum length. - * - * This affects NN options (SP options are all u8) and other variable-length - * options (see table 3 in RFC 4340). The limit is currently given the Sequence - * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other - * options consume less than 6 bytes (timestamps are 4 bytes). - * When updating this constant (e.g. due to new internet drafts / RFCs), make - * sure that you also update all code which refers to it. - */ -#define DCCP_OPTVAL_MAXLEN 6 - -void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); -u64 dccp_decode_value_var(const u8 *bf, const u8 len); -u64 dccp_feat_nn_get(struct sock *sk, u8 feat); - -int dccp_insert_option_mandatory(struct sk_buff *skb); -int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len, - bool repeat_first); -#endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c deleted file mode 100644 index 2cbb757a894f..000000000000 --- a/net/dccp/input.c +++ /dev/null @@ -1,739 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/input.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/dccp.h> -#include <linux/skbuff.h> -#include <linux/slab.h> - -#include <net/sock.h> - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" - -/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */ -int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8; - -static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb) -{ - __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4); - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - sk->sk_data_ready(sk); -} - -static void dccp_fin(struct sock *sk, struct sk_buff *skb) -{ - /* - * On receiving Close/CloseReq, both RD/WR shutdown are performed. - * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after - * receiving the closing segment, but there is no guarantee that such - * data will be processed at all. - */ - sk->sk_shutdown = SHUTDOWN_MASK; - sock_set_flag(sk, SOCK_DONE); - dccp_enqueue_skb(sk, skb); -} - -static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb) -{ - int queued = 0; - - switch (sk->sk_state) { - /* - * We ignore Close when received in one of the following states: - * - CLOSED (may be a late or duplicate packet) - * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier) - * - RESPOND (already handled by dccp_check_req) - */ - case DCCP_CLOSING: - /* - * Simultaneous-close: receiving a Close after sending one. This - * can happen if both client and server perform active-close and - * will result in an endless ping-pong of crossing and retrans- - * mitted Close packets, which only terminates when one of the - * nodes times out (min. 64 seconds). Quicker convergence can be - * achieved when one of the nodes acts as tie-breaker. - * This is ok as both ends are done with data transfer and each - * end is just waiting for the other to acknowledge termination. - */ - if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) - break; - fallthrough; - case DCCP_REQUESTING: - case DCCP_ACTIVE_CLOSEREQ: - dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_done(sk); - break; - case DCCP_OPEN: - case DCCP_PARTOPEN: - /* Give waiting application a chance to read pending data */ - queued = 1; - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_PASSIVE_CLOSE); - fallthrough; - case DCCP_PASSIVE_CLOSE: - /* - * Retransmitted Close: we have already enqueued the first one. - */ - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - } - return queued; -} - -static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb) -{ - int queued = 0; - - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == CloseReq) - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) { - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC); - return queued; - } - - /* Step 13: process relevant Client states < CLOSEREQ */ - switch (sk->sk_state) { - case DCCP_REQUESTING: - dccp_send_close(sk, 0); - dccp_set_state(sk, DCCP_CLOSING); - break; - case DCCP_OPEN: - case DCCP_PARTOPEN: - /* Give waiting application a chance to read pending data */ - queued = 1; - dccp_fin(sk, skb); - dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ); - fallthrough; - case DCCP_PASSIVE_CLOSEREQ: - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - } - return queued; -} - -static u16 dccp_reset_code_convert(const u8 code) -{ - static const u16 error_code[] = { - [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */ - [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */ - [DCCP_RESET_CODE_ABORTED] = ECONNRESET, - - [DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED, - [DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED, - [DCCP_RESET_CODE_TOO_BUSY] = EUSERS, - [DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT, - - [DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG, - [DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR, - [DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC, - [DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ, - [DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP, - }; - - return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code]; -} - -static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) -{ - u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code); - - sk->sk_err = err; - - /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */ - dccp_fin(sk, skb); - - if (err && !sock_flag(sk, SOCK_DEAD)) - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); - dccp_time_wait(sk, DCCP_TIME_WAIT, 0); -} - -static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; - - if (av == NULL) - return; - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); - dccp_ackvec_input(av, skb); -} - -static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - /* Don't deliver to RX CCID when node has shut down read end. */ - if (!(sk->sk_shutdown & RCV_SHUTDOWN)) - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - /* - * Until the TX queue has been drained, we can not honour SHUT_WR, since - * we need received feedback as input to adjust congestion control. - */ - if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN)) - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); -} - -static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - struct dccp_sock *dp = dccp_sk(sk); - u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq, - ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - - /* - * Step 5: Prepare sequence numbers for Sync - * If P.type == Sync or P.type == SyncAck, - * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL, - * / * P is valid, so update sequence number variables - * accordingly. After this update, P will pass the tests - * in Step 6. A SyncAck is generated if necessary in - * Step 15 * / - * Update S.GSR, S.SWL, S.SWH - * Otherwise, - * Drop packet and return - */ - if (dh->dccph_type == DCCP_PKT_SYNC || - dh->dccph_type == DCCP_PKT_SYNCACK) { - if (between48(ackno, dp->dccps_awl, dp->dccps_awh) && - dccp_delta_seqno(dp->dccps_swl, seqno) >= 0) - dccp_update_gsr(sk, seqno); - else - return -1; - } - - /* - * Step 6: Check sequence numbers - * Let LSWL = S.SWL and LAWL = S.AWL - * If P.type == CloseReq or P.type == Close or P.type == Reset, - * LSWL := S.GSR + 1, LAWL := S.GAR - * If LSWL <= P.seqno <= S.SWH - * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH), - * Update S.GSR, S.SWL, S.SWH - * If P.type != Sync, - * Update S.GAR - */ - lswl = dp->dccps_swl; - lawl = dp->dccps_awl; - - if (dh->dccph_type == DCCP_PKT_CLOSEREQ || - dh->dccph_type == DCCP_PKT_CLOSE || - dh->dccph_type == DCCP_PKT_RESET) { - lswl = ADD48(dp->dccps_gsr, 1); - lawl = dp->dccps_gar; - } - - if (between48(seqno, lswl, dp->dccps_swh) && - (ackno == DCCP_PKT_WITHOUT_ACK_SEQ || - between48(ackno, lawl, dp->dccps_awh))) { - dccp_update_gsr(sk, seqno); - - if (dh->dccph_type != DCCP_PKT_SYNC && - ackno != DCCP_PKT_WITHOUT_ACK_SEQ && - after48(ackno, dp->dccps_gar)) - dp->dccps_gar = ackno; - } else { - unsigned long now = jiffies; - /* - * Step 6: Check sequence numbers - * Otherwise, - * If P.type == Reset, - * Send Sync packet acknowledging S.GSR - * Otherwise, - * Send Sync packet acknowledging P.seqno - * Drop packet and return - * - * These Syncs are rate-limited as per RFC 4340, 7.5.4: - * at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second. - */ - if (time_before(now, (dp->dccps_rate_last + - sysctl_dccp_sync_ratelimit))) - return -1; - - DCCP_WARN("Step 6 failed for %s packet, " - "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " - "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " - "sending SYNC...\n", dccp_packet_name(dh->dccph_type), - (unsigned long long) lswl, (unsigned long long) seqno, - (unsigned long long) dp->dccps_swh, - (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" - : "exists", - (unsigned long long) lawl, (unsigned long long) ackno, - (unsigned long long) dp->dccps_awh); - - dp->dccps_rate_last = now; - - if (dh->dccph_type == DCCP_PKT_RESET) - seqno = dp->dccps_gsr; - dccp_send_sync(sk, seqno, DCCP_PKT_SYNC); - return -1; - } - - return 0; -} - -static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - - switch (dccp_hdr(skb)->dccph_type) { - case DCCP_PKT_DATAACK: - case DCCP_PKT_DATA: - /* - * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when - * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening" - * - sk_receive_queue is full, use Code 2, "Receive Buffer" - */ - dccp_enqueue_skb(sk, skb); - return 0; - case DCCP_PKT_ACK: - goto discard; - case DCCP_PKT_RESET: - /* - * Step 9: Process Reset - * If P.type == Reset, - * Tear down connection - * S.state := TIMEWAIT - * Set TIMEWAIT timer - * Drop packet and return - */ - dccp_rcv_reset(sk, skb); - return 0; - case DCCP_PKT_CLOSEREQ: - if (dccp_rcv_closereq(sk, skb)) - return 0; - goto discard; - case DCCP_PKT_CLOSE: - if (dccp_rcv_close(sk, skb)) - return 0; - goto discard; - case DCCP_PKT_REQUEST: - /* Step 7 - * or (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state >= OPEN and P.type == Request - * and P.seqno >= S.OSR) - * or (S.state >= OPEN and P.type == Response - * and P.seqno >= S.OSR) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if (dp->dccps_role != DCCP_ROLE_LISTEN) - goto send_sync; - goto check_seq; - case DCCP_PKT_RESPONSE: - if (dp->dccps_role != DCCP_ROLE_CLIENT) - goto send_sync; -check_seq: - if (dccp_delta_seqno(dp->dccps_osr, - DCCP_SKB_CB(skb)->dccpd_seq) >= 0) { -send_sync: - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_PKT_SYNC); - } - break; - case DCCP_PKT_SYNC: - dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_PKT_SYNCACK); - /* - * From RFC 4340, sec. 5.7 - * - * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets - * MAY have non-zero-length application data areas, whose - * contents receivers MUST ignore. - */ - goto discard; - } - - DCCP_INC_STATS(DCCP_MIB_INERRS); -discard: - __kfree_skb(skb); - return 0; -} - -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned int len) -{ - if (dccp_check_seqno(sk, skb)) - goto discard; - - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); - - return __dccp_rcv_established(sk, skb, dh, len); -discard: - __kfree_skb(skb); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_rcv_established); - -static int dccp_rcv_request_sent_state_process(struct sock *sk, - struct sk_buff *skb, - const struct dccp_hdr *dh, - const unsigned int len) -{ - /* - * Step 4: Prepare sequence numbers in REQUEST - * If S.state == REQUEST, - * If (P.type == Response or P.type == Reset) - * and S.AWL <= P.ackno <= S.AWH, - * / * Set sequence number variables corresponding to the - * other endpoint, so P will pass the tests in Step 6 * / - * Set S.GSR, S.ISR, S.SWL, S.SWH - * / * Response processing continues in Step 10; Reset - * processing continues in Step 9 * / - */ - if (dh->dccph_type == DCCP_PKT_RESPONSE) { - const struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - long tstamp = dccp_timestamp(); - - if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, - dp->dccps_awl, dp->dccps_awh)) { - dccp_pr_debug("invalid ackno: S.AWL=%llu, " - "P.ackno=%llu, S.AWH=%llu\n", - (unsigned long long)dp->dccps_awl, - (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, - (unsigned long long)dp->dccps_awh); - goto out_invalid_packet; - } - - /* - * If option processing (Step 8) failed, return 1 here so that - * dccp_v4_do_rcv() sends a Reset. The Reset code depends on - * the option type and is set in dccp_parse_options(). - */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ - if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) - dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - - dp->dccps_options_received.dccpor_timestamp_echo)); - - /* Stop the REQUEST timer */ - inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); - WARN_ON(sk->sk_send_head == NULL); - kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - - /* - * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect - * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH - * is done as part of activating the feature values below, since - * these settings depend on the local/remote Sequence Window - * features, which were undefined or not confirmed until now. - */ - dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; - - dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); - - /* - * Step 10: Process REQUEST state (second part) - * If S.state == REQUEST, - * / * If we get here, P is a valid Response from the - * server (see Step 4), and we should move to - * PARTOPEN state. PARTOPEN means send an Ack, - * don't send Data packets, retransmit Acks - * periodically, and always include any Init Cookie - * from the Response * / - * S.state := PARTOPEN - * Set PARTOPEN timer - * Continue with S.state == PARTOPEN - * / * Step 12 will send the Ack completing the - * three-way handshake * / - */ - dccp_set_state(sk, DCCP_PARTOPEN); - - /* - * If feature negotiation was successful, activate features now; - * an activation failure means that this host could not activate - * one ore more features (e.g. insufficient memory), which would - * leave at least one feature in an undefined state. - */ - if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) - goto unable_to_proceed; - - /* Make sure socket is routed, for correct metrics. */ - icsk->icsk_af_ops->rebuild_header(sk); - - if (!sock_flag(sk, SOCK_DEAD)) { - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - } - - if (sk->sk_write_pending || inet_csk_in_pingpong_mode(sk) || - icsk->icsk_accept_queue.rskq_defer_accept) { - /* Save one ACK. Data will be ready after - * several ticks, if write_pending is set. - * - * It may be deleted, but with this feature tcpdumps - * look so _wonderfully_ clever, that I was not able - * to stand against the temptation 8) --ANK - */ - /* - * OK, in DCCP we can as well do a similar trick, its - * even in the draft, but there is no need for us to - * schedule an ack here, as dccp_sendmsg does this for - * us, also stated in the draft. -acme - */ - __kfree_skb(skb); - return 0; - } - dccp_send_ack(sk); - return -1; - } - -out_invalid_packet: - /* dccp_v4_do_rcv will send a reset */ - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; - return 1; - -unable_to_proceed: - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; - /* - * We mark this socket as no longer usable, so that the loop in - * dccp_sendmsg() terminates and the application gets notified. - */ - dccp_set_state(sk, DCCP_CLOSED); - sk->sk_err = ECOMM; - return 1; -} - -static int dccp_rcv_respond_partopen_state_process(struct sock *sk, - struct sk_buff *skb, - const struct dccp_hdr *dh, - const unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - u32 sample = dp->dccps_options_received.dccpor_timestamp_echo; - int queued = 0; - - switch (dh->dccph_type) { - case DCCP_PKT_RESET: - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - break; - case DCCP_PKT_DATA: - if (sk->sk_state == DCCP_RESPOND) - break; - fallthrough; - case DCCP_PKT_DATAACK: - case DCCP_PKT_ACK: - /* - * FIXME: we should be resetting the PARTOPEN (DELACK) timer - * here but only if we haven't used the DELACK timer for - * something else, like sending a delayed ack for a TIMESTAMP - * echo, etc, for now were not clearing it, sending an extra - * ACK when there is nothing else to do in DELACK is not a big - * deal after all. - */ - - /* Stop the PARTOPEN timer */ - if (sk->sk_state == DCCP_PARTOPEN) - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - - /* Obtain usec RTT sample from SYN exchange (used by TFRC). */ - if (likely(sample)) { - long delta = dccp_timestamp() - sample; - - dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta); - } - - dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq; - dccp_set_state(sk, DCCP_OPEN); - - if (dh->dccph_type == DCCP_PKT_DATAACK || - dh->dccph_type == DCCP_PKT_DATA) { - __dccp_rcv_established(sk, skb, dh, len); - queued = 1; /* packet was queued - (by __dccp_rcv_established) */ - } - break; - } - - return queued; -} - -int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, - struct dccp_hdr *dh, unsigned int len) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const int old_state = sk->sk_state; - bool acceptable; - int queued = 0; - - /* - * Step 3: Process LISTEN state - * - * If S.state == LISTEN, - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init - * Cookies Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_LISTEN) { - if (dh->dccph_type == DCCP_PKT_REQUEST) { - /* It is possible that we process SYN packets from backlog, - * so we need to make sure to disable BH and RCU right there. - */ - rcu_read_lock(); - local_bh_disable(); - acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; - local_bh_enable(); - rcu_read_unlock(); - if (!acceptable) - return 1; - consume_skb(skb); - return 0; - } - if (dh->dccph_type == DCCP_PKT_RESET) - goto discard; - - /* Caller (dccp_v4_do_rcv) will send Reset */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - } else if (sk->sk_state == DCCP_CLOSED) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - } - - /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ - if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) - goto discard; - - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - if ((dp->dccps_role != DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_RESPONSE) || - (dp->dccps_role == DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_REQUEST) || - (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); - goto discard; - } - - /* Step 8: Process options */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - /* - * Step 9: Process Reset - * If P.type == Reset, - * Tear down connection - * S.state := TIMEWAIT - * Set TIMEWAIT timer - * Drop packet and return - */ - if (dh->dccph_type == DCCP_PKT_RESET) { - dccp_rcv_reset(sk, skb); - return 0; - } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ - if (dccp_rcv_closereq(sk, skb)) - return 0; - goto discard; - } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ - if (dccp_rcv_close(sk, skb)) - return 0; - goto discard; - } - - switch (sk->sk_state) { - case DCCP_REQUESTING: - queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); - if (queued >= 0) - return queued; - - __kfree_skb(skb); - return 0; - - case DCCP_PARTOPEN: - /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); - fallthrough; - case DCCP_RESPOND: - queued = dccp_rcv_respond_partopen_state_process(sk, skb, - dh, len); - break; - } - - if (dh->dccph_type == DCCP_PKT_ACK || - dh->dccph_type == DCCP_PKT_DATAACK) { - switch (old_state) { - case DCCP_PARTOPEN: - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - break; - } - } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK); - goto discard; - } - - if (!queued) { -discard: - __kfree_skb(skb); - } - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_rcv_state_process); - -/** - * dccp_sample_rtt - Validate and finalise computation of RTT sample - * @sk: socket structure - * @delta: number of microseconds between packet and acknowledgment - * - * The routine is kept generic to work in different contexts. It should be - * called immediately when the ACK used for the RTT sample arrives. - */ -u32 dccp_sample_rtt(struct sock *sk, long delta) -{ - /* dccpor_elapsed_time is either zeroed out or set and > 0 */ - delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; - - if (unlikely(delta <= 0)) { - DCCP_WARN("unusable RTT sample %ld, using min\n", delta); - return DCCP_SANE_RTT_MIN; - } - if (unlikely(delta > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %ld too large, using max\n", delta); - return DCCP_SANE_RTT_MAX; - } - - return delta; -} diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c deleted file mode 100644 index 5926159a6f20..000000000000 --- a/net/dccp/ipv4.c +++ /dev/null @@ -1,1105 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/ipv4.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/dccp.h> -#include <linux/icmp.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/skbuff.h> -#include <linux/random.h> - -#include <net/icmp.h> -#include <net/inet_common.h> -#include <net/inet_hashtables.h> -#include <net/inet_sock.h> -#include <net/protocol.h> -#include <net/sock.h> -#include <net/timewait_sock.h> -#include <net/tcp_states.h> -#include <net/xfrm.h> -#include <net/secure_seq.h> -#include <net/netns/generic.h> -#include <net/rstreason.h> - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -struct dccp_v4_pernet { - struct sock *v4_ctl_sk; -}; - -static unsigned int dccp_v4_pernet_id __read_mostly; - -/* - * The per-net v4_ctl_sk socket is used for responding to - * the Out-of-the-blue (OOTB) packets. A control sock will be created - * for this socket at the initialization time. - */ - -int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) -{ - const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; - struct inet_sock *inet = inet_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - __be16 orig_sport, orig_dport; - __be32 daddr, nexthop; - struct flowi4 *fl4; - struct rtable *rt; - int err; - struct ip_options_rcu *inet_opt; - - dp->dccps_role = DCCP_ROLE_CLIENT; - - if (addr_len < sizeof(struct sockaddr_in)) - return -EINVAL; - - if (usin->sin_family != AF_INET) - return -EAFNOSUPPORT; - - nexthop = daddr = usin->sin_addr.s_addr; - - inet_opt = rcu_dereference_protected(inet->inet_opt, - lockdep_sock_is_held(sk)); - if (inet_opt != NULL && inet_opt->opt.srr) { - if (daddr == 0) - return -EINVAL; - nexthop = inet_opt->opt.faddr; - } - - orig_sport = inet->inet_sport; - orig_dport = usin->sin_port; - fl4 = &inet->cork.fl.u.ip4; - rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, - sk->sk_bound_dev_if, IPPROTO_DCCP, orig_sport, - orig_dport, sk); - if (IS_ERR(rt)) - return PTR_ERR(rt); - - if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { - ip_rt_put(rt); - return -ENETUNREACH; - } - - if (inet_opt == NULL || !inet_opt->opt.srr) - daddr = fl4->daddr; - - if (inet->inet_saddr == 0) { - err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET); - if (err) { - ip_rt_put(rt); - return err; - } - } else { - sk_rcv_saddr_set(sk, inet->inet_saddr); - } - - inet->inet_dport = usin->sin_port; - sk_daddr_set(sk, daddr); - - inet_csk(sk)->icsk_ext_hdr_len = 0; - if (inet_opt) - inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - /* - * Socket identity is still unknown (sport may be zero). - * However we set state to DCCP_REQUESTING and not releasing socket - * lock select source port, enter ourselves into the hash tables and - * complete initialization after this. - */ - dccp_set_state(sk, DCCP_REQUESTING); - err = inet_hash_connect(&dccp_death_row, sk); - if (err != 0) - goto failure; - - rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, - inet->inet_sport, inet->inet_dport, sk); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - rt = NULL; - goto failure; - } - /* OK, now commit destination to socket. */ - sk_setup_caps(sk, &rt->dst); - - dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - inet->inet_dport); - atomic_set(&inet->inet_id, get_random_u16()); - - err = dccp_connect(sk); - rt = NULL; - if (err != 0) - goto failure; -out: - return err; -failure: - /* - * This unhashes the socket and releases the local port, if necessary. - */ - dccp_set_state(sk, DCCP_CLOSED); - inet_bhash2_reset_saddr(sk); - ip_rt_put(rt); - sk->sk_route_caps = 0; - inet->inet_dport = 0; - goto out; -} -EXPORT_SYMBOL_GPL(dccp_v4_connect); - -/* - * This routine does path mtu discovery as defined in RFC1191. - */ -static inline void dccp_do_pmtu_discovery(struct sock *sk, - const struct iphdr *iph, - u32 mtu) -{ - struct dst_entry *dst; - const struct inet_sock *inet = inet_sk(sk); - const struct dccp_sock *dp = dccp_sk(sk); - - /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs - * send out by Linux are always < 576bytes so they should go through - * unfragmented). - */ - if (sk->sk_state == DCCP_LISTEN) - return; - - dst = inet_csk_update_pmtu(sk, mtu); - if (!dst) - return; - - /* Something is about to be wrong... Remember soft error - * for the case, if this connection will not able to recover. - */ - if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) - WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); - - mtu = dst_mtu(dst); - - if (inet->pmtudisc != IP_PMTUDISC_DONT && - ip_sk_accept_pmtu(sk) && - inet_csk(sk)->icsk_pmtu_cookie > mtu) { - dccp_sync_mss(sk, mtu); - - /* - * From RFC 4340, sec. 14.1: - * - * DCCP-Sync packets are the best choice for upward - * probing, since DCCP-Sync probes do not risk application - * data loss. - */ - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); - } /* else let the usual retransmit timer handle it */ -} - -static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk) -{ - struct dst_entry *dst = __sk_dst_check(sk, 0); - - if (dst) - dst->ops->redirect(dst, sk, skb); -} - -void dccp_req_err(struct sock *sk, u64 seq) - { - struct request_sock *req = inet_reqsk(sk); - struct net *net = sock_net(sk); - - /* - * ICMPs are not backlogged, hence we cannot get an established - * socket here. - */ - if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - } else { - /* - * Still in RESPOND, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - inet_csk_reqsk_queue_drop(req->rsk_listener, req); - } - reqsk_put(req); -} -EXPORT_SYMBOL(dccp_req_err); - -/* - * This routine is called by the ICMP module when it gets some sort of error - * condition. If err < 0 then the socket should be closed and the error - * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code. - * After adjustment header points to the first 8 bytes of the tcp header. We - * need to find the appropriate port. - * - * The locking strategy used here is very "optimistic". When someone else - * accesses the socket the ICMP is just dropped and for some paths there is no - * check at all. A more general error queue to queue errors for later handling - * is probably better. - */ -static int dccp_v4_err(struct sk_buff *skb, u32 info) -{ - const struct iphdr *iph = (struct iphdr *)skb->data; - const u8 offset = iph->ihl << 2; - const struct dccp_hdr *dh; - struct dccp_sock *dp; - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; - struct sock *sk; - __u64 seq; - int err; - struct net *net = dev_net(skb->dev); - - if (!pskb_may_pull(skb, offset + sizeof(*dh))) - return -EINVAL; - dh = (struct dccp_hdr *)(skb->data + offset); - if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) - return -EINVAL; - iph = (struct iphdr *)skb->data; - dh = (struct dccp_hdr *)(skb->data + offset); - - sk = __inet_lookup_established(net, &dccp_hashinfo, - iph->daddr, dh->dccph_dport, - iph->saddr, ntohs(dh->dccph_sport), - inet_iif(skb), 0); - if (!sk) { - __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return -ENOENT; - } - - if (sk->sk_state == DCCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return 0; - } - seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - dccp_req_err(sk, seq); - return 0; - } - - bh_lock_sock(sk); - /* If too many ICMPs get dropped on busy - * servers this needs to be solved differently. - */ - if (sock_owned_by_user(sk)) - __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - dp = dccp_sk(sk); - if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && - !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - switch (type) { - case ICMP_REDIRECT: - if (!sock_owned_by_user(sk)) - dccp_do_redirect(skb, sk); - goto out; - case ICMP_SOURCE_QUENCH: - /* Just silently ignore these. */ - goto out; - case ICMP_PARAMETERPROB: - err = EPROTO; - break; - case ICMP_DEST_UNREACH: - if (code > NR_ICMP_UNREACH) - goto out; - - if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ - if (!sock_owned_by_user(sk)) - dccp_do_pmtu_discovery(sk, iph, info); - goto out; - } - - err = icmp_err_convert[code].errno; - break; - case ICMP_TIME_EXCEEDED: - err = EHOSTUNREACH; - break; - default: - goto out; - } - - switch (sk->sk_state) { - case DCCP_REQUESTING: - case DCCP_RESPOND: - if (!sock_owned_by_user(sk)) { - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - sk->sk_err = err; - - sk_error_report(sk); - - dccp_done(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } - goto out; - } - - /* If we've already connected we will keep trying - * until we time out, or the user gives up. - * - * rfc1122 4.2.3.9 allows to consider as hard errors - * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, - * but it is obsoleted by pmtu discovery). - * - * Note, that in modern internet, where routing is unreliable - * and in each dark corner broken firewalls sit, sending random - * errors ordered by their masters even this two messages finally lose - * their original sense (even Linux sends invalid PORT_UNREACHs) - * - * Now we are in compliance with RFCs. - * --ANK (980905) - */ - - if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { - sk->sk_err = err; - sk_error_report(sk); - } else { /* Only an error on timeout */ - WRITE_ONCE(sk->sk_err_soft, err); - } -out: - bh_unlock_sock(sk); - sock_put(sk); - return 0; -} - -static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb, - __be32 src, __be32 dst) -{ - return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum); -} - -void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb) -{ - const struct inet_sock *inet = inet_sk(sk); - struct dccp_hdr *dh = dccp_hdr(skb); - - dccp_csum_outgoing(skb); - dh->dccph_checksum = dccp_v4_csum_finish(skb, - inet->inet_saddr, - inet->inet_daddr); -} -EXPORT_SYMBOL_GPL(dccp_v4_send_check); - -static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb) -{ - return secure_dccp_sequence_number(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - dccp_hdr(skb)->dccph_dport, - dccp_hdr(skb)->dccph_sport); -} - -/* - * The three way handshake has completed - we got a valid ACK or DATAACK - - * now create the new socket. - * - * This is the equivalent of TCP's tcp_v4_syn_recv_sock - */ -struct sock *dccp_v4_request_recv_sock(const struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req) -{ - struct inet_request_sock *ireq; - struct inet_sock *newinet; - struct sock *newsk; - - if (sk_acceptq_is_full(sk)) - goto exit_overflow; - - newsk = dccp_create_openreq_child(sk, req, skb); - if (newsk == NULL) - goto exit_nonewsk; - - newinet = inet_sk(newsk); - ireq = inet_rsk(req); - sk_daddr_set(newsk, ireq->ir_rmt_addr); - sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); - newinet->inet_saddr = ireq->ir_loc_addr; - RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); - newinet->mc_index = inet_iif(skb); - newinet->mc_ttl = ip_hdr(skb)->ttl; - atomic_set(&newinet->inet_id, get_random_u16()); - - if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) - goto put_and_exit; - - sk_setup_caps(newsk, dst); - - dccp_sync_mss(newsk, dst_mtu(dst)); - - if (__inet_inherit_port(sk, newsk) < 0) - goto put_and_exit; - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); - if (*own_req) - ireq->ireq_opt = NULL; - else - newinet->inet_opt = NULL; - return newsk; - -exit_overflow: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -exit_nonewsk: - dst_release(dst); -exit: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); - return NULL; -put_and_exit: - newinet->inet_opt = NULL; - inet_csk_prepare_forced_close(newsk); - dccp_done(newsk); - goto exit; -} -EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); - -static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, - struct sk_buff *skb) -{ - struct rtable *rt; - const struct iphdr *iph = ip_hdr(skb); - struct flowi4 fl4 = { - .flowi4_oif = inet_iif(skb), - .daddr = iph->saddr, - .saddr = iph->daddr, - .flowi4_tos = ip_sock_rt_tos(sk), - .flowi4_scope = ip_sock_rt_scope(sk), - .flowi4_proto = sk->sk_protocol, - .fl4_sport = dccp_hdr(skb)->dccph_dport, - .fl4_dport = dccp_hdr(skb)->dccph_sport, - }; - - security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); - rt = ip_route_output_flow(net, &fl4, sk); - if (IS_ERR(rt)) { - IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - - return &rt->dst; -} - -static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req) -{ - int err = -1; - struct sk_buff *skb; - struct dst_entry *dst; - struct flowi4 fl4; - - dst = inet_csk_route_req(sk, &fl4, req); - if (dst == NULL) - goto out; - - skb = dccp_make_response(sk, dst, req); - if (skb != NULL) { - const struct inet_request_sock *ireq = inet_rsk(req); - struct dccp_hdr *dh = dccp_hdr(skb); - - dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr, - ireq->ir_rmt_addr); - rcu_read_lock(); - err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr, - ireq->ir_rmt_addr, - rcu_dereference(ireq->ireq_opt), - READ_ONCE(inet_sk(sk)->tos)); - rcu_read_unlock(); - err = net_xmit_eval(err); - } - -out: - dst_release(dst); - return err; -} - -static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, - enum sk_rst_reason reason) -{ - int err; - const struct iphdr *rxiph; - struct sk_buff *skb; - struct dst_entry *dst; - struct net *net = dev_net(skb_dst(rxskb)->dev); - struct dccp_v4_pernet *pn; - struct sock *ctl_sk; - - /* Never send a reset in response to a reset. */ - if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) - return; - - if (skb_rtable(rxskb)->rt_type != RTN_LOCAL) - return; - - pn = net_generic(net, dccp_v4_pernet_id); - ctl_sk = pn->v4_ctl_sk; - dst = dccp_v4_route_skb(net, ctl_sk, rxskb); - if (dst == NULL) - return; - - skb = dccp_ctl_make_reset(ctl_sk, rxskb); - if (skb == NULL) - goto out; - - rxiph = ip_hdr(rxskb); - dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr, - rxiph->daddr); - skb_dst_set(skb, dst_clone(dst)); - - local_bh_disable(); - bh_lock_sock(ctl_sk); - err = ip_build_and_send_pkt(skb, ctl_sk, - rxiph->daddr, rxiph->saddr, NULL, - inet_sk(ctl_sk)->tos); - bh_unlock_sock(ctl_sk); - - if (net_xmit_eval(err) == 0) { - __DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - __DCCP_INC_STATS(DCCP_MIB_OUTRSTS); - } - local_bh_enable(); -out: - dst_release(dst); -} - -static void dccp_v4_reqsk_destructor(struct request_sock *req) -{ - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); - kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); -} - -void dccp_syn_ack_timeout(const struct request_sock *req) -{ -} -EXPORT_SYMBOL(dccp_syn_ack_timeout); - -static struct request_sock_ops dccp_request_sock_ops __read_mostly = { - .family = PF_INET, - .obj_size = sizeof(struct dccp_request_sock), - .rtx_syn_ack = dccp_v4_send_response, - .send_ack = dccp_reqsk_send_ack, - .destructor = dccp_v4_reqsk_destructor, - .send_reset = dccp_v4_ctl_send_reset, - .syn_ack_timeout = dccp_syn_ack_timeout, -}; - -int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) -{ - struct inet_request_sock *ireq; - struct request_sock *req; - struct dccp_request_sock *dreq; - const __be32 service = dccp_hdr_request(skb)->dccph_req_service; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - - /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ - if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) - return 0; /* discard, don't send a reset here */ - - if (dccp_bad_service_code(sk, service)) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; - goto drop; - } - /* - * TW buckets are converted to open requests without - * limitations, they conserve resources and peer is - * evidently real one. - */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; - if (inet_csk_reqsk_queue_is_full(sk)) - goto drop; - - if (sk_acceptq_is_full(sk)) - goto drop; - - req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true); - if (req == NULL) - goto drop; - - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; - - dreq = dccp_rsk(req); - if (dccp_parse_options(sk, dreq, skb)) - goto drop_and_free; - - ireq = inet_rsk(req); - sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); - sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); - ireq->ir_mark = inet_request_mark(sk, skb); - ireq->ireq_family = AF_INET; - ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - - /* - * Step 3: Process LISTEN state - * - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie - * - * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child(). - */ - dreq->dreq_isr = dcb->dccpd_seq; - dreq->dreq_gsr = dreq->dreq_isr; - dreq->dreq_iss = dccp_v4_init_sequence(skb); - dreq->dreq_gss = dreq->dreq_iss; - dreq->dreq_service = service; - - if (dccp_v4_send_response(sk, req)) - goto drop_and_free; - - if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) - reqsk_free(req); - else - reqsk_put(req); - - return 0; - -drop_and_free: - reqsk_free(req); -drop: - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - return -1; -} -EXPORT_SYMBOL_GPL(dccp_v4_conn_request); - -int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_hdr *dh = dccp_hdr(skb); - - if (sk->sk_state == DCCP_OPEN) { /* Fast path */ - if (dccp_rcv_established(sk, skb, dh, skb->len)) - goto reset; - return 0; - } - - /* - * Step 3: Process LISTEN state - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - * Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - * - * NOTE: the check for the packet types is done in - * dccp_rcv_state_process - */ - - if (dccp_rcv_state_process(sk, skb, dh, skb->len)) - goto reset; - return 0; - -reset: - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - kfree_skb(skb); - return 0; -} -EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); - -/** - * dccp_invalid_packet - check for malformed packets - * @skb: Packet to validate - * - * Implements RFC 4340, 8.5: Step 1: Check header basics - * Packets that fail these checks are ignored and do not receive Resets. - */ -int dccp_invalid_packet(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - unsigned int cscov; - u8 dccph_doff; - - if (skb->pkt_type != PACKET_HOST) - return 1; - - /* If the packet is shorter than 12 bytes, drop packet and return */ - if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) { - DCCP_WARN("pskb_may_pull failed\n"); - return 1; - } - - dh = dccp_hdr(skb); - - /* If P.type is not understood, drop packet and return */ - if (dh->dccph_type >= DCCP_PKT_INVALID) { - DCCP_WARN("invalid packet type\n"); - return 1; - } - - /* - * If P.Data Offset is too small for packet type, drop packet and return - */ - dccph_doff = dh->dccph_doff; - if (dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) { - DCCP_WARN("P.Data Offset(%u) too small\n", dccph_doff); - return 1; - } - /* - * If P.Data Offset is too large for packet, drop packet and return - */ - if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) { - DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff); - return 1; - } - dh = dccp_hdr(skb); - /* - * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet - * has short sequence numbers), drop packet and return - */ - if ((dh->dccph_type < DCCP_PKT_DATA || - dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) { - DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n", - dccp_packet_name(dh->dccph_type)); - return 1; - } - - /* - * If P.CsCov is too large for the packet size, drop packet and return. - * This must come _before_ checksumming (not as RFC 4340 suggests). - */ - cscov = dccp_csum_coverage(skb); - if (cscov > skb->len) { - DCCP_WARN("P.CsCov %u exceeds packet length %d\n", - dh->dccph_cscov, skb->len); - return 1; - } - - /* If header checksum is incorrect, drop packet and return. - * (This step is completed in the AF-dependent functions.) */ - skb->csum = skb_checksum(skb, 0, cscov, 0); - - return 0; -} -EXPORT_SYMBOL_GPL(dccp_invalid_packet); - -/* this is called when real data arrives */ -static int dccp_v4_rcv(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - const struct iphdr *iph; - bool refcounted; - struct sock *sk; - int min_cov; - - /* Step 1: Check header basics */ - - if (dccp_invalid_packet(skb)) - goto discard_it; - - iph = ip_hdr(skb); - /* Step 1: If header checksum is incorrect, drop packet and return */ - if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) { - DCCP_WARN("dropped packet with invalid checksum\n"); - goto discard_it; - } - - dh = dccp_hdr(skb); - - DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); - DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; - - dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu", - dccp_packet_name(dh->dccph_type), - &iph->saddr, ntohs(dh->dccph_sport), - &iph->daddr, ntohs(dh->dccph_dport), - (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); - - if (dccp_packet_without_ack(skb)) { - DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; - dccp_pr_debug_cat("\n"); - } else { - DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - } - -lookup: - sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), - dh->dccph_sport, dh->dccph_dport, 0, &refcounted); - if (!sk) { - dccp_pr_debug("failed to look up flow ID in table and " - "get corresponding socket\n"); - goto no_dccp_socket; - } - - /* - * Step 2: - * ... or S.state == TIMEWAIT, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_TIME_WAIT) { - dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); - inet_twsk_put(inet_twsk(sk)); - goto no_dccp_socket; - } - - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); - struct sock *nsk; - - sk = req->rsk_listener; - if (unlikely(sk->sk_state != DCCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; - } - sock_hold(sk); - refcounted = true; - nsk = dccp_check_req(sk, skb, req); - if (!nsk) { - reqsk_put(req); - goto discard_and_relse; - } - if (nsk == sk) { - reqsk_put(req); - } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - goto discard_and_relse; - } else { - sock_put(sk); - return 0; - } - } - /* - * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage - * o if MinCsCov = 0, only packets with CsCov = 0 are accepted - * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov - */ - min_cov = dccp_sk(sk)->dccps_pcrlen; - if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { - dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", - dh->dccph_cscov, min_cov); - /* FIXME: "Such packets SHOULD be reported using Data Dropped - * options (Section 11.7) with Drop Code 0, Protocol - * Constraints." */ - goto discard_and_relse; - } - - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) - goto discard_and_relse; - nf_reset_ct(skb); - - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted); - -no_dccp_socket: - if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard_it; - /* - * Step 2: - * If no socket ... - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (dh->dccph_type != DCCP_PKT_RESET) { - DCCP_SKB_CB(skb)->dccpd_reset_code = - DCCP_RESET_CODE_NO_CONNECTION; - dccp_v4_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - } - -discard_it: - kfree_skb(skb); - return 0; - -discard_and_relse: - if (refcounted) - sock_put(sk); - goto discard_it; -} - -static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { - .queue_xmit = ip_queue_xmit, - .send_check = dccp_v4_send_check, - .rebuild_header = inet_sk_rebuild_header, - .conn_request = dccp_v4_conn_request, - .syn_recv_sock = dccp_v4_request_recv_sock, - .net_header_len = sizeof(struct iphdr), - .setsockopt = ip_setsockopt, - .getsockopt = ip_getsockopt, - .addr2sockaddr = inet_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in), -}; - -static int dccp_v4_init_sock(struct sock *sk) -{ - static __u8 dccp_v4_ctl_sock_initialized; - int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized); - - if (err == 0) { - if (unlikely(!dccp_v4_ctl_sock_initialized)) - dccp_v4_ctl_sock_initialized = 1; - inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops; - } - - return err; -} - -static struct timewait_sock_ops dccp_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct inet_timewait_sock), -}; - -static struct proto dccp_v4_prot = { - .name = "DCCP", - .owner = THIS_MODULE, - .close = dccp_close, - .connect = dccp_v4_connect, - .disconnect = dccp_disconnect, - .ioctl = dccp_ioctl, - .init = dccp_v4_init_sock, - .setsockopt = dccp_setsockopt, - .getsockopt = dccp_getsockopt, - .sendmsg = dccp_sendmsg, - .recvmsg = dccp_recvmsg, - .backlog_rcv = dccp_v4_do_rcv, - .hash = inet_hash, - .unhash = inet_unhash, - .accept = inet_csk_accept, - .get_port = inet_csk_get_port, - .shutdown = dccp_shutdown, - .destroy = dccp_destroy_sock, - .orphan_count = &dccp_orphan_count, - .max_header = MAX_DCCP_HEADER, - .obj_size = sizeof(struct dccp_sock), - .slab_flags = SLAB_TYPESAFE_BY_RCU, - .rsk_prot = &dccp_request_sock_ops, - .twsk_prot = &dccp_timewait_sock_ops, - .h.hashinfo = &dccp_hashinfo, -}; - -static const struct net_protocol dccp_v4_protocol = { - .handler = dccp_v4_rcv, - .err_handler = dccp_v4_err, - .no_policy = 1, - .icmp_strict_tag_validation = 1, -}; - -static const struct proto_ops inet_dccp_ops = { - .family = PF_INET, - .owner = THIS_MODULE, - .release = inet_release, - .bind = inet_bind, - .connect = inet_stream_connect, - .socketpair = sock_no_socketpair, - .accept = inet_accept, - .getname = inet_getname, - /* FIXME: work on tcp_poll to rename it to inet_csk_poll */ - .poll = dccp_poll, - .ioctl = inet_ioctl, - .gettstamp = sock_gettstamp, - /* FIXME: work on inet_listen to rename it to sock_common_listen */ - .listen = inet_dccp_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, -}; - -static struct inet_protosw dccp_v4_protosw = { - .type = SOCK_DCCP, - .protocol = IPPROTO_DCCP, - .prot = &dccp_v4_prot, - .ops = &inet_dccp_ops, - .flags = INET_PROTOSW_ICSK, -}; - -static int __net_init dccp_v4_init_net(struct net *net) -{ - struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); - - if (dccp_hashinfo.bhash == NULL) - return -ESOCKTNOSUPPORT; - - return inet_ctl_sock_create(&pn->v4_ctl_sk, PF_INET, - SOCK_DCCP, IPPROTO_DCCP, net); -} - -static void __net_exit dccp_v4_exit_net(struct net *net) -{ - struct dccp_v4_pernet *pn = net_generic(net, dccp_v4_pernet_id); - - inet_ctl_sock_destroy(pn->v4_ctl_sk); -} - -static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list) -{ - inet_twsk_purge(&dccp_hashinfo); -} - -static struct pernet_operations dccp_v4_ops = { - .init = dccp_v4_init_net, - .exit = dccp_v4_exit_net, - .exit_batch = dccp_v4_exit_batch, - .id = &dccp_v4_pernet_id, - .size = sizeof(struct dccp_v4_pernet), -}; - -static int __init dccp_v4_init(void) -{ - int err = proto_register(&dccp_v4_prot, 1); - - if (err) - goto out; - - inet_register_protosw(&dccp_v4_protosw); - - err = register_pernet_subsys(&dccp_v4_ops); - if (err) - goto out_destroy_ctl_sock; - - err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP); - if (err) - goto out_proto_unregister; - -out: - return err; -out_proto_unregister: - unregister_pernet_subsys(&dccp_v4_ops); -out_destroy_ctl_sock: - inet_unregister_protosw(&dccp_v4_protosw); - proto_unregister(&dccp_v4_prot); - goto out; -} - -static void __exit dccp_v4_exit(void) -{ - inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP); - unregister_pernet_subsys(&dccp_v4_ops); - inet_unregister_protosw(&dccp_v4_protosw); - proto_unregister(&dccp_v4_prot); -} - -module_init(dccp_v4_init); -module_exit(dccp_v4_exit); - -/* - * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) - * values directly, Also cover the case where the protocol is not specified, - * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP - */ -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 33, 6); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 0, 6); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>"); -MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c deleted file mode 100644 index d6649246188d..000000000000 --- a/net/dccp/ipv6.c +++ /dev/null @@ -1,1181 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * DCCP over IPv6 - * Linux INET6 implementation - * - * Based on net/dccp6/ipv6.c - * - * Arnaldo Carvalho de Melo <acme@ghostprotocols.net> - */ - -#include <linux/module.h> -#include <linux/random.h> -#include <linux/slab.h> -#include <linux/xfrm.h> -#include <linux/string.h> - -#include <net/addrconf.h> -#include <net/inet_common.h> -#include <net/inet_hashtables.h> -#include <net/inet_sock.h> -#include <net/inet6_connection_sock.h> -#include <net/inet6_hashtables.h> -#include <net/ip6_route.h> -#include <net/ipv6.h> -#include <net/protocol.h> -#include <net/transp_v6.h> -#include <net/ip6_checksum.h> -#include <net/xfrm.h> -#include <net/secure_seq.h> -#include <net/netns/generic.h> -#include <net/sock.h> -#include <net/rstreason.h> - -#include "dccp.h" -#include "ipv6.h" -#include "feat.h" - -struct dccp_v6_pernet { - struct sock *v6_ctl_sk; -}; - -static unsigned int dccp_v6_pernet_id __read_mostly; - -/* The per-net v6_ctl_sk is used for sending RSTs and ACKs */ - -static const struct inet_connection_sock_af_ops dccp_ipv6_mapped; -static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops; - -/* add pseudo-header to DCCP checksum stored in skb->csum */ -static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb, - const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum); -} - -static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct dccp_hdr *dh = dccp_hdr(skb); - - dccp_csum_outgoing(skb); - dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr); -} - -static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb) -{ - return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32, - dccp_hdr(skb)->dccph_dport, - dccp_hdr(skb)->dccph_sport ); - -} - -static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) -{ - const struct ipv6hdr *hdr; - const struct dccp_hdr *dh; - struct dccp_sock *dp; - struct ipv6_pinfo *np; - struct sock *sk; - int err; - __u64 seq; - struct net *net = dev_net(skb->dev); - - if (!pskb_may_pull(skb, offset + sizeof(*dh))) - return -EINVAL; - dh = (struct dccp_hdr *)(skb->data + offset); - if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) - return -EINVAL; - hdr = (const struct ipv6hdr *)skb->data; - dh = (struct dccp_hdr *)(skb->data + offset); - - sk = __inet6_lookup_established(net, &dccp_hashinfo, - &hdr->daddr, dh->dccph_dport, - &hdr->saddr, ntohs(dh->dccph_sport), - inet6_iif(skb), 0); - - if (!sk) { - __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); - return -ENOENT; - } - - if (sk->sk_state == DCCP_TIME_WAIT) { - inet_twsk_put(inet_twsk(sk)); - return 0; - } - seq = dccp_hdr_seq(dh); - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - dccp_req_err(sk, seq); - return 0; - } - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) - __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); - - if (sk->sk_state == DCCP_CLOSED) - goto out; - - dp = dccp_sk(sk); - if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && - !between48(seq, dp->dccps_awl, dp->dccps_awh)) { - __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); - goto out; - } - - np = inet6_sk(sk); - - if (type == NDISC_REDIRECT) { - if (!sock_owned_by_user(sk)) { - struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst) - dst->ops->redirect(dst, sk, skb); - } - goto out; - } - - if (type == ICMPV6_PKT_TOOBIG) { - struct dst_entry *dst = NULL; - - if (!ip6_sk_accept_pmtu(sk)) - goto out; - - if (sock_owned_by_user(sk)) - goto out; - if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) - goto out; - - dst = inet6_csk_update_pmtu(sk, ntohl(info)); - if (!dst) - goto out; - - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) - dccp_sync_mss(sk, dst_mtu(dst)); - goto out; - } - - icmpv6_err_convert(type, code, &err); - - /* Might be for an request_sock */ - switch (sk->sk_state) { - case DCCP_REQUESTING: - case DCCP_RESPOND: /* Cannot happen. - It can, it SYNs are crossed. --ANK */ - if (!sock_owned_by_user(sk)) { - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - sk->sk_err = err; - /* - * Wake people up to see the error - * (see connect in sock.c) - */ - sk_error_report(sk); - dccp_done(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } - goto out; - } - - if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { - sk->sk_err = err; - sk_error_report(sk); - } else { - WRITE_ONCE(sk->sk_err_soft, err); - } -out: - bh_unlock_sock(sk); - sock_put(sk); - return 0; -} - - -static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req) -{ - struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff *skb; - struct in6_addr *final_p, final; - struct flowi6 fl6; - int err = -1; - struct dst_entry *dst; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.daddr = ireq->ir_v6_rmt_addr; - fl6.saddr = ireq->ir_v6_loc_addr; - fl6.flowlabel = 0; - fl6.flowi6_oif = ireq->ir_iif; - fl6.fl6_dport = ireq->ir_rmt_port; - fl6.fl6_sport = htons(ireq->ir_num); - security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); - - - rcu_read_lock(); - final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); - rcu_read_unlock(); - - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - dst = NULL; - goto done; - } - - skb = dccp_make_response(sk, dst, req); - if (skb != NULL) { - struct dccp_hdr *dh = dccp_hdr(skb); - struct ipv6_txoptions *opt; - - dh->dccph_checksum = dccp_v6_csum_finish(skb, - &ireq->ir_v6_loc_addr, - &ireq->ir_v6_rmt_addr); - fl6.daddr = ireq->ir_v6_rmt_addr; - rcu_read_lock(); - opt = ireq->ipv6_opt; - if (!opt) - opt = rcu_dereference(np->opt); - err = ip6_xmit(sk, skb, &fl6, READ_ONCE(sk->sk_mark), opt, - np->tclass, READ_ONCE(sk->sk_priority)); - rcu_read_unlock(); - err = net_xmit_eval(err); - } - -done: - dst_release(dst); - return err; -} - -static void dccp_v6_reqsk_destructor(struct request_sock *req) -{ - dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); - kfree(inet_rsk(req)->ipv6_opt); - kfree_skb(inet_rsk(req)->pktopts); -} - -static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb, - enum sk_rst_reason reason) -{ - const struct ipv6hdr *rxip6h; - struct sk_buff *skb; - struct flowi6 fl6; - struct net *net = dev_net(skb_dst(rxskb)->dev); - struct dccp_v6_pernet *pn; - struct sock *ctl_sk; - struct dst_entry *dst; - - if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) - return; - - if (!ipv6_unicast_destination(rxskb)) - return; - - pn = net_generic(net, dccp_v6_pernet_id); - ctl_sk = pn->v6_ctl_sk; - skb = dccp_ctl_make_reset(ctl_sk, rxskb); - if (skb == NULL) - return; - - rxip6h = ipv6_hdr(rxskb); - dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr, - &rxip6h->daddr); - - memset(&fl6, 0, sizeof(fl6)); - fl6.daddr = rxip6h->saddr; - fl6.saddr = rxip6h->daddr; - - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.flowi6_oif = inet6_iif(rxskb); - fl6.fl6_dport = dccp_hdr(skb)->dccph_dport; - fl6.fl6_sport = dccp_hdr(skb)->dccph_sport; - security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6)); - - /* sk = NULL, but it is safe for now. RST socket required. */ - dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); - if (!IS_ERR(dst)) { - skb_dst_set(skb, dst); - ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0); - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - DCCP_INC_STATS(DCCP_MIB_OUTRSTS); - return; - } - - kfree_skb(skb); -} - -static struct request_sock_ops dccp6_request_sock_ops = { - .family = AF_INET6, - .obj_size = sizeof(struct dccp6_request_sock), - .rtx_syn_ack = dccp_v6_send_response, - .send_ack = dccp_reqsk_send_ack, - .destructor = dccp_v6_reqsk_destructor, - .send_reset = dccp_v6_ctl_send_reset, - .syn_ack_timeout = dccp_syn_ack_timeout, -}; - -static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) -{ - struct request_sock *req; - struct dccp_request_sock *dreq; - struct inet_request_sock *ireq; - struct ipv6_pinfo *np = inet6_sk(sk); - const __be32 service = dccp_hdr_request(skb)->dccph_req_service; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - - if (skb->protocol == htons(ETH_P_IP)) - return dccp_v4_conn_request(sk, skb); - - if (!ipv6_unicast_destination(skb)) - return 0; /* discard, don't send a reset here */ - - if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { - __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); - return 0; - } - - if (dccp_bad_service_code(sk, service)) { - dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; - goto drop; - } - /* - * There are no SYN attacks on IPv6, yet... - */ - dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; - if (inet_csk_reqsk_queue_is_full(sk)) - goto drop; - - if (sk_acceptq_is_full(sk)) - goto drop; - - req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true); - if (req == NULL) - goto drop; - - if (dccp_reqsk_init(req, dccp_sk(sk), skb)) - goto drop_and_free; - - dreq = dccp_rsk(req); - if (dccp_parse_options(sk, dreq, skb)) - goto drop_and_free; - - ireq = inet_rsk(req); - ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; - ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; - ireq->ireq_family = AF_INET6; - ireq->ir_mark = inet_request_mark(sk, skb); - - if (security_inet_conn_request(sk, skb, req)) - goto drop_and_free; - - if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || - np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || - np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { - refcount_inc(&skb->users); - ireq->pktopts = skb; - } - ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); - - /* So that link locals have meaning */ - if (!ireq->ir_iif && - ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) - ireq->ir_iif = inet6_iif(skb); - - /* - * Step 3: Process LISTEN state - * - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie - * - * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child(). - */ - dreq->dreq_isr = dcb->dccpd_seq; - dreq->dreq_gsr = dreq->dreq_isr; - dreq->dreq_iss = dccp_v6_init_sequence(skb); - dreq->dreq_gss = dreq->dreq_iss; - dreq->dreq_service = service; - - if (dccp_v6_send_response(sk, req)) - goto drop_and_free; - - if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) - reqsk_free(req); - else - reqsk_put(req); - - return 0; - -drop_and_free: - reqsk_free(req); -drop: - __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); - return -1; -} - -static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst, - struct request_sock *req_unhash, - bool *own_req) -{ - struct inet_request_sock *ireq = inet_rsk(req); - struct ipv6_pinfo *newnp; - const struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_txoptions *opt; - struct inet_sock *newinet; - struct dccp6_sock *newdp6; - struct sock *newsk; - - if (skb->protocol == htons(ETH_P_IP)) { - /* - * v6 mapped - */ - newsk = dccp_v4_request_recv_sock(sk, skb, req, dst, - req_unhash, own_req); - if (newsk == NULL) - return NULL; - - newdp6 = (struct dccp6_sock *)newsk; - newinet = inet_sk(newsk); - newinet->pinet6 = &newdp6->inet6; - newnp = inet6_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - - newnp->saddr = newsk->sk_v6_rcv_saddr; - - inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; - newsk->sk_backlog_rcv = dccp_v4_do_rcv; - newnp->pktoptions = NULL; - newnp->opt = NULL; - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - newnp->mcast_oif = inet_iif(skb); - newnp->mcast_hops = ip_hdr(skb)->ttl; - - /* - * No need to charge this sock to the relevant IPv6 refcnt debug socks count - * here, dccp_create_openreq_child now does this for us, see the comment in - * that function for the gory details. -acme - */ - - /* It is tricky place. Until this moment IPv4 tcp - worked with IPv6 icsk.icsk_af_ops. - Sync it now. - */ - dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); - - return newsk; - } - - - if (sk_acceptq_is_full(sk)) - goto out_overflow; - - if (!dst) { - struct flowi6 fl6; - - dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_DCCP); - if (!dst) - goto out; - } - - newsk = dccp_create_openreq_child(sk, req, skb); - if (newsk == NULL) - goto out_nonewsk; - - /* - * No need to charge this sock to the relevant IPv6 refcnt debug socks - * count here, dccp_create_openreq_child now does this for us, see the - * comment in that function for the gory details. -acme - */ - - ip6_dst_store(newsk, dst, NULL, NULL); - newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | - NETIF_F_TSO); - newdp6 = (struct dccp6_sock *)newsk; - newinet = inet_sk(newsk); - newinet->pinet6 = &newdp6->inet6; - newnp = inet6_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - - newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; - newnp->saddr = ireq->ir_v6_loc_addr; - newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; - newsk->sk_bound_dev_if = ireq->ir_iif; - - /* Now IPv6 options... - - First: no IPv4 options. - */ - newinet->inet_opt = NULL; - - /* Clone RX bits */ - newnp->rxopt.all = np->rxopt.all; - - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - newnp->pktoptions = NULL; - newnp->opt = NULL; - newnp->mcast_oif = inet6_iif(skb); - newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; - - /* - * Clone native IPv6 options from listening socket (if any) - * - * Yes, keeping reference count would be much more clever, but we make - * one more one thing there: reattach optmem to newsk. - */ - opt = ireq->ipv6_opt; - if (!opt) - opt = rcu_dereference(np->opt); - if (opt) { - opt = ipv6_dup_options(newsk, opt); - RCU_INIT_POINTER(newnp->opt, opt); - } - inet_csk(newsk)->icsk_ext_hdr_len = 0; - if (opt) - inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + - opt->opt_flen; - - dccp_sync_mss(newsk, dst_mtu(dst)); - - newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; - newinet->inet_rcv_saddr = LOOPBACK4_IPV6; - - if (__inet_inherit_port(sk, newsk) < 0) { - inet_csk_prepare_forced_close(newsk); - dccp_done(newsk); - goto out; - } - *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); - /* Clone pktoptions received with SYN, if we own the req */ - if (*own_req && ireq->pktopts) { - newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); - consume_skb(ireq->pktopts); - ireq->pktopts = NULL; - } - - return newsk; - -out_overflow: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -out_nonewsk: - dst_release(dst); -out: - __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); - return NULL; -} - -/* The socket must have it's spinlock held when we get - * here. - * - * We have a potential double-lock case here, so even when - * doing backlog processing we use the BH locking scheme. - * This is because we cannot sleep with the original spinlock - * held. - */ -static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct sk_buff *opt_skb = NULL; - - /* Imagine: socket is IPv6. IPv4 packet arrives, - goes to IPv4 receive handler and backlogged. - From backlog it always goes here. Kerboom... - Fortunately, dccp_rcv_established and rcv_established - handle them correctly, but it is not case with - dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK - */ - - if (skb->protocol == htons(ETH_P_IP)) - return dccp_v4_do_rcv(sk, skb); - - if (sk_filter(sk, skb)) - goto discard; - - /* - * socket locking is here for SMP purposes as backlog rcv is currently - * called with bh processing disabled. - */ - - /* Do Stevens' IPV6_PKTOPTIONS. - - Yes, guys, it is the only place in our code, where we - may make it not affecting IPv4. - The rest of code is protocol independent, - and I do not like idea to uglify IPv4. - - Actually, all the idea behind IPV6_PKTOPTIONS - looks not very well thought. For now we latch - options, received in the last packet, enqueued - by tcp. Feel free to propose better solution. - --ANK (980728) - */ - if (np->rxopt.all && sk->sk_state != DCCP_LISTEN) - opt_skb = skb_clone_and_charge_r(skb, sk); - - if (sk->sk_state == DCCP_OPEN) { /* Fast path */ - if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) - goto reset; - if (opt_skb) - goto ipv6_pktoptions; - return 0; - } - - /* - * Step 3: Process LISTEN state - * If S.state == LISTEN, - * If P.type == Request or P contains a valid Init Cookie option, - * (* Must scan the packet's options to check for Init - * Cookies. Only Init Cookies are processed here, - * however; other options are processed in Step 8. This - * scan need only be performed if the endpoint uses Init - * Cookies *) - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - * S.state = RESPOND - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - * Continue with S.state == RESPOND - * (* A Response packet will be generated in Step 11 *) - * Otherwise, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - * - * NOTE: the check for the packet types is done in - * dccp_rcv_state_process - */ - - if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) - goto reset; - if (opt_skb) - goto ipv6_pktoptions; - return 0; - -reset: - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); -discard: - if (opt_skb != NULL) - __kfree_skb(opt_skb); - kfree_skb(skb); - return 0; - -/* Handling IPV6_PKTOPTIONS skb the similar - * way it's done for net/ipv6/tcp_ipv6.c - */ -ipv6_pktoptions: - if (!((1 << sk->sk_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) { - if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) - WRITE_ONCE(np->mcast_oif, inet6_iif(opt_skb)); - if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) - WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); - if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) - np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); - if (inet6_test_bit(REPFLOW, sk)) - np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); - if (ipv6_opt_accepted(sk, opt_skb, - &DCCP_SKB_CB(opt_skb)->header.h6)) { - memmove(IP6CB(opt_skb), - &DCCP_SKB_CB(opt_skb)->header.h6, - sizeof(struct inet6_skb_parm)); - opt_skb = xchg(&np->pktoptions, opt_skb); - } else { - __kfree_skb(opt_skb); - opt_skb = xchg(&np->pktoptions, NULL); - } - } - - kfree_skb(opt_skb); - return 0; -} - -static int dccp_v6_rcv(struct sk_buff *skb) -{ - const struct dccp_hdr *dh; - bool refcounted; - struct sock *sk; - int min_cov; - - /* Step 1: Check header basics */ - - if (dccp_invalid_packet(skb)) - goto discard_it; - - /* Step 1: If header checksum is incorrect, drop packet and return. */ - if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr, - &ipv6_hdr(skb)->daddr)) { - DCCP_WARN("dropped packet with invalid checksum\n"); - goto discard_it; - } - - dh = dccp_hdr(skb); - - DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); - DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; - - if (dccp_packet_without_ack(skb)) - DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; - else - DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); - -lookup: - sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), - dh->dccph_sport, dh->dccph_dport, - inet6_iif(skb), 0, &refcounted); - if (!sk) { - dccp_pr_debug("failed to look up flow ID in table and " - "get corresponding socket\n"); - goto no_dccp_socket; - } - - /* - * Step 2: - * ... or S.state == TIMEWAIT, - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (sk->sk_state == DCCP_TIME_WAIT) { - dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); - inet_twsk_put(inet_twsk(sk)); - goto no_dccp_socket; - } - - if (sk->sk_state == DCCP_NEW_SYN_RECV) { - struct request_sock *req = inet_reqsk(sk); - struct sock *nsk; - - sk = req->rsk_listener; - if (unlikely(sk->sk_state != DCCP_LISTEN)) { - inet_csk_reqsk_queue_drop_and_put(sk, req); - goto lookup; - } - sock_hold(sk); - refcounted = true; - nsk = dccp_check_req(sk, skb, req); - if (!nsk) { - reqsk_put(req); - goto discard_and_relse; - } - if (nsk == sk) { - reqsk_put(req); - } else if (dccp_child_process(sk, nsk, skb)) { - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - goto discard_and_relse; - } else { - sock_put(sk); - return 0; - } - } - /* - * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage - * o if MinCsCov = 0, only packets with CsCov = 0 are accepted - * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov - */ - min_cov = dccp_sk(sk)->dccps_pcrlen; - if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { - dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", - dh->dccph_cscov, min_cov); - /* FIXME: send Data Dropped option (see also dccp_v4_rcv) */ - goto discard_and_relse; - } - - if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) - goto discard_and_relse; - nf_reset_ct(skb); - - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, - refcounted) ? -1 : 0; - -no_dccp_socket: - if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto discard_it; - /* - * Step 2: - * If no socket ... - * Generate Reset(No Connection) unless P.type == Reset - * Drop packet and return - */ - if (dh->dccph_type != DCCP_PKT_RESET) { - DCCP_SKB_CB(skb)->dccpd_reset_code = - DCCP_RESET_CODE_NO_CONNECTION; - dccp_v6_ctl_send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - } - -discard_it: - kfree_skb(skb); - return 0; - -discard_and_relse: - if (refcounted) - sock_put(sk); - goto discard_it; -} - -static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) -{ - struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - struct in6_addr *saddr = NULL, *final_p, final; - struct ipv6_txoptions *opt; - struct flowi6 fl6; - struct dst_entry *dst; - int addr_type; - int err; - - dp->dccps_role = DCCP_ROLE_CLIENT; - - if (addr_len < SIN6_LEN_RFC2133) - return -EINVAL; - - if (usin->sin6_family != AF_INET6) - return -EAFNOSUPPORT; - - memset(&fl6, 0, sizeof(fl6)); - - if (inet6_test_bit(SNDFLOW, sk)) { - fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; - IP6_ECN_flow_init(fl6.flowlabel); - if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { - struct ip6_flowlabel *flowlabel; - flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); - if (IS_ERR(flowlabel)) - return -EINVAL; - fl6_sock_release(flowlabel); - } - } - /* - * connect() to INADDR_ANY means loopback (BSD'ism). - */ - if (ipv6_addr_any(&usin->sin6_addr)) - usin->sin6_addr.s6_addr[15] = 1; - - addr_type = ipv6_addr_type(&usin->sin6_addr); - - if (addr_type & IPV6_ADDR_MULTICAST) - return -ENETUNREACH; - - if (addr_type & IPV6_ADDR_LINKLOCAL) { - if (addr_len >= sizeof(struct sockaddr_in6) && - usin->sin6_scope_id) { - /* If interface is set while binding, indices - * must coincide. - */ - if (sk->sk_bound_dev_if && - sk->sk_bound_dev_if != usin->sin6_scope_id) - return -EINVAL; - - sk->sk_bound_dev_if = usin->sin6_scope_id; - } - - /* Connect to link-local address requires an interface */ - if (!sk->sk_bound_dev_if) - return -EINVAL; - } - - sk->sk_v6_daddr = usin->sin6_addr; - np->flow_label = fl6.flowlabel; - - /* - * DCCP over IPv4 - */ - if (addr_type == IPV6_ADDR_MAPPED) { - u32 exthdrlen = icsk->icsk_ext_hdr_len; - struct sockaddr_in sin; - - net_dbg_ratelimited("connect: ipv4 mapped\n"); - - if (ipv6_only_sock(sk)) - return -ENETUNREACH; - - sin.sin_family = AF_INET; - sin.sin_port = usin->sin6_port; - sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - - icsk->icsk_af_ops = &dccp_ipv6_mapped; - sk->sk_backlog_rcv = dccp_v4_do_rcv; - - err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); - if (err) { - icsk->icsk_ext_hdr_len = exthdrlen; - icsk->icsk_af_ops = &dccp_ipv6_af_ops; - sk->sk_backlog_rcv = dccp_v6_do_rcv; - goto failure; - } - np->saddr = sk->sk_v6_rcv_saddr; - return err; - } - - if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) - saddr = &sk->sk_v6_rcv_saddr; - - fl6.flowi6_proto = IPPROTO_DCCP; - fl6.daddr = sk->sk_v6_daddr; - fl6.saddr = saddr ? *saddr : np->saddr; - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.fl6_dport = usin->sin6_port; - fl6.fl6_sport = inet->inet_sport; - security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - - opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); - final_p = fl6_update_dst(&fl6, opt, &final); - - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - goto failure; - } - - if (saddr == NULL) { - saddr = &fl6.saddr; - - err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); - if (err) - goto failure; - } - - /* set the source address */ - np->saddr = *saddr; - inet->inet_rcv_saddr = LOOPBACK4_IPV6; - - ip6_dst_store(sk, dst, NULL, NULL); - - icsk->icsk_ext_hdr_len = 0; - if (opt) - icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; - - inet->inet_dport = usin->sin6_port; - - dccp_set_state(sk, DCCP_REQUESTING); - err = inet6_hash_connect(&dccp_death_row, sk); - if (err) - goto late_failure; - - dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport); - err = dccp_connect(sk); - if (err) - goto late_failure; - - return 0; - -late_failure: - dccp_set_state(sk, DCCP_CLOSED); - inet_bhash2_reset_saddr(sk); - __sk_dst_reset(sk); -failure: - inet->inet_dport = 0; - sk->sk_route_caps = 0; - return err; -} - -static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { - .queue_xmit = inet6_csk_xmit, - .send_check = dccp_v6_send_check, - .rebuild_header = inet6_sk_rebuild_header, - .conn_request = dccp_v6_conn_request, - .syn_recv_sock = dccp_v6_request_recv_sock, - .net_header_len = sizeof(struct ipv6hdr), - .setsockopt = ipv6_setsockopt, - .getsockopt = ipv6_getsockopt, - .addr2sockaddr = inet6_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in6), -}; - -/* - * DCCP over IPv4 via INET6 API - */ -static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { - .queue_xmit = ip_queue_xmit, - .send_check = dccp_v4_send_check, - .rebuild_header = inet_sk_rebuild_header, - .conn_request = dccp_v6_conn_request, - .syn_recv_sock = dccp_v6_request_recv_sock, - .net_header_len = sizeof(struct iphdr), - .setsockopt = ipv6_setsockopt, - .getsockopt = ipv6_getsockopt, - .addr2sockaddr = inet6_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in6), -}; - -static void dccp_v6_sk_destruct(struct sock *sk) -{ - dccp_destruct_common(sk); - inet6_sock_destruct(sk); -} - -/* NOTE: A lot of things set to zero explicitly by call to - * sk_alloc() so need not be done here. - */ -static int dccp_v6_init_sock(struct sock *sk) -{ - static __u8 dccp_v6_ctl_sock_initialized; - int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized); - - if (err == 0) { - if (unlikely(!dccp_v6_ctl_sock_initialized)) - dccp_v6_ctl_sock_initialized = 1; - inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; - sk->sk_destruct = dccp_v6_sk_destruct; - } - - return err; -} - -static struct timewait_sock_ops dccp6_timewait_sock_ops = { - .twsk_obj_size = sizeof(struct dccp6_timewait_sock), -}; - -static struct proto dccp_v6_prot = { - .name = "DCCPv6", - .owner = THIS_MODULE, - .close = dccp_close, - .connect = dccp_v6_connect, - .disconnect = dccp_disconnect, - .ioctl = dccp_ioctl, - .init = dccp_v6_init_sock, - .setsockopt = dccp_setsockopt, - .getsockopt = dccp_getsockopt, - .sendmsg = dccp_sendmsg, - .recvmsg = dccp_recvmsg, - .backlog_rcv = dccp_v6_do_rcv, - .hash = inet6_hash, - .unhash = inet_unhash, - .accept = inet_csk_accept, - .get_port = inet_csk_get_port, - .shutdown = dccp_shutdown, - .destroy = dccp_destroy_sock, - .orphan_count = &dccp_orphan_count, - .max_header = MAX_DCCP_HEADER, - .obj_size = sizeof(struct dccp6_sock), - .ipv6_pinfo_offset = offsetof(struct dccp6_sock, inet6), - .slab_flags = SLAB_TYPESAFE_BY_RCU, - .rsk_prot = &dccp6_request_sock_ops, - .twsk_prot = &dccp6_timewait_sock_ops, - .h.hashinfo = &dccp_hashinfo, -}; - -static const struct inet6_protocol dccp_v6_protocol = { - .handler = dccp_v6_rcv, - .err_handler = dccp_v6_err, - .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, -}; - -static const struct proto_ops inet6_dccp_ops = { - .family = PF_INET6, - .owner = THIS_MODULE, - .release = inet6_release, - .bind = inet6_bind, - .connect = inet_stream_connect, - .socketpair = sock_no_socketpair, - .accept = inet_accept, - .getname = inet6_getname, - .poll = dccp_poll, - .ioctl = inet6_ioctl, - .gettstamp = sock_gettstamp, - .listen = inet_dccp_listen, - .shutdown = inet_shutdown, - .setsockopt = sock_common_setsockopt, - .getsockopt = sock_common_getsockopt, - .sendmsg = inet_sendmsg, - .recvmsg = sock_common_recvmsg, - .mmap = sock_no_mmap, -#ifdef CONFIG_COMPAT - .compat_ioctl = inet6_compat_ioctl, -#endif -}; - -static struct inet_protosw dccp_v6_protosw = { - .type = SOCK_DCCP, - .protocol = IPPROTO_DCCP, - .prot = &dccp_v6_prot, - .ops = &inet6_dccp_ops, - .flags = INET_PROTOSW_ICSK, -}; - -static int __net_init dccp_v6_init_net(struct net *net) -{ - struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); - - if (dccp_hashinfo.bhash == NULL) - return -ESOCKTNOSUPPORT; - - return inet_ctl_sock_create(&pn->v6_ctl_sk, PF_INET6, - SOCK_DCCP, IPPROTO_DCCP, net); -} - -static void __net_exit dccp_v6_exit_net(struct net *net) -{ - struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); - - inet_ctl_sock_destroy(pn->v6_ctl_sk); -} - -static struct pernet_operations dccp_v6_ops = { - .init = dccp_v6_init_net, - .exit = dccp_v6_exit_net, - .id = &dccp_v6_pernet_id, - .size = sizeof(struct dccp_v6_pernet), -}; - -static int __init dccp_v6_init(void) -{ - int err = proto_register(&dccp_v6_prot, 1); - - if (err) - goto out; - - inet6_register_protosw(&dccp_v6_protosw); - - err = register_pernet_subsys(&dccp_v6_ops); - if (err) - goto out_destroy_ctl_sock; - - err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP); - if (err) - goto out_unregister_proto; - -out: - return err; -out_unregister_proto: - unregister_pernet_subsys(&dccp_v6_ops); -out_destroy_ctl_sock: - inet6_unregister_protosw(&dccp_v6_protosw); - proto_unregister(&dccp_v6_prot); - goto out; -} - -static void __exit dccp_v6_exit(void) -{ - inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP); - unregister_pernet_subsys(&dccp_v6_ops); - inet6_unregister_protosw(&dccp_v6_protosw); - proto_unregister(&dccp_v6_prot); -} - -module_init(dccp_v6_init); -module_exit(dccp_v6_exit); - -/* - * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) - * values directly, Also cover the case where the protocol is not specified, - * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP - */ -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 33, 6); -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 0, 6); -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>"); -MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h deleted file mode 100644 index c5d14c48def1..000000000000 --- a/net/dccp/ipv6.h +++ /dev/null @@ -1,27 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _DCCP_IPV6_H -#define _DCCP_IPV6_H -/* - * net/dccp/ipv6.h - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> - */ - -#include <linux/dccp.h> -#include <linux/ipv6.h> - -struct dccp6_sock { - struct dccp_sock dccp; - struct ipv6_pinfo inet6; -}; - -struct dccp6_request_sock { - struct dccp_request_sock dccp; -}; - -struct dccp6_timewait_sock { - struct inet_timewait_sock inet; -}; - -#endif /* _DCCP_IPV6_H */ diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c deleted file mode 100644 index fecc8190064f..000000000000 --- a/net/dccp/minisocks.c +++ /dev/null @@ -1,266 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/minisocks.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/dccp.h> -#include <linux/gfp.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/timer.h> - -#include <net/sock.h> -#include <net/xfrm.h> -#include <net/inet_timewait_sock.h> -#include <net/rstreason.h> - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -struct inet_timewait_death_row dccp_death_row = { - .tw_refcount = REFCOUNT_INIT(1), - .sysctl_max_tw_buckets = NR_FILE * 2, - .hashinfo = &dccp_hashinfo, -}; - -EXPORT_SYMBOL_GPL(dccp_death_row); - -void dccp_time_wait(struct sock *sk, int state, int timeo) -{ - struct inet_timewait_sock *tw; - - tw = inet_twsk_alloc(sk, &dccp_death_row, state); - - if (tw != NULL) { - const struct inet_connection_sock *icsk = inet_csk(sk); - const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); -#if IS_ENABLED(CONFIG_IPV6) - if (tw->tw_family == PF_INET6) { - tw->tw_v6_daddr = sk->sk_v6_daddr; - tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - tw->tw_ipv6only = sk->sk_ipv6only; - } -#endif - - /* Get the TIME_WAIT timeout firing. */ - if (timeo < rto) - timeo = rto; - - if (state == DCCP_TIME_WAIT) - timeo = DCCP_TIMEWAIT_LEN; - - /* Linkage updates. - * Note that access to tw after this point is illegal. - */ - inet_twsk_hashdance_schedule(tw, sk, &dccp_hashinfo, timeo); - } else { - /* Sorry, if we're out of memory, just CLOSE this - * socket up. We've got bigger problems than - * non-graceful socket closings. - */ - DCCP_WARN("time wait bucket table overflow\n"); - } - - dccp_done(sk); -} - -struct sock *dccp_create_openreq_child(const struct sock *sk, - const struct request_sock *req, - const struct sk_buff *skb) -{ - /* - * Step 3: Process LISTEN state - * - * (* Generate a new socket and switch to that socket *) - * Set S := new socket for this port pair - */ - struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); - - if (newsk != NULL) { - struct dccp_request_sock *dreq = dccp_rsk(req); - struct inet_connection_sock *newicsk = inet_csk(newsk); - struct dccp_sock *newdp = dccp_sk(newsk); - - newdp->dccps_role = DCCP_ROLE_SERVER; - newdp->dccps_hc_rx_ackvec = NULL; - newdp->dccps_service_list = NULL; - newdp->dccps_hc_rx_ccid = NULL; - newdp->dccps_hc_tx_ccid = NULL; - newdp->dccps_service = dreq->dreq_service; - newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo; - newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; - newicsk->icsk_rto = DCCP_TIMEOUT_INIT; - - INIT_LIST_HEAD(&newdp->dccps_featneg); - /* - * Step 3: Process LISTEN state - * - * Choose S.ISS (initial seqno) or set from Init Cookies - * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR from packet (or Init Cookies) - * - * Setting AWL/AWH and SWL/SWH happens as part of the feature - * activation below, as these windows all depend on the local - * and remote Sequence Window feature values (7.5.2). - */ - newdp->dccps_iss = dreq->dreq_iss; - newdp->dccps_gss = dreq->dreq_gss; - newdp->dccps_gar = newdp->dccps_iss; - newdp->dccps_isr = dreq->dreq_isr; - newdp->dccps_gsr = dreq->dreq_gsr; - - /* - * Activate features: initialise CCIDs, sequence windows etc. - */ - if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { - sk_free_unlock_clone(newsk); - return NULL; - } - dccp_init_xmit_timers(newsk); - - __DCCP_INC_STATS(DCCP_MIB_PASSIVEOPENS); - } - return newsk; -} - -EXPORT_SYMBOL_GPL(dccp_create_openreq_child); - -/* - * Process an incoming packet for RESPOND sockets represented - * as an request_sock. - */ -struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, - struct request_sock *req) -{ - struct sock *child = NULL; - struct dccp_request_sock *dreq = dccp_rsk(req); - bool own_req; - - /* TCP/DCCP listeners became lockless. - * DCCP stores complex state in its request_sock, so we need - * a protection for them, now this code runs without being protected - * by the parent (listener) lock. - */ - spin_lock_bh(&dreq->dreq_lock); - - /* Check for retransmitted REQUEST */ - if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { - - if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_gsr)) { - dccp_pr_debug("Retransmitted REQUEST\n"); - dreq->dreq_gsr = DCCP_SKB_CB(skb)->dccpd_seq; - /* - * Send another RESPONSE packet - * To protect against Request floods, increment retrans - * counter (backoff, monitored by dccp_response_timer). - */ - inet_rtx_syn_ack(sk, req); - } - /* Network Duplicate, discard packet */ - goto out; - } - - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; - - if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK && - dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK) - goto drop; - - /* Invalid ACK */ - if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq, - dreq->dreq_iss, dreq->dreq_gss)) { - dccp_pr_debug("Invalid ACK number: ack_seq=%llu, " - "dreq_iss=%llu, dreq_gss=%llu\n", - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq, - (unsigned long long) dreq->dreq_iss, - (unsigned long long) dreq->dreq_gss); - goto drop; - } - - if (dccp_parse_options(sk, dreq, skb)) - goto drop; - - child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, - req, &own_req); - if (child) { - child = inet_csk_complete_hashdance(sk, child, req, own_req); - goto out; - } - - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; -drop: - if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) - req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_NOT_SPECIFIED); - - inet_csk_reqsk_queue_drop(sk, req); -out: - spin_unlock_bh(&dreq->dreq_lock); - return child; -} - -EXPORT_SYMBOL_GPL(dccp_check_req); - -/* - * Queue segment on the new socket if the new socket is active, - * otherwise we just shortcircuit this and continue with - * the new socket. - */ -int dccp_child_process(struct sock *parent, struct sock *child, - struct sk_buff *skb) - __releases(child) -{ - int ret = 0; - const int state = child->sk_state; - - if (!sock_owned_by_user(child)) { - ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb), - skb->len); - - /* Wakeup parent, send SIGIO */ - if (state == DCCP_RESPOND && child->sk_state != state) - parent->sk_data_ready(parent); - } else { - /* Alas, it is possible again, because we do lookup - * in main socket hash table and lock on listening - * socket does not protect us more. - */ - __sk_add_backlog(child, skb); - } - - bh_unlock_sock(child); - sock_put(child); - return ret; -} - -EXPORT_SYMBOL_GPL(dccp_child_process); - -void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, - struct request_sock *rsk) -{ - DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state"); -} - -EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); - -int dccp_reqsk_init(struct request_sock *req, - struct dccp_sock const *dp, struct sk_buff const *skb) -{ - struct dccp_request_sock *dreq = dccp_rsk(req); - - spin_lock_init(&dreq->dreq_lock); - inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport; - inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport); - inet_rsk(req)->acked = 0; - dreq->dreq_timestamp_echo = 0; - - /* inherit feature negotiation options from listening socket */ - return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); -} - -EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/options.c b/net/dccp/options.c deleted file mode 100644 index db62d4767024..000000000000 --- a/net/dccp/options.c +++ /dev/null @@ -1,609 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/options.c - * - * An implementation of the DCCP protocol - * Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org> - * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> - * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz> - */ -#include <linux/dccp.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/unaligned.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -u64 dccp_decode_value_var(const u8 *bf, const u8 len) -{ - u64 value = 0; - - if (len >= DCCP_OPTVAL_MAXLEN) - value += ((u64)*bf++) << 40; - if (len > 4) - value += ((u64)*bf++) << 32; - if (len > 3) - value += ((u64)*bf++) << 24; - if (len > 2) - value += ((u64)*bf++) << 16; - if (len > 1) - value += ((u64)*bf++) << 8; - if (len > 0) - value += *bf; - - return value; -} - -/** - * dccp_parse_options - Parse DCCP options present in @skb - * @sk: client|server|listening dccp socket (when @dreq != NULL) - * @dreq: request socket to use during connection setup, or NULL - * @skb: frame to parse - */ -int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_hdr *dh = dccp_hdr(skb); - const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; - unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); - unsigned char *opt_ptr = options; - const unsigned char *opt_end = (unsigned char *)dh + - (dh->dccph_doff * 4); - struct dccp_options_received *opt_recv = &dp->dccps_options_received; - unsigned char opt, len; - unsigned char *value; - u32 elapsed_time; - __be32 opt_val; - int rc; - int mandatory = 0; - - memset(opt_recv, 0, sizeof(*opt_recv)); - - opt = len = 0; - while (opt_ptr != opt_end) { - opt = *opt_ptr++; - len = 0; - value = NULL; - - /* Check if this isn't a single byte option */ - if (opt > DCCPO_MAX_RESERVED) { - if (opt_ptr == opt_end) - goto out_nonsensical_length; - - len = *opt_ptr++; - if (len < 2) - goto out_nonsensical_length; - /* - * Remove the type and len fields, leaving - * just the value size - */ - len -= 2; - value = opt_ptr; - opt_ptr += len; - - if (opt_ptr > opt_end) - goto out_nonsensical_length; - } - - /* - * CCID-specific options are ignored during connection setup, as - * negotiation may still be in progress (see RFC 4340, 10.3). - * The same applies to Ack Vectors, as these depend on the CCID. - */ - if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || - opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) - goto ignore_option; - - switch (opt) { - case DCCPO_PADDING: - break; - case DCCPO_MANDATORY: - if (mandatory) - goto out_invalid_option; - if (pkt_type != DCCP_PKT_DATA) - mandatory = 1; - break; - case DCCPO_NDP_COUNT: - if (len > 6) - goto out_invalid_option; - - opt_recv->dccpor_ndp = dccp_decode_value_var(value, len); - dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), - (unsigned long long)opt_recv->dccpor_ndp); - break; - case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: - if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ - break; - if (len == 0) - goto out_invalid_option; - rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, - *value, value + 1, len - 1); - if (rc) - goto out_featneg_failed; - break; - case DCCPO_TIMESTAMP: - if (len != 4) - goto out_invalid_option; - /* - * RFC 4340 13.1: "The precise time corresponding to - * Timestamp Value zero is not specified". We use - * zero to indicate absence of a meaningful timestamp. - */ - opt_val = get_unaligned((__be32 *)value); - if (unlikely(opt_val == 0)) { - DCCP_WARN("Timestamp with zero value\n"); - break; - } - - if (dreq != NULL) { - dreq->dreq_timestamp_echo = ntohl(opt_val); - dreq->dreq_timestamp_time = dccp_timestamp(); - } else { - opt_recv->dccpor_timestamp = - dp->dccps_timestamp_echo = ntohl(opt_val); - dp->dccps_timestamp_time = dccp_timestamp(); - } - dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n", - dccp_role(sk), ntohl(opt_val), - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - /* schedule an Ack in case this sender is quiescent */ - inet_csk_schedule_ack(sk); - break; - case DCCPO_TIMESTAMP_ECHO: - if (len != 4 && len != 6 && len != 8) - goto out_invalid_option; - - opt_val = get_unaligned((__be32 *)value); - opt_recv->dccpor_timestamp_echo = ntohl(opt_val); - - dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, " - "ackno=%llu", dccp_role(sk), - opt_recv->dccpor_timestamp_echo, - len + 2, - (unsigned long long) - DCCP_SKB_CB(skb)->dccpd_ack_seq); - - value += 4; - - if (len == 4) { /* no elapsed time included */ - dccp_pr_debug_cat("\n"); - break; - } - - if (len == 6) { /* 2-byte elapsed time */ - __be16 opt_val2 = get_unaligned((__be16 *)value); - elapsed_time = ntohs(opt_val2); - } else { /* 4-byte elapsed time */ - opt_val = get_unaligned((__be32 *)value); - elapsed_time = ntohl(opt_val); - } - - dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time); - - /* Give precedence to the biggest ELAPSED_TIME */ - if (elapsed_time > opt_recv->dccpor_elapsed_time) - opt_recv->dccpor_elapsed_time = elapsed_time; - break; - case DCCPO_ELAPSED_TIME: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */ - break; - - if (len == 2) { - __be16 opt_val2 = get_unaligned((__be16 *)value); - elapsed_time = ntohs(opt_val2); - } else if (len == 4) { - opt_val = get_unaligned((__be32 *)value); - elapsed_time = ntohl(opt_val); - } else { - goto out_invalid_option; - } - - if (elapsed_time > opt_recv->dccpor_elapsed_time) - opt_recv->dccpor_elapsed_time = elapsed_time; - - dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", - dccp_role(sk), elapsed_time); - break; - case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: - if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, - pkt_type, opt, value, len)) - goto out_invalid_option; - break; - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ - break; - /* - * Ack vectors are processed by the TX CCID if it is - * interested. The RX CCID need not parse Ack Vectors, - * since it is only interested in clearing old state. - */ - fallthrough; - case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: - if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, - pkt_type, opt, value, len)) - goto out_invalid_option; - break; - default: - DCCP_CRIT("DCCP(%p): option %d(len=%d) not " - "implemented, ignoring", sk, opt, len); - break; - } -ignore_option: - if (opt != DCCPO_MANDATORY) - mandatory = 0; - } - - /* mandatory was the last byte in option list -> reset connection */ - if (mandatory) - goto out_invalid_option; - -out_nonsensical_length: - /* RFC 4340, 5.8: ignore option and all remaining option space */ - return 0; - -out_invalid_option: - DCCP_INC_STATS(DCCP_MIB_INVALIDOPT); - rc = DCCP_RESET_CODE_OPTION_ERROR; -out_featneg_failed: - DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); - DCCP_SKB_CB(skb)->dccpd_reset_code = rc; - DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; - DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; - DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; - return -1; -} - -EXPORT_SYMBOL_GPL(dccp_parse_options); - -void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) -{ - if (len >= DCCP_OPTVAL_MAXLEN) - *to++ = (value & 0xFF0000000000ull) >> 40; - if (len > 4) - *to++ = (value & 0xFF00000000ull) >> 32; - if (len > 3) - *to++ = (value & 0xFF000000) >> 24; - if (len > 2) - *to++ = (value & 0xFF0000) >> 16; - if (len > 1) - *to++ = (value & 0xFF00) >> 8; - if (len > 0) - *to++ = (value & 0xFF); -} - -static inline u8 dccp_ndp_len(const u64 ndp) -{ - if (likely(ndp <= 0xFF)) - return 1; - return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6); -} - -int dccp_insert_option(struct sk_buff *skb, const unsigned char option, - const void *value, const unsigned char len) -{ - unsigned char *to; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2; - - to = skb_push(skb, len + 2); - *to++ = option; - *to++ = len + 2; - - memcpy(to, value, len); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_insert_option); - -static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - u64 ndp = dp->dccps_ndp_count; - - if (dccp_non_data_packet(skb)) - ++dp->dccps_ndp_count; - else - dp->dccps_ndp_count = 0; - - if (ndp > 0) { - unsigned char *ptr; - const int ndp_len = dccp_ndp_len(ndp); - const int len = ndp_len + 2; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - ptr = skb_push(skb, len); - *ptr++ = DCCPO_NDP_COUNT; - *ptr++ = len; - dccp_encode_value_var(ndp, ptr, ndp_len); - } - - return 0; -} - -static inline int dccp_elapsed_time_len(const u32 elapsed_time) -{ - return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4; -} - -static int dccp_insert_option_timestamp(struct sk_buff *skb) -{ - __be32 now = htonl(dccp_timestamp()); - /* yes this will overflow but that is the point as we want a - * 10 usec 32 bit timer which mean it wraps every 11.9 hours */ - - return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now)); -} - -static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, - struct dccp_request_sock *dreq, - struct sk_buff *skb) -{ - __be32 tstamp_echo; - unsigned char *to; - u32 elapsed_time, elapsed_time_len, len; - - if (dreq != NULL) { - elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time; - tstamp_echo = htonl(dreq->dreq_timestamp_echo); - dreq->dreq_timestamp_echo = 0; - } else { - elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time; - tstamp_echo = htonl(dp->dccps_timestamp_echo); - dp->dccps_timestamp_echo = 0; - } - - elapsed_time_len = dccp_elapsed_time_len(elapsed_time); - len = 6 + elapsed_time_len; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - to = skb_push(skb, len); - *to++ = DCCPO_TIMESTAMP_ECHO; - *to++ = len; - - memcpy(to, &tstamp_echo, 4); - to += 4; - - if (elapsed_time_len == 2) { - const __be16 var16 = htons((u16)elapsed_time); - memcpy(to, &var16, 2); - } else if (elapsed_time_len == 4) { - const __be32 var32 = htonl(elapsed_time); - memcpy(to, &var32, 4); - } - - return 0; -} - -static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const u16 buflen = dccp_ackvec_buflen(av); - /* Figure out how many options do we need to represent the ackvec */ - const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); - u16 len = buflen + 2 * nr_opts; - u8 i, nonce = 0; - const unsigned char *tail, *from; - unsigned char *to; - - if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, - dccp_packet_name(dcb->dccpd_type)); - return -1; - } - /* - * Since Ack Vectors are variable-length, we can not always predict - * their size. To catch exception cases where the space is running out - * on the skb, a separate Sync is scheduled to carry the Ack Vector. - */ - if (len > DCCPAV_MIN_OPTLEN && - len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { - DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " - "MPS=%u ==> reduce payload size?\n", len, skb->len, - dcb->dccpd_opt_len, dp->dccps_mss_cache); - dp->dccps_sync_scheduled = 1; - return 0; - } - dcb->dccpd_opt_len += len; - - to = skb_push(skb, len); - len = buflen; - from = av->av_buf + av->av_buf_head; - tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; - - for (i = 0; i < nr_opts; ++i) { - int copylen = len; - - if (len > DCCP_SINGLE_OPT_MAXLEN) - copylen = DCCP_SINGLE_OPT_MAXLEN; - - /* - * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via - * its type; ack_nonce is the sum of all individual buf_nonce's. - */ - nonce ^= av->av_buf_nonce[i]; - - *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; - *to++ = copylen + 2; - - /* Check if buf_head wraps */ - if (from + copylen > tail) { - const u16 tailsize = tail - from; - - memcpy(to, from, tailsize); - to += tailsize; - len -= tailsize; - copylen -= tailsize; - from = av->av_buf; - } - - memcpy(to, from, copylen); - from += copylen; - to += copylen; - len -= copylen; - } - /* - * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. - */ - if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) - return -ENOBUFS; - return 0; -} - -/** - * dccp_insert_option_mandatory - Mandatory option (5.8.2) - * @skb: frame into which to insert option - * - * Note that since we are using skb_push, this function needs to be called - * _after_ inserting the option it is supposed to influence (stack order). - */ -int dccp_insert_option_mandatory(struct sk_buff *skb) -{ - if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len++; - *(u8 *)skb_push(skb, 1) = DCCPO_MANDATORY; - return 0; -} - -/** - * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb - * @skb: frame to insert feature negotiation option into - * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R - * @feat: one out of %dccp_feature_numbers - * @val: NN value or SP array (preferred element first) to copy - * @len: true length of @val in bytes (excluding first element repetition) - * @repeat_first: whether to copy the first element of @val twice - * - * The last argument is used to construct Confirm options, where the preferred - * value and the preference list appear separately (RFC 4340, 6.3.1). Preference - * lists are kept such that the preferred entry is always first, so we only need - * to copy twice, and avoid the overhead of cloning into a bigger array. - */ -int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, - u8 *val, u8 len, bool repeat_first) -{ - u8 tot_len, *to; - - /* take the `Feature' field and possible repetition into account */ - if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { - DCCP_WARN("length %u for feature %u too large\n", len, feat); - return -1; - } - - if (unlikely(val == NULL || len == 0)) - len = repeat_first = false; - tot_len = 3 + repeat_first + len; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { - DCCP_WARN("packet too small for feature %d option!\n", feat); - return -1; - } - DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; - - to = skb_push(skb, tot_len); - *to++ = type; - *to++ = tot_len; - *to++ = feat; - - if (repeat_first) - *to++ = *val; - if (len) - memcpy(to, val, len); - return 0; -} - -/* The length of all options needs to be a multiple of 4 (5.8) */ -static void dccp_insert_option_padding(struct sk_buff *skb) -{ - int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4; - - if (padding != 0) { - padding = 4 - padding; - memset(skb_push(skb, padding), 0, padding); - DCCP_SKB_CB(skb)->dccpd_opt_len += padding; - } -} - -int dccp_insert_options(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - - DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - - if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) - return -1; - - if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { - - /* Feature Negotiation */ - if (dccp_feat_insert_opts(dp, NULL, skb)) - return -1; - - if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { - /* - * Obtain RTT sample from Request/Response exchange. - * This is currently used for TFRC initialisation. - */ - if (dccp_insert_option_timestamp(skb)) - return -1; - - } else if (dccp_ackvec_pending(sk) && - dccp_insert_option_ackvec(sk, skb)) { - return -1; - } - } - - if (dp->dccps_hc_rx_insert_options) { - if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb)) - return -1; - dp->dccps_hc_rx_insert_options = 0; - } - - if (dp->dccps_timestamp_echo != 0 && - dccp_insert_option_timestamp_echo(dp, NULL, skb)) - return -1; - - dccp_insert_option_padding(skb); - return 0; -} - -int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) -{ - DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - - if (dccp_feat_insert_opts(NULL, dreq, skb)) - return -1; - - /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */ - if (dccp_insert_option_timestamp(skb)) - return -1; - - if (dreq->dreq_timestamp_echo != 0 && - dccp_insert_option_timestamp_echo(NULL, dreq, skb)) - return -1; - - dccp_insert_option_padding(skb); - return 0; -} diff --git a/net/dccp/output.c b/net/dccp/output.c deleted file mode 100644 index 5c2e24f3c39b..000000000000 --- a/net/dccp/output.c +++ /dev/null @@ -1,709 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/output.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/dccp.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <linux/sched/signal.h> - -#include <net/inet_sock.h> -#include <net/sock.h> - -#include "ackvec.h" -#include "ccid.h" -#include "dccp.h" - -static inline void dccp_event_ack_sent(struct sock *sk) -{ - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); -} - -/* enqueue @skb on sk_send_head for retransmission, return clone to send now */ -static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) -{ - skb_set_owner_w(skb, sk); - WARN_ON(sk->sk_send_head); - sk->sk_send_head = skb; - return skb_clone(sk->sk_send_head, gfp_any()); -} - -/* - * All SKB's seen here are completely headerless. It is our - * job to build the DCCP header, and pass the packet down to - * IP so it can do the same plus pass the packet off to the - * device. - */ -static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) -{ - if (likely(skb != NULL)) { - struct inet_sock *inet = inet_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - struct dccp_hdr *dh; - /* XXX For now we're using only 48 bits sequence numbers */ - const u32 dccp_header_size = sizeof(*dh) + - sizeof(struct dccp_hdr_ext) + - dccp_packet_hdr_len(dcb->dccpd_type); - int err, set_ack = 1; - u64 ackno = dp->dccps_gsr; - /* - * Increment GSS here already in case the option code needs it. - * Update GSS for real only if option processing below succeeds. - */ - dcb->dccpd_seq = ADD48(dp->dccps_gss, 1); - - switch (dcb->dccpd_type) { - case DCCP_PKT_DATA: - set_ack = 0; - fallthrough; - case DCCP_PKT_DATAACK: - case DCCP_PKT_RESET: - break; - - case DCCP_PKT_REQUEST: - set_ack = 0; - /* Use ISS on the first (non-retransmitted) Request. */ - if (icsk->icsk_retransmits == 0) - dcb->dccpd_seq = dp->dccps_iss; - fallthrough; - - case DCCP_PKT_SYNC: - case DCCP_PKT_SYNCACK: - ackno = dcb->dccpd_ack_seq; - fallthrough; - default: - /* - * Set owner/destructor: some skbs are allocated via - * alloc_skb (e.g. when retransmission may happen). - * Only Data, DataAck, and Reset packets should come - * through here with skb->sk set. - */ - WARN_ON(skb->sk); - skb_set_owner_w(skb, sk); - break; - } - - if (dccp_insert_options(sk, skb)) { - kfree_skb(skb); - return -EPROTO; - } - - - /* Build DCCP header and checksum it. */ - dh = dccp_zeroed_hdr(skb, dccp_header_size); - dh->dccph_type = dcb->dccpd_type; - dh->dccph_sport = inet->inet_sport; - dh->dccph_dport = inet->inet_dport; - dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4; - dh->dccph_ccval = dcb->dccpd_ccval; - dh->dccph_cscov = dp->dccps_pcslen; - /* XXX For now we're using only 48 bits sequence numbers */ - dh->dccph_x = 1; - - dccp_update_gss(sk, dcb->dccpd_seq); - dccp_hdr_set_seq(dh, dp->dccps_gss); - if (set_ack) - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno); - - switch (dcb->dccpd_type) { - case DCCP_PKT_REQUEST: - dccp_hdr_request(skb)->dccph_req_service = - dp->dccps_service; - /* - * Limit Ack window to ISS <= P.ackno <= GSS, so that - * only Responses to Requests we sent are considered. - */ - dp->dccps_awl = dp->dccps_iss; - break; - case DCCP_PKT_RESET: - dccp_hdr_reset(skb)->dccph_reset_code = - dcb->dccpd_reset_code; - break; - } - - icsk->icsk_af_ops->send_check(sk, skb); - - if (set_ack) - dccp_event_ack_sent(sk); - - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - - err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); - return net_xmit_eval(err); - } - return -ENOBUFS; -} - -/** - * dccp_determine_ccmps - Find out about CCID-specific packet-size limits - * @dp: socket to find packet size limits of - * - * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.), - * since the RX CCID is restricted to feedback packets (Acks), which are small - * in comparison with the data traffic. A value of 0 means "no current CCMPS". - */ -static u32 dccp_determine_ccmps(const struct dccp_sock *dp) -{ - const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid; - - if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL) - return 0; - return tx_ccid->ccid_ops->ccid_ccmps; -} - -unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct dccp_sock *dp = dccp_sk(sk); - u32 ccmps = dccp_determine_ccmps(dp); - u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; - - /* Account for header lengths and IPv4/v6 option overhead */ - cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + - sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); - - /* - * Leave enough headroom for common DCCP header options. - * This only considers options which may appear on DCCP-Data packets, as - * per table 3 in RFC 4340, 5.8. When running out of space for other - * options (eg. Ack Vector which can take up to 255 bytes), it is better - * to schedule a separate Ack. Thus we leave headroom for the following: - * - 1 byte for Slow Receiver (11.6) - * - 6 bytes for Timestamp (13.1) - * - 10 bytes for Timestamp Echo (13.3) - * - 8 bytes for NDP count (7.7, when activated) - * - 6 bytes for Data Checksum (9.3) - * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) - */ - cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + - (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); - - /* And store cached results */ - icsk->icsk_pmtu_cookie = pmtu; - WRITE_ONCE(dp->dccps_mss_cache, cur_mps); - - return cur_mps; -} - -EXPORT_SYMBOL_GPL(dccp_sync_mss); - -void dccp_write_space(struct sock *sk) -{ - struct socket_wq *wq; - - rcu_read_lock(); - wq = rcu_dereference(sk->sk_wq); - if (skwq_has_sleeper(wq)) - wake_up_interruptible(&wq->wait); - /* Should agree with poll, otherwise some programs break */ - if (sock_writeable(sk)) - sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); - - rcu_read_unlock(); -} - -/** - * dccp_wait_for_ccid - Await CCID send permission - * @sk: socket to wait for - * @delay: timeout in jiffies - * - * This is used by CCIDs which need to delay the send time in process context. - */ -static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) -{ - DEFINE_WAIT(wait); - long remaining; - - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - sk->sk_write_pending++; - release_sock(sk); - - remaining = schedule_timeout(delay); - - lock_sock(sk); - sk->sk_write_pending--; - finish_wait(sk_sleep(sk), &wait); - - if (signal_pending(current) || sk->sk_err) - return -1; - return remaining; -} - -/** - * dccp_xmit_packet - Send data packet under control of CCID - * @sk: socket to send data packet on - * - * Transmits next-queued payload and informs CCID to account for the packet. - */ -static void dccp_xmit_packet(struct sock *sk) -{ - int err, len; - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb = dccp_qpolicy_pop(sk); - - if (unlikely(skb == NULL)) - return; - len = skb->len; - - if (sk->sk_state == DCCP_PARTOPEN) { - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; - /* - * See 8.1.5 - Handshake Completion. - * - * For robustness we resend Confirm options until the client has - * entered OPEN. During the initial feature negotiation, the MPS - * is smaller than usual, reduced by the Change/Confirm options. - */ - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { - DCCP_WARN("Payload too large (%d) for featneg.\n", len); - dccp_send_ack(sk); - dccp_feat_list_purge(&dp->dccps_featneg); - } - - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else if (dccp_ack_pending(sk)) { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; - } else { - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; - } - - err = dccp_transmit_skb(sk, skb); - if (err) - dccp_pr_debug("transmit_skb() returned err=%d\n", err); - /* - * Register this one as sent even if an error occurred. To the remote - * end a local packet drop is indistinguishable from network loss, i.e. - * any local drop will eventually be reported via receiver feedback. - */ - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); - - /* - * If the CCID needs to transfer additional header options out-of-band - * (e.g. Ack Vectors or feature-negotiation options), it activates this - * flag to schedule a Sync. The Sync will automatically incorporate all - * currently pending header options, thus clearing the backlog. - */ - if (dp->dccps_sync_scheduled) - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); -} - -/** - * dccp_flush_write_queue - Drain queue at end of connection - * @sk: socket to be drained - * @time_budget: time allowed to drain the queue - * - * Since dccp_sendmsg queues packets without waiting for them to be sent, it may - * happen that the TX queue is not empty at the end of a connection. We give the - * HC-sender CCID a grace period of up to @time_budget jiffies. If this function - * returns with a non-empty write queue, it will be purged later. - */ -void dccp_flush_write_queue(struct sock *sk, long *time_budget) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - long delay, rc; - - while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { - rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - /* - * If the CCID determines when to send, the next sending - * time is unknown or the CCID may not even send again - * (e.g. remote host crashes or lost Ack packets). - */ - DCCP_WARN("CCID did not manage to send all packets\n"); - return; - case CCID_PACKET_DELAY: - delay = msecs_to_jiffies(rc); - if (delay > *time_budget) - return; - rc = dccp_wait_for_ccid(sk, delay); - if (rc < 0) - return; - *time_budget -= (delay - rc); - /* check again if we can send now */ - break; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - skb_dequeue(&sk->sk_write_queue); - kfree_skb(skb); - dccp_pr_debug("packet discarded due to err=%ld\n", rc); - } - } -} - -void dccp_write_xmit(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - - while ((skb = dccp_qpolicy_top(sk))) { - int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - - switch (ccid_packet_dequeue_eval(rc)) { - case CCID_PACKET_WILL_DEQUEUE_LATER: - return; - case CCID_PACKET_DELAY: - sk_reset_timer(sk, &dp->dccps_xmit_timer, - jiffies + msecs_to_jiffies(rc)); - return; - case CCID_PACKET_SEND_AT_ONCE: - dccp_xmit_packet(sk); - break; - case CCID_PACKET_ERR: - dccp_qpolicy_drop(sk, skb); - dccp_pr_debug("packet discarded due to err=%d\n", rc); - } - } -} - -/** - * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets - * @sk: socket to perform retransmit on - * - * There are only four retransmittable packet types in DCCP: - * - Request in client-REQUEST state (sec. 8.1.1), - * - CloseReq in server-CLOSEREQ state (sec. 8.3), - * - Close in node-CLOSING state (sec. 8.3), - * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()). - * This function expects sk->sk_send_head to contain the original skb. - */ -int dccp_retransmit_skb(struct sock *sk) -{ - WARN_ON(sk->sk_send_head == NULL); - - if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0) - return -EHOSTUNREACH; /* Routing failure or similar. */ - - /* this count is used to distinguish original and retransmitted skb */ - inet_csk(sk)->icsk_retransmits++; - - return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC)); -} - -struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst, - struct request_sock *req) -{ - struct dccp_hdr *dh; - struct dccp_request_sock *dreq; - const u32 dccp_header_size = sizeof(struct dccp_hdr) + - sizeof(struct dccp_hdr_ext) + - sizeof(struct dccp_hdr_response); - struct sk_buff *skb; - - /* sk is marked const to clearly express we dont hold socket lock. - * sock_wmalloc() will atomically change sk->sk_wmem_alloc, - * it is safe to promote sk to non const. - */ - skb = sock_wmalloc((struct sock *)sk, MAX_DCCP_HEADER, 1, - GFP_ATOMIC); - if (!skb) - return NULL; - - skb_reserve(skb, MAX_DCCP_HEADER); - - skb_dst_set(skb, dst_clone(dst)); - - dreq = dccp_rsk(req); - if (inet_rsk(req)->acked) /* increase GSS upon retransmission */ - dccp_inc_seqno(&dreq->dreq_gss); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; - DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss; - - /* Resolve feature dependencies resulting from choice of CCID */ - if (dccp_feat_server_ccid_dependencies(dreq)) - goto response_failed; - - if (dccp_insert_options_rsk(dreq, skb)) - goto response_failed; - - /* Build and checksum header */ - dh = dccp_zeroed_hdr(skb, dccp_header_size); - - dh->dccph_sport = htons(inet_rsk(req)->ir_num); - dh->dccph_dport = inet_rsk(req)->ir_rmt_port; - dh->dccph_doff = (dccp_header_size + - DCCP_SKB_CB(skb)->dccpd_opt_len) / 4; - dh->dccph_type = DCCP_PKT_RESPONSE; - dh->dccph_x = 1; - dccp_hdr_set_seq(dh, dreq->dreq_gss); - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr); - dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service; - - dccp_csum_outgoing(skb); - - /* We use `acked' to remember that a Response was already sent. */ - inet_rsk(req)->acked = 1; - DCCP_INC_STATS(DCCP_MIB_OUTSEGS); - return skb; -response_failed: - kfree_skb(skb); - return NULL; -} - -EXPORT_SYMBOL_GPL(dccp_make_response); - -/* answer offending packet in @rcv_skb with Reset from control socket @ctl */ -struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb) -{ - struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh; - struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb); - const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) + - sizeof(struct dccp_hdr_ext) + - sizeof(struct dccp_hdr_reset); - struct dccp_hdr_reset *dhr; - struct sk_buff *skb; - - skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); - if (skb == NULL) - return NULL; - - skb_reserve(skb, sk->sk_prot->max_header); - - /* Swap the send and the receive. */ - dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len); - dh->dccph_type = DCCP_PKT_RESET; - dh->dccph_sport = rxdh->dccph_dport; - dh->dccph_dport = rxdh->dccph_sport; - dh->dccph_doff = dccp_hdr_reset_len / 4; - dh->dccph_x = 1; - - dhr = dccp_hdr_reset(skb); - dhr->dccph_reset_code = dcb->dccpd_reset_code; - - switch (dcb->dccpd_reset_code) { - case DCCP_RESET_CODE_PACKET_ERROR: - dhr->dccph_reset_data[0] = rxdh->dccph_type; - break; - case DCCP_RESET_CODE_OPTION_ERROR: - case DCCP_RESET_CODE_MANDATORY_ERROR: - memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3); - break; - } - /* - * From RFC 4340, 8.3.1: - * If P.ackno exists, set R.seqno := P.ackno + 1. - * Else set R.seqno := 0. - */ - if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1)); - dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq); - - dccp_csum_outgoing(skb); - return skb; -} - -EXPORT_SYMBOL_GPL(dccp_ctl_make_reset); - -/* send Reset on established socket, to close or abort the connection */ -int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) -{ - struct sk_buff *skb; - /* - * FIXME: what if rebuild_header fails? - * Should we be doing a rebuild_header here? - */ - int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk); - - if (err != 0) - return err; - - skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC); - if (skb == NULL) - return -ENOBUFS; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET; - DCCP_SKB_CB(skb)->dccpd_reset_code = code; - - return dccp_transmit_skb(sk, skb); -} - -/* - * Do all connect socket setups that can be done AF independent. - */ -int dccp_connect(struct sock *sk) -{ - struct sk_buff *skb; - struct dccp_sock *dp = dccp_sk(sk); - struct dst_entry *dst = __sk_dst_get(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - sk->sk_err = 0; - sock_reset_flag(sk, SOCK_DONE); - - dccp_sync_mss(sk, dst_mtu(dst)); - - /* do not connect if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dccp_sk(sk))) - return -EPROTO; - - /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ - dp->dccps_gar = dp->dccps_iss; - - skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); - if (unlikely(skb == NULL)) - return -ENOBUFS; - - /* Reserve space for headers. */ - skb_reserve(skb, sk->sk_prot->max_header); - - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; - - dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); - DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); - - /* Timer for repeating the REQUEST until an answer. */ - icsk->icsk_retransmits = 0; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - icsk->icsk_rto, DCCP_RTO_MAX); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_connect); - -void dccp_send_ack(struct sock *sk) -{ - /* If we have been reset, we may not send again. */ - if (sk->sk_state != DCCP_CLOSED) { - struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, - GFP_ATOMIC); - - if (skb == NULL) { - inet_csk_schedule_ack(sk); - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, - DCCP_RTO_MAX); - return; - } - - /* Reserve space for headers */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK; - dccp_transmit_skb(sk, skb); - } -} - -EXPORT_SYMBOL_GPL(dccp_send_ack); - -#if 0 -/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */ -void dccp_send_delayed_ack(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - /* - * FIXME: tune this timer. elapsed time fixes the skew, so no problem - * with using 2s, and active senders also piggyback the ACK into a - * DATAACK packet, so this is really for quiescent senders. - */ - unsigned long timeout = jiffies + 2 * HZ; - - /* Use new timeout only if there wasn't a older one earlier. */ - if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { - /* If delack timer was blocked or is about to expire, - * send ACK now. - * - * FIXME: check the "about to expire" part - */ - if (icsk->icsk_ack.blocked) { - dccp_send_ack(sk); - return; - } - - if (!time_before(timeout, icsk->icsk_ack.timeout)) - timeout = icsk->icsk_ack.timeout; - } - icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; - icsk->icsk_ack.timeout = timeout; - sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); -} -#endif - -void dccp_send_sync(struct sock *sk, const u64 ackno, - const enum dccp_pkt_type pkt_type) -{ - /* - * We are not putting this on the write queue, so - * dccp_transmit_skb() will set the ownership to this - * sock. - */ - struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC); - - if (skb == NULL) { - /* FIXME: how to make sure the sync is sent? */ - DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type)); - return; - } - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - DCCP_SKB_CB(skb)->dccpd_type = pkt_type; - DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; - - /* - * Clear the flag in case the Sync was scheduled for out-of-band data, - * such as carrying a long Ack Vector. - */ - dccp_sk(sk)->dccps_sync_scheduled = 0; - - dccp_transmit_skb(sk, skb); -} - -EXPORT_SYMBOL_GPL(dccp_send_sync); - -/* - * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This - * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under - * any circumstances. - */ -void dccp_send_close(struct sock *sk, const int active) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC; - - skb = alloc_skb(sk->sk_prot->max_header, prio); - if (skb == NULL) - return; - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, sk->sk_prot->max_header); - if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait) - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ; - else - DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; - - if (active) { - skb = dccp_skb_entail(sk, skb); - /* - * Retransmission timer for active-close: RFC 4340, 8.3 requires - * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ - * state can be left. The initial timeout is 2 RTTs. - * Since RTT measurement is done by the CCIDs, there is no easy - * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4 - * is too low (200ms); we use a high value to avoid unnecessary - * retransmissions when the link RTT is > 0.2 seconds. - * FIXME: Let main module sample RTTs and use that instead. - */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); - } - dccp_transmit_skb(sk, skb); -} diff --git a/net/dccp/proto.c b/net/dccp/proto.c deleted file mode 100644 index fcc5c9d64f46..000000000000 --- a/net/dccp/proto.c +++ /dev/null @@ -1,1293 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/proto.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/dccp.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/sched.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/in.h> -#include <linux/if_arp.h> -#include <linux/init.h> -#include <linux/random.h> -#include <linux/slab.h> -#include <net/checksum.h> - -#include <net/inet_sock.h> -#include <net/inet_common.h> -#include <net/sock.h> -#include <net/xfrm.h> - -#include <asm/ioctls.h> -#include <linux/spinlock.h> -#include <linux/timer.h> -#include <linux/delay.h> -#include <linux/poll.h> - -#include "ccid.h" -#include "dccp.h" -#include "feat.h" - -#define CREATE_TRACE_POINTS -#include "trace.h" - -DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; - -EXPORT_SYMBOL_GPL(dccp_statistics); - -DEFINE_PER_CPU(unsigned int, dccp_orphan_count); -EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count); - -struct inet_hashinfo dccp_hashinfo; -EXPORT_SYMBOL_GPL(dccp_hashinfo); - -/* the maximum queue length for tx in packets. 0 is no limit */ -int sysctl_dccp_tx_qlen __read_mostly = 5; - -#ifdef CONFIG_IP_DCCP_DEBUG -static const char *dccp_state_name(const int state) -{ - static const char *const dccp_state_names[] = { - [DCCP_OPEN] = "OPEN", - [DCCP_REQUESTING] = "REQUESTING", - [DCCP_PARTOPEN] = "PARTOPEN", - [DCCP_LISTEN] = "LISTEN", - [DCCP_RESPOND] = "RESPOND", - [DCCP_CLOSING] = "CLOSING", - [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", - [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", - [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", - [DCCP_TIME_WAIT] = "TIME_WAIT", - [DCCP_CLOSED] = "CLOSED", - }; - - if (state >= DCCP_MAX_STATES) - return "INVALID STATE!"; - else - return dccp_state_names[state]; -} -#endif - -void dccp_set_state(struct sock *sk, const int state) -{ - const int oldstate = sk->sk_state; - - dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk, - dccp_state_name(oldstate), dccp_state_name(state)); - WARN_ON(state == oldstate); - - switch (state) { - case DCCP_OPEN: - if (oldstate != DCCP_OPEN) - DCCP_INC_STATS(DCCP_MIB_CURRESTAB); - /* Client retransmits all Confirm options until entering OPEN */ - if (oldstate == DCCP_PARTOPEN) - dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); - break; - - case DCCP_CLOSED: - if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ || - oldstate == DCCP_CLOSING) - DCCP_INC_STATS(DCCP_MIB_ESTABRESETS); - - sk->sk_prot->unhash(sk); - if (inet_csk(sk)->icsk_bind_hash != NULL && - !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) - inet_put_port(sk); - fallthrough; - default: - if (oldstate == DCCP_OPEN) - DCCP_DEC_STATS(DCCP_MIB_CURRESTAB); - } - - /* Change state AFTER socket is unhashed to avoid closed - * socket sitting in hash tables. - */ - inet_sk_set_state(sk, state); -} - -EXPORT_SYMBOL_GPL(dccp_set_state); - -static void dccp_finish_passive_close(struct sock *sk) -{ - switch (sk->sk_state) { - case DCCP_PASSIVE_CLOSE: - /* Node (client or server) has received Close packet. */ - dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED); - dccp_set_state(sk, DCCP_CLOSED); - break; - case DCCP_PASSIVE_CLOSEREQ: - /* - * Client received CloseReq. We set the `active' flag so that - * dccp_send_close() retransmits the Close as per RFC 4340, 8.3. - */ - dccp_send_close(sk, 1); - dccp_set_state(sk, DCCP_CLOSING); - } -} - -void dccp_done(struct sock *sk) -{ - dccp_set_state(sk, DCCP_CLOSED); - dccp_clear_xmit_timers(sk); - - sk->sk_shutdown = SHUTDOWN_MASK; - - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_state_change(sk); - else - inet_csk_destroy_sock(sk); -} - -EXPORT_SYMBOL_GPL(dccp_done); - -const char *dccp_packet_name(const int type) -{ - static const char *const dccp_packet_names[] = { - [DCCP_PKT_REQUEST] = "REQUEST", - [DCCP_PKT_RESPONSE] = "RESPONSE", - [DCCP_PKT_DATA] = "DATA", - [DCCP_PKT_ACK] = "ACK", - [DCCP_PKT_DATAACK] = "DATAACK", - [DCCP_PKT_CLOSEREQ] = "CLOSEREQ", - [DCCP_PKT_CLOSE] = "CLOSE", - [DCCP_PKT_RESET] = "RESET", - [DCCP_PKT_SYNC] = "SYNC", - [DCCP_PKT_SYNCACK] = "SYNCACK", - }; - - if (type >= DCCP_NR_PKT_TYPES) - return "INVALID"; - else - return dccp_packet_names[type]; -} - -EXPORT_SYMBOL_GPL(dccp_packet_name); - -void dccp_destruct_common(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = NULL; -} -EXPORT_SYMBOL_GPL(dccp_destruct_common); - -static void dccp_sk_destruct(struct sock *sk) -{ - dccp_destruct_common(sk); - inet_sock_destruct(sk); -} - -int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - - pr_warn_once("DCCP is deprecated and scheduled to be removed in 2025, " - "please contact the netdev mailing list\n"); - - icsk->icsk_rto = DCCP_TIMEOUT_INIT; - icsk->icsk_syn_retries = sysctl_dccp_request_retries; - sk->sk_state = DCCP_CLOSED; - sk->sk_write_space = dccp_write_space; - sk->sk_destruct = dccp_sk_destruct; - icsk->icsk_sync_mss = dccp_sync_mss; - dp->dccps_mss_cache = 536; - dp->dccps_rate_last = jiffies; - dp->dccps_role = DCCP_ROLE_UNDEFINED; - dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; - dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; - - dccp_init_xmit_timers(sk); - - INIT_LIST_HEAD(&dp->dccps_featneg); - /* control socket doesn't need feat nego */ - if (likely(ctl_sock_initialized)) - return dccp_feat_init(sk); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_init_sock); - -void dccp_destroy_sock(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - __skb_queue_purge(&sk->sk_write_queue); - if (sk->sk_send_head != NULL) { - kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - } - - /* Clean up a referenced DCCP bind bucket. */ - if (inet_csk(sk)->icsk_bind_hash != NULL) - inet_put_port(sk); - - kfree(dp->dccps_service_list); - dp->dccps_service_list = NULL; - - if (dp->dccps_hc_rx_ackvec != NULL) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = NULL; - - /* clean up feature negotiation state */ - dccp_feat_list_purge(&dp->dccps_featneg); -} - -EXPORT_SYMBOL_GPL(dccp_destroy_sock); - -static inline int dccp_need_reset(int state) -{ - return state != DCCP_CLOSED && state != DCCP_LISTEN && - state != DCCP_REQUESTING; -} - -int dccp_disconnect(struct sock *sk, int flags) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_sock *inet = inet_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - const int old_state = sk->sk_state; - - if (old_state != DCCP_CLOSED) - dccp_set_state(sk, DCCP_CLOSED); - - /* - * This corresponds to the ABORT function of RFC793, sec. 3.8 - * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted". - */ - if (old_state == DCCP_LISTEN) { - inet_csk_listen_stop(sk); - } else if (dccp_need_reset(old_state)) { - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - sk->sk_err = ECONNRESET; - } else if (old_state == DCCP_REQUESTING) - sk->sk_err = ECONNRESET; - - dccp_clear_xmit_timers(sk); - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = NULL; - - __skb_queue_purge(&sk->sk_receive_queue); - __skb_queue_purge(&sk->sk_write_queue); - if (sk->sk_send_head != NULL) { - __kfree_skb(sk->sk_send_head); - sk->sk_send_head = NULL; - } - - inet->inet_dport = 0; - - inet_bhash2_reset_saddr(sk); - - sk->sk_shutdown = 0; - sock_reset_flag(sk, SOCK_DONE); - - icsk->icsk_backoff = 0; - inet_csk_delack_init(sk); - __sk_dst_reset(sk); - - WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); - - sk_error_report(sk); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_disconnect); - -/* - * Wait for a DCCP event. - * - * Note that we don't need to lock the socket, as the upper poll layers - * take care of normal races (between the test and the event) and we don't - * go look at any of the socket buffers directly. - */ -__poll_t dccp_poll(struct file *file, struct socket *sock, - poll_table *wait) -{ - struct sock *sk = sock->sk; - __poll_t mask; - u8 shutdown; - int state; - - sock_poll_wait(file, sock, wait); - - state = inet_sk_state_load(sk); - if (state == DCCP_LISTEN) - return inet_csk_listen_poll(sk); - - /* Socket is not locked. We are protected from async events - by poll logic and correct handling of state changes - made by another threads is impossible in any case. - */ - - mask = 0; - if (READ_ONCE(sk->sk_err)) - mask = EPOLLERR; - shutdown = READ_ONCE(sk->sk_shutdown); - - if (shutdown == SHUTDOWN_MASK || state == DCCP_CLOSED) - mask |= EPOLLHUP; - if (shutdown & RCV_SHUTDOWN) - mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - - /* Connected? */ - if ((1 << state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) { - if (atomic_read(&sk->sk_rmem_alloc) > 0) - mask |= EPOLLIN | EPOLLRDNORM; - - if (!(shutdown & SEND_SHUTDOWN)) { - if (sk_stream_is_writeable(sk)) { - mask |= EPOLLOUT | EPOLLWRNORM; - } else { /* send SIGIO later */ - sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); - set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - - /* Race breaker. If space is freed after - * wspace test but before the flags are set, - * IO signal will be lost. - */ - if (sk_stream_is_writeable(sk)) - mask |= EPOLLOUT | EPOLLWRNORM; - } - } - } - return mask; -} -EXPORT_SYMBOL_GPL(dccp_poll); - -int dccp_ioctl(struct sock *sk, int cmd, int *karg) -{ - int rc = -ENOTCONN; - - lock_sock(sk); - - if (sk->sk_state == DCCP_LISTEN) - goto out; - - switch (cmd) { - case SIOCOUTQ: { - *karg = sk_wmem_alloc_get(sk); - /* Using sk_wmem_alloc here because sk_wmem_queued is not used by DCCP and - * always 0, comparably to UDP. - */ - - rc = 0; - } - break; - case SIOCINQ: { - struct sk_buff *skb; - *karg = 0; - - skb = skb_peek(&sk->sk_receive_queue); - if (skb != NULL) { - /* - * We will only return the amount of this packet since - * that is all that will be read. - */ - *karg = skb->len; - } - rc = 0; - } - break; - default: - rc = -ENOIOCTLCMD; - break; - } -out: - release_sock(sk); - return rc; -} - -EXPORT_SYMBOL_GPL(dccp_ioctl); - -static int dccp_setsockopt_service(struct sock *sk, const __be32 service, - sockptr_t optval, unsigned int optlen) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_service_list *sl = NULL; - - if (service == DCCP_SERVICE_INVALID_VALUE || - optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32)) - return -EINVAL; - - if (optlen > sizeof(service)) { - sl = kmalloc(optlen, GFP_KERNEL); - if (sl == NULL) - return -ENOMEM; - - sl->dccpsl_nr = optlen / sizeof(u32) - 1; - if (copy_from_sockptr_offset(sl->dccpsl_list, optval, - sizeof(service), optlen - sizeof(service)) || - dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) { - kfree(sl); - return -EFAULT; - } - } - - lock_sock(sk); - dp->dccps_service = service; - - kfree(dp->dccps_service_list); - - dp->dccps_service_list = sl; - release_sock(sk); - return 0; -} - -static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) -{ - u8 *list, len; - int i, rc; - - if (cscov < 0 || cscov > 15) - return -EINVAL; - /* - * Populate a list of permissible values, in the range cscov...15. This - * is necessary since feature negotiation of single values only works if - * both sides incidentally choose the same value. Since the list starts - * lowest-value first, negotiation will pick the smallest shared value. - */ - if (cscov == 0) - return 0; - len = 16 - cscov; - - list = kmalloc(len, GFP_KERNEL); - if (list == NULL) - return -ENOBUFS; - - for (i = 0; i < len; i++) - list[i] = cscov++; - - rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); - - if (rc == 0) { - if (rx) - dccp_sk(sk)->dccps_pcrlen = cscov; - else - dccp_sk(sk)->dccps_pcslen = cscov; - } - kfree(list); - return rc; -} - -static int dccp_setsockopt_ccid(struct sock *sk, int type, - sockptr_t optval, unsigned int optlen) -{ - u8 *val; - int rc = 0; - - if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) - return -EINVAL; - - val = memdup_sockptr(optval, optlen); - if (IS_ERR(val)) - return PTR_ERR(val); - - lock_sock(sk); - if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); - - if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) - rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); - release_sock(sk); - - kfree(val); - return rc; -} - -static int do_dccp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) -{ - struct dccp_sock *dp = dccp_sk(sk); - int val, err = 0; - - switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CHANGE_L: - case DCCP_SOCKOPT_CHANGE_R: - DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_CCID: - case DCCP_SOCKOPT_RX_CCID: - case DCCP_SOCKOPT_TX_CCID: - return dccp_setsockopt_ccid(sk, optname, optval, optlen); - } - - if (optlen < (int)sizeof(int)) - return -EINVAL; - - if (copy_from_sockptr(&val, optval, sizeof(int))) - return -EFAULT; - - if (optname == DCCP_SOCKOPT_SERVICE) - return dccp_setsockopt_service(sk, val, optval, optlen); - - lock_sock(sk); - switch (optname) { - case DCCP_SOCKOPT_SERVER_TIMEWAIT: - if (dp->dccps_role != DCCP_ROLE_SERVER) - err = -EOPNOTSUPP; - else - dp->dccps_server_timewait = (val != 0); - break; - case DCCP_SOCKOPT_SEND_CSCOV: - err = dccp_setsockopt_cscov(sk, val, false); - break; - case DCCP_SOCKOPT_RECV_CSCOV: - err = dccp_setsockopt_cscov(sk, val, true); - break; - case DCCP_SOCKOPT_QPOLICY_ID: - if (sk->sk_state != DCCP_CLOSED) - err = -EISCONN; - else if (val < 0 || val >= DCCPQ_POLICY_MAX) - err = -EINVAL; - else - dp->dccps_qpolicy = val; - break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - if (val < 0) - err = -EINVAL; - else - dp->dccps_tx_qlen = val; - break; - default: - err = -ENOPROTOOPT; - break; - } - release_sock(sk); - - return err; -} - -int dccp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, - unsigned int optlen) -{ - if (level != SOL_DCCP) - return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, - optname, optval, - optlen); - return do_dccp_setsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(dccp_setsockopt); - -static int dccp_getsockopt_service(struct sock *sk, int len, - __be32 __user *optval, - int __user *optlen) -{ - const struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_service_list *sl; - int err = -ENOENT, slen = 0, total_len = sizeof(u32); - - lock_sock(sk); - if ((sl = dp->dccps_service_list) != NULL) { - slen = sl->dccpsl_nr * sizeof(u32); - total_len += slen; - } - - err = -EINVAL; - if (total_len > len) - goto out; - - err = 0; - if (put_user(total_len, optlen) || - put_user(dp->dccps_service, optval) || - (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen))) - err = -EFAULT; -out: - release_sock(sk); - return err; -} - -static int do_dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - struct dccp_sock *dp; - int val, len; - - if (get_user(len, optlen)) - return -EFAULT; - - if (len < (int)sizeof(int)) - return -EINVAL; - - dp = dccp_sk(sk); - - switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - return 0; - case DCCP_SOCKOPT_SERVICE: - return dccp_getsockopt_service(sk, len, - (__be32 __user *)optval, optlen); - case DCCP_SOCKOPT_GET_CUR_MPS: - val = READ_ONCE(dp->dccps_mss_cache); - break; - case DCCP_SOCKOPT_AVAILABLE_CCIDS: - return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); - case DCCP_SOCKOPT_TX_CCID: - val = ccid_get_current_tx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; - case DCCP_SOCKOPT_RX_CCID: - val = ccid_get_current_rx_ccid(dp); - if (val < 0) - return -ENOPROTOOPT; - break; - case DCCP_SOCKOPT_SERVER_TIMEWAIT: - val = dp->dccps_server_timewait; - break; - case DCCP_SOCKOPT_SEND_CSCOV: - val = dp->dccps_pcslen; - break; - case DCCP_SOCKOPT_RECV_CSCOV: - val = dp->dccps_pcrlen; - break; - case DCCP_SOCKOPT_QPOLICY_ID: - val = dp->dccps_qpolicy; - break; - case DCCP_SOCKOPT_QPOLICY_TXQLEN: - val = dp->dccps_tx_qlen; - break; - case 128 ... 191: - return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, - len, (u32 __user *)optval, optlen); - case 192 ... 255: - return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname, - len, (u32 __user *)optval, optlen); - default: - return -ENOPROTOOPT; - } - - len = sizeof(val); - if (put_user(len, optlen) || copy_to_user(optval, &val, len)) - return -EFAULT; - - return 0; -} - -int dccp_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) -{ - if (level != SOL_DCCP) - return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, - optname, optval, - optlen); - return do_dccp_getsockopt(sk, level, optname, optval, optlen); -} - -EXPORT_SYMBOL_GPL(dccp_getsockopt); - -static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) -{ - struct cmsghdr *cmsg; - - /* - * Assign an (opaque) qpolicy priority value to skb->priority. - * - * We are overloading this skb field for use with the qpolicy subystem. - * The skb->priority is normally used for the SO_PRIORITY option, which - * is initialised from sk_priority. Since the assignment of sk_priority - * to skb->priority happens later (on layer 3), we overload this field - * for use with queueing priorities as long as the skb is on layer 4. - * The default priority value (if nothing is set) is 0. - */ - skb->priority = 0; - - for_each_cmsghdr(cmsg, msg) { - if (!CMSG_OK(msg, cmsg)) - return -EINVAL; - - if (cmsg->cmsg_level != SOL_DCCP) - continue; - - if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && - !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) - return -EINVAL; - - switch (cmsg->cmsg_type) { - case DCCP_SCM_PRIORITY: - if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) - return -EINVAL; - skb->priority = *(__u32 *)CMSG_DATA(cmsg); - break; - default: - return -EINVAL; - } - } - return 0; -} - -int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) -{ - const struct dccp_sock *dp = dccp_sk(sk); - const int flags = msg->msg_flags; - const int noblock = flags & MSG_DONTWAIT; - struct sk_buff *skb; - int rc, size; - long timeo; - - trace_dccp_probe(sk, len); - - if (len > READ_ONCE(dp->dccps_mss_cache)) - return -EMSGSIZE; - - lock_sock(sk); - - timeo = sock_sndtimeo(sk, noblock); - - /* - * We have to use sk_stream_wait_connect here to set sk_write_pending, - * so that the trick in dccp_rcv_request_sent_state_process. - */ - /* Wait for a connection to finish. */ - if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) - if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0) - goto out_release; - - size = sk->sk_prot->max_header + len; - release_sock(sk); - skb = sock_alloc_send_skb(sk, size, noblock, &rc); - lock_sock(sk); - if (skb == NULL) - goto out_release; - - if (dccp_qpolicy_full(sk)) { - rc = -EAGAIN; - goto out_discard; - } - - if (sk->sk_state == DCCP_CLOSED) { - rc = -ENOTCONN; - goto out_discard; - } - - /* We need to check dccps_mss_cache after socket is locked. */ - if (len > dp->dccps_mss_cache) { - rc = -EMSGSIZE; - goto out_discard; - } - - skb_reserve(skb, sk->sk_prot->max_header); - rc = memcpy_from_msg(skb_put(skb, len), msg, len); - if (rc != 0) - goto out_discard; - - rc = dccp_msghdr_parse(msg, skb); - if (rc != 0) - goto out_discard; - - dccp_qpolicy_push(sk, skb); - /* - * The xmit_timer is set if the TX CCID is rate-based and will expire - * when congestion control permits to release further packets into the - * network. Window-based CCIDs do not use this timer. - */ - if (!timer_pending(&dp->dccps_xmit_timer)) - dccp_write_xmit(sk); -out_release: - release_sock(sk); - return rc ? : len; -out_discard: - kfree_skb(skb); - goto out_release; -} - -EXPORT_SYMBOL_GPL(dccp_sendmsg); - -int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, - int *addr_len) -{ - const struct dccp_hdr *dh; - long timeo; - - lock_sock(sk); - - if (sk->sk_state == DCCP_LISTEN) { - len = -ENOTCONN; - goto out; - } - - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - do { - struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - - if (skb == NULL) - goto verify_sock_status; - - dh = dccp_hdr(skb); - - switch (dh->dccph_type) { - case DCCP_PKT_DATA: - case DCCP_PKT_DATAACK: - goto found_ok_skb; - - case DCCP_PKT_CLOSE: - case DCCP_PKT_CLOSEREQ: - if (!(flags & MSG_PEEK)) - dccp_finish_passive_close(sk); - fallthrough; - case DCCP_PKT_RESET: - dccp_pr_debug("found fin (%s) ok!\n", - dccp_packet_name(dh->dccph_type)); - len = 0; - goto found_fin_ok; - default: - dccp_pr_debug("packet_type=%s\n", - dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb); - } -verify_sock_status: - if (sock_flag(sk, SOCK_DONE)) { - len = 0; - break; - } - - if (sk->sk_err) { - len = sock_error(sk); - break; - } - - if (sk->sk_shutdown & RCV_SHUTDOWN) { - len = 0; - break; - } - - if (sk->sk_state == DCCP_CLOSED) { - if (!sock_flag(sk, SOCK_DONE)) { - /* This occurs when user tries to read - * from never connected socket. - */ - len = -ENOTCONN; - break; - } - len = 0; - break; - } - - if (!timeo) { - len = -EAGAIN; - break; - } - - if (signal_pending(current)) { - len = sock_intr_errno(timeo); - break; - } - - sk_wait_data(sk, &timeo, NULL); - continue; - found_ok_skb: - if (len > skb->len) - len = skb->len; - else if (len < skb->len) - msg->msg_flags |= MSG_TRUNC; - - if (skb_copy_datagram_msg(skb, 0, msg, len)) { - /* Exception. Bailout! */ - len = -EFAULT; - break; - } - if (flags & MSG_TRUNC) - len = skb->len; - found_fin_ok: - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); - break; - } while (1); -out: - release_sock(sk); - return len; -} - -EXPORT_SYMBOL_GPL(dccp_recvmsg); - -int inet_dccp_listen(struct socket *sock, int backlog) -{ - struct sock *sk = sock->sk; - unsigned char old_state; - int err; - - lock_sock(sk); - - err = -EINVAL; - if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP) - goto out; - - old_state = sk->sk_state; - if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) - goto out; - - WRITE_ONCE(sk->sk_max_ack_backlog, backlog); - /* Really, if the socket is already in listen state - * we can only allow the backlog to be adjusted. - */ - if (old_state != DCCP_LISTEN) { - struct dccp_sock *dp = dccp_sk(sk); - - dp->dccps_role = DCCP_ROLE_LISTEN; - - /* do not start to listen if feature negotiation setup fails */ - if (dccp_feat_finalise_settings(dp)) { - err = -EPROTO; - goto out; - } - - err = inet_csk_listen_start(sk); - if (err) - goto out; - } - err = 0; - -out: - release_sock(sk); - return err; -} - -EXPORT_SYMBOL_GPL(inet_dccp_listen); - -static void dccp_terminate_connection(struct sock *sk) -{ - u8 next_state = DCCP_CLOSED; - - switch (sk->sk_state) { - case DCCP_PASSIVE_CLOSE: - case DCCP_PASSIVE_CLOSEREQ: - dccp_finish_passive_close(sk); - break; - case DCCP_PARTOPEN: - dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk); - inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); - fallthrough; - case DCCP_OPEN: - dccp_send_close(sk, 1); - - if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER && - !dccp_sk(sk)->dccps_server_timewait) - next_state = DCCP_ACTIVE_CLOSEREQ; - else - next_state = DCCP_CLOSING; - fallthrough; - default: - dccp_set_state(sk, next_state); - } -} - -void dccp_close(struct sock *sk, long timeout) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb; - u32 data_was_unread = 0; - int state; - - lock_sock(sk); - - sk->sk_shutdown = SHUTDOWN_MASK; - - if (sk->sk_state == DCCP_LISTEN) { - dccp_set_state(sk, DCCP_CLOSED); - - /* Special case. */ - inet_csk_listen_stop(sk); - - goto adjudge_to_death; - } - - sk_stop_timer(sk, &dp->dccps_xmit_timer); - - /* - * We need to flush the recv. buffs. We do this only on the - * descriptor close, not protocol-sourced closes, because the - *reader process may not have drained the data yet! - */ - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - data_was_unread += skb->len; - __kfree_skb(skb); - } - - /* If socket has been already reset kill it. */ - if (sk->sk_state == DCCP_CLOSED) - goto adjudge_to_death; - - if (data_was_unread) { - /* Unread data was tossed, send an appropriate Reset Code */ - DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - dccp_set_state(sk, DCCP_CLOSED); - } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { - /* Check zero linger _after_ checking for unread data. */ - sk->sk_prot->disconnect(sk, 0); - } else if (sk->sk_state != DCCP_CLOSED) { - /* - * Normal connection termination. May need to wait if there are - * still packets in the TX queue that are delayed by the CCID. - */ - dccp_flush_write_queue(sk, &timeout); - dccp_terminate_connection(sk); - } - - /* - * Flush write queue. This may be necessary in several cases: - * - we have been closed by the peer but still have application data; - * - abortive termination (unread data or zero linger time), - * - normal termination but queue could not be flushed within time limit - */ - __skb_queue_purge(&sk->sk_write_queue); - - sk_stream_wait_close(sk, timeout); - -adjudge_to_death: - state = sk->sk_state; - sock_hold(sk); - sock_orphan(sk); - - /* - * It is the last release_sock in its life. It will remove backlog. - */ - release_sock(sk); - /* - * Now socket is owned by kernel and we acquire BH lock - * to finish close. No need to check for user refs. - */ - local_bh_disable(); - bh_lock_sock(sk); - WARN_ON(sock_owned_by_user(sk)); - - this_cpu_inc(dccp_orphan_count); - - /* Have we already been destroyed by a softirq or backlog? */ - if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) - goto out; - - if (sk->sk_state == DCCP_CLOSED) - inet_csk_destroy_sock(sk); - - /* Otherwise, socket is reprieved until protocol close. */ - -out: - bh_unlock_sock(sk); - local_bh_enable(); - sock_put(sk); -} - -EXPORT_SYMBOL_GPL(dccp_close); - -void dccp_shutdown(struct sock *sk, int how) -{ - dccp_pr_debug("called shutdown(%x)\n", how); -} - -EXPORT_SYMBOL_GPL(dccp_shutdown); - -static inline int __init dccp_mib_init(void) -{ - dccp_statistics = alloc_percpu(struct dccp_mib); - if (!dccp_statistics) - return -ENOMEM; - return 0; -} - -static inline void dccp_mib_exit(void) -{ - free_percpu(dccp_statistics); -} - -static int thash_entries; -module_param(thash_entries, int, 0444); -MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); - -#ifdef CONFIG_IP_DCCP_DEBUG -bool dccp_debug; -module_param(dccp_debug, bool, 0644); -MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); - -EXPORT_SYMBOL_GPL(dccp_debug); -#endif - -static int __init dccp_init(void) -{ - unsigned long goal; - unsigned long nr_pages = totalram_pages(); - int ehash_order, bhash_order, i; - int rc; - - BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > - sizeof_field(struct sk_buff, cb)); - rc = inet_hashinfo2_init_mod(&dccp_hashinfo); - if (rc) - goto out_fail; - rc = -ENOBUFS; - dccp_hashinfo.bind_bucket_cachep = - kmem_cache_create("dccp_bind_bucket", - sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); - if (!dccp_hashinfo.bind_bucket_cachep) - goto out_free_hashinfo2; - dccp_hashinfo.bind2_bucket_cachep = - kmem_cache_create("dccp_bind2_bucket", - sizeof(struct inet_bind2_bucket), 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); - if (!dccp_hashinfo.bind2_bucket_cachep) - goto out_free_bind_bucket_cachep; - - /* - * Size and allocate the main established and bind bucket - * hash tables. - * - * The methodology is similar to that of the buffer cache. - */ - if (nr_pages >= (128 * 1024)) - goal = nr_pages >> (21 - PAGE_SHIFT); - else - goal = nr_pages >> (23 - PAGE_SHIFT); - - if (thash_entries) - goal = (thash_entries * - sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT; - for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++) - ; - do { - unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE / - sizeof(struct inet_ehash_bucket); - - while (hash_size & (hash_size - 1)) - hash_size--; - dccp_hashinfo.ehash_mask = hash_size - 1; - dccp_hashinfo.ehash = (struct inet_ehash_bucket *) - __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order); - } while (!dccp_hashinfo.ehash && --ehash_order > 0); - - if (!dccp_hashinfo.ehash) { - DCCP_CRIT("Failed to allocate DCCP established hash table"); - goto out_free_bind2_bucket_cachep; - } - - for (i = 0; i <= dccp_hashinfo.ehash_mask; i++) - INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); - - if (inet_ehash_locks_alloc(&dccp_hashinfo)) - goto out_free_dccp_ehash; - - bhash_order = ehash_order; - - do { - dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE / - sizeof(struct inet_bind_hashbucket); - if ((dccp_hashinfo.bhash_size > (64 * 1024)) && - bhash_order > 0) - continue; - dccp_hashinfo.bhash = (struct inet_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order); - } while (!dccp_hashinfo.bhash && --bhash_order >= 0); - - if (!dccp_hashinfo.bhash) { - DCCP_CRIT("Failed to allocate DCCP bind hash table"); - goto out_free_dccp_locks; - } - - dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *) - __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order); - - if (!dccp_hashinfo.bhash2) { - DCCP_CRIT("Failed to allocate DCCP bind2 hash table"); - goto out_free_dccp_bhash; - } - - for (i = 0; i < dccp_hashinfo.bhash_size; i++) { - spin_lock_init(&dccp_hashinfo.bhash[i].lock); - INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain); - spin_lock_init(&dccp_hashinfo.bhash2[i].lock); - INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); - } - - dccp_hashinfo.pernet = false; - - rc = dccp_mib_init(); - if (rc) - goto out_free_dccp_bhash2; - - rc = dccp_ackvec_init(); - if (rc) - goto out_free_dccp_mib; - - rc = dccp_sysctl_init(); - if (rc) - goto out_ackvec_exit; - - rc = ccid_initialize_builtins(); - if (rc) - goto out_sysctl_exit; - - dccp_timestamping_init(); - - return 0; - -out_sysctl_exit: - dccp_sysctl_exit(); -out_ackvec_exit: - dccp_ackvec_exit(); -out_free_dccp_mib: - dccp_mib_exit(); -out_free_dccp_bhash2: - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); -out_free_dccp_bhash: - free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); -out_free_dccp_locks: - inet_ehash_locks_free(&dccp_hashinfo); -out_free_dccp_ehash: - free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order); -out_free_bind2_bucket_cachep: - kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep); -out_free_bind_bucket_cachep: - kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); -out_free_hashinfo2: - inet_hashinfo2_free_mod(&dccp_hashinfo); -out_fail: - dccp_hashinfo.bhash = NULL; - dccp_hashinfo.bhash2 = NULL; - dccp_hashinfo.ehash = NULL; - dccp_hashinfo.bind_bucket_cachep = NULL; - dccp_hashinfo.bind2_bucket_cachep = NULL; - return rc; -} - -static void __exit dccp_fini(void) -{ - int bhash_order = get_order(dccp_hashinfo.bhash_size * - sizeof(struct inet_bind_hashbucket)); - - ccid_cleanup_builtins(); - dccp_mib_exit(); - free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order); - free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order); - free_pages((unsigned long)dccp_hashinfo.ehash, - get_order((dccp_hashinfo.ehash_mask + 1) * - sizeof(struct inet_ehash_bucket))); - inet_ehash_locks_free(&dccp_hashinfo); - kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); - dccp_ackvec_exit(); - dccp_sysctl_exit(); - inet_hashinfo2_free_mod(&dccp_hashinfo); -} - -module_init(dccp_init); -module_exit(dccp_fini); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>"); -MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c deleted file mode 100644 index 5ba204ec0aca..000000000000 --- a/net/dccp/qpolicy.c +++ /dev/null @@ -1,136 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/qpolicy.c - * - * Policy-based packet dequeueing interface for DCCP. - * - * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net> - */ -#include "dccp.h" - -/* - * Simple Dequeueing Policy: - * If tx_qlen is different from 0, enqueue up to tx_qlen elements. - */ -static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) -{ - skb_queue_tail(&sk->sk_write_queue, skb); -} - -static bool qpolicy_simple_full(struct sock *sk) -{ - return dccp_sk(sk)->dccps_tx_qlen && - sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; -} - -static struct sk_buff *qpolicy_simple_top(struct sock *sk) -{ - return skb_peek(&sk->sk_write_queue); -} - -/* - * Priority-based Dequeueing Policy: - * If tx_qlen is different from 0 and the queue has reached its upper bound - * of tx_qlen elements, replace older packets lowest-priority-first. - */ -static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) -{ - struct sk_buff *skb, *best = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (best == NULL || skb->priority > best->priority) - best = skb; - return best; -} - -static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) -{ - struct sk_buff *skb, *worst = NULL; - - skb_queue_walk(&sk->sk_write_queue, skb) - if (worst == NULL || skb->priority < worst->priority) - worst = skb; - return worst; -} - -static bool qpolicy_prio_full(struct sock *sk) -{ - if (qpolicy_simple_full(sk)) - dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); - return false; -} - -/** - * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface - * @push: add a new @skb to the write queue - * @full: indicates that no more packets will be admitted - * @top: peeks at whatever the queueing policy defines as its `top' - * @params: parameter passed to policy operation - */ -struct dccp_qpolicy_operations { - void (*push) (struct sock *sk, struct sk_buff *skb); - bool (*full) (struct sock *sk); - struct sk_buff* (*top) (struct sock *sk); - __be32 params; -}; - -static struct dccp_qpolicy_operations qpol_table[DCCPQ_POLICY_MAX] = { - [DCCPQ_POLICY_SIMPLE] = { - .push = qpolicy_simple_push, - .full = qpolicy_simple_full, - .top = qpolicy_simple_top, - .params = 0, - }, - [DCCPQ_POLICY_PRIO] = { - .push = qpolicy_simple_push, - .full = qpolicy_prio_full, - .top = qpolicy_prio_best_skb, - .params = DCCP_SCM_PRIORITY, - }, -}; - -/* - * Externally visible interface - */ -void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) -{ - qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); -} - -bool dccp_qpolicy_full(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); -} - -void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) -{ - if (skb != NULL) { - skb_unlink(skb, &sk->sk_write_queue); - kfree_skb(skb); - } -} - -struct sk_buff *dccp_qpolicy_top(struct sock *sk) -{ - return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); -} - -struct sk_buff *dccp_qpolicy_pop(struct sock *sk) -{ - struct sk_buff *skb = dccp_qpolicy_top(sk); - - if (skb != NULL) { - /* Clear any skb fields that we used internally */ - skb->priority = 0; - skb_unlink(skb, &sk->sk_write_queue); - } - return skb; -} - -bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) -{ - /* check if exactly one bit is set */ - if (!param || (param & (param - 1))) - return false; - return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; -} diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c deleted file mode 100644 index 3fc474d6e57d..000000000000 --- a/net/dccp/sysctl.c +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/dccp/sysctl.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@mandriva.com> - */ - -#include <linux/mm.h> -#include <linux/sysctl.h> -#include "dccp.h" -#include "feat.h" - -#ifndef CONFIG_SYSCTL -#error This file should not be compiled without CONFIG_SYSCTL defined -#endif - -/* Boundary values */ -static int u8_max = 0xFF; -static unsigned long seqw_min = DCCPF_SEQ_WMIN, - seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */ - -static struct ctl_table dccp_default_table[] = { - { - .procname = "seq_window", - .data = &sysctl_dccp_sequence_window, - .maxlen = sizeof(sysctl_dccp_sequence_window), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ - .extra2 = &seqw_max, - }, - { - .procname = "rx_ccid", - .data = &sysctl_dccp_rx_ccid, - .maxlen = sizeof(sysctl_dccp_rx_ccid), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, /* RFC 4340, 10. */ - }, - { - .procname = "tx_ccid", - .data = &sysctl_dccp_tx_ccid, - .maxlen = sizeof(sysctl_dccp_tx_ccid), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, /* RFC 4340, 10. */ - }, - { - .procname = "request_retries", - .data = &sysctl_dccp_request_retries, - .maxlen = sizeof(sysctl_dccp_request_retries), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = &u8_max, - }, - { - .procname = "retries1", - .data = &sysctl_dccp_retries1, - .maxlen = sizeof(sysctl_dccp_retries1), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, - }, - { - .procname = "retries2", - .data = &sysctl_dccp_retries2, - .maxlen = sizeof(sysctl_dccp_retries2), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = &u8_max, - }, - { - .procname = "tx_qlen", - .data = &sysctl_dccp_tx_qlen, - .maxlen = sizeof(sysctl_dccp_tx_qlen), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - }, - { - .procname = "sync_ratelimit", - .data = &sysctl_dccp_sync_ratelimit, - .maxlen = sizeof(sysctl_dccp_sync_ratelimit), - .mode = 0644, - .proc_handler = proc_dointvec_ms_jiffies, - }, -}; - -static struct ctl_table_header *dccp_table_header; - -int __init dccp_sysctl_init(void) -{ - dccp_table_header = register_net_sysctl(&init_net, "net/dccp/default", - dccp_default_table); - - return dccp_table_header != NULL ? 0 : -ENOMEM; -} - -void dccp_sysctl_exit(void) -{ - if (dccp_table_header != NULL) { - unregister_net_sysctl_table(dccp_table_header); - dccp_table_header = NULL; - } -} diff --git a/net/dccp/timer.c b/net/dccp/timer.c deleted file mode 100644 index a4cfb47b60e5..000000000000 --- a/net/dccp/timer.c +++ /dev/null @@ -1,272 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/dccp/timer.c - * - * An implementation of the DCCP protocol - * Arnaldo Carvalho de Melo <acme@conectiva.com.br> - */ - -#include <linux/dccp.h> -#include <linux/skbuff.h> -#include <linux/export.h> - -#include "dccp.h" - -/* sysctl variables governing numbers of retransmission attempts */ -int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES; -int sysctl_dccp_retries1 __read_mostly = TCP_RETR1; -int sysctl_dccp_retries2 __read_mostly = TCP_RETR2; - -static void dccp_write_err(struct sock *sk) -{ - sk->sk_err = READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT; - sk_error_report(sk); - - dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); - dccp_done(sk); - __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT); -} - -/* A write timeout has occurred. Process the after effects. */ -static int dccp_write_timeout(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - int retry_until; - - if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) { - if (icsk->icsk_retransmits != 0) - dst_negative_advice(sk); - retry_until = icsk->icsk_syn_retries ? - : sysctl_dccp_request_retries; - } else { - if (icsk->icsk_retransmits >= sysctl_dccp_retries1) { - /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu - black hole detection. :-( - - It is place to make it. It is not made. I do not want - to make it. It is disguisting. It does not work in any - case. Let me to cite the same draft, which requires for - us to implement this: - - "The one security concern raised by this memo is that ICMP black holes - are often caused by over-zealous security administrators who block - all ICMP messages. It is vitally important that those who design and - deploy security systems understand the impact of strict filtering on - upper-layer protocols. The safest web site in the world is worthless - if most TCP implementations cannot transfer data from it. It would - be far nicer to have all of the black holes fixed rather than fixing - all of the TCP implementations." - - Golden words :-). - */ - - dst_negative_advice(sk); - } - - retry_until = sysctl_dccp_retries2; - /* - * FIXME: see tcp_write_timout and tcp_out_of_resources - */ - } - - if (icsk->icsk_retransmits >= retry_until) { - /* Has it gone just too far? */ - dccp_write_err(sk); - return 1; - } - return 0; -} - -/* - * The DCCP retransmit timer. - */ -static void dccp_retransmit_timer(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - /* - * More than 4MSL (8 minutes) has passed, a RESET(aborted) was - * sent, no need to retransmit, this sock is dead. - */ - if (dccp_write_timeout(sk)) - return; - - /* - * We want to know the number of packets retransmitted, not the - * total number of retransmissions of clones of original packets. - */ - if (icsk->icsk_retransmits == 0) - __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS); - - if (dccp_retransmit_skb(sk) != 0) { - /* - * Retransmission failed because of local congestion, - * do not backoff. - */ - if (--icsk->icsk_retransmits == 0) - icsk->icsk_retransmits = 1; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - min(icsk->icsk_rto, - TCP_RESOURCE_PROBE_INTERVAL), - DCCP_RTO_MAX); - return; - } - - icsk->icsk_backoff++; - - icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, - DCCP_RTO_MAX); - if (icsk->icsk_retransmits > sysctl_dccp_retries1) - __sk_dst_reset(sk); -} - -static void dccp_write_timer(struct timer_list *t) -{ - struct inet_connection_sock *icsk = - from_timer(icsk, t, icsk_retransmit_timer); - struct sock *sk = &icsk->icsk_inet.sk; - int event = 0; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later */ - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - jiffies + (HZ / 20)); - goto out; - } - - if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending) - goto out; - - if (time_after(icsk->icsk_timeout, jiffies)) { - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - icsk->icsk_timeout); - goto out; - } - - event = icsk->icsk_pending; - icsk->icsk_pending = 0; - - switch (event) { - case ICSK_TIME_RETRANS: - dccp_retransmit_timer(sk); - break; - } -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -static void dccp_keepalive_timer(struct timer_list *t) -{ - struct sock *sk = from_timer(sk, t, sk_timer); - - pr_err("dccp should not use a keepalive timer !\n"); - sock_put(sk); -} - -/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */ -static void dccp_delack_timer(struct timer_list *t) -{ - struct inet_connection_sock *icsk = - from_timer(icsk, t, icsk_delack_timer); - struct sock *sk = &icsk->icsk_inet.sk; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) { - /* Try again later. */ - __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); - sk_reset_timer(sk, &icsk->icsk_delack_timer, - jiffies + TCP_DELACK_MIN); - goto out; - } - - if (sk->sk_state == DCCP_CLOSED || - !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) - goto out; - if (time_after(icsk->icsk_ack.timeout, jiffies)) { - sk_reset_timer(sk, &icsk->icsk_delack_timer, - icsk->icsk_ack.timeout); - goto out; - } - - icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; - - if (inet_csk_ack_scheduled(sk)) { - if (!inet_csk_in_pingpong_mode(sk)) { - /* Delayed ACK missed: inflate ATO. */ - icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, - icsk->icsk_rto); - } else { - /* Delayed ACK missed: leave pingpong mode and - * deflate ATO. - */ - inet_csk_exit_pingpong_mode(sk); - icsk->icsk_ack.ato = TCP_ATO_MIN; - } - dccp_send_ack(sk); - __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS); - } -out: - bh_unlock_sock(sk); - sock_put(sk); -} - -/** - * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface - * @t: pointer to the tasklet associated with this handler - * - * See the comments above %ccid_dequeueing_decision for supported modes. - */ -static void dccp_write_xmitlet(struct tasklet_struct *t) -{ - struct dccp_sock *dp = from_tasklet(dp, t, dccps_xmitlet); - struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk; - - bh_lock_sock(sk); - if (sock_owned_by_user(sk)) - sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); - else - dccp_write_xmit(sk); - bh_unlock_sock(sk); - sock_put(sk); -} - -static void dccp_write_xmit_timer(struct timer_list *t) -{ - struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer); - - dccp_write_xmitlet(&dp->dccps_xmitlet); -} - -void dccp_init_xmit_timers(struct sock *sk) -{ - struct dccp_sock *dp = dccp_sk(sk); - - tasklet_setup(&dp->dccps_xmitlet, dccp_write_xmitlet); - timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0); - inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, - &dccp_keepalive_timer); -} - -static ktime_t dccp_timestamp_seed; -/** - * dccp_timestamp - 10s of microseconds time source - * Returns the number of 10s of microseconds since loading DCCP. This is native - * DCCP time difference format (RFC 4340, sec. 13). - * Please note: This will wrap around about circa every 11.9 hours. - */ -u32 dccp_timestamp(void) -{ - u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); - - do_div(delta, 10); - return delta; -} -EXPORT_SYMBOL_GPL(dccp_timestamp); - -void __init dccp_timestamping_init(void) -{ - dccp_timestamp_seed = ktime_get_real(); -} diff --git a/net/dccp/trace.h b/net/dccp/trace.h deleted file mode 100644 index 5a43b3508c7f..000000000000 --- a/net/dccp/trace.h +++ /dev/null @@ -1,82 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM dccp - -#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_DCCP_H - -#include <net/sock.h> -#include "dccp.h" -#include "ccids/ccid3.h" -#include <linux/tracepoint.h> -#include <trace/events/net_probe_common.h> - -TRACE_EVENT(dccp_probe, - - TP_PROTO(struct sock *sk, size_t size), - - TP_ARGS(sk, size), - - TP_STRUCT__entry( - /* sockaddr_in6 is always bigger than sockaddr_in */ - __array(__u8, saddr, sizeof(struct sockaddr_in6)) - __array(__u8, daddr, sizeof(struct sockaddr_in6)) - __field(__u16, sport) - __field(__u16, dport) - __field(__u16, size) - __field(__u16, tx_s) - __field(__u32, tx_rtt) - __field(__u32, tx_p) - __field(__u32, tx_x_calc) - __field(__u64, tx_x_recv) - __field(__u64, tx_x) - __field(__u32, tx_t_ipi) - ), - - TP_fast_assign( - const struct inet_sock *inet = inet_sk(sk); - struct ccid3_hc_tx_sock *hc = NULL; - - if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) - hc = ccid3_hc_tx_sk(sk); - - memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); - memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); - - TP_STORE_ADDR_PORTS(__entry, inet, sk); - - /* For filtering use */ - __entry->sport = ntohs(inet->inet_sport); - __entry->dport = ntohs(inet->inet_dport); - - __entry->size = size; - if (hc) { - __entry->tx_s = hc->tx_s; - __entry->tx_rtt = hc->tx_rtt; - __entry->tx_p = hc->tx_p; - __entry->tx_x_calc = hc->tx_x_calc; - __entry->tx_x_recv = hc->tx_x_recv >> 6; - __entry->tx_x = hc->tx_x >> 6; - __entry->tx_t_ipi = hc->tx_t_ipi; - } else { - __entry->tx_s = 0; - memset_startat(__entry, 0, tx_rtt); - } - ), - - TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d " - "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d", - __entry->saddr, __entry->daddr, __entry->size, - __entry->tx_s, __entry->tx_rtt, __entry->tx_p, - __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x, - __entry->tx_t_ipi) -); - -#endif /* _TRACE_TCP_H */ - -/* This part must be outside protection */ -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace -#include <trace/define_trace.h> diff --git a/net/devlink/core.c b/net/devlink/core.c index f49cd83f1955..7203c39532fc 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -117,7 +117,7 @@ static struct devlink_rel *devlink_rel_alloc(void) err = xa_alloc_cyclic(&devlink_rels, &rel->index, rel, xa_limit_32b, &next, GFP_KERNEL); - if (err) { + if (err < 0) { kfree(rel); return ERR_PTR(err); } diff --git a/net/devlink/dev.c b/net/devlink/dev.c index d6e3db300acb..02602704bdea 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -775,7 +775,7 @@ static int devlink_info_version_put(struct devlink_info_req *req, int attr, req->version_cb(version_name, version_type, req->version_cb_priv); - if (!req->msg) + if (!req->msg || !*version_value) return 0; nest = nla_nest_start_noflag(req->msg, attr); diff --git a/net/devlink/health.c b/net/devlink/health.c index b8d3084e6fe0..b3ce8ecbb7fb 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -735,7 +735,7 @@ static void devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name) return; } - item->nla_type = NLA_NUL_STRING; + item->nla_type = DEVLINK_VAR_ATTR_TYPE_NUL_STRING; item->len = strlen(name) + 1; item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME; memcpy(&item->value, name, item->len); @@ -822,32 +822,37 @@ static void devlink_fmsg_put_value(struct devlink_fmsg *fmsg, static void devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) { devlink_fmsg_err_if_binary(fmsg); - devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG); + devlink_fmsg_put_value(fmsg, &value, sizeof(value), + DEVLINK_VAR_ATTR_TYPE_FLAG); } static void devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) { devlink_fmsg_err_if_binary(fmsg); - devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8); + devlink_fmsg_put_value(fmsg, &value, sizeof(value), + DEVLINK_VAR_ATTR_TYPE_U8); } void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) { devlink_fmsg_err_if_binary(fmsg); - devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32); + devlink_fmsg_put_value(fmsg, &value, sizeof(value), + DEVLINK_VAR_ATTR_TYPE_U32); } EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); static void devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) { devlink_fmsg_err_if_binary(fmsg); - devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64); + devlink_fmsg_put_value(fmsg, &value, sizeof(value), + DEVLINK_VAR_ATTR_TYPE_U64); } void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) { devlink_fmsg_err_if_binary(fmsg); - devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, NLA_NUL_STRING); + devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, + DEVLINK_VAR_ATTR_TYPE_NUL_STRING); } EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); @@ -857,7 +862,8 @@ void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, if (!fmsg->err && !fmsg->putting_binary) fmsg->err = -EINVAL; - devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); + devlink_fmsg_put_value(fmsg, value, value_len, + DEVLINK_VAR_ATTR_TYPE_BINARY); } EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); @@ -928,43 +934,26 @@ void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); static int -devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb) -{ - switch (msg->nla_type) { - case NLA_FLAG: - case NLA_U8: - case NLA_U32: - case NLA_U64: - case NLA_NUL_STRING: - case NLA_BINARY: - return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, - msg->nla_type); - default: - return -EINVAL; - } -} - -static int devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb) { int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; u8 tmp; switch (msg->nla_type) { - case NLA_FLAG: + case DEVLINK_VAR_ATTR_TYPE_FLAG: /* Always provide flag data, regardless of its value */ tmp = *(bool *)msg->value; return nla_put_u8(skb, attrtype, tmp); - case NLA_U8: + case DEVLINK_VAR_ATTR_TYPE_U8: return nla_put_u8(skb, attrtype, *(u8 *)msg->value); - case NLA_U32: + case DEVLINK_VAR_ATTR_TYPE_U32: return nla_put_u32(skb, attrtype, *(u32 *)msg->value); - case NLA_U64: + case DEVLINK_VAR_ATTR_TYPE_U64: return devlink_nl_put_u64(skb, attrtype, *(u64 *)msg->value); - case NLA_NUL_STRING: + case DEVLINK_VAR_ATTR_TYPE_NUL_STRING: return nla_put_string(skb, attrtype, (char *)&msg->value); - case NLA_BINARY: + case DEVLINK_VAR_ATTR_TYPE_BINARY: return nla_put(skb, attrtype, msg->len, (void *)&msg->value); default: return -EINVAL; @@ -998,7 +987,8 @@ devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, err = nla_put_flag(skb, item->attrtype); break; case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA: - err = devlink_fmsg_item_fill_type(item, skb); + err = nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, + item->nla_type); if (err) break; err = devlink_fmsg_item_fill_data(item, skb); @@ -1238,3 +1228,70 @@ int devlink_nl_health_reporter_test_doit(struct sk_buff *skb, return reporter->ops->test(reporter, info->extack); } + +/** + * devlink_fmsg_dump_skb - Dump sk_buffer structure + * @fmsg: devlink formatted message pointer + * @skb: pointer to skb + * + * Dump diagnostic information about sk_buff structure, like headroom, length, + * tailroom, MAC, etc. + */ +void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb) +{ + struct skb_shared_info *sh = skb_shinfo(skb); + struct sock *sk = skb->sk; + bool has_mac, has_trans; + + has_mac = skb_mac_header_was_set(skb); + has_trans = skb_transport_header_was_set(skb); + + devlink_fmsg_pair_nest_start(fmsg, "skb"); + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_put(fmsg, "actual len", skb->len); + devlink_fmsg_put(fmsg, "head len", skb_headlen(skb)); + devlink_fmsg_put(fmsg, "data len", skb->data_len); + devlink_fmsg_put(fmsg, "tail len", skb_tailroom(skb)); + devlink_fmsg_put(fmsg, "MAC", has_mac ? skb->mac_header : -1); + devlink_fmsg_put(fmsg, "MAC len", + has_mac ? skb_mac_header_len(skb) : -1); + devlink_fmsg_put(fmsg, "network hdr", skb->network_header); + devlink_fmsg_put(fmsg, "network hdr len", + has_trans ? skb_network_header_len(skb) : -1); + devlink_fmsg_put(fmsg, "transport hdr", + has_trans ? skb->transport_header : -1); + devlink_fmsg_put(fmsg, "csum", (__force u32)skb->csum); + devlink_fmsg_put(fmsg, "csum_ip_summed", (u8)skb->ip_summed); + devlink_fmsg_put(fmsg, "csum_complete_sw", !!skb->csum_complete_sw); + devlink_fmsg_put(fmsg, "csum_valid", !!skb->csum_valid); + devlink_fmsg_put(fmsg, "csum_level", (u8)skb->csum_level); + devlink_fmsg_put(fmsg, "sw_hash", !!skb->sw_hash); + devlink_fmsg_put(fmsg, "l4_hash", !!skb->l4_hash); + devlink_fmsg_put(fmsg, "proto", ntohs(skb->protocol)); + devlink_fmsg_put(fmsg, "pkt_type", (u8)skb->pkt_type); + devlink_fmsg_put(fmsg, "iif", skb->skb_iif); + + if (sk) { + devlink_fmsg_pair_nest_start(fmsg, "sk"); + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_put(fmsg, "family", sk->sk_type); + devlink_fmsg_put(fmsg, "type", sk->sk_type); + devlink_fmsg_put(fmsg, "proto", sk->sk_protocol); + devlink_fmsg_obj_nest_end(fmsg); + devlink_fmsg_pair_nest_end(fmsg); + } + + devlink_fmsg_obj_nest_end(fmsg); + devlink_fmsg_pair_nest_end(fmsg); + + devlink_fmsg_pair_nest_start(fmsg, "shinfo"); + devlink_fmsg_obj_nest_start(fmsg); + devlink_fmsg_put(fmsg, "tx_flags", sh->tx_flags); + devlink_fmsg_put(fmsg, "nr_frags", sh->nr_frags); + devlink_fmsg_put(fmsg, "gso_size", sh->gso_size); + devlink_fmsg_put(fmsg, "gso_type", sh->gso_type); + devlink_fmsg_put(fmsg, "gso_segs", sh->gso_segs); + devlink_fmsg_obj_nest_end(fmsg); + devlink_fmsg_pair_nest_end(fmsg); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_dump_skb); diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index f9786d51f68f..e340d955cf3b 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -10,6 +10,33 @@ #include <uapi/linux/devlink.h> +/* Sparse enums validation callbacks */ +static int +devlink_attr_param_type_validate(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + switch (nla_get_u8(attr)) { + case DEVLINK_VAR_ATTR_TYPE_U8: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_U16: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_U32: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_U64: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_STRING: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_FLAG: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_NUL_STRING: + fallthrough; + case DEVLINK_VAR_ATTR_TYPE_BINARY: + return 0; + } + NL_SET_ERR_MSG_ATTR(extack, attr, "invalid enum value"); + return -EINVAL; +} + /* Common nested types */ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1] = { [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY, }, @@ -273,7 +300,7 @@ static const struct nla_policy devlink_param_set_nl_policy[DEVLINK_ATTR_PARAM_VA [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, }, - [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8, }, + [DEVLINK_ATTR_PARAM_TYPE] = NLA_POLICY_VALIDATE_FN(NLA_U8, &devlink_attr_param_type_validate), [DEVLINK_ATTR_PARAM_VALUE_CMODE] = NLA_POLICY_MAX(NLA_U8, 2), }; diff --git a/net/devlink/param.c b/net/devlink/param.c index dcf0d1ccebba..b29abf8d3ed4 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -167,25 +167,6 @@ static int devlink_param_set(struct devlink *devlink, } static int -devlink_param_type_to_nla_type(enum devlink_param_type param_type) -{ - switch (param_type) { - case DEVLINK_PARAM_TYPE_U8: - return NLA_U8; - case DEVLINK_PARAM_TYPE_U16: - return NLA_U16; - case DEVLINK_PARAM_TYPE_U32: - return NLA_U32; - case DEVLINK_PARAM_TYPE_STRING: - return NLA_STRING; - case DEVLINK_PARAM_TYPE_BOOL: - return NLA_FLAG; - default: - return -EINVAL; - } -} - -static int devlink_nl_param_value_fill_one(struct sk_buff *msg, enum devlink_param_type type, enum devlink_param_cmode cmode, @@ -247,7 +228,6 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, struct devlink_param_gset_ctx ctx; struct nlattr *param_values_list; struct nlattr *param_attr; - int nla_type; void *hdr; int err; int i; @@ -293,11 +273,7 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, goto param_nest_cancel; if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) goto param_nest_cancel; - - nla_type = devlink_param_type_to_nla_type(param->type); - if (nla_type < 0) - goto param_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type)) + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, param->type)) goto param_nest_cancel; param_values_list = nla_nest_start_noflag(msg, @@ -419,25 +395,7 @@ devlink_param_type_get_from_info(struct genl_info *info, if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE)) return -EINVAL; - switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { - case NLA_U8: - *param_type = DEVLINK_PARAM_TYPE_U8; - break; - case NLA_U16: - *param_type = DEVLINK_PARAM_TYPE_U16; - break; - case NLA_U32: - *param_type = DEVLINK_PARAM_TYPE_U32; - break; - case NLA_STRING: - *param_type = DEVLINK_PARAM_TYPE_STRING; - break; - case NLA_FLAG: - *param_type = DEVLINK_PARAM_TYPE_BOOL; - break; - default: - return -EINVAL; - } + *param_type = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE]); return 0; } diff --git a/net/devlink/port.c b/net/devlink/port.c index be9158b4453c..939081a0e615 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -1376,7 +1376,7 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set); * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance - * @pf: associated PF for the devlink port instance + * @pf: associated PCI function number for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller, @@ -1402,8 +1402,9 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set); * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance - * @pf: associated PF for the devlink port instance - * @vf: associated VF of a PF for the devlink port instance + * @pf: associated PCI function number for the devlink port instance + * @vf: associated PCI VF number of a PF for the devlink port instance; + * VF number starts from 0 for the first PCI virtual function * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller, @@ -1430,8 +1431,8 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); * * @devlink_port: devlink port * @controller: associated controller number for the devlink port instance - * @pf: associated PF for the devlink port instance - * @sf: associated SF of a PF for the devlink port instance + * @pf: associated PCI function number for the devlink port instance + * @sf: associated SF number of a PF for the devlink port instance * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, diff --git a/net/dsa/conduit.c b/net/dsa/conduit.c index 3dfdb3cb47dc..4ae255cfb23f 100644 --- a/net/dsa/conduit.c +++ b/net/dsa/conduit.c @@ -10,6 +10,7 @@ #include <linux/netdevice.h> #include <linux/netlink.h> #include <net/dsa.h> +#include <net/netdev_lock.h> #include "conduit.h" #include "dsa.h" @@ -26,7 +27,9 @@ static int dsa_conduit_get_regs_len(struct net_device *dev) int len; if (ops->get_regs_len) { + netdev_lock_ops(dev); len = ops->get_regs_len(dev); + netdev_unlock_ops(dev); if (len < 0) return len; ret += len; @@ -57,11 +60,15 @@ static void dsa_conduit_get_regs(struct net_device *dev, int len; if (ops->get_regs_len && ops->get_regs) { + netdev_lock_ops(dev); len = ops->get_regs_len(dev); - if (len < 0) + if (len < 0) { + netdev_unlock_ops(dev); return; + } regs->len = len; ops->get_regs(dev, regs, data); + netdev_unlock_ops(dev); data += regs->len; } @@ -91,8 +98,10 @@ static void dsa_conduit_get_ethtool_stats(struct net_device *dev, int count = 0; if (ops->get_sset_count && ops->get_ethtool_stats) { + netdev_lock_ops(dev); count = ops->get_sset_count(dev, ETH_SS_STATS); ops->get_ethtool_stats(dev, stats, data); + netdev_unlock_ops(dev); } if (ds->ops->get_ethtool_stats) @@ -114,8 +123,10 @@ static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev, if (count >= 0) phy_ethtool_get_stats(dev->phydev, stats, data); } else if (ops->get_sset_count && ops->get_ethtool_phy_stats) { + netdev_lock_ops(dev); count = ops->get_sset_count(dev, ETH_SS_PHY_STATS); ops->get_ethtool_phy_stats(dev, stats, data); + netdev_unlock_ops(dev); } if (count < 0) @@ -132,11 +143,13 @@ static int dsa_conduit_get_sset_count(struct net_device *dev, int sset) struct dsa_switch *ds = cpu_dp->ds; int count = 0; + netdev_lock_ops(dev); if (sset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats) count = phy_ethtool_get_sset_count(dev->phydev); else if (ops->get_sset_count) count = ops->get_sset_count(dev, sset); + netdev_unlock_ops(dev); if (count < 0) count = 0; @@ -163,6 +176,7 @@ static void dsa_conduit_get_strings(struct net_device *dev, uint32_t stringset, /* We do not want to be NULL-terminated, since this is a prefix */ pfx[sizeof(pfx) - 1] = '_'; + netdev_lock_ops(dev); if (stringset == ETH_SS_PHY_STATS && dev->phydev && !ops->get_ethtool_phy_stats) { mcount = phy_ethtool_get_sset_count(dev->phydev); @@ -176,6 +190,7 @@ static void dsa_conduit_get_strings(struct net_device *dev, uint32_t stringset, mcount = 0; ops->get_strings(dev, stringset, data); } + netdev_unlock_ops(dev); if (ds->ops->get_strings) { ndata = data + mcount * len; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 5a7c0e565a89..436a7e1b412a 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -862,6 +862,16 @@ static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst) kfree(dst->lags); } +static void dsa_tree_teardown_routing_table(struct dsa_switch_tree *dst) +{ + struct dsa_link *dl, *next; + + list_for_each_entry_safe(dl, next, &dst->rtable, list) { + list_del(&dl->list); + kfree(dl); + } +} + static int dsa_tree_setup(struct dsa_switch_tree *dst) { bool complete; @@ -879,7 +889,7 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) err = dsa_tree_setup_cpu_ports(dst); if (err) - return err; + goto teardown_rtable; err = dsa_tree_setup_switches(dst); if (err) @@ -911,14 +921,14 @@ teardown_switches: dsa_tree_teardown_switches(dst); teardown_cpu_ports: dsa_tree_teardown_cpu_ports(dst); +teardown_rtable: + dsa_tree_teardown_routing_table(dst); return err; } static void dsa_tree_teardown(struct dsa_switch_tree *dst) { - struct dsa_link *dl, *next; - if (!dst->setup) return; @@ -932,10 +942,7 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) dsa_tree_teardown_cpu_ports(dst); - list_for_each_entry_safe(dl, next, &dst->rtable, list) { - list_del(&dl->list); - kfree(dl); - } + dsa_tree_teardown_routing_table(dst); pr_info("DSA: tree %d torn down\n", dst->index); @@ -1367,7 +1374,7 @@ static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) return dsa_switch_parse_ports_of(ds, dn); } -static int dev_is_class(struct device *dev, void *class) +static int dev_is_class(struct device *dev, const void *class) { if (dev->class != NULL && !strcmp(dev->class->name, class)) return 1; @@ -1478,12 +1485,44 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) static void dsa_switch_release_ports(struct dsa_switch *ds) { + struct dsa_mac_addr *a, *tmp; struct dsa_port *dp, *next; + struct dsa_vlan *v, *n; dsa_switch_for_each_port_safe(dp, next, ds) { - WARN_ON(!list_empty(&dp->fdbs)); - WARN_ON(!list_empty(&dp->mdbs)); - WARN_ON(!list_empty(&dp->vlans)); + /* These are either entries that upper layers lost track of + * (probably due to bugs), or installed through interfaces + * where one does not necessarily have to remove them, like + * ndo_dflt_fdb_add(). + */ + list_for_each_entry_safe(a, tmp, &dp->fdbs, list) { + dev_info(ds->dev, + "Cleaning up unicast address %pM vid %u from port %d\n", + a->addr, a->vid, dp->index); + list_del(&a->list); + kfree(a); + } + + list_for_each_entry_safe(a, tmp, &dp->mdbs, list) { + dev_info(ds->dev, + "Cleaning up multicast address %pM vid %u from port %d\n", + a->addr, a->vid, dp->index); + list_del(&a->list); + kfree(a); + } + + /* These are entries that upper layers have lost track of, + * probably due to bugs, but also due to dsa_port_do_vlan_del() + * having failed and the VLAN entry still lingering on. + */ + list_for_each_entry_safe(v, n, &dp->vlans, list) { + dev_info(ds->dev, + "Cleaning up vid %u from port %d\n", + v->vid, dp->index); + list_del(&v->list); + kfree(v); + } + list_del(&dp->list); kfree(dp); } diff --git a/net/dsa/port.c b/net/dsa/port.c index ee0aaec4c8e0..082573ae6864 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -116,19 +116,15 @@ static bool dsa_port_can_configure_learning(struct dsa_port *dp) bool dsa_port_supports_hwtstamp(struct dsa_port *dp) { + struct kernel_hwtstamp_config config = {}; struct dsa_switch *ds = dp->ds; - struct ifreq ifr = {}; int err; if (!ds->ops->port_hwtstamp_get || !ds->ops->port_hwtstamp_set) return false; - /* "See through" shim implementations of the "get" method. - * Since we can't cook up a complete ioctl request structure, this will - * fail in copy_to_user() with -EFAULT, which hopefully is enough to - * detect a valid implementation. - */ - err = ds->ops->port_hwtstamp_get(ds, dp->index, &ifr); + /* "See through" shim implementations of the "get" method. */ + err = ds->ops->port_hwtstamp_get(ds, dp->index, &config); return err != -EOPNOTSUPP; } @@ -1575,6 +1571,22 @@ void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, cpu_dp->tag_ops = tag_ops; } +/* dsa_supports_eee - indicate that EEE is supported + * @ds: pointer to &struct dsa_switch + * @port: port index + * + * A default implementation for the .support_eee() DSA operations member, + * which drivers can use to indicate that they support EEE on all of their + * user ports. + * + * Returns: true + */ +bool dsa_supports_eee(struct dsa_switch *ds, int port) +{ + return true; +} +EXPORT_SYMBOL_GPL(dsa_supports_eee); + static void dsa_port_phylink_mac_config(struct phylink_config *config, unsigned int mode, const struct phylink_link_state *state) diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 3ee53e28ec2e..53e03fd8071b 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -197,7 +197,7 @@ static int dsa_port_do_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid) err = ds->ops->tag_8021q_vlan_del(ds, port, vid); if (err) { - refcount_inc(&v->refcount); + refcount_set(&v->refcount, 1); return err; } diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 8c3c068728e5..fe75821623a4 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -257,7 +257,7 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, int source_port; u8 *brcm_tag; - if (unlikely(!pskb_may_pull(skb, BRCM_LEG_PORT_ID))) + if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN))) return NULL; brcm_tag = dsa_etype_header_pos_rx(skb); diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 281bbac5539d..0b7564b53790 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -66,7 +66,7 @@ static int ksz_connect(struct dsa_switch *ds) if (!priv) return -ENOMEM; - xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit", + xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit", ds->dst->index, ds->index); if (IS_ERR(xmit_worker)) { ret = PTR_ERR(xmit_worker); @@ -140,7 +140,12 @@ static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev) static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev) { - u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; + u8 *tag; + + if (skb_linearize(skb)) + return NULL; + + tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; return ksz_common_rcv(skb, dev, tag[0] & KSZ8795_TAIL_TAG_EG_PORT_M, KSZ_EGRESS_TAG_LEN); @@ -311,10 +316,16 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev) { - /* Tag decoding */ - u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; - unsigned int port = tag[0] & KSZ9477_TAIL_TAG_EG_PORT_M; unsigned int len = KSZ_EGRESS_TAG_LEN; + unsigned int port; + u8 *tag; + + if (skb_linearize(skb)) + return NULL; + + /* Tag decoding */ + tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; + port = tag[0] & KSZ9477_TAIL_TAG_EG_PORT_M; /* Extra 4-bytes PTP timestamp */ if (tag[0] & KSZ9477_PTP_TAG_INDICATION) { diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 11ea8cfd6266..3929584791e4 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -110,7 +110,7 @@ static int ocelot_connect(struct dsa_switch *ds) if (!priv) return -ENOMEM; - priv->xmit_worker = kthread_create_worker(0, "felix_xmit"); + priv->xmit_worker = kthread_run_worker(0, "felix_xmit"); if (IS_ERR(priv->xmit_worker)) { err = PTR_ERR(priv->xmit_worker); kfree(priv); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 3e902af7eea6..02adec693811 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -707,7 +707,7 @@ static int sja1105_connect(struct dsa_switch *ds) spin_lock_init(&priv->meta_lock); - xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit", + xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit", ds->dst->index, ds->index); if (IS_ERR(xmit_worker)) { err = PTR_ERR(xmit_worker); diff --git a/net/dsa/user.c b/net/dsa/user.c index 06c30a9e29ff..e9334520c54a 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -515,12 +515,13 @@ dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid, bool is_static, void *data) { struct dsa_user_dump_ctx *dump = data; + struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx; u32 portid = NETLINK_CB(dump->cb->skb).portid; u32 seq = dump->cb->nlh->nlmsg_seq; struct nlmsghdr *nlh; struct ndmsg *ndm; - if (dump->idx < dump->cb->args[2]) + if (dump->idx < ctx->fdb_idx) goto skip; nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH, @@ -577,20 +578,6 @@ dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_user_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->dp->ds; - int port = p->dp->index; - - /* Pass through to switch driver if it supports timestamping */ - switch (cmd) { - case SIOCGHWTSTAMP: - if (ds->ops->port_hwtstamp_get) - return ds->ops->port_hwtstamp_get(ds, port, ifr); - break; - case SIOCSHWTSTAMP: - if (ds->ops->port_hwtstamp_set) - return ds->ops->port_hwtstamp_set(ds, port, ifr); - break; - } return phylink_mii_ioctl(p->dp->pl, ifr, cmd); } @@ -896,7 +883,7 @@ static void dsa_skb_tx_timestamp(struct dsa_user_priv *p, { struct dsa_switch *ds = p->dp->ds; - if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) + if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NOBPF)) return; if (!ds->ops->port_txtstamp) @@ -1149,6 +1136,16 @@ dsa_user_get_rmon_stats(struct net_device *dev, ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges); } +static void dsa_user_get_ts_stats(struct net_device *dev, + struct ethtool_ts_stats *ts_stats) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_ts_stats) + ds->ops->get_ts_stats(ds, dp->index, ts_stats); +} + static void dsa_user_net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { @@ -1228,16 +1225,29 @@ static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) struct dsa_switch *ds = dp->ds; int ret; - /* Port's PHY and MAC both need to be EEE capable */ - if (!dev->phydev || !dp->pl) - return -ENODEV; - - if (!ds->ops->set_mac_eee) + /* Check whether the switch supports EEE */ + if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; - ret = ds->ops->set_mac_eee(ds, dp->index, e); - if (ret) - return ret; + /* If the port is using phylink managed EEE, then an unimplemented + * set_mac_eee() is permissible. + */ + if (!phylink_mac_implements_lpi(ds->phylink_mac_ops)) { + /* Port's PHY and MAC both need to be EEE capable */ + if (!dev->phydev) + return -ENODEV; + + if (!ds->ops->set_mac_eee) + return -EOPNOTSUPP; + + ret = ds->ops->set_mac_eee(ds, dp->index, e); + if (ret) + return ret; + } else if (ds->ops->set_mac_eee) { + ret = ds->ops->set_mac_eee(ds, dp->index, e); + if (ret) + return ret; + } return phylink_ethtool_set_eee(dp->pl, e); } @@ -1246,18 +1256,14 @@ static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; - int ret; - - /* Port's PHY and MAC both need to be EEE capable */ - if (!dev->phydev || !dp->pl) - return -ENODEV; - if (!ds->ops->get_mac_eee) + /* Check whether the switch supports EEE */ + if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; - ret = ds->ops->get_mac_eee(ds, dp->index, e); - if (ret) - return ret; + /* Port's PHY and MAC both need to be EEE capable */ + if (!dev->phydev) + return -ENODEV; return phylink_ethtool_get_eee(dp->pl, e); } @@ -2500,6 +2506,7 @@ static const struct ethtool_ops dsa_user_ethtool_ops = { .get_eth_mac_stats = dsa_user_get_eth_mac_stats, .get_eth_ctrl_stats = dsa_user_get_eth_ctrl_stats, .get_rmon_stats = dsa_user_get_rmon_stats, + .get_ts_stats = dsa_user_get_ts_stats, .set_wol = dsa_user_set_wol, .get_wol = dsa_user_get_wol, .set_eee = dsa_user_set_eee, @@ -2553,6 +2560,31 @@ static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx, return 0; } +static int dsa_user_hwtstamp_get(struct net_device *dev, + struct kernel_hwtstamp_config *cfg) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->port_hwtstamp_get) + return -EOPNOTSUPP; + + return ds->ops->port_hwtstamp_get(ds, dp->index, cfg); +} + +static int dsa_user_hwtstamp_set(struct net_device *dev, + struct kernel_hwtstamp_config *cfg, + struct netlink_ext_ack *extack) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->port_hwtstamp_set) + return -EOPNOTSUPP; + + return ds->ops->port_hwtstamp_set(ds, dp->index, cfg, extack); +} + static const struct net_device_ops dsa_user_netdev_ops = { .ndo_open = dsa_user_open, .ndo_stop = dsa_user_close, @@ -2574,6 +2606,8 @@ static const struct net_device_ops dsa_user_netdev_ops = { .ndo_vlan_rx_kill_vid = dsa_user_vlan_rx_kill_vid, .ndo_change_mtu = dsa_user_change_mtu, .ndo_fill_forward_path = dsa_user_fill_forward_path, + .ndo_hwtstamp_get = dsa_user_hwtstamp_get, + .ndo_hwtstamp_set = dsa_user_hwtstamp_set, }; static const struct device_type dsa_type = { diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 9b540644ba31..a1490c4afe6b 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -9,4 +9,4 @@ ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \ module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o mm.o \ - phy.o + phy.o tsconfig.o diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c index f22051f33868..0364b8fb577b 100644 --- a/net/ethtool/cabletest.c +++ b/net/ethtool/cabletest.c @@ -2,6 +2,7 @@ #include <linux/phy.h> #include <linux/ethtool_netlink.h> +#include <net/netdev_lock.h> #include "netlink.h" #include "common.h" @@ -72,23 +73,24 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) dev = req_info.dev; rtnl_lock(); - phydev = ethnl_req_get_phydev(&req_info, - tb[ETHTOOL_A_CABLE_TEST_HEADER], + netdev_lock_ops(dev); + phydev = ethnl_req_get_phydev(&req_info, tb, + ETHTOOL_A_CABLE_TEST_HEADER, info->extack); if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; - goto out_rtnl; + goto out_unlock; } ops = ethtool_phy_ops; if (!ops || !ops->start_cable_test) { ret = -EOPNOTSUPP; - goto out_rtnl; + goto out_unlock; } ret = ethnl_ops_begin(dev); if (ret < 0) - goto out_rtnl; + goto out_unlock; ret = ops->start_cable_test(phydev, info->extack); @@ -97,7 +99,8 @@ int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) if (!ret) ethnl_cable_test_started(phydev, ETHTOOL_MSG_CABLE_TEST_NTF); -out_rtnl: +out_unlock: + netdev_unlock_ops(dev); rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; @@ -339,23 +342,24 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) goto out_dev_put; rtnl_lock(); - phydev = ethnl_req_get_phydev(&req_info, - tb[ETHTOOL_A_CABLE_TEST_TDR_HEADER], + netdev_lock_ops(dev); + phydev = ethnl_req_get_phydev(&req_info, tb, + ETHTOOL_A_CABLE_TEST_TDR_HEADER, info->extack); if (IS_ERR_OR_NULL(phydev)) { ret = -EOPNOTSUPP; - goto out_rtnl; + goto out_unlock; } ops = ethtool_phy_ops; if (!ops || !ops->start_cable_test_tdr) { ret = -EOPNOTSUPP; - goto out_rtnl; + goto out_unlock; } ret = ethnl_ops_begin(dev); if (ret < 0) - goto out_rtnl; + goto out_unlock; ret = ops->start_cable_test_tdr(phydev, info->extack, &cfg); @@ -365,7 +369,8 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) ethnl_cable_test_started(phydev, ETHTOOL_MSG_CABLE_TEST_TDR_NTF); -out_rtnl: +out_unlock: + netdev_unlock_ops(dev); rtnl_unlock(); out_dev_put: ethnl_parse_header_dev_put(&req_info); diff --git a/net/ethtool/cmis.h b/net/ethtool/cmis.h index 1e790413db0e..4a9a946cabf0 100644 --- a/net/ethtool/cmis.h +++ b/net/ethtool/cmis.h @@ -101,7 +101,6 @@ struct ethtool_cmis_cdb_rpl { }; u32 ethtool_cmis_get_max_lpl_size(u8 num_of_byte_octs); -u32 ethtool_cmis_get_max_epl_size(u8 num_of_byte_octs); void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args, enum ethtool_cmis_cdb_cmd_id cmd, u8 *lpl, diff --git a/net/ethtool/cmis_cdb.c b/net/ethtool/cmis_cdb.c index d159dc121bde..3057576bc81e 100644 --- a/net/ethtool/cmis_cdb.c +++ b/net/ethtool/cmis_cdb.c @@ -16,15 +16,6 @@ u32 ethtool_cmis_get_max_lpl_size(u8 num_of_byte_octs) return 8 * (1 + min_t(u8, num_of_byte_octs, 15)); } -/* For accessing the EPL field on page 9Fh, the allowable length extension is - * min(i, 255) byte octets where i specifies the allowable additional number of - * byte octets in a READ or a WRITE. - */ -u32 ethtool_cmis_get_max_epl_size(u8 num_of_byte_octs) -{ - return 8 * (1 + min_t(u8, num_of_byte_octs, 255)); -} - void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args, enum ethtool_cmis_cdb_cmd_id cmd, u8 *lpl, u8 lpl_len, u8 *epl, u16 epl_len, @@ -33,19 +24,16 @@ void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args, { args->req.id = cpu_to_be16(cmd); args->req.lpl_len = lpl_len; - if (lpl) { + if (lpl) memcpy(args->req.payload, lpl, args->req.lpl_len); - args->read_write_len_ext = - ethtool_cmis_get_max_lpl_size(read_write_len_ext); - } if (epl) { args->req.epl_len = cpu_to_be16(epl_len); args->req.epl = epl; - args->read_write_len_ext = - ethtool_cmis_get_max_epl_size(read_write_len_ext); } args->max_duration = max_duration; + args->read_write_len_ext = + ethtool_cmis_get_max_lpl_size(read_write_len_ext); args->msleep_pre_rpl = msleep_pre_rpl; args->rpl_exp_len = rpl_exp_len; args->flags = flags; @@ -363,7 +351,7 @@ ethtool_cmis_module_poll(struct net_device *dev, struct netlink_ext_ack extack = {}; int err; - ethtool_cmis_page_init(&page_data, 0, offset, sizeof(rpl)); + ethtool_cmis_page_init(&page_data, 0, offset, sizeof(*rpl)); page_data.data = (u8 *)rpl; err = ops->get_module_eeprom_by_page(dev, &page_data, &extack); diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c index 48aef6220f00..df5f344209c4 100644 --- a/net/ethtool/cmis_fw_update.c +++ b/net/ethtool/cmis_fw_update.c @@ -2,6 +2,7 @@ #include <linux/ethtool.h> #include <linux/firmware.h> +#include <net/netdev_lock.h> #include "common.h" #include "module_fw.h" @@ -418,8 +419,13 @@ cmis_fw_update_commit_image(struct ethtool_cmis_cdb *cdb, static int cmis_fw_update_reset(struct net_device *dev) { __u32 reset_data = ETH_RESET_PHY; + int ret; - return dev->ethtool_ops->reset(dev, &reset_data); + netdev_lock_ops(dev); + ret = dev->ethtool_ops->reset(dev, &reset_data); + netdev_unlock_ops(dev); + + return ret; } void diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 05ce4f8080b3..eb253e0fd61b 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -5,9 +5,13 @@ #include <linux/phy.h> #include <linux/rtnetlink.h> #include <linux/ptp_clock_kernel.h> +#include <linux/phy_link_topology.h> +#include <net/netdev_queues.h> #include "netlink.h" #include "common.h" +#include "../core/dev.h" + const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_SG_BIT] = "tx-scatter-gather", @@ -32,6 +36,7 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust", [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation", + [NETIF_F_GSO_ACCECN_BIT] = "tx-tcp-accecn-segmentation", [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation", [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation", [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation", @@ -210,6 +215,24 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(10, T1S, Half), __DEFINE_LINK_MODE_NAME(10, T1S_P2MP, Half), __DEFINE_LINK_MODE_NAME(10, T1BRR, Full), + __DEFINE_LINK_MODE_NAME(200000, CR, Full), + __DEFINE_LINK_MODE_NAME(200000, KR, Full), + __DEFINE_LINK_MODE_NAME(200000, DR, Full), + __DEFINE_LINK_MODE_NAME(200000, DR_2, Full), + __DEFINE_LINK_MODE_NAME(200000, SR, Full), + __DEFINE_LINK_MODE_NAME(200000, VR, Full), + __DEFINE_LINK_MODE_NAME(400000, CR2, Full), + __DEFINE_LINK_MODE_NAME(400000, KR2, Full), + __DEFINE_LINK_MODE_NAME(400000, DR2, Full), + __DEFINE_LINK_MODE_NAME(400000, DR2_2, Full), + __DEFINE_LINK_MODE_NAME(400000, SR2, Full), + __DEFINE_LINK_MODE_NAME(400000, VR2, Full), + __DEFINE_LINK_MODE_NAME(800000, CR4, Full), + __DEFINE_LINK_MODE_NAME(800000, KR4, Full), + __DEFINE_LINK_MODE_NAME(800000, DR4, Full), + __DEFINE_LINK_MODE_NAME(800000, DR4_2, Full), + __DEFINE_LINK_MODE_NAME(800000, SR4, Full), + __DEFINE_LINK_MODE_NAME(800000, VR4, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -218,8 +241,11 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_CR4 4 #define __LINK_MODE_LANES_CR8 8 #define __LINK_MODE_LANES_DR 1 +#define __LINK_MODE_LANES_DR_2 1 #define __LINK_MODE_LANES_DR2 2 +#define __LINK_MODE_LANES_DR2_2 2 #define __LINK_MODE_LANES_DR4 4 +#define __LINK_MODE_LANES_DR4_2 4 #define __LINK_MODE_LANES_DR8 8 #define __LINK_MODE_LANES_KR 1 #define __LINK_MODE_LANES_KR2 2 @@ -248,6 +274,9 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_T1L 1 #define __LINK_MODE_LANES_T1S 1 #define __LINK_MODE_LANES_T1S_P2MP 1 +#define __LINK_MODE_LANES_VR 1 +#define __LINK_MODE_LANES_VR2 2 +#define __LINK_MODE_LANES_VR4 4 #define __LINK_MODE_LANES_VR8 8 #define __LINK_MODE_LANES_DR8_2 8 #define __LINK_MODE_LANES_T1BRR 1 @@ -375,8 +404,27 @@ const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(10, T1S, Half), __DEFINE_LINK_MODE_PARAMS(10, T1S_P2MP, Half), __DEFINE_LINK_MODE_PARAMS(10, T1BRR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR_2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, VR, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR2_2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, VR2, Full), + __DEFINE_LINK_MODE_PARAMS(800000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full), + __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); +EXPORT_SYMBOL_GPL(link_mode_params); const char netif_msg_class_names[][ETH_GSTRING_LEN] = { [NETIF_MSG_DRV_BIT] = "drv", @@ -428,6 +476,7 @@ const char sof_timestamping_names[][ETH_GSTRING_LEN] = { [const_ilog2(SOF_TIMESTAMPING_BIND_PHC)] = "bind-phc", [const_ilog2(SOF_TIMESTAMPING_OPT_ID_TCP)] = "option-id-tcp", [const_ilog2(SOF_TIMESTAMPING_OPT_RX_FILTER)] = "option-rx-filter", + [const_ilog2(SOF_TIMESTAMPING_TX_COMPLETION)] = "tx-completion", }; static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT); @@ -459,6 +508,11 @@ const char ts_rx_filter_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT); +const char ts_flags_names[][ETH_GSTRING_LEN] = { + [const_ilog2(HWTSTAMP_FLAG_BONDED_PHC_INDEX)] = "bonded-phc-index", +}; +static_assert(ARRAY_SIZE(ts_flags_names) == __HWTSTAMP_FLAG_CNT); + const char udp_tunnel_type_names[][ETH_GSTRING_LEN] = { [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN] = "vxlan", [ETHTOOL_UDP_TUNNEL_TYPE_GENEVE] = "geneve", @@ -763,20 +817,123 @@ int ethtool_check_ops(const struct ethtool_ops *ops) return 0; } -int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info) +void ethtool_ringparam_get_cfg(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kparam, + struct netlink_ext_ack *extack) { - const struct ethtool_ops *ops = dev->ethtool_ops; - struct phy_device *phydev = dev->phydev; - int err = 0; + memset(param, 0, sizeof(*param)); + memset(kparam, 0, sizeof(*kparam)); + param->cmd = ETHTOOL_GRINGPARAM; + dev->ethtool_ops->get_ringparam(dev, param, kparam, extack); + + /* Driver gives us current state, we want to return current config */ + kparam->tcp_data_split = dev->cfg->hds_config; + kparam->hds_thresh = dev->cfg->hds_thresh; +} + +static void ethtool_init_tsinfo(struct kernel_ethtool_ts_info *info) +{ memset(info, 0, sizeof(*info)); info->cmd = ETHTOOL_GET_TS_INFO; info->phc_index = -1; +} + +int ethtool_net_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + int err; + + if (!ops->get_ts_info) + return -ENODEV; + + /* Does ptp comes from netdev */ + ethtool_init_tsinfo(info); + info->phc_qualifier = hwprov_desc->qualifier; + err = ops->get_ts_info(dev, info); + if (err) + return err; + + if (info->phc_index == hwprov_desc->index && + net_support_hwtstamp_qualifier(dev, hwprov_desc->qualifier)) + return 0; + + return -ENODEV; +} + +struct phy_device * +ethtool_phy_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc) +{ + int err; + + /* Only precise qualifier is supported in phydev */ + if (hwprov_desc->qualifier != HWTSTAMP_PROVIDER_QUALIFIER_PRECISE) + return ERR_PTR(-ENODEV); - if (phy_is_default_hwtstamp(phydev) && phy_has_tsinfo(phydev)) - err = phy_ts_info(phydev, info); - else if (ops->get_ts_info) - err = ops->get_ts_info(dev, info); + /* Look in the phy topology */ + if (dev->link_topo) { + struct phy_device_node *pdn; + unsigned long phy_index; + + xa_for_each(&dev->link_topo->phys, phy_index, pdn) { + if (!phy_has_tsinfo(pdn->phy)) + continue; + + ethtool_init_tsinfo(info); + err = phy_ts_info(pdn->phy, info); + if (err) + return ERR_PTR(err); + + if (info->phc_index == hwprov_desc->index) + return pdn->phy; + } + return ERR_PTR(-ENODEV); + } + + /* Look on the dev->phydev */ + if (phy_has_tsinfo(dev->phydev)) { + ethtool_init_tsinfo(info); + err = phy_ts_info(dev->phydev, info); + if (err) + return ERR_PTR(err); + + if (info->phc_index == hwprov_desc->index) + return dev->phydev; + } + + return ERR_PTR(-ENODEV); +} + +int ethtool_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc) +{ + int err; + + err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc); + if (err == -ENODEV) { + struct phy_device *phy; + + phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); + if (IS_ERR(phy)) + return PTR_ERR(phy); + + /* Report the phc source only if we have a real + * phc source with an index. + */ + if (info->phc_index >= 0) { + info->phc_source = HWTSTAMP_SOURCE_PHYLIB; + info->phc_phyindex = phy->phyindex; + } + err = 0; + } else if (!err && info->phc_index >= 0) { + info->phc_source = HWTSTAMP_SOURCE_NETDEV; + } info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; @@ -784,6 +941,69 @@ int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info return err; } +int __ethtool_get_ts_info(struct net_device *dev, + struct kernel_ethtool_ts_info *info) +{ + struct hwtstamp_provider *hwprov; + int err = 0; + + rcu_read_lock(); + hwprov = rcu_dereference(dev->hwprov); + /* No provider specified, use default behavior */ + if (!hwprov) { + const struct ethtool_ops *ops = dev->ethtool_ops; + struct phy_device *phydev = dev->phydev; + + ethtool_init_tsinfo(info); + if (phy_is_default_hwtstamp(phydev) && + phy_has_tsinfo(phydev)) { + err = phy_ts_info(phydev, info); + /* Report the phc source only if we have a real + * phc source with an index. + */ + if (!err && info->phc_index >= 0) { + info->phc_source = HWTSTAMP_SOURCE_PHYLIB; + info->phc_phyindex = phydev->phyindex; + } + } else if (ops->get_ts_info) { + err = ops->get_ts_info(dev, info); + if (!err && info->phc_index >= 0) + info->phc_source = HWTSTAMP_SOURCE_NETDEV; + } + + info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + + rcu_read_unlock(); + return err; + } + + err = ethtool_get_ts_info_by_phc(dev, info, &hwprov->desc); + rcu_read_unlock(); + return err; +} + +bool net_support_hwtstamp_qualifier(struct net_device *dev, + enum hwtstamp_provider_qualifier qualifier) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops) + return false; + + /* Return true with precise qualifier and with NIC without + * qualifier description to not break the old behavior. + */ + if (!ops->supported_hwtstamp_qualifiers && + qualifier == HWTSTAMP_PROVIDER_QUALIFIER_PRECISE) + return true; + + if (ops->supported_hwtstamp_qualifiers & BIT(qualifier)) + return true; + + return false; +} + int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index) { struct kernel_ethtool_ts_info info = { }; diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 4a2de3ce7354..b4683d286a5a 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -13,14 +13,10 @@ ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT #define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1) - -struct link_mode_info { - int speed; - u8 lanes; - u8 duplex; -}; +#define __HWTSTAMP_FLAG_CNT (const_ilog2(HWTSTAMP_FLAG_LAST) + 1) struct genl_info; +struct hwtstamp_provider_desc; extern const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]; @@ -31,12 +27,12 @@ tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN]; extern const char link_mode_names[][ETH_GSTRING_LEN]; -extern const struct link_mode_info link_mode_params[]; extern const char netif_msg_class_names[][ETH_GSTRING_LEN]; extern const char wol_mode_names[][ETH_GSTRING_LEN]; extern const char sof_timestamping_names[][ETH_GSTRING_LEN]; extern const char ts_tx_type_names[][ETH_GSTRING_LEN]; extern const char ts_rx_filter_names[][ETH_GSTRING_LEN]; +extern const char ts_flags_names[][ETH_GSTRING_LEN]; extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN]; int __ethtool_get_link(struct net_device *dev); @@ -48,7 +44,25 @@ int ethtool_check_max_channel(struct net_device *dev, struct ethtool_channels channels, struct genl_info *info); int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context); + +void ethtool_ringparam_get_cfg(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kparam, + struct netlink_ext_ack *extack); + int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); +int ethtool_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc); +int ethtool_net_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc); +struct phy_device * +ethtool_phy_get_ts_info_by_phc(struct net_device *dev, + struct kernel_ethtool_ts_info *info, + struct hwtstamp_provider_desc *hwprov_desc); +bool net_support_hwtstamp_qualifier(struct net_device *dev, + enum hwtstamp_provider_qualifier qualifier); extern const struct ethtool_phy_ops *ethtool_phy_ops; extern const struct ethtool_pse_ops *ethtool_pse_ops; diff --git a/net/ethtool/features.c b/net/ethtool/features.c index b6cb101d7f19..f2217983be2b 100644 --- a/net/ethtool/features.c +++ b/net/ethtool/features.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <net/netdev_lock.h> + #include "netlink.h" #include "common.h" #include "bitset.h" @@ -234,9 +236,10 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) dev = req_info.dev; rtnl_lock(); + netdev_lock_ops(dev); ret = ethnl_ops_begin(dev); if (ret < 0) - goto out_rtnl; + goto out_unlock; ethnl_features_to_bitmap(old_active, dev->features); ethnl_features_to_bitmap(old_wanted, dev->wanted_features); ret = ethnl_parse_bitset(req_wanted, req_mask, NETDEV_FEATURE_COUNT, @@ -286,7 +289,8 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) out_ops: ethnl_ops_complete(dev); -out_rtnl: +out_unlock: + netdev_unlock_ops(dev); rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 7bb94875a7ec..39ec920f5de7 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -31,6 +31,7 @@ #include <net/ipv6.h> #include <net/xdp_sock_drv.h> #include <net/flow_offload.h> +#include <net/netdev_lock.h> #include <linux/ethtool_netlink.h> #include "common.h" @@ -59,7 +60,7 @@ static struct devlink *netdev_to_devlink_get(struct net_device *dev) u32 ethtool_op_get_link(struct net_device *dev) { /* Synchronize carrier state with link watch, see also rtnl_getlink() */ - linkwatch_sync_dev(dev); + __linkwatch_sync_dev(dev); return netif_carrier_ok(dev) ? 1 : 0; } @@ -977,6 +978,88 @@ static int ethtool_rxnfc_copy_to_user(void __user *useraddr, return 0; } +static bool flow_type_hashable(u32 flow_type) +{ + switch (flow_type) { + case TCP_V4_FLOW: + case UDP_V4_FLOW: + case SCTP_V4_FLOW: + case AH_ESP_V4_FLOW: + case TCP_V6_FLOW: + case UDP_V6_FLOW: + case SCTP_V6_FLOW: + case AH_ESP_V6_FLOW: + case AH_V4_FLOW: + case ESP_V4_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case IPV4_FLOW: + case IPV6_FLOW: + case GTPU_V4_FLOW: + case GTPU_V6_FLOW: + case GTPC_V4_FLOW: + case GTPC_V6_FLOW: + case GTPC_TEID_V4_FLOW: + case GTPC_TEID_V6_FLOW: + case GTPU_EH_V4_FLOW: + case GTPU_EH_V6_FLOW: + case GTPU_UL_V4_FLOW: + case GTPU_UL_V6_FLOW: + case GTPU_DL_V4_FLOW: + case GTPU_DL_V6_FLOW: + return true; + } + + return false; +} + +/* When adding a new type, update the assert and, if it's hashable, add it to + * the flow_type_hashable switch case. + */ +static_assert(GTPU_DL_V6_FLOW + 1 == __FLOW_TYPE_COUNT); + +static int ethtool_check_xfrm_rxfh(u32 input_xfrm, u64 rxfh) +{ + /* Sanity check: if symmetric-xor/symmetric-or-xor is set, then: + * 1 - no other fields besides IP src/dst and/or L4 src/dst are set + * 2 - If src is set, dst must also be set + */ + if ((input_xfrm != RXH_XFRM_NO_CHANGE && + input_xfrm & (RXH_XFRM_SYM_XOR | RXH_XFRM_SYM_OR_XOR)) && + ((rxfh & ~(RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3)) || + (!!(rxfh & RXH_IP_SRC) ^ !!(rxfh & RXH_IP_DST)) || + (!!(rxfh & RXH_L4_B_0_1) ^ !!(rxfh & RXH_L4_B_2_3)))) + return -EINVAL; + + return 0; +} + +static int ethtool_check_flow_types(struct net_device *dev, u32 input_xfrm) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc info = { + .cmd = ETHTOOL_GRXFH, + }; + int err; + u32 i; + + for (i = 0; i < __FLOW_TYPE_COUNT; i++) { + if (!flow_type_hashable(i)) + continue; + + info.flow_type = i; + err = ops->get_rxnfc(dev, &info, NULL); + if (err) + continue; + + err = ethtool_check_xfrm_rxfh(input_xfrm, info.data); + if (err) + return err; + } + + return 0; +} + static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { @@ -992,29 +1075,28 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, if (rc) return rc; - /* Nonzero ring with RSS only makes sense if NIC adds them together */ - if (cmd == ETHTOOL_SRXCLSRLINS && info.flow_type & FLOW_RSS && - !ops->cap_rss_rxnfc_adds && - ethtool_get_flow_spec_ring(info.fs.ring_cookie)) - return -EINVAL; + if (cmd == ETHTOOL_SRXCLSRLINS && info.fs.flow_type & FLOW_RSS) { + /* Nonzero ring with RSS only makes sense + * if NIC adds them together + */ + if (!ops->cap_rss_rxnfc_adds && + ethtool_get_flow_spec_ring(info.fs.ring_cookie)) + return -EINVAL; + + if (!xa_load(&dev->ethtool->rss_ctx, info.rss_context)) + return -EINVAL; + } - if (ops->get_rxfh) { + if (cmd == ETHTOOL_SRXFH && ops->get_rxfh) { struct ethtool_rxfh_param rxfh = {}; rc = ops->get_rxfh(dev, &rxfh); if (rc) return rc; - /* Sanity check: if symmetric-xor is set, then: - * 1 - no other fields besides IP src/dst and/or L4 src/dst - * 2 - If src is set, dst must also be set - */ - if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && - ((info.data & ~(RXH_IP_SRC | RXH_IP_DST | - RXH_L4_B_0_1 | RXH_L4_B_2_3)) || - (!!(info.data & RXH_IP_SRC) ^ !!(info.data & RXH_IP_DST)) || - (!!(info.data & RXH_L4_B_0_1) ^ !!(info.data & RXH_L4_B_2_3)))) - return -EINVAL; + rc = ethtool_check_xfrm_rxfh(rxfh.input_xfrm, info.data); + if (rc) + return rc; } rc = ops->set_rxnfc(dev, &info); @@ -1382,11 +1464,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, return -EOPNOTSUPP; /* Check input data transformation capabilities */ if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR && + rxfh.input_xfrm != RXH_XFRM_SYM_OR_XOR && rxfh.input_xfrm != RXH_XFRM_NO_CHANGE) return -EINVAL; if (rxfh.input_xfrm != RXH_XFRM_NO_CHANGE && - (rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && - !ops->cap_rss_sym_xor_supported) + rxfh.input_xfrm & ~ops->supported_input_xfrm) return -EOPNOTSUPP; create = rxfh.rss_context == ETH_RXFH_CONTEXT_ALLOC; @@ -1406,6 +1488,10 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh.input_xfrm == RXH_XFRM_NO_CHANGE)) return -EINVAL; + ret = ethtool_check_flow_types(dev, rxfh.input_xfrm); + if (ret) + return ret; + indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]); /* Check settings which may be global rather than per RSS-context */ @@ -2059,8 +2145,8 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { - struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; struct kernel_ethtool_ringparam kernel_ringparam; + struct ethtool_ringparam ringparam, max; int ret; if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) @@ -2069,7 +2155,7 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; - dev->ethtool_ops->get_ringparam(dev, &max, &kernel_ringparam, NULL); + ethtool_ringparam_get_cfg(dev, &max, &kernel_ringparam, NULL); /* ensure new ring parameters are within the maximums */ if (ringparam.rx_pending > max.rx_max_pending || @@ -2311,6 +2397,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) */ busy = true; netdev_hold(dev, &dev_tracker, GFP_KERNEL); + netdev_unlock_ops(dev); rtnl_unlock(); if (rc == 0) { @@ -2325,8 +2412,10 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) do { rtnl_lock(); + netdev_lock_ops(dev); rc = ops->set_phys_id(dev, (i++ & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON); + netdev_unlock_ops(dev); rtnl_unlock(); if (rc) break; @@ -2335,6 +2424,7 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) } rtnl_lock(); + netdev_lock_ops(dev); netdev_put(dev, &dev_tracker); busy = false; @@ -3134,6 +3224,7 @@ __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr, return -EPERM; } + netdev_lock_ops(dev); if (dev->dev.parent) pm_runtime_get_sync(dev->dev.parent); @@ -3367,6 +3458,7 @@ __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr, out: if (dev->dev.parent) pm_runtime_put(dev->dev.parent); + netdev_unlock_ops(dev); return rc; } diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index 34d76e87847d..05a5f72c99fa 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -3,6 +3,7 @@ #include "netlink.h" #include "common.h" #include <linux/phy.h> +#include <linux/phylib_stubs.h> struct linkstate_req_info { struct ethnl_req_info base; @@ -26,9 +27,8 @@ const struct nla_policy ethnl_linkstate_get_policy[] = { NLA_POLICY_NESTED(ethnl_header_policy_stats), }; -static int linkstate_get_sqi(struct net_device *dev) +static int linkstate_get_sqi(struct phy_device *phydev) { - struct phy_device *phydev = dev->phydev; int ret; if (!phydev) @@ -46,9 +46,8 @@ static int linkstate_get_sqi(struct net_device *dev) return ret; } -static int linkstate_get_sqi_max(struct net_device *dev) +static int linkstate_get_sqi_max(struct phy_device *phydev) { - struct phy_device *phydev = dev->phydev; int ret; if (!phydev) @@ -100,19 +99,28 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, { struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; + struct nlattr **tb = info->attrs; + struct phy_device *phydev; int ret; + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_LINKSTATE_HEADER, + info->extack); + if (IS_ERR(phydev)) { + ret = PTR_ERR(phydev); + goto out; + } + ret = ethnl_ops_begin(dev); if (ret < 0) return ret; data->link = __ethtool_get_link(dev); - ret = linkstate_get_sqi(dev); + ret = linkstate_get_sqi(phydev); if (linkstate_sqi_critical_error(ret)) goto out; data->sqi = ret; - ret = linkstate_get_sqi_max(dev); + ret = linkstate_get_sqi_max(phydev); if (linkstate_sqi_critical_error(ret)) goto out; data->sqi_max = ret; @@ -127,9 +135,9 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, sizeof(data->link_stats) / 8); if (req_base->flags & ETHTOOL_FLAG_STATS) { - if (dev->phydev) - data->link_stats.link_down_events = - READ_ONCE(dev->phydev->link_down_events); + if (phydev) + phy_ethtool_get_link_ext_stats(phydev, + &data->link_stats); if (dev->ethtool_ops->get_link_ext_stats) dev->ethtool_ops->get_link_ext_stats(dev, diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c index 2816bb23c3ad..ad9b40034003 100644 --- a/net/ethtool/mm.c +++ b/net/ethtool/mm.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright 2022-2023 NXP + * Copyright 2022-2025 NXP + * Copyright 2024 Furong Xu <0x1207@gmail.com> */ #include "common.h" #include "netlink.h" @@ -282,3 +283,279 @@ bool ethtool_dev_mm_supported(struct net_device *dev) return supported; } EXPORT_SYMBOL_GPL(ethtool_dev_mm_supported); + +static void ethtool_mmsv_configure_tx(struct ethtool_mmsv *mmsv, + bool tx_active) +{ + if (mmsv->ops->configure_tx) + mmsv->ops->configure_tx(mmsv, tx_active); +} + +static void ethtool_mmsv_configure_pmac(struct ethtool_mmsv *mmsv, + bool pmac_enabled) +{ + if (mmsv->ops->configure_pmac) + mmsv->ops->configure_pmac(mmsv, pmac_enabled); +} + +static void ethtool_mmsv_send_mpacket(struct ethtool_mmsv *mmsv, + enum ethtool_mpacket mpacket) +{ + if (mmsv->ops->send_mpacket) + mmsv->ops->send_mpacket(mmsv, mpacket); +} + +/** + * ethtool_mmsv_verify_timer - Timer for MAC Merge verification + * @t: timer_list struct containing private info + * + * Verify the MAC Merge capability in the local TX direction, by + * transmitting Verify mPackets up to 3 times. Wait until link + * partner responds with a Response mPacket, otherwise fail. + */ +static void ethtool_mmsv_verify_timer(struct timer_list *t) +{ + struct ethtool_mmsv *mmsv = from_timer(mmsv, t, verify_timer); + unsigned long flags; + bool rearm = false; + + spin_lock_irqsave(&mmsv->lock, flags); + + switch (mmsv->status) { + case ETHTOOL_MM_VERIFY_STATUS_INITIAL: + case ETHTOOL_MM_VERIFY_STATUS_VERIFYING: + if (mmsv->verify_retries != 0) { + ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_VERIFY); + rearm = true; + } else { + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_FAILED; + } + + mmsv->verify_retries--; + break; + + case ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED: + ethtool_mmsv_configure_tx(mmsv, true); + break; + + default: + break; + } + + if (rearm) { + mod_timer(&mmsv->verify_timer, + jiffies + msecs_to_jiffies(mmsv->verify_time)); + } + + spin_unlock_irqrestore(&mmsv->lock, flags); +} + +static void ethtool_mmsv_verify_timer_arm(struct ethtool_mmsv *mmsv) +{ + if (mmsv->pmac_enabled && mmsv->tx_enabled && mmsv->verify_enabled && + mmsv->status != ETHTOOL_MM_VERIFY_STATUS_FAILED && + mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) { + timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0); + mod_timer(&mmsv->verify_timer, jiffies); + } +} + +static void ethtool_mmsv_apply(struct ethtool_mmsv *mmsv) +{ + /* If verification is disabled, configure FPE right away. + * Otherwise let the timer code do it. + */ + if (!mmsv->verify_enabled) { + ethtool_mmsv_configure_pmac(mmsv, mmsv->pmac_enabled); + ethtool_mmsv_configure_tx(mmsv, mmsv->tx_enabled); + } else { + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL; + mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES; + + if (netif_running(mmsv->dev)) + ethtool_mmsv_verify_timer_arm(mmsv); + } +} + +/** + * ethtool_mmsv_stop() - Stop MAC Merge Software Verification + * @mmsv: MAC Merge Software Verification state + * + * Drivers should call this method in a state where the hardware is + * about to lose state, like ndo_stop() or suspend(), and turning off + * MAC Merge features would be superfluous. Otherwise, prefer + * ethtool_mmsv_link_state_handle() with up=false. + */ +void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv) +{ + timer_shutdown_sync(&mmsv->verify_timer); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_stop); + +/** + * ethtool_mmsv_link_state_handle() - Inform MAC Merge Software Verification + * of link state changes + * @mmsv: MAC Merge Software Verification state + * @up: True if device carrier is up and able to pass verification packets + * + * Calling context is expected to be from a task, interrupts enabled. + */ +void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up) +{ + unsigned long flags; + + ethtool_mmsv_stop(mmsv); + + spin_lock_irqsave(&mmsv->lock, flags); + + if (up && mmsv->pmac_enabled) { + /* VERIFY process requires pMAC enabled when NIC comes up */ + ethtool_mmsv_configure_pmac(mmsv, true); + + /* New link => maybe new partner => new verification process */ + ethtool_mmsv_apply(mmsv); + } else { + /* Reset the reported verification state while the link is down */ + if (mmsv->verify_enabled) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL; + + /* No link or pMAC not enabled */ + ethtool_mmsv_configure_pmac(mmsv, false); + ethtool_mmsv_configure_tx(mmsv, false); + } + + spin_unlock_irqrestore(&mmsv->lock, flags); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_link_state_handle); + +/** + * ethtool_mmsv_event_handle() - Inform MAC Merge Software Verification + * of interrupt-based events + * @mmsv: MAC Merge Software Verification state + * @event: Event which took place (packet transmission or reception) + * + * Calling context expects to have interrupts disabled. + */ +void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv, + enum ethtool_mmsv_event event) +{ + /* This is interrupt context, just spin_lock() */ + spin_lock(&mmsv->lock); + + if (!mmsv->pmac_enabled) + goto unlock; + + switch (event) { + case ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET: + /* Link partner has sent verify mPacket */ + ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_RESPONSE); + break; + case ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET: + /* Local device has sent verify mPacket */ + if (mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_VERIFYING; + break; + case ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET: + /* Link partner has sent response mPacket */ + if (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_VERIFYING) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED; + break; + } + +unlock: + spin_unlock(&mmsv->lock); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_event_handle); + +static bool ethtool_mmsv_is_tx_active(struct ethtool_mmsv *mmsv) +{ + /* TX is active if administratively enabled, and verification either + * succeeded, or was administratively disabled. + */ + return mmsv->tx_enabled && + (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED || + mmsv->status == ETHTOOL_MM_VERIFY_STATUS_DISABLED); +} + +/** + * ethtool_mmsv_get_mm() - get_mm() hook for MAC Merge Software Verification + * @mmsv: MAC Merge Software Verification state + * @state: see struct ethtool_mm_state + * + * Drivers are expected to call this from their ethtool_ops :: get_mm() + * method. + */ +void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv, + struct ethtool_mm_state *state) +{ + unsigned long flags; + + spin_lock_irqsave(&mmsv->lock, flags); + + state->max_verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS; + state->verify_enabled = mmsv->verify_enabled; + state->pmac_enabled = mmsv->pmac_enabled; + state->verify_time = mmsv->verify_time; + state->tx_enabled = mmsv->tx_enabled; + state->verify_status = mmsv->status; + state->tx_active = ethtool_mmsv_is_tx_active(mmsv); + + spin_unlock_irqrestore(&mmsv->lock, flags); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_get_mm); + +/** + * ethtool_mmsv_set_mm() - set_mm() hook for MAC Merge Software Verification + * @mmsv: MAC Merge Software Verification state + * @cfg: see struct ethtool_mm_cfg + * + * Drivers are expected to call this from their ethtool_ops :: set_mm() + * method. + */ +void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg) +{ + unsigned long flags; + + /* Wait for the verification that's currently in progress to finish */ + ethtool_mmsv_stop(mmsv); + + spin_lock_irqsave(&mmsv->lock, flags); + + mmsv->verify_enabled = cfg->verify_enabled; + mmsv->pmac_enabled = cfg->pmac_enabled; + mmsv->verify_time = cfg->verify_time; + mmsv->tx_enabled = cfg->tx_enabled; + + if (!cfg->verify_enabled) + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; + + ethtool_mmsv_apply(mmsv); + + spin_unlock_irqrestore(&mmsv->lock, flags); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_set_mm); + +/** + * ethtool_mmsv_init() - Initialize MAC Merge Software Verification state + * @mmsv: MAC Merge Software Verification state + * @dev: Pointer to network interface + * @ops: Methods for implementing the generic functionality + * + * The MAC Merge Software Verification is a timer- and event-based state + * machine intended for network interfaces which lack a hardware-based + * TX verification process (as per IEEE 802.3 clause 99.4.3). The timer + * is managed by the core code, whereas events are supplied by the + * driver explicitly calling one of the other API functions. + */ +void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev, + const struct ethtool_mmsv_ops *ops) +{ + mmsv->ops = ops; + mmsv->dev = dev; + mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES; + mmsv->verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS; + mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED; + timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0); + spin_lock_init(&mmsv->lock); +} +EXPORT_SYMBOL_GPL(ethtool_mmsv_init); diff --git a/net/ethtool/module.c b/net/ethtool/module.c index 6988e07bdcd6..4d4e0a82579a 100644 --- a/net/ethtool/module.c +++ b/net/ethtool/module.c @@ -4,6 +4,7 @@ #include <linux/firmware.h> #include <linux/sfp.h> #include <net/devlink.h> +#include <net/netdev_lock.h> #include "netlink.h" #include "common.h" @@ -419,19 +420,21 @@ int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info) dev = req_info.dev; rtnl_lock(); + netdev_lock_ops(dev); ret = ethnl_ops_begin(dev); if (ret < 0) - goto out_rtnl; + goto out_unlock; ret = ethnl_module_fw_flash_validate(dev, info->extack); if (ret < 0) - goto out_rtnl; + goto out_unlock; ret = module_flash_fw(dev, tb, skb, info); ethnl_ops_complete(dev); -out_rtnl: +out_unlock: + netdev_unlock_ops(dev); rtnl_unlock(); ethnl_parse_header_dev_put(&req_info); return ret; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index e3f0ef6b851b..9de828df46cd 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <net/netdev_lock.h> +#include <net/netdev_queues.h> #include <net/sock.h> #include <linux/ethtool_netlink.h> #include <linux/phy_link_topology.h> @@ -89,8 +91,10 @@ int ethnl_ops_begin(struct net_device *dev) if (dev->dev.parent) pm_runtime_get_sync(dev->dev.parent); + netdev_ops_assert_locked(dev); + if (!netif_device_present(dev) || - dev->reg_state == NETREG_UNREGISTERING) { + dev->reg_state >= NETREG_UNREGISTERING) { ret = -ENODEV; goto err; } @@ -210,7 +214,7 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, } struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, - const struct nlattr *header, + struct nlattr **tb, unsigned int header, struct netlink_ext_ack *extack) { struct phy_device *phydev; @@ -224,8 +228,8 @@ struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, return req_info->dev->phydev; phydev = phy_link_topo_get_phy(req_info->dev, req_info->phy_index); - if (!phydev) { - NL_SET_ERR_MSG_ATTR(extack, header, + if (!phydev && tb) { + NL_SET_ERR_MSG_ATTR(extack, tb[header], "no phy matching phyindex"); return ERR_PTR(-ENODEV); } @@ -353,6 +357,18 @@ struct ethnl_dump_ctx { unsigned long pos_ifindex; }; +/** + * struct ethnl_perphy_dump_ctx - context for dumpit() PHY-aware callbacks + * @ethnl_ctx: generic ethnl context + * @ifindex: For Filtered DUMP requests, the ifindex of the targeted netdev + * @pos_phyindex: iterator position for multi-msg DUMP + */ +struct ethnl_perphy_dump_ctx { + struct ethnl_dump_ctx ethnl_ctx; + unsigned int ifindex; + unsigned long pos_phyindex; +}; + static const struct ethnl_request_ops * ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STRSET_GET] = ðnl_strset_request_ops, @@ -394,6 +410,9 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_PLCA_GET_STATUS] = ðnl_plca_status_request_ops, [ETHTOOL_MSG_MM_GET] = ðnl_mm_request_ops, [ETHTOOL_MSG_MM_SET] = ðnl_mm_request_ops, + [ETHTOOL_MSG_TSCONFIG_GET] = ðnl_tsconfig_request_ops, + [ETHTOOL_MSG_TSCONFIG_SET] = ðnl_tsconfig_request_ops, + [ETHTOOL_MSG_PHY_GET] = ðnl_phy_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -401,6 +420,12 @@ static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) return (struct ethnl_dump_ctx *)cb->ctx; } +static struct ethnl_perphy_dump_ctx * +ethnl_perphy_dump_context(struct netlink_callback *cb) +{ + return (struct ethnl_perphy_dump_ctx *)cb->ctx; +} + /** * ethnl_default_parse() - Parse request message * @req_info: pointer to structure to put data into @@ -487,10 +512,14 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) ethnl_init_reply_data(reply_data, ops, req_info->dev); rtnl_lock(); + if (req_info->dev) + netdev_lock_ops(req_info->dev); ret = ops->prepare_data(req_info, reply_data, info); + if (req_info->dev) + netdev_unlock_ops(req_info->dev); rtnl_unlock(); if (ret < 0) - goto err_cleanup; + goto err_dev; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; @@ -545,10 +574,12 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev); rtnl_lock(); + netdev_lock_ops(dev); ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info); + netdev_unlock_ops(dev); rtnl_unlock(); if (ret < 0) - goto out; + goto out_cancel; ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr); if (ret < 0) goto out; @@ -557,6 +588,7 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, out: if (ctx->ops->cleanup_data) ctx->ops->cleanup_data(ctx->reply_data); +out_cancel: ctx->reply_data->dev = NULL; if (ret < 0) genlmsg_cancel(skb, ehdr); @@ -571,18 +603,19 @@ static int ethnl_default_dumpit(struct sk_buff *skb, { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct net *net = sock_net(skb->sk); + netdevice_tracker dev_tracker; struct net_device *dev; int ret = 0; rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->pos_ifindex) { - dev_hold(dev); + netdev_hold(dev, &dev_tracker, GFP_ATOMIC); rcu_read_unlock(); ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb)); rcu_read_lock(); - dev_put(dev); + netdev_put(dev, &dev_tracker); if (ret < 0 && ret != -EOPNOTSUPP) { if (likely(skb->len)) @@ -649,6 +682,173 @@ free_req_info: return ret; } +/* per-PHY ->start() handler for GET requests */ +static int ethnl_perphy_start(struct netlink_callback *cb) +{ + struct ethnl_perphy_dump_ctx *phy_ctx = ethnl_perphy_dump_context(cb); + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct ethnl_dump_ctx *ctx = &phy_ctx->ethnl_ctx; + struct ethnl_reply_data *reply_data; + const struct ethnl_request_ops *ops; + struct ethnl_req_info *req_info; + struct genlmsghdr *ghdr; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + ghdr = nlmsg_data(cb->nlh); + ops = ethnl_default_requests[ghdr->cmd]; + if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd)) + return -EOPNOTSUPP; + req_info = kzalloc(ops->req_info_size, GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); + if (!reply_data) { + ret = -ENOMEM; + goto free_req_info; + } + + /* Unlike per-dev dump, don't ignore dev. The dump handler + * will notice it and dump PHYs from given dev. We only keep track of + * the dev's ifindex, .dumpit() will grab and release the netdev itself. + */ + ret = ethnl_default_parse(req_info, &info->info, ops, false); + if (req_info->dev) { + phy_ctx->ifindex = req_info->dev->ifindex; + netdev_put(req_info->dev, &req_info->dev_tracker); + req_info->dev = NULL; + } + if (ret < 0) + goto free_reply_data; + + ctx->ops = ops; + ctx->req_info = req_info; + ctx->reply_data = reply_data; + ctx->pos_ifindex = 0; + + return 0; + +free_reply_data: + kfree(reply_data); +free_req_info: + kfree(req_info); + + return ret; +} + +static int ethnl_perphy_dump_one_dev(struct sk_buff *skb, + struct ethnl_perphy_dump_ctx *ctx, + const struct genl_info *info) +{ + struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; + struct net_device *dev = ethnl_ctx->req_info->dev; + struct phy_device_node *pdn; + int ret; + + if (!dev->link_topo) + return 0; + + xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn, + ctx->pos_phyindex) { + ethnl_ctx->req_info->phy_index = ctx->pos_phyindex; + + /* We can re-use the original dump_one as ->prepare_data in + * commands use ethnl_req_get_phydev(), which gets the PHY from + * the req_info->phy_index + */ + ret = ethnl_default_dump_one(skb, dev, ethnl_ctx, info); + if (ret) + return ret; + } + + ctx->pos_phyindex = 0; + + return 0; +} + +static int ethnl_perphy_dump_all_dev(struct sk_buff *skb, + struct ethnl_perphy_dump_ctx *ctx, + const struct genl_info *info) +{ + struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; + struct net *net = sock_net(skb->sk); + netdevice_tracker dev_tracker; + struct net_device *dev; + int ret = 0; + + rcu_read_lock(); + for_each_netdev_dump(net, dev, ethnl_ctx->pos_ifindex) { + netdev_hold(dev, &dev_tracker, GFP_ATOMIC); + rcu_read_unlock(); + + /* per-PHY commands use ethnl_req_get_phydev(), which needs the + * net_device in the req_info + */ + ethnl_ctx->req_info->dev = dev; + ret = ethnl_perphy_dump_one_dev(skb, ctx, info); + + rcu_read_lock(); + netdev_put(dev, &dev_tracker); + ethnl_ctx->req_info->dev = NULL; + + if (ret < 0 && ret != -EOPNOTSUPP) { + if (likely(skb->len)) + ret = skb->len; + break; + } + ret = 0; + } + rcu_read_unlock(); + + return ret; +} + +/* per-PHY ->dumpit() handler for GET requests. */ +static int ethnl_perphy_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb); + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; + int ret = 0; + + if (ctx->ifindex) { + netdevice_tracker dev_tracker; + struct net_device *dev; + + dev = netdev_get_by_index(genl_info_net(&info->info), + ctx->ifindex, &dev_tracker, + GFP_KERNEL); + if (!dev) + return -ENODEV; + + ethnl_ctx->req_info->dev = dev; + ret = ethnl_perphy_dump_one_dev(skb, ctx, genl_info_dump(cb)); + + if (ret < 0 && ret != -EOPNOTSUPP && likely(skb->len)) + ret = skb->len; + + netdev_put(dev, &dev_tracker); + } else { + ret = ethnl_perphy_dump_all_dev(skb, ctx, genl_info_dump(cb)); + } + + return ret; +} + +/* per-PHY ->done() handler for GET requests */ +static int ethnl_perphy_done(struct netlink_callback *cb) +{ + struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb); + struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; + + kfree(ethnl_ctx->reply_data); + kfree(ethnl_ctx->req_info); + + return 0; +} + /* default ->done() handler for GET requests */ static int ethnl_default_done(struct netlink_callback *cb) { @@ -665,6 +865,7 @@ static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) const struct ethnl_request_ops *ops; struct ethnl_req_info req_info = {}; const u8 cmd = info->genlhdr->cmd; + struct net_device *dev; int ret; ops = ethnl_default_requests[cmd]; @@ -686,20 +887,38 @@ static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) goto out_dev; } + dev = req_info.dev; + rtnl_lock(); - ret = ethnl_ops_begin(req_info.dev); + netdev_lock_ops(dev); + dev->cfg_pending = kmemdup(dev->cfg, sizeof(*dev->cfg), + GFP_KERNEL_ACCOUNT); + if (!dev->cfg_pending) { + ret = -ENOMEM; + goto out_tie_cfg; + } + + ret = ethnl_ops_begin(dev); if (ret < 0) - goto out_rtnl; + goto out_free_cfg; ret = ops->set(&req_info, info); - if (ret <= 0) + if (ret < 0) + goto out_ops; + + swap(dev->cfg, dev->cfg_pending); + if (!ret) goto out_ops; - ethtool_notify(req_info.dev, ops->set_ntf_cmd, NULL); + ethtool_notify(dev, ops->set_ntf_cmd, NULL); ret = 0; out_ops: - ethnl_ops_complete(req_info.dev); -out_rtnl: + ethnl_ops_complete(dev); +out_free_cfg: + kfree(dev->cfg_pending); +out_tie_cfg: + dev->cfg_pending = dev->cfg; + netdev_unlock_ops(dev); rtnl_unlock(); out_dev: ethnl_parse_header_dev_put(&req_info); @@ -757,10 +976,12 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, req_info->dev = dev; req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS; + netdev_ops_assert_locked(dev); + ethnl_init_reply_data(reply_data, ops, dev); ret = ops->prepare_data(req_info, reply_data, &info); if (ret < 0) - goto err_cleanup; + goto err_rep; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; @@ -795,6 +1016,7 @@ err_skb: err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); +err_rep: kfree(reply_data); kfree(req_info); return; @@ -1074,9 +1296,9 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_TSINFO_GET, .doit = ethnl_default_doit, - .start = ethnl_default_start, - .dumpit = ethnl_default_dumpit, - .done = ethnl_default_done, + .start = ethnl_tsinfo_start, + .dumpit = ethnl_tsinfo_dumpit, + .done = ethnl_tsinfo_done, .policy = ethnl_tsinfo_get_policy, .maxattr = ARRAY_SIZE(ethnl_tsinfo_get_policy) - 1, }, @@ -1165,9 +1387,9 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PSE_GET, .doit = ethnl_default_doit, - .start = ethnl_default_start, - .dumpit = ethnl_default_dumpit, - .done = ethnl_default_done, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, .policy = ethnl_pse_get_policy, .maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1, }, @@ -1189,9 +1411,9 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PLCA_GET_CFG, .doit = ethnl_default_doit, - .start = ethnl_default_start, - .dumpit = ethnl_default_dumpit, - .done = ethnl_default_done, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, .policy = ethnl_plca_get_cfg_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1, }, @@ -1205,9 +1427,9 @@ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_PLCA_GET_STATUS, .doit = ethnl_default_doit, - .start = ethnl_default_start, - .dumpit = ethnl_default_dumpit, - .done = ethnl_default_done, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, .policy = ethnl_plca_get_status_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1, }, @@ -1236,13 +1458,29 @@ static const struct genl_ops ethtool_genl_ops[] = { }, { .cmd = ETHTOOL_MSG_PHY_GET, - .doit = ethnl_phy_doit, - .start = ethnl_phy_start, - .dumpit = ethnl_phy_dumpit, - .done = ethnl_phy_done, + .doit = ethnl_default_doit, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, .policy = ethnl_phy_get_policy, .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_TSCONFIG_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_tsconfig_get_policy, + .maxattr = ARRAY_SIZE(ethnl_tsconfig_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_TSCONFIG_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_default_set_doit, + .policy = ethnl_tsconfig_set_policy, + .maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 203b08eb6c6f..91b953924af3 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -275,7 +275,8 @@ static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info) * ethnl_req_get_phydev() - Gets the phy_device targeted by this request, * if any. Must be called under rntl_lock(). * @req_info: The ethnl request to get the phy from. - * @header: The netlink header, used for error reporting. + * @tb: The netlink attributes array, for error reporting. + * @header: The netlink header index, used for error reporting. * @extack: The netlink extended ACK, for error reporting. * * The caller must hold RTNL, until it's done interacting with the returned @@ -289,7 +290,7 @@ static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info) * is returned. */ struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, - const struct nlattr *header, + struct nlattr **tb, unsigned int header, struct netlink_ext_ack *extack); /** @@ -435,6 +436,7 @@ extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops; extern const struct ethnl_request_ops ethnl_plca_status_request_ops; extern const struct ethnl_request_ops ethnl_mm_request_ops; extern const struct ethnl_request_ops ethnl_phy_request_ops; +extern const struct ethnl_request_ops ethnl_tsconfig_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -455,7 +457,7 @@ extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANT extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1]; extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1]; extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1]; -extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX + 1]; +extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_HDS_THRESH_MAX + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; @@ -464,7 +466,7 @@ extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_STATS_SRC extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1]; -extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + 1]; +extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1]; extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1]; extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; @@ -485,6 +487,8 @@ extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1]; extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1]; extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1]; extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1]; +extern const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1]; +extern const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1]; int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); @@ -495,15 +499,15 @@ int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info); int ethnl_rss_dump_start(struct netlink_callback *cb); int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb); -int ethnl_phy_start(struct netlink_callback *cb); -int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info); -int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb); -int ethnl_phy_done(struct netlink_callback *cb); +int ethnl_tsinfo_start(struct netlink_callback *cb); +int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int ethnl_tsinfo_done(struct netlink_callback *cb); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN]; extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN]; +extern const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN]; #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c index ed8f690f6bac..68372bef4b2f 100644 --- a/net/ethtool/phy.c +++ b/net/ethtool/phy.c @@ -9,298 +9,157 @@ #include <linux/phy.h> #include <linux/phy_link_topology.h> #include <linux/sfp.h> +#include <net/netdev_lock.h> struct phy_req_info { - struct ethnl_req_info base; - struct phy_device_node *pdn; + struct ethnl_req_info base; }; -#define PHY_REQINFO(__req_base) \ - container_of(__req_base, struct phy_req_info, base) +struct phy_reply_data { + struct ethnl_reply_data base; + u32 phyindex; + char *drvname; + char *name; + unsigned int upstream_type; + char *upstream_sfp_name; + unsigned int upstream_index; + char *downstream_sfp_name; +}; + +#define PHY_REPDATA(__reply_base) \ + container_of(__reply_base, struct phy_reply_data, base) const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1] = { [ETHTOOL_A_PHY_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; -/* Caller holds rtnl */ -static ssize_t -ethnl_phy_reply_size(const struct ethnl_req_info *req_base, - struct netlink_ext_ack *extack) +static int phy_reply_size(const struct ethnl_req_info *req_info, + const struct ethnl_reply_data *reply_data) { - struct phy_req_info *req_info = PHY_REQINFO(req_base); - struct phy_device_node *pdn = req_info->pdn; - struct phy_device *phydev = pdn->phy; + struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); size_t size = 0; - ASSERT_RTNL(); - /* ETHTOOL_A_PHY_INDEX */ size += nla_total_size(sizeof(u32)); /* ETHTOOL_A_DRVNAME */ - if (phydev->drv) - size += nla_total_size(strlen(phydev->drv->name) + 1); + if (rep_data->drvname) + size += nla_total_size(strlen(rep_data->drvname) + 1); /* ETHTOOL_A_NAME */ - size += nla_total_size(strlen(dev_name(&phydev->mdio.dev)) + 1); + size += nla_total_size(strlen(rep_data->name) + 1); /* ETHTOOL_A_PHY_UPSTREAM_TYPE */ size += nla_total_size(sizeof(u32)); - if (phy_on_sfp(phydev)) { - const char *upstream_sfp_name = sfp_get_name(pdn->parent_sfp_bus); - - /* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */ - if (upstream_sfp_name) - size += nla_total_size(strlen(upstream_sfp_name) + 1); + /* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */ + if (rep_data->upstream_sfp_name) + size += nla_total_size(strlen(rep_data->upstream_sfp_name) + 1); - /* ETHTOOL_A_PHY_UPSTREAM_INDEX */ + /* ETHTOOL_A_PHY_UPSTREAM_INDEX */ + if (rep_data->upstream_index) size += nla_total_size(sizeof(u32)); - } /* ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME */ - if (phydev->sfp_bus) { - const char *sfp_name = sfp_get_name(phydev->sfp_bus); - - if (sfp_name) - size += nla_total_size(strlen(sfp_name) + 1); - } + if (rep_data->downstream_sfp_name) + size += nla_total_size(strlen(rep_data->downstream_sfp_name) + 1); return size; } -static int -ethnl_phy_fill_reply(const struct ethnl_req_info *req_base, struct sk_buff *skb) +static int phy_prepare_data(const struct ethnl_req_info *req_info, + struct ethnl_reply_data *reply_data, + const struct genl_info *info) { - struct phy_req_info *req_info = PHY_REQINFO(req_base); - struct phy_device_node *pdn = req_info->pdn; - struct phy_device *phydev = pdn->phy; - enum phy_upstream ptype; + struct phy_link_topology *topo = reply_data->dev->link_topo; + struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); + struct nlattr **tb = info->attrs; + struct phy_device_node *pdn; + struct phy_device *phydev; - ptype = pdn->upstream_type; + /* RTNL is held by the caller */ + phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PHY_HEADER, + info->extack); + if (IS_ERR_OR_NULL(phydev)) + return -EOPNOTSUPP; - if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, phydev->phyindex) || - nla_put_string(skb, ETHTOOL_A_PHY_NAME, dev_name(&phydev->mdio.dev)) || - nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, ptype)) - return -EMSGSIZE; + pdn = xa_load(&topo->phys, phydev->phyindex); + if (!pdn) + return -EOPNOTSUPP; - if (phydev->drv && - nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, phydev->drv->name)) - return -EMSGSIZE; + rep_data->phyindex = phydev->phyindex; + rep_data->name = kstrdup(dev_name(&phydev->mdio.dev), GFP_KERNEL); + rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL); + rep_data->upstream_type = pdn->upstream_type; - if (ptype == PHY_UPSTREAM_PHY) { + if (pdn->upstream_type == PHY_UPSTREAM_PHY) { struct phy_device *upstream = pdn->upstream.phydev; - const char *sfp_upstream_name; - - /* Parent index */ - if (nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX, upstream->phyindex)) - return -EMSGSIZE; - - if (pdn->parent_sfp_bus) { - sfp_upstream_name = sfp_get_name(pdn->parent_sfp_bus); - if (sfp_upstream_name && - nla_put_string(skb, ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, - sfp_upstream_name)) - return -EMSGSIZE; - } - } - - if (phydev->sfp_bus) { - const char *sfp_name = sfp_get_name(phydev->sfp_bus); - - if (sfp_name && - nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, - sfp_name)) - return -EMSGSIZE; + rep_data->upstream_index = upstream->phyindex; } - return 0; -} - -static int ethnl_phy_parse_request(struct ethnl_req_info *req_base, - struct nlattr **tb, - struct netlink_ext_ack *extack) -{ - struct phy_link_topology *topo = req_base->dev->link_topo; - struct phy_req_info *req_info = PHY_REQINFO(req_base); - struct phy_device *phydev; + if (pdn->parent_sfp_bus) + rep_data->upstream_sfp_name = kstrdup(sfp_get_name(pdn->parent_sfp_bus), + GFP_KERNEL); - phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PHY_HEADER], - extack); - if (!phydev) - return 0; - - if (IS_ERR(phydev)) - return PTR_ERR(phydev); - - if (!topo) - return 0; - - req_info->pdn = xa_load(&topo->phys, phydev->phyindex); + if (phydev->sfp_bus) + rep_data->downstream_sfp_name = kstrdup(sfp_get_name(phydev->sfp_bus), + GFP_KERNEL); return 0; } -int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info) +static int phy_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_info, + const struct ethnl_reply_data *reply_data) { - struct phy_req_info req_info = {}; - struct nlattr **tb = info->attrs; - struct sk_buff *rskb; - void *reply_payload; - int reply_len; - int ret; - - ret = ethnl_parse_header_dev_get(&req_info.base, - tb[ETHTOOL_A_PHY_HEADER], - genl_info_net(info), info->extack, - true); - if (ret < 0) - return ret; - - rtnl_lock(); - - ret = ethnl_phy_parse_request(&req_info.base, tb, info->extack); - if (ret < 0) - goto err_unlock_rtnl; - - /* No PHY, return early */ - if (!req_info.pdn) - goto err_unlock_rtnl; - - ret = ethnl_phy_reply_size(&req_info.base, info->extack); - if (ret < 0) - goto err_unlock_rtnl; - reply_len = ret + ethnl_reply_header_size(); - - rskb = ethnl_reply_init(reply_len, req_info.base.dev, - ETHTOOL_MSG_PHY_GET_REPLY, - ETHTOOL_A_PHY_HEADER, - info, &reply_payload); - if (!rskb) { - ret = -ENOMEM; - goto err_unlock_rtnl; - } - - ret = ethnl_phy_fill_reply(&req_info.base, rskb); - if (ret) - goto err_free_msg; + struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); - rtnl_unlock(); - ethnl_parse_header_dev_put(&req_info.base); - genlmsg_end(rskb, reply_payload); - - return genlmsg_reply(rskb, info); - -err_free_msg: - nlmsg_free(rskb); -err_unlock_rtnl: - rtnl_unlock(); - ethnl_parse_header_dev_put(&req_info.base); - return ret; -} - -struct ethnl_phy_dump_ctx { - struct phy_req_info *phy_req_info; - unsigned long ifindex; - unsigned long phy_index; -}; - -int ethnl_phy_start(struct netlink_callback *cb) -{ - const struct genl_info *info = genl_info_dump(cb); - struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; - int ret; - - BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); - - ctx->phy_req_info = kzalloc(sizeof(*ctx->phy_req_info), GFP_KERNEL); - if (!ctx->phy_req_info) - return -ENOMEM; - - ret = ethnl_parse_header_dev_get(&ctx->phy_req_info->base, - info->attrs[ETHTOOL_A_PHY_HEADER], - sock_net(cb->skb->sk), cb->extack, - false); - ctx->ifindex = 0; - ctx->phy_index = 0; - - if (ret) - kfree(ctx->phy_req_info); + if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, rep_data->phyindex) || + nla_put_string(skb, ETHTOOL_A_PHY_NAME, rep_data->name) || + nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, rep_data->upstream_type)) + return -EMSGSIZE; - return ret; -} + if (rep_data->drvname && + nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, rep_data->drvname)) + return -EMSGSIZE; -int ethnl_phy_done(struct netlink_callback *cb) -{ - struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; + if (rep_data->upstream_index && + nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX, + rep_data->upstream_index)) + return -EMSGSIZE; - if (ctx->phy_req_info->base.dev) - ethnl_parse_header_dev_put(&ctx->phy_req_info->base); + if (rep_data->upstream_sfp_name && + nla_put_string(skb, ETHTOOL_A_PHY_UPSTREAM_SFP_NAME, + rep_data->upstream_sfp_name)) + return -EMSGSIZE; - kfree(ctx->phy_req_info); + if (rep_data->downstream_sfp_name && + nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME, + rep_data->downstream_sfp_name)) + return -EMSGSIZE; return 0; } -static int ethnl_phy_dump_one_dev(struct sk_buff *skb, struct net_device *dev, - struct netlink_callback *cb) +static void phy_cleanup_data(struct ethnl_reply_data *reply_data) { - struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; - struct phy_req_info *pri = ctx->phy_req_info; - struct phy_device_node *pdn; - int ret = 0; - void *ehdr; - - if (!dev->link_topo) - return 0; - - xa_for_each_start(&dev->link_topo->phys, ctx->phy_index, pdn, ctx->phy_index) { - ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_PHY_GET_REPLY); - if (!ehdr) { - ret = -EMSGSIZE; - break; - } - - ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_PHY_HEADER); - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - break; - } - - pri->pdn = pdn; - ret = ethnl_phy_fill_reply(&pri->base, skb); - if (ret < 0) { - genlmsg_cancel(skb, ehdr); - break; - } - - genlmsg_end(skb, ehdr); - } + struct phy_reply_data *rep_data = PHY_REPDATA(reply_data); - return ret; + kfree(rep_data->drvname); + kfree(rep_data->name); + kfree(rep_data->upstream_sfp_name); + kfree(rep_data->downstream_sfp_name); } -int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct ethnl_phy_dump_ctx *ctx = (void *)cb->ctx; - struct net *net = sock_net(skb->sk); - struct net_device *dev; - int ret = 0; - - rtnl_lock(); - - if (ctx->phy_req_info->base.dev) { - ret = ethnl_phy_dump_one_dev(skb, ctx->phy_req_info->base.dev, cb); - } else { - for_each_netdev_dump(net, dev, ctx->ifindex) { - ret = ethnl_phy_dump_one_dev(skb, dev, cb); - if (ret) - break; - - ctx->phy_index = 0; - } - } - rtnl_unlock(); - - return ret; -} +const struct ethnl_request_ops ethnl_phy_request_ops = { + .request_cmd = ETHTOOL_MSG_PHY_GET, + .reply_cmd = ETHTOOL_MSG_PHY_GET_REPLY, + .hdr_attr = ETHTOOL_A_PHY_HEADER, + .req_info_size = sizeof(struct phy_req_info), + .reply_data_size = sizeof(struct phy_reply_data), + + .prepare_data = phy_prepare_data, + .reply_size = phy_reply_size, + .fill_reply = phy_fill_reply, + .cleanup_data = phy_cleanup_data, +}; diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c index d95d92f173a6..e1f7820a6158 100644 --- a/net/ethtool/plca.c +++ b/net/ethtool/plca.c @@ -62,7 +62,7 @@ static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base, struct phy_device *phydev; int ret; - phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PLCA_HEADER], + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PLCA_HEADER, info->extack); // check that the PHY device is available and connected if (IS_ERR_OR_NULL(phydev)) { @@ -152,7 +152,7 @@ ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info) bool mod = false; int ret; - phydev = ethnl_req_get_phydev(req_info, tb[ETHTOOL_A_PLCA_HEADER], + phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PLCA_HEADER, info->extack); // check that the PHY device is available and connected if (IS_ERR_OR_NULL(phydev)) @@ -211,7 +211,7 @@ static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base, struct phy_device *phydev; int ret; - phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PLCA_HEADER], + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PLCA_HEADER, info->extack); // check that the PHY device is available and connected if (IS_ERR_OR_NULL(phydev)) { diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c index a0705edca22a..4f6b99eab2a6 100644 --- a/net/ethtool/pse-pd.c +++ b/net/ethtool/pse-pd.c @@ -19,7 +19,7 @@ struct pse_req_info { struct pse_reply_data { struct ethnl_reply_data base; - struct pse_control_status status; + struct ethtool_pse_control_status status; }; #define PSE_REPDATA(__reply_base) \ @@ -64,7 +64,7 @@ static int pse_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; - phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_PSE_HEADER], + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PSE_HEADER, info->extack); if (IS_ERR(phydev)) return -ENODEV; @@ -80,7 +80,7 @@ static int pse_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pse_reply_data *data = PSE_REPDATA(reply_base); - const struct pse_control_status *st = &data->status; + const struct ethtool_pse_control_status *st = &data->status; int len = 0; if (st->podl_admin_state > 0) @@ -114,7 +114,7 @@ static int pse_reply_size(const struct ethnl_req_info *req_base, } static int pse_put_pw_limit_ranges(struct sk_buff *skb, - const struct pse_control_status *st) + const struct ethtool_pse_control_status *st) { const struct ethtool_c33_pse_pw_limit_range *pw_limit_ranges; int i; @@ -146,7 +146,7 @@ static int pse_fill_reply(struct sk_buff *skb, const struct ethnl_reply_data *reply_base) { const struct pse_reply_data *data = PSE_REPDATA(reply_base); - const struct pse_control_status *st = &data->status; + const struct ethtool_pse_control_status *st = &data->status; if (st->podl_admin_state > 0 && nla_put_u32(skb, ETHTOOL_A_PODL_PSE_ADMIN_STATE, @@ -261,7 +261,7 @@ ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) struct phy_device *phydev; int ret; - phydev = ethnl_req_get_phydev(req_info, tb[ETHTOOL_A_PSE_HEADER], + phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PSE_HEADER, info->extack); ret = ethnl_set_pse_validate(phydev, info); if (ret) diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index b7865a14fdf8..aeedd5ec6b8c 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <net/netdev_queues.h> + #include "netlink.h" #include "common.h" @@ -37,6 +39,10 @@ static int rings_prepare_data(const struct ethnl_req_info *req_base, ret = ethnl_ops_begin(dev); if (ret < 0) return ret; + + data->kernel_ringparam.tcp_data_split = dev->cfg->hds_config; + data->kernel_ringparam.hds_thresh = dev->cfg->hds_thresh; + dev->ethtool_ops->get_ringparam(dev, &data->ringparam, &data->kernel_ringparam, info->extack); ethnl_ops_complete(dev); @@ -61,7 +67,9 @@ static int rings_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u8)) + /* _RINGS_TX_PUSH */ nla_total_size(sizeof(u8))) + /* _RINGS_RX_PUSH */ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN */ - nla_total_size(sizeof(u32)); /* _RINGS_TX_PUSH_BUF_LEN_MAX */ + nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN_MAX */ + nla_total_size(sizeof(u32)) + /* _RINGS_HDS_THRESH */ + nla_total_size(sizeof(u32)); /* _RINGS_HDS_THRESH_MAX*/ } static int rings_fill_reply(struct sk_buff *skb, @@ -108,7 +116,12 @@ static int rings_fill_reply(struct sk_buff *skb, (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX, kr->tx_push_buf_max_len) || nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN, - kr->tx_push_buf_len)))) + kr->tx_push_buf_len))) || + ((supported_ring_params & ETHTOOL_RING_USE_HDS_THRS) && + (nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH, + kr->hds_thresh) || + nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH_MAX, + kr->hds_thresh_max)))) return -EMSGSIZE; return 0; @@ -130,6 +143,7 @@ const struct nla_policy ethnl_rings_set_policy[] = { [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1), [ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] = { .type = NLA_U32 }, + [ETHTOOL_A_RINGS_HDS_THRESH] = { .type = NLA_U32 }, }; static int @@ -155,6 +169,14 @@ ethnl_set_rings_validate(struct ethnl_req_info *req_info, return -EOPNOTSUPP; } + if (tb[ETHTOOL_A_RINGS_HDS_THRESH] && + !(ops->supported_ring_params & ETHTOOL_RING_USE_HDS_THRS)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_RINGS_HDS_THRESH], + "setting hds-thresh is not supported"); + return -EOPNOTSUPP; + } + if (tb[ETHTOOL_A_RINGS_CQE_SIZE] && !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) { NL_SET_ERR_MSG_ATTR(info->extack, @@ -193,16 +215,16 @@ ethnl_set_rings_validate(struct ethnl_req_info *req_info, static int ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) { - struct kernel_ethtool_ringparam kernel_ringparam = {}; - struct ethtool_ringparam ringparam = {}; + struct kernel_ethtool_ringparam kernel_ringparam; struct net_device *dev = req_info->dev; + struct ethtool_ringparam ringparam; struct nlattr **tb = info->attrs; const struct nlattr *err_attr; bool mod = false; int ret; - dev->ethtool_ops->get_ringparam(dev, &ringparam, - &kernel_ringparam, info->extack); + ethtool_ringparam_get_cfg(dev, &ringparam, &kernel_ringparam, + info->extack); ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, @@ -222,9 +244,32 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) tb[ETHTOOL_A_RINGS_RX_PUSH], &mod); ethnl_update_u32(&kernel_ringparam.tx_push_buf_len, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod); + ethnl_update_u32(&kernel_ringparam.hds_thresh, + tb[ETHTOOL_A_RINGS_HDS_THRESH], &mod); if (!mod) return 0; + if (kernel_ringparam.tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + dev_xdp_sb_prog_count(dev)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], + "tcp-data-split can not be enabled with single buffer XDP"); + return -EINVAL; + } + + if (dev_get_min_mp_channel_count(dev)) { + if (kernel_ringparam.tcp_data_split != + ETHTOOL_TCP_DATA_SPLIT_ENABLED) { + NL_SET_ERR_MSG(info->extack, + "can't disable tcp-data-split while device has memory provider enabled"); + return -EINVAL; + } else if (kernel_ringparam.hds_thresh) { + NL_SET_ERR_MSG(info->extack, + "can't set non-zero hds_thresh while device is memory provider enabled"); + return -EINVAL; + } + } + /* ensure new ring parameters are within limits */ if (ringparam.rx_pending > ringparam.rx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_RX]; @@ -234,6 +279,8 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO]; else if (ringparam.tx_pending > ringparam.tx_max_pending) err_attr = tb[ETHTOOL_A_RINGS_TX]; + else if (kernel_ringparam.hds_thresh > kernel_ringparam.hds_thresh_max) + err_attr = tb[ETHTOOL_A_RINGS_HDS_THRESH]; else err_attr = NULL; if (err_attr) { @@ -250,6 +297,9 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) return -EINVAL; } + dev->cfg_pending->hds_config = kernel_ringparam.tcp_data_split; + dev->cfg_pending->hds_thresh = kernel_ringparam.hds_thresh; + ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); return ret < 0 ? ret : 1; diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 7cb106b590ab..6d9b1769896b 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <net/netdev_lock.h> + #include "netlink.h" #include "common.h" @@ -107,6 +109,8 @@ rss_prepare_ctx(const struct rss_req_info *request, struct net_device *dev, u32 total_size, indir_bytes; u8 *rss_config; + data->no_key_fields = !dev->ethtool_ops->rxfh_per_ctx_key; + ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context); if (!ctx) return -ENOENT; @@ -153,7 +157,6 @@ rss_prepare_data(const struct ethnl_req_info *req_base, if (!ops->cap_rss_ctx_supported && !ops->create_rxfh_context) return -EOPNOTSUPP; - data->no_key_fields = !ops->rxfh_per_ctx_key; return rss_prepare_ctx(request, dev, data, info); } @@ -344,7 +347,9 @@ int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb) if (ctx->match_ifindex && ctx->match_ifindex != ctx->ifindex) break; + netdev_lock_ops(dev); ret = rss_dump_one_dev(skb, cb, dev); + netdev_unlock_ops(dev); if (ret) break; } diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c index 912f0c4fff2f..3ca8eb2a3b31 100644 --- a/net/ethtool/stats.c +++ b/net/ethtool/stats.c @@ -1,5 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/phy.h> +#include <linux/phylib_stubs.h> + #include "netlink.h" #include "common.h" #include "bitset.h" @@ -20,6 +23,7 @@ struct stats_reply_data { struct ethtool_eth_mac_stats mac_stats; struct ethtool_eth_ctrl_stats ctrl_stats; struct ethtool_rmon_stats rmon_stats; + struct ethtool_phy_stats phydev_stats; ); const struct ethtool_rmon_hist_range *rmon_ranges; }; @@ -32,6 +36,7 @@ const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_STATS_ETH_MAC] = "eth-mac", [ETHTOOL_STATS_ETH_CTRL] = "eth-ctrl", [ETHTOOL_STATS_RMON] = "rmon", + [ETHTOOL_STATS_PHY] = "phydev", }; const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = { @@ -76,6 +81,15 @@ const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = { [ETHTOOL_A_STATS_RMON_JABBER] = "etherStatsJabbers", }; +const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN] = { + [ETHTOOL_A_STATS_PHY_RX_PKTS] = "RxFrames", + [ETHTOOL_A_STATS_PHY_RX_BYTES] = "RxOctets", + [ETHTOOL_A_STATS_PHY_RX_ERRORS] = "RxErrors", + [ETHTOOL_A_STATS_PHY_TX_PKTS] = "TxFrames", + [ETHTOOL_A_STATS_PHY_TX_BYTES] = "TxOctets", + [ETHTOOL_A_STATS_PHY_TX_ERRORS] = "TxErrors", +}; + const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1] = { [ETHTOOL_A_STATS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), @@ -120,8 +134,15 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, struct stats_reply_data *data = STATS_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; + struct nlattr **tb = info->attrs; + struct phy_device *phydev; int ret; + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_STATS_HEADER, + info->extack); + if (IS_ERR(phydev)) + return PTR_ERR(phydev); + ret = ethnl_ops_begin(dev); if (ret < 0) return ret; @@ -145,6 +166,14 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base, data->ctrl_stats.src = src; data->rmon_stats.src = src; + if ((test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask) || + test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) && + src == ETHTOOL_MAC_STATS_SRC_AGGREGATE) { + if (phydev) + phy_ethtool_get_phy_stats(phydev, &data->phy_stats, + &data->phydev_stats); + } + if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) && dev->ethtool_ops->get_eth_phy_stats) dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats); @@ -194,6 +223,10 @@ static int stats_reply_size(const struct ethnl_req_info *req_base, nla_total_size(4)) * /* _A_STATS_GRP_HIST_BKT_HI */ ETHTOOL_RMON_HIST_MAX * 2; } + if (test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask)) { + n_stats += sizeof(struct ethtool_phy_stats) / sizeof(u64); + n_grps++; + } len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */ nla_total_size(4) + /* _A_STATS_GRP_ID */ @@ -247,6 +280,25 @@ static int stats_put_phy_stats(struct sk_buff *skb, return 0; } +static int stats_put_phydev_stats(struct sk_buff *skb, + const struct stats_reply_data *data) +{ + if (stat_put(skb, ETHTOOL_A_STATS_PHY_RX_PKTS, + data->phydev_stats.rx_packets) || + stat_put(skb, ETHTOOL_A_STATS_PHY_RX_BYTES, + data->phydev_stats.rx_bytes) || + stat_put(skb, ETHTOOL_A_STATS_PHY_RX_ERRORS, + data->phydev_stats.rx_errors) || + stat_put(skb, ETHTOOL_A_STATS_PHY_TX_PKTS, + data->phydev_stats.tx_packets) || + stat_put(skb, ETHTOOL_A_STATS_PHY_TX_BYTES, + data->phydev_stats.tx_bytes) || + stat_put(skb, ETHTOOL_A_STATS_PHY_TX_ERRORS, + data->phydev_stats.tx_errors)) + return -EMSGSIZE; + return 0; +} + static int stats_put_mac_stats(struct sk_buff *skb, const struct stats_reply_data *data) { @@ -423,6 +475,9 @@ static int stats_fill_reply(struct sk_buff *skb, if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON, ETH_SS_STATS_RMON, stats_put_rmon_stats); + if (!ret && test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask)) + ret = stats_put_stats(skb, data, ETHTOOL_STATS_PHY, + ETH_SS_STATS_PHY, stats_put_phydev_stats); return ret; } diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index b3382b3cf325..f6a67109beda 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -75,6 +75,11 @@ static const struct strset_info info_template[] = { .count = __HWTSTAMP_FILTER_CNT, .strings = ts_rx_filter_names, }, + [ETH_SS_TS_FLAGS] = { + .per_dev = false, + .count = __HWTSTAMP_FLAG_CNT, + .strings = ts_flags_names, + }, [ETH_SS_UDP_TUNNEL_TYPES] = { .per_dev = false, .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, @@ -105,6 +110,11 @@ static const struct strset_info info_template[] = { .count = __ETHTOOL_A_STATS_RMON_CNT, .strings = stats_rmon_names, }, + [ETH_SS_STATS_PHY] = { + .per_dev = false, + .count = __ETHTOOL_A_STATS_PHY_CNT, + .strings = stats_phy_names, + }, }; struct strset_req_info { @@ -299,7 +309,7 @@ static int strset_prepare_data(const struct ethnl_req_info *req_base, return 0; } - phydev = ethnl_req_get_phydev(req_base, tb[ETHTOOL_A_HEADER_FLAGS], + phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_HEADER_FLAGS, info->extack); /* phydev can be NULL, check for errors only */ diff --git a/net/ethtool/ts.h b/net/ethtool/ts.h new file mode 100644 index 000000000000..d901a879a671 --- /dev/null +++ b/net/ethtool/ts.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef _NET_ETHTOOL_TS_H +#define _NET_ETHTOOL_TS_H + +#include "netlink.h" + +static const struct nla_policy +ethnl_ts_hwtst_prov_policy[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_MAX + 1] = { + [ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX] = { .type = NLA_U32 }, + [ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER] = + NLA_POLICY_MAX(NLA_U32, HWTSTAMP_PROVIDER_QUALIFIER_CNT - 1) +}; + +int ts_parse_hwtst_provider(const struct nlattr *nest, + struct hwtstamp_provider_desc *hwprov_desc, + struct netlink_ext_ack *extack, + bool *mod); + +#endif /* _NET_ETHTOOL_TS_H */ diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c new file mode 100644 index 000000000000..2be356bdfe87 --- /dev/null +++ b/net/ethtool/tsconfig.c @@ -0,0 +1,457 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/net_tstamp.h> +#include <linux/ptp_clock_kernel.h> + +#include "netlink.h" +#include "common.h" +#include "bitset.h" +#include "../core/dev.h" +#include "ts.h" + +struct tsconfig_req_info { + struct ethnl_req_info base; +}; + +struct tsconfig_reply_data { + struct ethnl_reply_data base; + struct hwtstamp_provider_desc hwprov_desc; + struct { + u32 tx_type; + u32 rx_filter; + u32 flags; + } hwtst_config; +}; + +#define TSCONFIG_REPDATA(__reply_base) \ + container_of(__reply_base, struct tsconfig_reply_data, base) + +const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1] = { + [ETHTOOL_A_TSCONFIG_HEADER] = + NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int tsconfig_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + const struct genl_info *info) +{ + struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); + struct hwtstamp_provider *hwprov = NULL; + struct net_device *dev = reply_base->dev; + struct kernel_hwtstamp_config cfg = {}; + int ret; + + if (!dev->netdev_ops->ndo_hwtstamp_get) + return -EOPNOTSUPP; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + ret = dev_get_hwtstamp_phylib(dev, &cfg); + if (ret) + goto out; + + data->hwtst_config.tx_type = BIT(cfg.tx_type); + data->hwtst_config.rx_filter = BIT(cfg.rx_filter); + data->hwtst_config.flags = cfg.flags; + + data->hwprov_desc.index = -1; + hwprov = rtnl_dereference(dev->hwprov); + if (hwprov) { + data->hwprov_desc.index = hwprov->desc.index; + data->hwprov_desc.qualifier = hwprov->desc.qualifier; + } else { + struct kernel_ethtool_ts_info ts_info = {}; + + ts_info.phc_index = -1; + ret = __ethtool_get_ts_info(dev, &ts_info); + if (ret) + goto out; + + if (ts_info.phc_index == -1) + return -ENODEV; + + data->hwprov_desc.index = ts_info.phc_index; + data->hwprov_desc.qualifier = ts_info.phc_qualifier; + } + +out: + ethnl_ops_complete(dev); + return ret; +} + +static int tsconfig_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int len = 0; + int ret; + + BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); + BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); + BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32); + + if (data->hwtst_config.flags) { + ret = ethnl_bitset32_size(&data->hwtst_config.flags, + NULL, __HWTSTAMP_FLAG_CNT, + ts_flags_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSCONFIG_HWTSTAMP_FLAGS */ + } + + if (data->hwtst_config.tx_type) { + ret = ethnl_bitset32_size(&data->hwtst_config.tx_type, + NULL, __HWTSTAMP_TX_CNT, + ts_tx_type_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSCONFIG_TX_TYPES */ + } + if (data->hwtst_config.rx_filter) { + ret = ethnl_bitset32_size(&data->hwtst_config.rx_filter, + NULL, __HWTSTAMP_FILTER_CNT, + ts_rx_filter_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSCONFIG_RX_FILTERS */ + } + + if (data->hwprov_desc.index >= 0) + /* _TSCONFIG_HWTSTAMP_PROVIDER */ + len += nla_total_size(0) + + 2 * nla_total_size(sizeof(u32)); + + return len; +} + +static int tsconfig_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base); + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + int ret; + + if (data->hwtst_config.flags) { + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, + &data->hwtst_config.flags, NULL, + __HWTSTAMP_FLAG_CNT, + ts_flags_names, compact); + if (ret < 0) + return ret; + } + + if (data->hwtst_config.tx_type) { + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_TX_TYPES, + &data->hwtst_config.tx_type, NULL, + __HWTSTAMP_TX_CNT, + ts_tx_type_names, compact); + if (ret < 0) + return ret; + } + + if (data->hwtst_config.rx_filter) { + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_RX_FILTERS, + &data->hwtst_config.rx_filter, + NULL, __HWTSTAMP_FILTER_CNT, + ts_rx_filter_names, compact); + if (ret < 0) + return ret; + } + + if (data->hwprov_desc.index >= 0) { + struct nlattr *nest; + + nest = nla_nest_start(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, + data->hwprov_desc.index) || + nla_put_u32(skb, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, + data->hwprov_desc.qualifier)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + } + return 0; +} + +/* TSCONFIG_SET */ +const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1] = { + [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER] = + NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), + [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_NESTED }, + [ETHTOOL_A_TSCONFIG_RX_FILTERS] = { .type = NLA_NESTED }, + [ETHTOOL_A_TSCONFIG_TX_TYPES] = { .type = NLA_NESTED }, +}; + +static int tsconfig_send_reply(struct net_device *dev, struct genl_info *info) +{ + struct tsconfig_reply_data *reply_data; + struct tsconfig_req_info *req_info; + struct sk_buff *rskb; + void *reply_payload; + int reply_len = 0; + int ret; + + req_info = kzalloc(sizeof(*req_info), GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kmalloc(sizeof(*reply_data), GFP_KERNEL); + if (!reply_data) { + kfree(req_info); + return -ENOMEM; + } + + ASSERT_RTNL(); + reply_data->base.dev = dev; + ret = tsconfig_prepare_data(&req_info->base, &reply_data->base, info); + if (ret < 0) + goto err_cleanup; + + ret = tsconfig_reply_size(&req_info->base, &reply_data->base); + if (ret < 0) + goto err_cleanup; + + reply_len = ret + ethnl_reply_header_size(); + rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_TSCONFIG_SET_REPLY, + ETHTOOL_A_TSCONFIG_HEADER, info, &reply_payload); + if (!rskb) + goto err_cleanup; + + ret = tsconfig_fill_reply(rskb, &req_info->base, &reply_data->base); + if (ret < 0) + goto err_cleanup; + + genlmsg_end(rskb, reply_payload); + ret = genlmsg_reply(rskb, info); + +err_cleanup: + kfree(reply_data); + kfree(req_info); + return ret; +} + +static int ethnl_set_tsconfig_validate(struct ethnl_req_info *req_base, + struct genl_info *info) +{ + const struct net_device_ops *ops = req_base->dev->netdev_ops; + + if (!ops->ndo_hwtstamp_set || !ops->ndo_hwtstamp_get) + return -EOPNOTSUPP; + + return 1; +} + +static struct hwtstamp_provider * +tsconfig_set_hwprov_from_desc(struct net_device *dev, + struct genl_info *info, + struct hwtstamp_provider_desc *hwprov_desc) +{ + struct kernel_ethtool_ts_info ts_info; + struct hwtstamp_provider *hwprov; + struct nlattr **tb = info->attrs; + struct phy_device *phy = NULL; + enum hwtstamp_source source; + int ret; + + ret = ethtool_net_get_ts_info_by_phc(dev, &ts_info, hwprov_desc); + if (!ret) { + /* Found */ + source = HWTSTAMP_SOURCE_NETDEV; + } else { + phy = ethtool_phy_get_ts_info_by_phc(dev, &ts_info, hwprov_desc); + if (IS_ERR(phy)) { + if (PTR_ERR(phy) == -ENODEV) + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER], + "phc not in this net device topology"); + return ERR_CAST(phy); + } + + source = HWTSTAMP_SOURCE_PHYLIB; + } + + hwprov = kzalloc(sizeof(*hwprov), GFP_KERNEL); + if (!hwprov) + return ERR_PTR(-ENOMEM); + + hwprov->desc.index = hwprov_desc->index; + hwprov->desc.qualifier = hwprov_desc->qualifier; + hwprov->source = source; + hwprov->phydev = phy; + + return hwprov; +} + +static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, + struct genl_info *info) +{ + struct kernel_hwtstamp_config hwtst_config = {0}; + bool hwprov_mod = false, config_mod = false; + struct hwtstamp_provider *hwprov = NULL; + struct net_device *dev = req_base->dev; + struct nlattr **tb = info->attrs; + int ret; + + BUILD_BUG_ON(__HWTSTAMP_TX_CNT >= 32); + BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT >= 32); + BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32); + + if (!netif_device_present(dev)) + return -ENODEV; + + if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER]) { + struct hwtstamp_provider_desc __hwprov_desc = {.index = -1}; + struct hwtstamp_provider *__hwprov; + + __hwprov = rtnl_dereference(dev->hwprov); + if (__hwprov) { + __hwprov_desc.index = __hwprov->desc.index; + __hwprov_desc.qualifier = __hwprov->desc.qualifier; + } + + ret = ts_parse_hwtst_provider(tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER], + &__hwprov_desc, info->extack, + &hwprov_mod); + if (ret < 0) + return ret; + + if (hwprov_mod) { + hwprov = tsconfig_set_hwprov_from_desc(dev, info, + &__hwprov_desc); + if (IS_ERR(hwprov)) + return PTR_ERR(hwprov); + } + } + + /* Get current hwtstamp config if we are not changing the + * hwtstamp source. It will be zeroed in the other case. + */ + if (!hwprov_mod) { + ret = dev_get_hwtstamp_phylib(dev, &hwtst_config); + if (ret < 0 && ret != -EOPNOTSUPP) + goto err_free_hwprov; + } + + /* Get the hwtstamp config from netlink */ + if (tb[ETHTOOL_A_TSCONFIG_TX_TYPES]) { + u32 req_tx_type; + + req_tx_type = BIT(hwtst_config.tx_type); + ret = ethnl_update_bitset32(&req_tx_type, + __HWTSTAMP_TX_CNT, + tb[ETHTOOL_A_TSCONFIG_TX_TYPES], + ts_tx_type_names, info->extack, + &config_mod); + if (ret < 0) + goto err_free_hwprov; + + /* Select only one tx type at a time */ + if (ffs(req_tx_type) != fls(req_tx_type)) { + ret = -EINVAL; + goto err_free_hwprov; + } + + hwtst_config.tx_type = ffs(req_tx_type) - 1; + } + + if (tb[ETHTOOL_A_TSCONFIG_RX_FILTERS]) { + u32 req_rx_filter; + + req_rx_filter = BIT(hwtst_config.rx_filter); + ret = ethnl_update_bitset32(&req_rx_filter, + __HWTSTAMP_FILTER_CNT, + tb[ETHTOOL_A_TSCONFIG_RX_FILTERS], + ts_rx_filter_names, info->extack, + &config_mod); + if (ret < 0) + goto err_free_hwprov; + + /* Select only one rx filter at a time */ + if (ffs(req_rx_filter) != fls(req_rx_filter)) { + ret = -EINVAL; + goto err_free_hwprov; + } + + hwtst_config.rx_filter = ffs(req_rx_filter) - 1; + } + + if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS]) { + ret = ethnl_update_bitset32(&hwtst_config.flags, + __HWTSTAMP_FLAG_CNT, + tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS], + ts_flags_names, info->extack, + &config_mod); + if (ret < 0) + goto err_free_hwprov; + } + + ret = net_hwtstamp_validate(&hwtst_config); + if (ret) + goto err_free_hwprov; + + if (hwprov_mod) { + struct kernel_hwtstamp_config zero_config = {0}; + struct hwtstamp_provider *__hwprov; + + /* Disable current time stamping if we try to enable + * another one + */ + ret = dev_set_hwtstamp_phylib(dev, &zero_config, info->extack); + if (ret < 0) + goto err_free_hwprov; + + /* Change the selected hwtstamp source */ + __hwprov = rcu_replace_pointer_rtnl(dev->hwprov, hwprov); + if (__hwprov) + kfree_rcu(__hwprov, rcu_head); + } + + if (config_mod) { + ret = dev_set_hwtstamp_phylib(dev, &hwtst_config, + info->extack); + if (ret < 0) + return ret; + } + + if (hwprov_mod || config_mod) { + ret = tsconfig_send_reply(dev, info); + if (ret && ret != -EOPNOTSUPP) { + NL_SET_ERR_MSG(info->extack, + "error while reading the new configuration set"); + return ret; + } + } + + /* tsconfig has no notification */ + return 0; + +err_free_hwprov: + kfree(hwprov); + + return ret; +} + +const struct ethnl_request_ops ethnl_tsconfig_request_ops = { + .request_cmd = ETHTOOL_MSG_TSCONFIG_GET, + .reply_cmd = ETHTOOL_MSG_TSCONFIG_GET_REPLY, + .hdr_attr = ETHTOOL_A_TSCONFIG_HEADER, + .req_info_size = sizeof(struct tsconfig_req_info), + .reply_data_size = sizeof(struct tsconfig_reply_data), + + .prepare_data = tsconfig_prepare_data, + .reply_size = tsconfig_reply_size, + .fill_reply = tsconfig_fill_reply, + + .set_validate = ethnl_set_tsconfig_validate, + .set = ethnl_set_tsconfig, +}; diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c index 03d12d6f79ca..8c654caa6805 100644 --- a/net/ethtool/tsinfo.c +++ b/net/ethtool/tsinfo.c @@ -1,13 +1,19 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/net_tstamp.h> +#include <linux/phy.h> +#include <linux/phy_link_topology.h> +#include <linux/ptp_clock_kernel.h> +#include <net/netdev_lock.h> #include "netlink.h" #include "common.h" #include "bitset.h" +#include "ts.h" struct tsinfo_req_info { struct ethnl_req_info base; + struct hwtstamp_provider_desc hwprov_desc; }; struct tsinfo_reply_data { @@ -16,34 +22,96 @@ struct tsinfo_reply_data { struct ethtool_ts_stats stats; }; +#define TSINFO_REQINFO(__req_base) \ + container_of(__req_base, struct tsinfo_req_info, base) + #define TSINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct tsinfo_reply_data, base) #define ETHTOOL_TS_STAT_CNT \ (__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1)) -const struct nla_policy ethnl_tsinfo_get_policy[] = { +const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = { [ETHTOOL_A_TSINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), + [ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER] = + NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), }; +int ts_parse_hwtst_provider(const struct nlattr *nest, + struct hwtstamp_provider_desc *hwprov_desc, + struct netlink_ext_ack *extack, + bool *mod) +{ + struct nlattr *tb[ARRAY_SIZE(ethnl_ts_hwtst_prov_policy)]; + int ret; + + ret = nla_parse_nested(tb, + ARRAY_SIZE(ethnl_ts_hwtst_prov_policy) - 1, + nest, + ethnl_ts_hwtst_prov_policy, extack); + if (ret < 0) + return ret; + + if (NL_REQ_ATTR_CHECK(extack, nest, tb, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX) || + NL_REQ_ATTR_CHECK(extack, nest, tb, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER)) + return -EINVAL; + + ethnl_update_u32(&hwprov_desc->index, + tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX], + mod); + ethnl_update_u32(&hwprov_desc->qualifier, + tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER], + mod); + + return 0; +} + +static int +tsinfo_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, + struct netlink_ext_ack *extack) +{ + struct tsinfo_req_info *req = TSINFO_REQINFO(req_base); + bool mod = false; + + req->hwprov_desc.index = -1; + + if (!tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER]) + return 0; + + return ts_parse_hwtst_provider(tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER], + &req->hwprov_desc, extack, &mod); +} + static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); + struct tsinfo_req_info *req = TSINFO_REQINFO(req_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; + + if (req->hwprov_desc.index != -1) { + ret = ethtool_get_ts_info_by_phc(dev, &data->ts_info, + &req->hwprov_desc); + ethnl_ops_complete(dev); + return ret; + } + if (req_base->flags & ETHTOOL_FLAG_STATS) { ethtool_stats_init((u64 *)&data->stats, sizeof(data->stats) / sizeof(u64)); if (dev->ethtool_ops->get_ts_stats) dev->ethtool_ops->get_ts_stats(dev, &data->stats); } + ret = __ethtool_get_ts_info(dev, &data->ts_info); ethnl_ops_complete(dev); @@ -87,8 +155,17 @@ static int tsinfo_reply_size(const struct ethnl_req_info *req_base, return ret; len += ret; /* _TSINFO_RX_FILTERS */ } - if (ts_info->phc_index >= 0) + if (ts_info->phc_index >= 0) { len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */ + /* _TSINFO_HWTSTAMP_PROVIDER */ + len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32)); + } + if (ts_info->phc_source) { + len += nla_total_size(sizeof(u32)); /* _TSINFO_HWTSTAMP_SOURCE */ + if (ts_info->phc_phyindex) + /* _TSINFO_HWTSTAMP_PHYINDEX */ + len += nla_total_size(sizeof(u32)); + } if (req_base->flags & ETHTOOL_FLAG_STATS) len += nla_total_size(0) + /* _TSINFO_STATS */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT; @@ -116,6 +193,8 @@ static int tsinfo_put_stats(struct sk_buff *skb, if (tsinfo_put_stat(skb, stats->tx_stats.pkts, ETHTOOL_A_TS_STAT_TX_PKTS) || + tsinfo_put_stat(skb, stats->tx_stats.onestep_pkts_unconfirmed, + ETHTOOL_A_TS_STAT_TX_ONESTEP_PKTS_UNCONFIRMED) || tsinfo_put_stat(skb, stats->tx_stats.lost, ETHTOOL_A_TS_STAT_TX_LOST) || tsinfo_put_stat(skb, stats->tx_stats.err, @@ -163,9 +242,39 @@ static int tsinfo_fill_reply(struct sk_buff *skb, if (ret < 0) return ret; } - if (ts_info->phc_index >= 0 && - nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index)) - return -EMSGSIZE; + if (ts_info->phc_index >= 0) { + struct nlattr *nest; + + ret = nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, + ts_info->phc_index); + if (ret) + return -EMSGSIZE; + + nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, + ts_info->phc_index) || + nla_put_u32(skb, + ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, + ts_info->phc_qualifier)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + } + if (ts_info->phc_source) { + if (nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_SOURCE, + ts_info->phc_source)) + return -EMSGSIZE; + + if (ts_info->phc_phyindex && + nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PHYINDEX, + ts_info->phc_phyindex)) + return -EMSGSIZE; + } if (req_base->flags & ETHTOOL_FLAG_STATS && tsinfo_put_stats(skb, &data->stats)) return -EMSGSIZE; @@ -173,6 +282,274 @@ static int tsinfo_fill_reply(struct sk_buff *skb, return 0; } +struct ethnl_tsinfo_dump_ctx { + struct tsinfo_req_info *req_info; + struct tsinfo_reply_data *reply_data; + unsigned long pos_ifindex; + bool netdev_dump_done; + unsigned long pos_phyindex; + enum hwtstamp_provider_qualifier pos_phcqualifier; +}; + +static void *ethnl_tsinfo_prepare_dump(struct sk_buff *skb, + struct net_device *dev, + struct tsinfo_reply_data *reply_data, + struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + void *ehdr = NULL; + + ehdr = ethnl_dump_put(skb, cb, + ETHTOOL_MSG_TSINFO_GET_REPLY); + if (!ehdr) + return ERR_PTR(-EMSGSIZE); + + reply_data = ctx->reply_data; + memset(reply_data, 0, sizeof(*reply_data)); + reply_data->base.dev = dev; + reply_data->ts_info.cmd = ETHTOOL_GET_TS_INFO; + reply_data->ts_info.phc_index = -1; + + return ehdr; +} + +static int ethnl_tsinfo_end_dump(struct sk_buff *skb, + struct net_device *dev, + struct tsinfo_req_info *req_info, + struct tsinfo_reply_data *reply_data, + void *ehdr) +{ + int ret; + + reply_data->ts_info.so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | + SOF_TIMESTAMPING_SOFTWARE; + + ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TSINFO_HEADER); + if (ret < 0) + return ret; + + ret = tsinfo_fill_reply(skb, &req_info->base, &reply_data->base); + if (ret < 0) + return ret; + + reply_data->base.dev = NULL; + genlmsg_end(skb, ehdr); + + return ret; +} + +static int ethnl_tsinfo_dump_one_phydev(struct sk_buff *skb, + struct net_device *dev, + struct phy_device *phydev, + struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct tsinfo_reply_data *reply_data; + struct tsinfo_req_info *req_info; + void *ehdr = NULL; + int ret = 0; + + if (!phy_has_tsinfo(phydev)) + return -EOPNOTSUPP; + + reply_data = ctx->reply_data; + req_info = ctx->req_info; + ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); + if (IS_ERR(ehdr)) + return PTR_ERR(ehdr); + + ret = phy_ts_info(phydev, &reply_data->ts_info); + if (ret < 0) + goto err; + + if (reply_data->ts_info.phc_index >= 0) { + reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_PHYLIB; + reply_data->ts_info.phc_phyindex = phydev->phyindex; + } + + ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); + if (ret < 0) + goto err; + + return ret; +err: + genlmsg_cancel(skb, ehdr); + return ret; +} + +static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb, + struct net_device *dev, + struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + const struct ethtool_ops *ops = dev->ethtool_ops; + struct tsinfo_reply_data *reply_data; + struct tsinfo_req_info *req_info; + void *ehdr = NULL; + int ret = 0; + + if (!ops->get_ts_info) + return -EOPNOTSUPP; + + reply_data = ctx->reply_data; + req_info = ctx->req_info; + for (; ctx->pos_phcqualifier < HWTSTAMP_PROVIDER_QUALIFIER_CNT; + ctx->pos_phcqualifier++) { + if (!net_support_hwtstamp_qualifier(dev, + ctx->pos_phcqualifier)) + continue; + + ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); + if (IS_ERR(ehdr)) { + ret = PTR_ERR(ehdr); + goto err; + } + + reply_data->ts_info.phc_qualifier = ctx->pos_phcqualifier; + ret = ops->get_ts_info(dev, &reply_data->ts_info); + if (ret < 0) + goto err; + + if (reply_data->ts_info.phc_index >= 0) + reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_NETDEV; + ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, + ehdr); + if (ret < 0) + goto err; + } + + return ret; + +err: + genlmsg_cancel(skb, ehdr); + return ret; +} + +static int ethnl_tsinfo_dump_one_net_topo(struct sk_buff *skb, + struct net_device *dev, + struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct phy_device_node *pdn; + int ret = 0; + + if (!ctx->netdev_dump_done) { + ret = ethnl_tsinfo_dump_one_netdev(skb, dev, cb); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + ctx->netdev_dump_done = true; + } + + if (!dev->link_topo) { + if (phy_has_tsinfo(dev->phydev)) { + ret = ethnl_tsinfo_dump_one_phydev(skb, dev, + dev->phydev, cb); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + } + + return 0; + } + + xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn, + ctx->pos_phyindex) { + if (phy_has_tsinfo(pdn->phy)) { + ret = ethnl_tsinfo_dump_one_phydev(skb, dev, + pdn->phy, cb); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + } + } + + return ret; +} + +int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + if (ctx->req_info->base.dev) { + dev = ctx->req_info->base.dev; + netdev_lock_ops(dev); + ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb); + netdev_unlock_ops(dev); + } else { + for_each_netdev_dump(net, dev, ctx->pos_ifindex) { + netdev_lock_ops(dev); + ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb); + netdev_unlock_ops(dev); + if (ret < 0 && ret != -EOPNOTSUPP) + break; + ctx->pos_phyindex = 0; + ctx->netdev_dump_done = false; + ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE; + } + } + rtnl_unlock(); + + return ret; +} + +int ethnl_tsinfo_start(struct netlink_callback *cb) +{ + const struct genl_dumpit_info *info = genl_dumpit_info(cb); + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct nlattr **tb = info->info.attrs; + struct tsinfo_reply_data *reply_data; + struct tsinfo_req_info *req_info; + int ret; + + BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); + + req_info = kzalloc(sizeof(*req_info), GFP_KERNEL); + if (!req_info) + return -ENOMEM; + reply_data = kzalloc(sizeof(*reply_data), GFP_KERNEL); + if (!reply_data) { + ret = -ENOMEM; + goto free_req_info; + } + + ret = ethnl_parse_header_dev_get(&req_info->base, + tb[ETHTOOL_A_TSINFO_HEADER], + sock_net(cb->skb->sk), cb->extack, + false); + if (ret < 0) + goto free_reply_data; + + ctx->req_info = req_info; + ctx->reply_data = reply_data; + ctx->pos_ifindex = 0; + ctx->pos_phyindex = 0; + ctx->netdev_dump_done = false; + ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE; + + return 0; + +free_reply_data: + kfree(reply_data); +free_req_info: + kfree(req_info); + + return ret; +} + +int ethnl_tsinfo_done(struct netlink_callback *cb) +{ + struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; + struct tsinfo_req_info *req_info = ctx->req_info; + + ethnl_parse_header_dev_put(&req_info->base); + kfree(ctx->reply_data); + kfree(ctx->req_info); + + return 0; +} + const struct ethnl_request_ops ethnl_tsinfo_request_ops = { .request_cmd = ETHTOOL_MSG_TSINFO_GET, .reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY, @@ -180,6 +557,7 @@ const struct ethnl_request_ops ethnl_tsinfo_request_ops = { .req_info_size = sizeof(struct tsinfo_req_info), .reply_data_size = sizeof(struct tsinfo_reply_data), + .parse_request = tsinfo_parse_request, .prepare_data = tsinfo_prepare_data, .reply_size = tsinfo_reply_size, .fill_reply = tsinfo_fill_reply, diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig index 1b048c17b6c8..fcacdf4f0ffc 100644 --- a/net/hsr/Kconfig +++ b/net/hsr/Kconfig @@ -38,3 +38,21 @@ config HSR relying on this code in a safety critical system! If unsure, say N. + +if HSR + +config PRP_DUP_DISCARD_KUNIT_TEST + tristate "PRP duplicate discard KUnit tests" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Covers the PRP duplicate discard algorithm. + Only useful for kernel devs running KUnit test harness and are not + for inclusion into a production build. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + +endif diff --git a/net/hsr/Makefile b/net/hsr/Makefile index 75df90d3b416..34e581db5c41 100644 --- a/net/hsr/Makefile +++ b/net/hsr/Makefile @@ -8,3 +8,5 @@ obj-$(CONFIG_HSR) += hsr.o hsr-y := hsr_main.o hsr_framereg.o hsr_device.o \ hsr_netlink.o hsr_slave.o hsr_forward.o hsr-$(CONFIG_DEBUG_FS) += hsr_debugfs.o + +obj-$(CONFIG_PRP_DUP_DISCARD_KUNIT_TEST) += prp_dup_discard_test.o diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 1a195efc79cd..5b2cfac3b2ba 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -57,14 +57,11 @@ DEFINE_SHOW_ATTRIBUTE(hsr_node_table); void hsr_debugfs_rename(struct net_device *dev) { struct hsr_priv *priv = netdev_priv(dev); - struct dentry *d; + int err; - d = debugfs_rename(hsr_debugfs_root_dir, priv->node_tbl_root, - hsr_debugfs_root_dir, dev->name); - if (IS_ERR(d)) + err = debugfs_change_name(priv->node_tbl_root, "%s", dev->name); + if (err) netdev_warn(dev, "failed to rename\n"); - else - priv->node_tbl_root = d; } /* hsr_debugfs_init - create hsr node_table file for dumping diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 03eadd6c51fd..0d1e56965af0 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -33,14 +33,14 @@ static void hsr_set_operstate(struct hsr_port *master, bool has_carrier) struct net_device *dev = master->dev; if (!is_admin_up(dev)) { - netdev_set_operstate(dev, IF_OPER_DOWN); + netif_set_operstate(dev, IF_OPER_DOWN); return; } if (has_carrier) - netdev_set_operstate(dev, IF_OPER_UP); + netif_set_operstate(dev, IF_OPER_UP); else - netdev_set_operstate(dev, IF_OPER_LOWERLAYERDOWN); + netif_set_operstate(dev, IF_OPER_LOWERLAYERDOWN); } static bool hsr_check_carrier(struct hsr_port *master) @@ -616,6 +616,7 @@ static struct hsr_proto_ops hsr_ops = { .drop_frame = hsr_drop_frame, .fill_frame_info = hsr_fill_frame_info, .invalid_dan_ingress_frame = hsr_invalid_dan_ingress_frame, + .register_frame_out = hsr_register_frame_out, }; static struct hsr_proto_ops prp_ops = { @@ -626,6 +627,7 @@ static struct hsr_proto_ops prp_ops = { .fill_frame_info = prp_fill_frame_info, .handle_san_frame = prp_handle_san_frame, .update_san_info = prp_update_san_info, + .register_frame_out = prp_register_frame_out, }; void hsr_dev_setup(struct net_device *dev) @@ -643,7 +645,7 @@ void hsr_dev_setup(struct net_device *dev) /* Not sure about this. Taken from bridge code. netdevice.h says * it means "Does not change network namespaces". */ - dev->netns_local = true; + dev->netns_immutable = true; dev->needs_free_netdev = true; @@ -663,6 +665,19 @@ bool is_hsr_master(struct net_device *dev) } EXPORT_SYMBOL(is_hsr_master); +struct net_device *hsr_get_port_ndev(struct net_device *ndev, + enum hsr_port_type pt) +{ + struct hsr_priv *hsr = netdev_priv(ndev); + struct hsr_port *port; + + hsr_for_each_port(hsr, port) + if (port->type == pt) + return port->dev; + return NULL; +} +EXPORT_SYMBOL(hsr_get_port_ndev); + /* Default multicast address for HSR Supervision frames */ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { 0x01, 0x15, 0x4e, 0x00, 0x01, 0x00 @@ -746,6 +761,11 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], if (res) goto err_unregister; + if (protocol_version == PRP_V1) { + eth_hw_addr_set(slave[1], slave[0]->dev_addr); + call_netdevice_notifiers(NETDEV_CHANGEADDR, slave[1]); + } + if (interlink) { res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack); if (res) diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 87bb3a91598e..c67c0d35921d 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -536,8 +536,8 @@ static void hsr_forward_do(struct hsr_frame_info *frame) * Also for SAN, this shouldn't be done. */ if (!frame->is_from_san && - hsr_register_frame_out(port, frame->node_src, - frame->sequence_nr)) + hsr->proto_ops->register_frame_out && + hsr->proto_ops->register_frame_out(port, frame)) continue; if (frame->is_supervision && port->type == HSR_PT_MASTER && @@ -700,9 +700,12 @@ static int fill_frame_info(struct hsr_frame_info *frame, frame->is_vlan = true; if (frame->is_vlan) { - if (skb->mac_len < offsetofend(struct hsr_vlan_ethhdr, vlanhdr)) + /* Note: skb->mac_len might be wrong here. */ + if (!pskb_may_pull(skb, + skb_mac_offset(skb) + + offsetofend(struct hsr_vlan_ethhdr, vlanhdr))) return -EINVAL; - vlan_hdr = (struct hsr_vlan_ethhdr *)ethhdr; + vlan_hdr = (struct hsr_vlan_ethhdr *)skb_mac_header(skb); proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; } diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 73bc6f659812..4ce471a2f387 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -35,6 +35,7 @@ static bool seq_nr_after(u16 a, u16 b) #define seq_nr_before(a, b) seq_nr_after((b), (a)) #define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) +#define PRP_DROP_WINDOW_LEN 32768 bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr) { @@ -176,8 +177,11 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, new_node->time_in[i] = now; new_node->time_out[i] = now; } - for (i = 0; i < HSR_PT_PORTS; i++) + for (i = 0; i < HSR_PT_PORTS; i++) { new_node->seq_out[i] = seq_out; + new_node->seq_expected[i] = seq_out + 1; + new_node->seq_start[i] = seq_out + 1; + } if (san && hsr->proto_ops->handle_san_frame) hsr->proto_ops->handle_san_frame(san, rx_port, new_node); @@ -482,9 +486,11 @@ void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, * 0 otherwise, or * negative error code on error */ -int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, - u16 sequence_nr) +int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) { + struct hsr_node *node = frame->node_src; + u16 sequence_nr = frame->sequence_nr; + spin_lock_bh(&node->seq_out_lock); if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) && time_is_after_jiffies(node->time_out[port->type] + @@ -499,6 +505,93 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, return 0; } +/* Adaptation of the PRP duplicate discard algorithm described in wireshark + * wiki (https://wiki.wireshark.org/PRP) + * + * A drop window is maintained for both LANs with start sequence set to the + * first sequence accepted on the LAN that has not been seen on the other LAN, + * and expected sequence set to the latest received sequence number plus one. + * + * When a frame is received on either LAN it is compared against the received + * frames on the other LAN. If it is outside the drop window of the other LAN + * the frame is accepted and the drop window is updated. + * The drop window for the other LAN is reset. + * + * 'port' is the outgoing interface + * 'frame' is the frame to be sent + * + * Return: + * 1 if frame can be shown to have been sent recently on this interface, + * 0 otherwise + */ +int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) +{ + enum hsr_port_type other_port; + enum hsr_port_type rcv_port; + struct hsr_node *node; + u16 sequence_diff; + u16 sequence_exp; + u16 sequence_nr; + + /* out-going frames are always in order + * and can be checked the same way as for HSR + */ + if (frame->port_rcv->type == HSR_PT_MASTER) + return hsr_register_frame_out(port, frame); + + /* for PRP we should only forward frames from the slave ports + * to the master port + */ + if (port->type != HSR_PT_MASTER) + return 1; + + node = frame->node_src; + sequence_nr = frame->sequence_nr; + sequence_exp = sequence_nr + 1; + rcv_port = frame->port_rcv->type; + other_port = rcv_port == HSR_PT_SLAVE_A ? HSR_PT_SLAVE_B : + HSR_PT_SLAVE_A; + + spin_lock_bh(&node->seq_out_lock); + if (time_is_before_jiffies(node->time_out[port->type] + + msecs_to_jiffies(HSR_ENTRY_FORGET_TIME)) || + (node->seq_start[rcv_port] == node->seq_expected[rcv_port] && + node->seq_start[other_port] == node->seq_expected[other_port])) { + /* the node hasn't been sending for a while + * or both drop windows are empty, forward the frame + */ + node->seq_start[rcv_port] = sequence_nr; + } else if (seq_nr_before(sequence_nr, node->seq_expected[other_port]) && + seq_nr_before_or_eq(node->seq_start[other_port], sequence_nr)) { + /* drop the frame, update the drop window for the other port + * and reset our drop window + */ + node->seq_start[other_port] = sequence_exp; + node->seq_expected[rcv_port] = sequence_exp; + node->seq_start[rcv_port] = node->seq_expected[rcv_port]; + spin_unlock_bh(&node->seq_out_lock); + return 1; + } + + /* update the drop window for the port where this frame was received + * and clear the drop window for the other port + */ + node->seq_start[other_port] = node->seq_expected[other_port]; + node->seq_expected[rcv_port] = sequence_exp; + sequence_diff = sequence_exp - node->seq_start[rcv_port]; + if (sequence_diff > PRP_DROP_WINDOW_LEN) + node->seq_start[rcv_port] = sequence_exp - PRP_DROP_WINDOW_LEN; + + node->time_out[port->type] = jiffies; + node->seq_out[port->type] = sequence_nr; + spin_unlock_bh(&node->seq_out_lock); + return 0; +} + +#if IS_MODULE(CONFIG_PRP_DUP_DISCARD_KUNIT_TEST) +EXPORT_SYMBOL(prp_register_frame_out); +#endif + static struct hsr_port *get_late_port(struct hsr_priv *hsr, struct hsr_node *node) { diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index 993fa950d814..b04948659d84 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -44,8 +44,7 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, u16 sequence_nr); -int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, - u16 sequence_nr); +int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame); void hsr_prune_nodes(struct timer_list *t); void hsr_prune_proxy_nodes(struct timer_list *t); @@ -73,6 +72,8 @@ void prp_update_san_info(struct hsr_node *node, bool is_sup); bool hsr_is_node_in_db(struct list_head *node_db, const unsigned char addr[ETH_ALEN]); +int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame); + struct hsr_node { struct list_head mac_list; /* Protect R/W access to seq_out */ @@ -89,6 +90,9 @@ struct hsr_node { bool san_b; u16 seq_out[HSR_PT_PORTS]; bool removed; + /* PRP specific duplicate handling */ + u16 seq_expected[HSR_PT_PORTS]; + u16 seq_start[HSR_PT_PORTS]; struct rcu_head rcu_head; }; diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index d7ae32473c41..192893c3f2ec 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -78,6 +78,15 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, eth_hw_addr_set(master->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev); + + if (hsr->prot_version == PRP_V1) { + port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B); + if (port) { + eth_hw_addr_set(port->dev, dev->dev_addr); + call_netdevice_notifiers(NETDEV_CHANGEADDR, + port->dev); + } + } } /* Make sure we recognize frames from ourselves in hsr_rcv() */ diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index fcfeb79bb040..135ec5fce019 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -121,15 +121,6 @@ struct hsrv1_ethhdr_sp { struct hsr_sup_tag hsr_sup; } __packed; -enum hsr_port_type { - HSR_PT_NONE = 0, /* Must be 0, used by framereg */ - HSR_PT_SLAVE_A, - HSR_PT_SLAVE_B, - HSR_PT_INTERLINK, - HSR_PT_MASTER, - HSR_PT_PORTS, /* This must be the last item in the enum */ -}; - /* PRP Redunancy Control Trailor (RCT). * As defined in IEC-62439-4:2012, the PRP RCT is really { sequence Nr, * Lan indentifier (LanId), LSDU_size and PRP_suffix = 0x88FB }. @@ -163,6 +154,8 @@ struct hsr_port { struct net_device *dev; struct hsr_priv *hsr; enum hsr_port_type type; + struct rcu_head rcu; + unsigned char original_macaddress[ETH_ALEN]; }; struct hsr_frame_info; @@ -183,6 +176,8 @@ struct hsr_proto_ops { struct hsr_frame_info *frame); bool (*invalid_dan_ingress_frame)(__be16 protocol); void (*update_san_info)(struct hsr_node *node, bool is_sup); + int (*register_frame_out)(struct hsr_port *port, + struct hsr_frame_info *frame); }; struct hsr_self_node { diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index b68f2f71d0e1..b120470246cc 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -29,10 +29,12 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = { /* Here, it seems a netdevice has already been allocated for us, and the * hsr_dev_setup routine has been executed. Nice! */ -static int hsr_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int hsr_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); + struct nlattr **data = params->data; enum hsr_version proto_version; unsigned char multicast_spec; u8 proto = HSR_PROTOCOL_HSR; @@ -46,7 +48,7 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, NL_SET_ERR_MSG_MOD(extack, "Slave1 device not specified"); return -EINVAL; } - link[0] = __dev_get_by_index(src_net, + link[0] = __dev_get_by_index(link_net, nla_get_u32(data[IFLA_HSR_SLAVE1])); if (!link[0]) { NL_SET_ERR_MSG_MOD(extack, "Slave1 does not exist"); @@ -56,7 +58,7 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, NL_SET_ERR_MSG_MOD(extack, "Slave2 device not specified"); return -EINVAL; } - link[1] = __dev_get_by_index(src_net, + link[1] = __dev_get_by_index(link_net, nla_get_u32(data[IFLA_HSR_SLAVE2])); if (!link[1]) { NL_SET_ERR_MSG_MOD(extack, "Slave2 does not exist"); @@ -69,7 +71,7 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, } if (data[IFLA_HSR_INTERLINK]) - interlink = __dev_get_by_index(src_net, + interlink = __dev_get_by_index(link_net, nla_get_u32(data[IFLA_HSR_INTERLINK])); if (interlink && interlink == link[0]) { diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 464f683e016d..b87b6a6fe070 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -196,6 +196,7 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, port->hsr = hsr; port->dev = dev; port->type = type; + ether_addr_copy(port->original_macaddress, dev->dev_addr); if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); @@ -204,7 +205,6 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, } list_add_tail_rcu(&port->port_list, &hsr->ports); - synchronize_rcu(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); netdev_update_features(master->dev); @@ -233,9 +233,8 @@ void hsr_del_port(struct hsr_port *port) if (!port->hsr->fwd_offloaded) dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); + eth_hw_addr_set(port->dev, port->original_macaddress); } - synchronize_rcu(); - - kfree(port); + kfree_rcu(port, rcu); } diff --git a/net/hsr/prp_dup_discard_test.c b/net/hsr/prp_dup_discard_test.c new file mode 100644 index 000000000000..e86b7b633ae8 --- /dev/null +++ b/net/hsr/prp_dup_discard_test.c @@ -0,0 +1,212 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <kunit/test.h> + +#include "hsr_main.h" +#include "hsr_framereg.h" + +struct prp_test_data { + struct hsr_port port; + struct hsr_port port_rcv; + struct hsr_frame_info frame; + struct hsr_node node; +}; + +static struct prp_test_data *build_prp_test_data(struct kunit *test) +{ + struct prp_test_data *data = kunit_kzalloc(test, + sizeof(struct prp_test_data), GFP_USER); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, data); + + data->frame.node_src = &data->node; + data->frame.port_rcv = &data->port_rcv; + data->port_rcv.type = HSR_PT_SLAVE_A; + data->node.seq_start[HSR_PT_SLAVE_A] = 1; + data->node.seq_expected[HSR_PT_SLAVE_A] = 1; + data->node.seq_start[HSR_PT_SLAVE_B] = 1; + data->node.seq_expected[HSR_PT_SLAVE_B] = 1; + data->node.seq_out[HSR_PT_MASTER] = 0; + data->node.time_out[HSR_PT_MASTER] = jiffies; + data->port.type = HSR_PT_MASTER; + + return data; +} + +static void check_prp_counters(struct kunit *test, + struct prp_test_data *data, + u16 seq_start_a, u16 seq_expected_a, + u16 seq_start_b, u16 seq_expected_b) +{ + KUNIT_EXPECT_EQ(test, data->node.seq_start[HSR_PT_SLAVE_A], + seq_start_a); + KUNIT_EXPECT_EQ(test, data->node.seq_start[HSR_PT_SLAVE_B], + seq_start_b); + KUNIT_EXPECT_EQ(test, data->node.seq_expected[HSR_PT_SLAVE_A], + seq_expected_a); + KUNIT_EXPECT_EQ(test, data->node.seq_expected[HSR_PT_SLAVE_B], + seq_expected_b); +} + +static void prp_dup_discard_forward(struct kunit *test) +{ + /* Normal situation, both LANs in sync. Next frame is forwarded */ + struct prp_test_data *data = build_prp_test_data(test); + + data->frame.sequence_nr = 2; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, + data->node.seq_out[HSR_PT_MASTER]); + KUNIT_EXPECT_EQ(test, jiffies, data->node.time_out[HSR_PT_MASTER]); + check_prp_counters(test, data, data->frame.sequence_nr, + data->frame.sequence_nr + 1, 1, 1); +} + +static void prp_dup_discard_inside_dropwindow(struct kunit *test) +{ + /* Normal situation, other LAN ahead by one. Frame is dropped */ + struct prp_test_data *data = build_prp_test_data(test); + unsigned long time = jiffies - 10; + + data->frame.sequence_nr = 1; + data->node.seq_expected[HSR_PT_SLAVE_B] = 3; + data->node.seq_out[HSR_PT_MASTER] = 2; + data->node.time_out[HSR_PT_MASTER] = time; + + KUNIT_EXPECT_EQ(test, 1, + prp_register_frame_out(&data->port, &data->frame)); + KUNIT_EXPECT_EQ(test, 2, data->node.seq_out[HSR_PT_MASTER]); + KUNIT_EXPECT_EQ(test, time, data->node.time_out[HSR_PT_MASTER]); + check_prp_counters(test, data, 2, 2, 2, 3); +} + +static void prp_dup_discard_node_timeout(struct kunit *test) +{ + /* Timeout situation, node hasn't sent anything for a while */ + struct prp_test_data *data = build_prp_test_data(test); + + data->frame.sequence_nr = 7; + data->node.seq_start[HSR_PT_SLAVE_A] = 1234; + data->node.seq_expected[HSR_PT_SLAVE_A] = 1235; + data->node.seq_start[HSR_PT_SLAVE_B] = 1234; + data->node.seq_expected[HSR_PT_SLAVE_B] = 1234; + data->node.seq_out[HSR_PT_MASTER] = 1234; + data->node.time_out[HSR_PT_MASTER] = + jiffies - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME) - 1; + + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, + data->node.seq_out[HSR_PT_MASTER]); + KUNIT_EXPECT_EQ(test, jiffies, data->node.time_out[HSR_PT_MASTER]); + check_prp_counters(test, data, data->frame.sequence_nr, + data->frame.sequence_nr + 1, 1234, 1234); +} + +static void prp_dup_discard_out_of_sequence(struct kunit *test) +{ + /* One frame is received out of sequence on both LANs */ + struct prp_test_data *data = build_prp_test_data(test); + + data->node.seq_start[HSR_PT_SLAVE_A] = 10; + data->node.seq_expected[HSR_PT_SLAVE_A] = 10; + data->node.seq_start[HSR_PT_SLAVE_B] = 10; + data->node.seq_expected[HSR_PT_SLAVE_B] = 10; + data->node.seq_out[HSR_PT_MASTER] = 9; + + /* 1st old frame, should be accepted */ + data->frame.sequence_nr = 8; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, + data->node.seq_out[HSR_PT_MASTER]); + check_prp_counters(test, data, data->frame.sequence_nr, + data->frame.sequence_nr + 1, 10, 10); + + /* 2nd frame should be dropped */ + data->frame.sequence_nr = 8; + data->port_rcv.type = HSR_PT_SLAVE_B; + KUNIT_EXPECT_EQ(test, 1, + prp_register_frame_out(&data->port, &data->frame)); + check_prp_counters(test, data, data->frame.sequence_nr + 1, + data->frame.sequence_nr + 1, + data->frame.sequence_nr + 1, + data->frame.sequence_nr + 1); + + /* Next frame, this is forwarded */ + data->frame.sequence_nr = 10; + data->port_rcv.type = HSR_PT_SLAVE_A; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, + data->node.seq_out[HSR_PT_MASTER]); + check_prp_counters(test, data, data->frame.sequence_nr, + data->frame.sequence_nr + 1, 9, 9); + + /* and next one is dropped */ + data->frame.sequence_nr = 10; + data->port_rcv.type = HSR_PT_SLAVE_B; + KUNIT_EXPECT_EQ(test, 1, + prp_register_frame_out(&data->port, &data->frame)); + check_prp_counters(test, data, data->frame.sequence_nr + 1, + data->frame.sequence_nr + 1, + data->frame.sequence_nr + 1, + data->frame.sequence_nr + 1); +} + +static void prp_dup_discard_lan_b_late(struct kunit *test) +{ + /* LAN B is behind */ + struct prp_test_data *data = build_prp_test_data(test); + + data->node.seq_start[HSR_PT_SLAVE_A] = 9; + data->node.seq_expected[HSR_PT_SLAVE_A] = 9; + data->node.seq_start[HSR_PT_SLAVE_B] = 9; + data->node.seq_expected[HSR_PT_SLAVE_B] = 9; + data->node.seq_out[HSR_PT_MASTER] = 8; + + data->frame.sequence_nr = 9; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, + data->node.seq_out[HSR_PT_MASTER]); + check_prp_counters(test, data, 9, 10, 9, 9); + + data->frame.sequence_nr = 10; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, + data->node.seq_out[HSR_PT_MASTER]); + check_prp_counters(test, data, 9, 11, 9, 9); + + data->frame.sequence_nr = 9; + data->port_rcv.type = HSR_PT_SLAVE_B; + KUNIT_EXPECT_EQ(test, 1, + prp_register_frame_out(&data->port, &data->frame)); + check_prp_counters(test, data, 10, 11, 10, 10); + + data->frame.sequence_nr = 10; + data->port_rcv.type = HSR_PT_SLAVE_B; + KUNIT_EXPECT_EQ(test, 1, + prp_register_frame_out(&data->port, &data->frame)); + check_prp_counters(test, data, 11, 11, 11, 11); +} + +static struct kunit_case prp_dup_discard_test_cases[] = { + KUNIT_CASE(prp_dup_discard_forward), + KUNIT_CASE(prp_dup_discard_inside_dropwindow), + KUNIT_CASE(prp_dup_discard_node_timeout), + KUNIT_CASE(prp_dup_discard_out_of_sequence), + KUNIT_CASE(prp_dup_discard_lan_b_late), + {} +}; + +static struct kunit_suite prp_dup_discard_suite = { + .name = "prp_duplicate_discard", + .test_cases = prp_dup_discard_test_cases, +}; + +kunit_test_suite(prp_dup_discard_suite); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("KUnit tests for PRP duplicate discard"); +MODULE_AUTHOR("Jaakko Karrenpalo <jkarrenpalo@gmail.com>"); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 175efd860f7b..018929563c6b 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -50,6 +50,7 @@ #include <linux/if_arp.h> #include <net/ipv6.h> +#include <net/netdev_lock.h> #include "6lowpan_i.h" @@ -116,7 +117,7 @@ static void lowpan_setup(struct net_device *ldev) ldev->netdev_ops = &lowpan_netdev_ops; ldev->header_ops = &lowpan_header_ops; ldev->needs_free_netdev = true; - ldev->netns_local = true; + ldev->netns_immutable = true; } static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -129,10 +130,11 @@ static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } -static int lowpan_newlink(struct net *src_net, struct net_device *ldev, - struct nlattr *tb[], struct nlattr *data[], +static int lowpan_newlink(struct net_device *ldev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **tb = params->tb; struct net_device *wdev; int ret; @@ -142,6 +144,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, if (!tb[IFLA_LINK]) return -EINVAL; + if (params->link_net && !net_eq(params->link_net, dev_net(ldev))) + return -EINVAL; /* find and hold wpan device */ wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK])); if (!wdev) diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 867d637d86f0..d4b983d17038 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -31,7 +31,8 @@ static const char lowpan_frags_cache_name[] = "lowpan-frags"; static struct inet_frags lowpan_frags; static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev, struct net_device *ldev); + struct sk_buff *prev, struct net_device *ldev, + int *refs); static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) { @@ -45,6 +46,7 @@ static void lowpan_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); struct frag_queue *fq; + int refs = 1; fq = container_of(frag, struct frag_queue, q); @@ -53,10 +55,10 @@ static void lowpan_frag_expire(struct timer_list *t) if (fq->q.flags & INET_FRAG_COMPLETE) goto out; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, &refs); out: spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q); + inet_frag_putn(&fq->q, refs); } static inline struct lowpan_frag_queue * @@ -82,7 +84,8 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb, } static int lowpan_frag_queue(struct lowpan_frag_queue *fq, - struct sk_buff *skb, u8 frag_type) + struct sk_buff *skb, u8 frag_type, + int *refs) { struct sk_buff *prev_tail; struct net_device *ldev; @@ -143,7 +146,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - res = lowpan_frag_reasm(fq, skb, prev_tail, ldev); + res = lowpan_frag_reasm(fq, skb, prev_tail, ldev, refs); skb->_skb_refdst = orefdst; return res; } @@ -162,11 +165,12 @@ err: * the last and the first frames arrived and all the bits are here. */ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *ldev) + struct sk_buff *prev_tail, struct net_device *ldev, + int *refs) { void *reasm_data; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) @@ -300,17 +304,20 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type) goto err; } + rcu_read_lock(); fq = fq_find(net, cb, &hdr.source, &hdr.dest); if (fq != NULL) { - int ret; + int ret, refs = 0; spin_lock(&fq->q.lock); - ret = lowpan_frag_queue(fq, skb, frag_type); + ret = lowpan_frag_queue(fq, skb, frag_type, &refs); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q); + rcu_read_unlock(); + inet_frag_putn(&fq->q, refs); return ret; } + rcu_read_unlock(); err: kfree_skb(skb); diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index 88adb04e4072..89b671b12600 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -226,11 +226,11 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { if (!wpan_dev->netdev) continue; - wpan_dev->netdev->netns_local = false; + wpan_dev->netdev->netns_immutable = false; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); if (err) break; - wpan_dev->netdev->netns_local = true; + wpan_dev->netdev->netns_immutable = true; } if (err) { @@ -242,11 +242,11 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, list) { if (!wpan_dev->netdev) continue; - wpan_dev->netdev->netns_local = false; + wpan_dev->netdev->netns_immutable = false; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); WARN_ON(err); - wpan_dev->netdev->netns_local = true; + wpan_dev->netdev->netns_immutable = true; } return err; @@ -291,7 +291,7 @@ static int cfg802154_netdev_notifier_call(struct notifier_block *nb, switch (state) { /* TODO NETDEV_DEVTYPE */ case NETDEV_REGISTER: - dev->netns_local = true; + dev->netns_immutable = true; wpan_dev->identifier = ++rdev->wpan_dev_id; list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list); rdev->devlist_generation++; diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 359249ab77bf..4c07a475c567 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -224,10 +224,10 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info) dev_hold(dev); if (info->attrs[IEEE802154_ATTR_HW_ADDR]) { - struct sockaddr addr; + struct sockaddr_storage addr; - addr.sa_family = ARPHRD_IEEE802154; - nla_memcpy(&addr.sa_data, info->attrs[IEEE802154_ATTR_HW_ADDR], + addr.ss_family = ARPHRD_IEEE802154; + nla_memcpy(&addr.__data, info->attrs[IEEE802154_ATTR_HW_ADDR], IEEE802154_ADDR_LEN); /* strangely enough, some callbacks (inetdev_event) from diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 6d2c97f8e9ef..12850a277251 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -425,7 +425,7 @@ config INET_DIAG tristate "INET: socket monitoring interface" default y help - Support for INET (TCP, DCCP, etc) socket monitoring interface used by + Support for INET (TCP, UDP, etc) socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable at: diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8095e82de808..76e38092cd8a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -153,7 +153,7 @@ void inet_sock_destruct(struct sock *sk) WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc)); WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); WARN_ON_ONCE(sk->sk_wmem_queued); - WARN_ON_ONCE(sk_forward_alloc_get(sk)); + WARN_ON_ONCE(sk->sk_forward_alloc); kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); @@ -1309,8 +1309,6 @@ int inet_sk_rebuild_header(struct sock *sk) { struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0)); struct inet_sock *inet = inet_sk(sk); - __be32 daddr; - struct ip_options_rcu *inet_opt; struct flowi4 *fl4; int err; @@ -1319,17 +1317,9 @@ int inet_sk_rebuild_header(struct sock *sk) return 0; /* Reroute. */ - rcu_read_lock(); - inet_opt = rcu_dereference(inet->inet_opt); - daddr = inet->inet_daddr; - if (inet_opt && inet_opt->opt.srr) - daddr = inet_opt->opt.faddr; - rcu_read_unlock(); fl4 = &inet->cork.fl.u.ip4; - rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, - inet->inet_dport, inet->inet_sport, - sk->sk_protocol, ip_sock_rt_tos(sk), - sk->sk_bound_dev_if); + inet_sk_init_flowi4(inet, fl4); + rt = ip_route_output_flow(sock_net(sk), fl4, sk); if (!IS_ERR(rt)) { err = 0; sk_setup_caps(sk, &rt->dst); @@ -1338,10 +1328,7 @@ int inet_sk_rebuild_header(struct sock *sk) /* Routing failed... */ sk->sk_route_caps = 0; - /* - * Other protocols have to map its equivalent state to TCP_SYN_SENT. - * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme - */ + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index cb9a7ed8abd3..a648fff71ea7 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -659,10 +659,12 @@ static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb */ void arp_xmit(struct sk_buff *skb) { + rcu_read_lock(); /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, - dev_net(skb->dev), NULL, skb, NULL, skb->dev, + dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev, arp_xmit_finish); + rcu_read_unlock(); } EXPORT_SYMBOL(arp_xmit); @@ -1062,8 +1064,8 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } - if (__in_dev_get_rtnl(dev)) { - IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); + if (__in_dev_get_rtnl_net(dev)) { + IN_DEV_CONF_SET(__in_dev_get_rtnl_net(dev), PROXY_ARP, on); return 0; } return -ENXIO; @@ -1075,7 +1077,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, + dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; @@ -1293,14 +1295,14 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) switch (cmd) { case SIOCDARP: - rtnl_lock(); + rtnl_net_lock(net); err = arp_req_delete(net, &r); - rtnl_unlock(); + rtnl_net_unlock(net); break; case SIOCSARP: - rtnl_lock(); + rtnl_net_lock(net); err = arp_req_set(net, &r); - rtnl_unlock(); + rtnl_net_unlock(net); break; case SIOCGARP: rcu_read_lock(); diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 554804774628..e01492234b0b 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -121,7 +121,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt) { /* bpf_tcp_ca prog cannot have NULL tp */ - __tcp_send_ack((struct sock *)tp, rcv_nxt); + __tcp_send_ack((struct sock *)tp, rcv_nxt, 0); return 0; } diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 4aca1f05edd3..4b5bc6eb52e7 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -102,8 +102,6 @@ EXPORT_SYMBOL(ip4_datagram_connect); void ip4_datagram_release_cb(struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); - const struct ip_options_rcu *inet_opt; - __be32 daddr = inet->inet_daddr; struct dst_entry *dst; struct flowi4 fl4; struct rtable *rt; @@ -115,14 +113,9 @@ void ip4_datagram_release_cb(struct sock *sk) rcu_read_unlock(); return; } - inet_opt = rcu_dereference(inet->inet_opt); - if (inet_opt && inet_opt->opt.srr) - daddr = inet_opt->opt.faddr; - rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr, - inet->inet_saddr, inet->inet_dport, - inet->inet_sport, sk->sk_protocol, - ip_sock_rt_tos(sk), sk->sk_bound_dev_if); + inet_sk_init_flowi4(inet, &fl4); + rt = ip_route_output_flow(sock_net(sk), &fl4, sk); dst = !IS_ERR(rt) ? &rt->dst : NULL; sk_dst_set(sk, dst); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c8b3cf5fba4c..c47d3828d4f6 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -46,6 +46,7 @@ #include <linux/notifier.h> #include <linux/inetdevice.h> #include <linux/igmp.h> +#include "igmp_internal.h" #include <linux/slab.h> #include <linux/hash.h> #ifdef CONFIG_SYSCTL @@ -107,15 +108,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_PROTO] = { .type = NLA_U8 }, }; -struct inet_fill_args { - u32 portid; - u32 seq; - int event; - unsigned int flags; - int netnsid; - int ifindex; -}; - #define IN4_ADDR_HSIZE_SHIFT 8 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) @@ -289,7 +281,7 @@ static struct in_device *inetdev_init(struct net_device *dev) if (!in_dev->arp_parms) goto out_kfree; if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) - dev_disable_lro(dev); + netif_disable_lro(dev); /* Reference in_dev->dev */ netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL); /* Account for reference dev->ip_ptr (below) */ @@ -1371,10 +1363,11 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) __be32 addr = 0; unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; - struct net *net = dev_net(dev); + struct net *net; int master_idx; rcu_read_lock(); + net = dev_net_rcu(dev); in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto no_in_dev; @@ -1799,12 +1792,12 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct ifaddrmsg *ifm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request"); return -EINVAL; @@ -1846,9 +1839,38 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, return 0; } -static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, - struct netlink_callback *cb, int *s_ip_idx, - struct inet_fill_args *fillargs) +static int in_dev_dump_ifmcaddr(struct in_device *in_dev, struct sk_buff *skb, + struct netlink_callback *cb, int *s_ip_idx, + struct inet_fill_args *fillargs) +{ + struct ip_mc_list *im; + int ip_idx = 0; + int err; + + for (im = rcu_dereference(in_dev->mc_list); + im; + im = rcu_dereference(im->next_rcu)) { + if (ip_idx < *s_ip_idx) { + ip_idx++; + continue; + } + err = inet_fill_ifmcaddr(skb, in_dev->dev, im, fillargs); + if (err < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + ip_idx++; + } + err = 0; + ip_idx = 0; +done: + *s_ip_idx = ip_idx; + return err; +} + +static int in_dev_dump_ifaddr(struct in_device *in_dev, struct sk_buff *skb, + struct netlink_callback *cb, int *s_ip_idx, + struct inet_fill_args *fillargs) { struct in_ifaddr *ifa; int ip_idx = 0; @@ -1874,6 +1896,21 @@ done: return err; } +static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, + struct netlink_callback *cb, int *s_ip_idx, + struct inet_fill_args *fillargs) +{ + switch (fillargs->event) { + case RTM_NEWADDR: + return in_dev_dump_ifaddr(in_dev, skb, cb, s_ip_idx, fillargs); + case RTM_GETMULTICAST: + return in_dev_dump_ifmcaddr(in_dev, skb, cb, s_ip_idx, + fillargs); + default: + return -EINVAL; + } +} + /* Combine dev_addr_genid and dev_base_seq to detect changes. */ static u32 inet_base_seq(const struct net *net) @@ -1889,13 +1926,14 @@ static u32 inet_base_seq(const struct net *net) return res; } -static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +static int inet_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, + int event) { const struct nlmsghdr *nlh = cb->nlh; struct inet_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, .seq = nlh->nlmsg_seq, - .event = RTM_NEWADDR, + .event = event, .flags = NLM_F_MULTI, .netnsid = -1, }; @@ -1949,6 +1987,16 @@ done: return err; } +static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + return inet_dump_addr(skb, cb, RTM_NEWADDR); +} + +static int inet_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + return inet_dump_addr(skb, cb, RTM_GETMULTICAST); +} + static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid) { @@ -2845,6 +2893,8 @@ static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_GETNETCONF, .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET, .msgtype = RTM_GETMULTICAST, + .dumpit = inet_dump_ifmcaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; void __init devinet_init(void) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index f3281312eb5e..f14a41ee4aa1 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -120,47 +120,16 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) } #ifdef CONFIG_INET_ESPINTCP -struct esp_tcp_sk { - struct sock *sk; - struct rcu_head rcu; -}; - -static void esp_free_tcp_sk(struct rcu_head *head) -{ - struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); - - sock_put(esk->sk); - kfree(esk); -} - static struct sock *esp_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; struct net *net = xs_net(x); - struct esp_tcp_sk *esk; __be16 sport, dport; - struct sock *nsk; struct sock *sk; - sk = rcu_dereference(x->encap_sk); - if (sk && sk->sk_state == TCP_ESTABLISHED) - return sk; - spin_lock_bh(&x->lock); sport = encap->encap_sport; dport = encap->encap_dport; - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (sk && sk == nsk) { - esk = kmalloc(sizeof(*esk), GFP_ATOMIC); - if (!esk) { - spin_unlock_bh(&x->lock); - return ERR_PTR(-ENOMEM); - } - RCU_INIT_POINTER(x->encap_sk, NULL); - esk->sk = sk; - call_rcu(&esk->rcu, esp_free_tcp_sk); - } spin_unlock_bh(&x->lock); sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4, @@ -173,20 +142,6 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x) return ERR_PTR(-EINVAL); } - spin_lock_bh(&x->lock); - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (encap->encap_sport != sport || - encap->encap_dport != dport) { - sock_put(sk); - sk = nsk ?: ERR_PTR(-EREMCHG); - } else if (sk == nsk) { - sock_put(sk); - } else { - rcu_assign_pointer(x->encap_sk, sk); - } - spin_unlock_bh(&x->lock); - return sk; } @@ -199,8 +154,10 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) sk = esp_find_tcp_sk(x); err = PTR_ERR_OR_ZERO(sk); - if (err) + if (err) { + kfree_skb(skb); goto out; + } bh_lock_sock(sk); if (sock_owned_by_user(sk)) @@ -209,6 +166,8 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk); + sock_put(sk); + out: rcu_read_unlock(); return err; @@ -279,7 +238,7 @@ static void esp_output_done(void *data, int err) x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) esp_output_tail_tcp(x, skb); else - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } } @@ -392,6 +351,8 @@ static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, if (IS_ERR(sk)) return ERR_CAST(sk); + sock_put(sk); + *lenp = htons(len); esph = (struct ip_esp_hdr *)(lenp + 1); @@ -816,7 +777,8 @@ int esp_input_done2(struct sk_buff *skb, int err) } skb_pull_rcsum(skb, hlen); - if (x->props.mode == XFRM_MODE_TUNNEL) + if (x->props.mode == XFRM_MODE_TUNNEL || + x->props.mode == XFRM_MODE_IPTFS) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -ihl); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 272e42d81323..fd1e1507a224 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -553,18 +553,16 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, const struct in_ifaddr *ifa; struct in_device *in_dev; - in_dev = __in_dev_get_rtnl(dev); + in_dev = __in_dev_get_rtnl_net(dev); if (!in_dev) return -ENODEV; *colon = ':'; - rcu_read_lock(); - in_dev_for_each_ifa_rcu(ifa, in_dev) { + in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; } - rcu_read_unlock(); if (!ifa) return -ENODEV; @@ -635,7 +633,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - rtnl_lock(); + rtnl_net_lock(net); err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; @@ -659,7 +657,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) /* allocated by rtentry_to_fib_config() */ kfree(cfg.fc_mx); } - rtnl_unlock(); + rtnl_net_unlock(net); return err; } return -EINVAL; @@ -837,19 +835,33 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, } } + if (cfg->fc_dst_len > 32) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + err = -EINVAL; + goto errout; + } + + if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { + NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); + err = -EINVAL; + goto errout; + } + if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); - return -EINVAL; + err = -EINVAL; + goto errout; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); - return -EINVAL; + err = -EINVAL; + goto errout; } if (!cfg->fc_table) @@ -872,20 +884,24 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + rtnl_net_lock(net); + if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); err = -EINVAL; - goto errout; + goto unlock; } tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; - goto errout; + goto unlock; } err = fib_table_delete(net, tb, &cfg, extack); +unlock: + rtnl_net_unlock(net); errout: return err; } @@ -902,15 +918,20 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + rtnl_net_lock(net); + tb = fib_new_table(net, cfg.fc_table); if (!tb) { err = -ENOBUFS; - goto errout; + goto unlock; } err = fib_table_insert(net, tb, &cfg, extack); if (!err && cfg.fc_type == RTN_LOCAL) net->ipv4.fib_has_custom_local_routes = true; + +unlock: + rtnl_net_unlock(net); errout: return err; } @@ -927,12 +948,12 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, if (filter->rtnl_held) ASSERT_RTNL(); - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request"); return -EINVAL; } - rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_scope) { NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request"); @@ -1450,7 +1471,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev)); + rt_cache_flush(net); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); @@ -1461,7 +1482,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, */ fib_disable_ip(dev, event, true); } else { - rt_cache_flush(dev_net(dev)); + rt_cache_flush(net); } break; } @@ -1575,7 +1596,7 @@ static void ip_fib_net_exit(struct net *net) { int i; - ASSERT_RTNL(); + ASSERT_RTNL_NET(net); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); @@ -1615,9 +1636,15 @@ static int __net_init fib_net_init(struct net *net) error = ip_fib_net_init(net); if (error < 0) goto out; + + error = fib4_semantics_init(net); + if (error) + goto out_semantics; + error = nl_fib_lookup_init(net); if (error < 0) goto out_nlfl; + error = fib_proc_init(net); if (error < 0) goto out_proc; @@ -1627,9 +1654,11 @@ out: out_proc: nl_fib_lookup_exit(net); out_nlfl: - rtnl_lock(); + fib4_semantics_exit(net); +out_semantics: + rtnl_net_lock(net); ip_fib_net_exit(net); - rtnl_unlock(); + rtnl_net_unlock(net); goto out; } @@ -1644,10 +1673,15 @@ static void __net_exit fib_net_exit_batch(struct list_head *net_list) struct net *net; rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) + list_for_each_entry(net, net_list, exit_list) { + __rtnl_net_lock(net); ip_fib_net_exit(net); - + __rtnl_net_unlock(net); + } rtnl_unlock(); + + list_for_each_entry(net, net_list, exit_list) + fib4_semantics_exit(net); } static struct pernet_operations fib_net_ops = { @@ -1658,9 +1692,9 @@ static struct pernet_operations fib_net_ops = { static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWROUTE, - .doit = inet_rtm_newroute}, + .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELROUTE, - .doit = inet_rtm_delroute}, + .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 8325224ef072..fa58d6620ed6 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -37,6 +37,7 @@ struct fib4_rule { u8 dst_len; u8 src_len; dscp_t dscp; + dscp_t dscp_mask; u8 dscp_full:1; /* DSCP or TOS selector */ __be32 src; __be32 srcmask; @@ -192,7 +193,8 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, * to mask the upper three DSCP bits prior to matching to maintain * legacy behavior. */ - if (r->dscp_full && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos)) + if (r->dscp_full && + (r->dscp ^ inet_dsfield_to_dscp(fl4->flowi4_tos)) & r->dscp_mask) return 0; else if (!r->dscp_full && r->dscp && !fib_dscp_masked_match(r->dscp, fl4)) @@ -201,12 +203,12 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto)) return 0; - if (fib_rule_port_range_set(&rule->sport_range) && - !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport)) + if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask, + fl4->fl4_sport)) return 0; - if (fib_rule_port_range_set(&rule->dport_range) && - !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport)) + if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask, + fl4->fl4_dport)) return 0; return 1; @@ -235,19 +237,49 @@ static int fib4_nl2rule_dscp(const struct nlattr *nla, struct fib4_rule *rule4, } rule4->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + rule4->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK); rule4->dscp_full = true; return 0; } +static int fib4_nl2rule_dscp_mask(const struct nlattr *nla, + struct fib4_rule *rule4, + struct netlink_ext_ack *extack) +{ + dscp_t dscp_mask; + + if (!rule4->dscp_full) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Cannot specify DSCP mask without DSCP value"); + return -EINVAL; + } + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + if (rule4->dscp & ~dscp_mask) { + NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask"); + return -EINVAL; + } + + rule4->dscp_mask = dscp_mask; + + return 0; +} + static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); + struct fib4_rule *rule4 = (struct fib4_rule *)rule; + struct net *net = rule->fr_net; int err = -EINVAL; - struct fib4_rule *rule4 = (struct fib4_rule *) rule; + + if (tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) { + NL_SET_ERR_MSG(extack, + "Flow label cannot be specified for IPv4 FIB rules"); + goto errout; + } if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, @@ -265,6 +297,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, fib4_nl2rule_dscp(tb[FRA_DSCP], rule4, extack) < 0) goto errout; + if (tb[FRA_DSCP_MASK] && + fib4_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule4, extack) < 0) + goto errout; + /* split local/main if they are not already split */ err = fib_unmerge(net); if (err) @@ -360,6 +396,14 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, return 0; } + if (tb[FRA_DSCP_MASK]) { + dscp_t dscp_mask; + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2); + if (!rule4->dscp_full || rule4->dscp_mask != dscp_mask) + return 0; + } + #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; @@ -385,7 +429,9 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, if (rule4->dscp_full) { frh->tos = 0; if (nla_put_u8(skb, FRA_DSCP, - inet_dscp_to_dsfield(rule4->dscp) >> 2)) + inet_dscp_to_dsfield(rule4->dscp) >> 2) || + nla_put_u8(skb, FRA_DSCP_MASK, + inet_dscp_to_dsfield(rule4->dscp_mask) >> 2)) goto nla_put_failure; } else { frh->tos = inet_dscp_to_dsfield(rule4->dscp); @@ -412,7 +458,8 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) return nla_total_size(4) /* dst */ + nla_total_size(4) /* src */ + nla_total_size(4) /* flow */ - + nla_total_size(1); /* dscp */ + + nla_total_size(1) /* dscp */ + + nla_total_size(1); /* dscp mask */ } static void fib4_rule_flush_cache(struct fib_rules_ops *ops) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d2cee5c314f5..d643bd1a0d9d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -50,12 +50,6 @@ #include "fib_lookup.h" -static struct hlist_head *fib_info_hash; -static struct hlist_head *fib_info_laddrhash; -static unsigned int fib_info_hash_size; -static unsigned int fib_info_hash_bits; -static unsigned int fib_info_cnt; - /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ @@ -258,8 +252,7 @@ void fib_release_info(struct fib_info *fi) ASSERT_RTNL(); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); - - fib_info_cnt--; + fi->fib_net->ipv4.fib_info_cnt--; if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); @@ -335,11 +328,12 @@ static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, static unsigned int fib_info_hashfn_result(const struct net *net, unsigned int val) { - return hash_32(val ^ net_hash_mix(net), fib_info_hash_bits); + return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits); } -static inline unsigned int fib_info_hashfn(struct fib_info *fi) +static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) { + struct net *net = fi->fib_net; unsigned int val; val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, @@ -354,7 +348,70 @@ static inline unsigned int fib_info_hashfn(struct fib_info *fi) } endfor_nexthops(fi) } - return fib_info_hashfn_result(fi->fib_net, val); + return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)]; +} + +static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, + __be32 val) +{ + unsigned int hash_bits = net->ipv4.fib_info_hash_bits; + u32 slot; + + slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits); + + return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot]; +} + +static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) +{ + /* The second half is used for prefsrc */ + return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head), + GFP_KERNEL); +} + +static void fib_info_hash_free(struct hlist_head *head) +{ + kvfree(head); +} + +static void fib_info_hash_grow(struct net *net) +{ + unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits; + struct hlist_head *new_info_hash, *old_info_hash; + unsigned int i; + + if (net->ipv4.fib_info_cnt < old_size) + return; + + new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1); + if (!new_info_hash) + return; + + old_info_hash = net->ipv4.fib_info_hash; + net->ipv4.fib_info_hash = new_info_hash; + net->ipv4.fib_info_hash_bits += 1; + + for (i = 0; i < old_size; i++) { + struct hlist_head *head = &old_info_hash[i]; + struct hlist_node *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, n, head, fib_hash) + hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); + } + + for (i = 0; i < old_size; i++) { + struct hlist_head *lhead = &old_info_hash[old_size + i]; + struct hlist_node *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) + hlist_add_head(&fi->fib_lhash, + fib_info_laddrhash_bucket(fi->fib_net, + fi->fib_prefsrc)); + } + + fib_info_hash_free(old_info_hash); } /* no metrics, only nexthop id */ @@ -370,13 +427,12 @@ static struct fib_info *fib_find_info_nh(struct net *net, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); hash = fib_info_hashfn_result(net, hash); - head = &fib_info_hash[hash]; + head = &net->ipv4.fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { - if (!net_eq(fi->fib_net, net)) - continue; if (!fi->nh || fi->nh->id != cfg->fc_nh_id) continue; + if (cfg->fc_protocol == fi->fib_protocol && cfg->fc_scope == fi->fib_scope && cfg->fc_prefsrc == fi->fib_prefsrc && @@ -392,18 +448,13 @@ static struct fib_info *fib_find_info_nh(struct net *net, static struct fib_info *fib_find_info(struct fib_info *nfi) { - struct hlist_head *head; + struct hlist_head *head = fib_info_hash_bucket(nfi); struct fib_info *fi; - unsigned int hash; - - hash = fib_info_hashfn(nfi); - head = &fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { - if (!net_eq(fi->fib_net, nfi->fib_net)) - continue; if (fi->fib_nhs != nfi->fib_nhs) continue; + if (nfi->fib_protocol == fi->fib_protocol && nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && @@ -1239,64 +1290,6 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, return err; } -static struct hlist_head * -fib_info_laddrhash_bucket(const struct net *net, __be32 val) -{ - u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val, - fib_info_hash_bits); - - return &fib_info_laddrhash[slot]; -} - -static void fib_info_hash_move(struct hlist_head *new_info_hash, - struct hlist_head *new_laddrhash, - unsigned int new_size) -{ - struct hlist_head *old_info_hash, *old_laddrhash; - unsigned int old_size = fib_info_hash_size; - unsigned int i; - - ASSERT_RTNL(); - old_info_hash = fib_info_hash; - old_laddrhash = fib_info_laddrhash; - fib_info_hash_size = new_size; - fib_info_hash_bits = ilog2(new_size); - - for (i = 0; i < old_size; i++) { - struct hlist_head *head = &fib_info_hash[i]; - struct hlist_node *n; - struct fib_info *fi; - - hlist_for_each_entry_safe(fi, n, head, fib_hash) { - struct hlist_head *dest; - unsigned int new_hash; - - new_hash = fib_info_hashfn(fi); - dest = &new_info_hash[new_hash]; - hlist_add_head(&fi->fib_hash, dest); - } - } - fib_info_hash = new_info_hash; - - fib_info_laddrhash = new_laddrhash; - for (i = 0; i < old_size; i++) { - struct hlist_head *lhead = &old_laddrhash[i]; - struct hlist_node *n; - struct fib_info *fi; - - hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) { - struct hlist_head *ldest; - - ldest = fib_info_laddrhash_bucket(fi->fib_net, - fi->fib_prefsrc); - hlist_add_head(&fi->fib_lhash, ldest); - } - } - - kvfree(old_info_hash); - kvfree(old_laddrhash); -} - __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope) { @@ -1409,32 +1402,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } #endif - err = -ENOBUFS; - - if (fib_info_cnt >= fib_info_hash_size) { - unsigned int new_size = fib_info_hash_size << 1; - struct hlist_head *new_info_hash; - struct hlist_head *new_laddrhash; - size_t bytes; - - if (!new_size) - new_size = 16; - bytes = (size_t)new_size * sizeof(struct hlist_head *); - new_info_hash = kvzalloc(bytes, GFP_KERNEL); - new_laddrhash = kvzalloc(bytes, GFP_KERNEL); - if (!new_info_hash || !new_laddrhash) { - kvfree(new_info_hash); - kvfree(new_laddrhash); - } else { - fib_info_hash_move(new_info_hash, new_laddrhash, new_size); - } - if (!fib_info_hash_size) - goto failure; - } + fib_info_hash_grow(net); fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); - if (!fi) + if (!fi) { + err = -ENOBUFS; goto failure; + } + fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); @@ -1571,9 +1546,9 @@ link_it: refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); - fib_info_cnt++; - hlist_add_head(&fi->fib_hash, - &fib_info_hash[fib_info_hashfn(fi)]); + net->ipv4.fib_info_cnt++; + hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); + if (fi->fib_prefsrc) { struct hlist_head *head; @@ -1855,7 +1830,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) struct fib_info *fi; int ret = 0; - if (!fib_info_laddrhash || local == 0) + if (!local) return 0; head = fib_info_laddrhash_bucket(net, local); @@ -2193,34 +2168,52 @@ static bool fib_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -void fib_select_multipath(struct fib_result *res, int hash) +void fib_select_multipath(struct fib_result *res, int hash, + const struct flowi4 *fl4) { struct fib_info *fi = res->fi; struct net *net = fi->fib_net; - bool first = false; + bool found = false; + bool use_neigh; + __be32 saddr; if (unlikely(res->fi->nh)) { nexthop_path_fib_result(res, hash); return; } + use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh); + saddr = fl4 ? fl4->saddr : 0; + change_nexthops(fi) { - if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) { - if (!fib_good_nh(nexthop_nh)) - continue; - if (!first) { - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - first = true; - } + int nh_upper_bound; + + /* Nexthops without a carrier are assigned an upper bound of + * minus one when "ignore_routes_with_linkdown" is set. + */ + nh_upper_bound = atomic_read(&nexthop_nh->fib_nh_upper_bound); + if (nh_upper_bound == -1 || + (use_neigh && !fib_good_nh(nexthop_nh))) + continue; + + if (!found) { + res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; + found = !saddr || nexthop_nh->nh_saddr == saddr; } - if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound)) + if (hash > nh_upper_bound) continue; - res->nh_sel = nhsel; - res->nhc = &nexthop_nh->nh_common; - return; + if (!saddr || nexthop_nh->nh_saddr == saddr) { + res->nh_sel = nhsel; + res->nhc = &nexthop_nh->nh_common; + return; + } + + if (found) + return; + } endfor_nexthops(fi); } #endif @@ -2235,7 +2228,7 @@ void fib_select_path(struct net *net, struct fib_result *res, if (fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(net, fl4, skb, NULL); - fib_select_multipath(res, h); + fib_select_multipath(res, h, fl4); } else #endif @@ -2257,3 +2250,22 @@ check_saddr: fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK); } } + +int __net_init fib4_semantics_init(struct net *net) +{ + unsigned int hash_bits = 4; + + net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits); + if (!net->ipv4.fib_info_hash) + return -ENOMEM; + + net->ipv4.fib_info_hash_bits = hash_bits; + net->ipv4.fib_info_cnt = 0; + + return 0; +} + +void __net_exit fib4_semantics_exit(struct net *net) +{ + fib_info_hash_free(net->ipv4.fib_info_hash); +} diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 161f5526b86c..59a6f0a9638f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1187,22 +1187,6 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } -static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) -{ - if (plen > KEYLENGTH) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - return false; - } - - if ((plen < KEYLENGTH) && (key << plen)) { - NL_SET_ERR_MSG(extack, - "Invalid prefix for given prefix length"); - return false; - } - - return true; -} - static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); @@ -1223,9 +1207,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); @@ -1717,9 +1698,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; @@ -2999,7 +2977,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%s\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", + "%u\t%08X\t%d\t%u\t%u", nhc->nhc_dev ? nhc->nhc_dev->name : "*", prefix, gw, flags, 0, 0, fi->fib_priority, @@ -3011,7 +2989,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) } else { seq_printf(seq, "*\t%08X\t%08X\t%04X\t%d\t%u\t" - "%d\t%08X\t%d\t%u\t%u", + "%u\t%08X\t%d\t%u\t%u", prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); } diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 6701a98d9a9f..dafd68f3436a 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -199,7 +199,7 @@ static const struct net_protocol net_gre_protocol = { static int __init gre_init(void) { - pr_info("GRE over IPv4 demultiplexor driver\n"); + pr_info("GRE over IPv4 demultiplexer driver\n"); if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { pr_err("can't add protocol\n"); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 963a89ae9c26..717cb7d3607a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -312,7 +312,6 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, struct dst_entry *dst = &rt->dst; struct inet_peer *peer; bool rc = true; - int vif; if (!apply_ratelimit) return true; @@ -321,12 +320,12 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) goto out; - vif = l3mdev_master_ifindex(dst->dev); - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); + rcu_read_lock(); + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, + l3mdev_master_ifindex_rcu(dst->dev)); rc = inet_peer_xrlim_allow(peer, READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); out: if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); @@ -400,13 +399,12 @@ static void icmp_push_reply(struct sock *sk, static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { - struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->dst.dev); + struct net *net = dev_net_rcu(rt->dst.dev); bool apply_ratelimit = false; + struct ipcm_cookie ipc; struct flowi4 fl4; struct sock *sk; - struct inet_sock *inet; __be32 daddr, saddr; u32 mark = IP4_REPLY_MARK(net, skb->mark); int type = icmp_param->data.icmph.type; @@ -425,12 +423,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) sk = icmp_xmit_lock(net); if (!sk) goto out_bh_enable; - inet = inet_sk(sk); icmp_param->data.icmph.checksum = 0; ipcm_init(&ipc); - inet->tos = ip_hdr(skb)->tos; + ipc.tos = ip_hdr(skb)->tos; ipc.sockc.mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); @@ -609,12 +606,14 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, struct sock *sk; if (!rt) - goto out; + return; + + rcu_read_lock(); if (rt->dst.dev) - net = dev_net(rt->dst.dev); + net = dev_net_rcu(rt->dst.dev); else if (skb_in->dev) - net = dev_net(skb_in->dev); + net = dev_net_rcu(skb_in->dev); else goto out; @@ -736,8 +735,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, icmp_param.data.icmph.checksum = 0; icmp_param.skb = skb_in; icmp_param.offset = skb_network_offset(skb_in); - inet_sk(sk)->tos = tos; ipcm_init(&ipc); + ipc.tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; ipc.sockc.mark = mark; @@ -786,7 +785,8 @@ out_unlock: icmp_xmit_unlock(sk); out_bh_enable: local_bh_enable(); -out:; +out: + rcu_read_unlock(); } EXPORT_SYMBOL(__icmp_send); @@ -835,7 +835,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) * avoid additional coding at protocol handlers. */ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { - __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); return; } @@ -869,7 +869,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) struct net *net; u32 info = 0; - net = dev_net(skb_dst(skb)->dev); + net = dev_net_rcu(skb_dst(skb)->dev); /* * Incomplete header ? @@ -980,7 +980,7 @@ out_err: static enum skb_drop_reason icmp_redirect(struct sk_buff *skb) { if (skb->len < sizeof(struct iphdr)) { - __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); return SKB_DROP_REASON_PKT_TOO_SMALL; } @@ -1012,7 +1012,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) struct icmp_bxm icmp_param; struct net *net; - net = dev_net(skb_dst(skb)->dev); + net = dev_net_rcu(skb_dst(skb)->dev); /* should there be an ICMP stat for ignored echos? */ if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) return SKB_NOT_DROPPED_YET; @@ -1041,9 +1041,9 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) { + struct net *net = dev_net_rcu(skb->dev); struct icmp_ext_hdr *ext_hdr, _ext_hdr; struct icmp_ext_echo_iio *iio, _iio; - struct net *net = dev_net(skb->dev); struct inet6_dev *in6_dev; struct in_device *in_dev; struct net_device *dev; @@ -1182,7 +1182,7 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) return SKB_NOT_DROPPED_YET; out_err: - __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS); return SKB_DROP_REASON_PKT_TOO_SMALL; } @@ -1199,7 +1199,7 @@ int icmp_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->dst.dev); + struct net *net = dev_net_rcu(rt->dst.dev); struct icmphdr *icmph; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { @@ -1248,22 +1248,6 @@ int icmp_rcv(struct sk_buff *skb) goto reason_check; } - if (icmph->type == ICMP_EXT_ECHOREPLY) { - reason = ping_rcv(skb); - goto reason_check; - } - - /* - * 18 is the highest 'known' ICMP type. Anything else is a mystery - * - * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently - * discarded. - */ - if (icmph->type > NR_ICMP_TYPES) { - reason = SKB_DROP_REASON_UNHANDLED_PROTO; - goto error; - } - /* * Parse the ICMP message */ @@ -1290,6 +1274,23 @@ int icmp_rcv(struct sk_buff *skb) } } + if (icmph->type == ICMP_EXT_ECHOREPLY || + icmph->type == ICMP_ECHOREPLY) { + reason = ping_rcv(skb); + return reason ? NET_RX_DROP : NET_RX_SUCCESS; + } + + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + reason = SKB_DROP_REASON_UNHANDLED_PROTO; + goto error; + } + reason = icmp_pointers[icmph->type].handler(skb); reason_check: if (!reason) { @@ -1372,9 +1373,9 @@ int icmp_err(struct sk_buff *skb, u32 info) struct iphdr *iph = (struct iphdr *)skb->data; int offset = iph->ihl<<2; struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset); + struct net *net = dev_net_rcu(skb->dev); int type = icmp_hdr(skb)->type; int code = icmp_hdr(skb)->code; - struct net *net = dev_net(skb->dev); /* * Use ping_err to handle all icmp errors except those diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6a238398acc9..ca7d539b3846 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -81,6 +81,7 @@ #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> +#include "igmp_internal.h" #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/times.h> @@ -88,6 +89,8 @@ #include <linux/byteorder/generic.h> #include <net/net_namespace.h> +#include <net/netlink.h> +#include <net/addrconf.h> #include <net/arp.h> #include <net/ip.h> #include <net/protocol.h> @@ -202,7 +205,7 @@ static void ip_sf_list_clear_all(struct ip_sf_list *psf) static void igmp_stop_timer(struct ip_mc_list *im) { spin_lock_bh(&im->lock); - if (del_timer(&im->timer)) + if (timer_delete(&im->timer)) refcount_dec(&im->refcnt); im->tm_running = 0; im->reporter = 0; @@ -248,7 +251,7 @@ static void igmp_mod_timer(struct ip_mc_list *im, int max_delay) { spin_lock_bh(&im->lock); im->unsolicit_count = 0; - if (del_timer(&im->timer)) { + if (timer_delete(&im->timer)) { if ((long)(im->timer.expires-jiffies) < max_delay) { add_timer(&im->timer); im->tm_running = 1; @@ -971,7 +974,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, } /* cancel the interface change timer */ WRITE_ONCE(in_dev->mr_ifc_count, 0); - if (del_timer(&in_dev->mr_ifc_timer)) + if (timer_delete(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); /* clear deleted report items */ igmpv3_clear_delrec(in_dev); @@ -1430,6 +1433,70 @@ static void ip_mc_hash_remove(struct in_device *in_dev, *mc_hash = im->next_hash; } +int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, + const struct ip_mc_list *im, + struct inet_fill_args *args) +{ + struct ifa_cacheinfo ci; + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, args->portid, args->seq, args->event, + sizeof(struct ifaddrmsg), args->flags); + if (!nlh) + return -EMSGSIZE; + + ifm = nlmsg_data(nlh); + ifm->ifa_family = AF_INET; + ifm->ifa_prefixlen = 32; + ifm->ifa_flags = IFA_F_PERMANENT; + ifm->ifa_scope = RT_SCOPE_UNIVERSE; + ifm->ifa_index = dev->ifindex; + + ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ; + ci.tstamp = ci.cstamp; + ci.ifa_prefered = INFINITY_LIFE_TIME; + ci.ifa_valid = INFINITY_LIFE_TIME; + + if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 || + nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static void inet_ifmcaddr_notify(struct net_device *dev, + const struct ip_mc_list *im, int event) +{ + struct inet_fill_args fillargs = { + .event = event, + }; + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOMEM; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(sizeof(__be32)) + + nla_total_size(sizeof(struct ifa_cacheinfo)), + GFP_KERNEL); + if (!skb) + goto error; + + err = inet_fill_ifmcaddr(skb, dev, im, &fillargs); + if (err < 0) { + WARN_ON_ONCE(err == -EMSGSIZE); + nlmsg_free(skb); + goto error; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_KERNEL); + return; +error: + rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err); +} /* * A socket has joined a multicast group on device dev. @@ -1473,6 +1540,8 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, im->interface = in_dev; in_dev_hold(in_dev); im->multiaddr = addr; + im->mca_cstamp = jiffies; + im->mca_tstamp = im->mca_cstamp; /* initial mode is (EX, empty) */ im->sfmode = mode; im->sfcount[mode] = 1; @@ -1492,6 +1561,7 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr, igmpv3_del_delrec(in_dev, im); #endif igmp_group_added(im); + inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST); if (!in_dev->dead) ip_rt_multicast_event(in_dev); out: @@ -1705,6 +1775,8 @@ void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp) *ip = i->next_rcu; in_dev->mc_count--; __igmp_group_dropped(i, gfp); + inet_ifmcaddr_notify(in_dev->dev, i, + RTM_DELMULTICAST); ip_mc_clear_src(i); if (!in_dev->dead) @@ -1758,10 +1830,10 @@ void ip_mc_down(struct in_device *in_dev) #ifdef CONFIG_IP_MULTICAST WRITE_ONCE(in_dev->mr_ifc_count, 0); - if (del_timer(&in_dev->mr_ifc_timer)) + if (timer_delete(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); in_dev->mr_gq_running = 0; - if (del_timer(&in_dev->mr_gq_timer)) + if (timer_delete(&in_dev->mr_gq_timer)) __in_dev_put(in_dev); #endif diff --git a/net/ipv4/igmp_internal.h b/net/ipv4/igmp_internal.h new file mode 100644 index 000000000000..0a1bcc8ec8e1 --- /dev/null +++ b/net/ipv4/igmp_internal.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_IGMP_INTERNAL_H +#define _LINUX_IGMP_INTERNAL_H + +struct inet_fill_args { + u32 portid; + u32 seq; + int event; + unsigned int flags; + int netnsid; + int ifindex; +}; + +int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, + const struct ip_mc_list *im, + struct inet_fill_args *args); +#endif diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6872b5aff73e..20915895bdaa 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -157,12 +157,10 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { - int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - - if (addr_type == IPV6_ADDR_ANY) + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) return false; - if (addr_type != IPV6_ADDR_MAPPED) + if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) return true; } #endif @@ -332,7 +330,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, struct inet_bind2_bucket **tb2_ret, struct inet_bind_hashbucket **head2_ret, int *port_ret) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); int i, low, high, attempt_half, port, l3mdev; struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); @@ -514,10 +512,10 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); int ret = -EADDRINUSE, port = snum, l3mdev; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; @@ -600,7 +598,7 @@ fail_unlock: if (bhash2_created) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); if (bhash_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); } if (head2_lock_acquired) spin_unlock(&head2->lock); @@ -769,7 +767,6 @@ void inet_csk_init_xmit_timers(struct sock *sk, timer_setup(&sk->sk_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } -EXPORT_SYMBOL(inet_csk_init_xmit_timers); void inet_csk_clear_xmit_timers(struct sock *sk) { @@ -782,7 +779,6 @@ void inet_csk_clear_xmit_timers(struct sock *sk) sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } -EXPORT_SYMBOL(inet_csk_clear_xmit_timers); void inet_csk_clear_xmit_timers_sync(struct sock *sk) { @@ -799,18 +795,6 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk) sk_stop_timer_sync(sk, &sk->sk_timer); } -void inet_csk_delete_keepalive_timer(struct sock *sk) -{ - sk_stop_timer(sk, &sk->sk_timer); -} -EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); - -void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) -{ - sk_reset_timer(sk, &sk->sk_timer, jiffies + len); -} -EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); - struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) @@ -845,7 +829,6 @@ no_route: __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } -EXPORT_SYMBOL_GPL(inet_csk_route_req); struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, struct sock *newsk, @@ -912,7 +895,6 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) req->num_retrans++; return err; } -EXPORT_SYMBOL(inet_rtx_syn_ack); static struct request_sock * reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, @@ -1040,9 +1022,10 @@ static bool reqsk_queue_unlink(struct request_sock *req) bool found = false; if (sk_hashed(sk)) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); - spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); + spinlock_t *lock; + lock = inet_ehash_lockp(hashinfo, req->rsk_hash); spin_lock(lock); found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); @@ -1072,14 +1055,13 @@ bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req) { return __inet_csk_reqsk_queue_drop(sk, req, false); } -EXPORT_SYMBOL(inet_csk_reqsk_queue_drop); void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req) { inet_csk_reqsk_queue_drop(sk, req); reqsk_put(req); } -EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put); +EXPORT_IPV6_MOD(inet_csk_reqsk_queue_drop_and_put); static void reqsk_timer_handler(struct timer_list *t) { @@ -1223,7 +1205,6 @@ bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, inet_csk_reqsk_queue_added(sk); return true; } -EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, const gfp_t priority) @@ -1249,42 +1230,61 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, const gfp_t priority) { struct sock *newsk = sk_clone_lock(sk, priority); + struct inet_connection_sock *newicsk; + struct inet_request_sock *ireq; + struct inet_sock *newinet; - if (newsk) { - struct inet_connection_sock *newicsk = inet_csk(newsk); + if (!newsk) + return NULL; - inet_sk_set_state(newsk, TCP_SYN_RECV); - newicsk->icsk_bind_hash = NULL; - newicsk->icsk_bind2_hash = NULL; + newicsk = inet_csk(newsk); + newinet = inet_sk(newsk); + ireq = inet_rsk(req); - inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; - inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; - inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); + newicsk->icsk_bind_hash = NULL; + newicsk->icsk_bind2_hash = NULL; - /* listeners have SOCK_RCU_FREE, not the children */ - sock_reset_flag(newsk, SOCK_RCU_FREE); + newinet->inet_dport = ireq->ir_rmt_port; + newinet->inet_num = ireq->ir_num; + newinet->inet_sport = htons(ireq->ir_num); - inet_sk(newsk)->mc_list = NULL; + newsk->sk_bound_dev_if = ireq->ir_iif; - newsk->sk_mark = inet_rsk(req)->ir_mark; - atomic64_set(&newsk->sk_cookie, - atomic64_read(&inet_rsk(req)->ir_cookie)); + newsk->sk_daddr = ireq->ir_rmt_addr; + newsk->sk_rcv_saddr = ireq->ir_loc_addr; + newinet->inet_saddr = ireq->ir_loc_addr; - newicsk->icsk_retransmits = 0; - newicsk->icsk_backoff = 0; - newicsk->icsk_probes_out = 0; - newicsk->icsk_probes_tstamp = 0; +#if IS_ENABLED(CONFIG_IPV6) + newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; + newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; +#endif - /* Deinitialize accept_queue to trap illegal accesses. */ - memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + /* listeners have SOCK_RCU_FREE, not the children */ + sock_reset_flag(newsk, SOCK_RCU_FREE); - inet_clone_ulp(req, newsk, priority); + inet_sk(newsk)->mc_list = NULL; + + newsk->sk_mark = inet_rsk(req)->ir_mark; + atomic64_set(&newsk->sk_cookie, + atomic64_read(&inet_rsk(req)->ir_cookie)); + + newicsk->icsk_retransmits = 0; + newicsk->icsk_backoff = 0; + newicsk->icsk_probes_out = 0; + newicsk->icsk_probes_tstamp = 0; + + /* Deinitialize accept_queue to trap illegal accesses. */ + memset(&newicsk->icsk_accept_queue, 0, + sizeof(newicsk->icsk_accept_queue)); + + inet_sk_set_state(newsk, TCP_SYN_RECV); + + inet_clone_ulp(req, newsk, priority); + + security_inet_csk_clone(newsk, req); - security_inet_csk_clone(newsk, req); - } return newsk; } -EXPORT_SYMBOL_GPL(inet_csk_clone_lock); /* * At this point, there should be no process reference to this @@ -1316,7 +1316,7 @@ void inet_csk_destroy_sock(struct sock *sk) EXPORT_SYMBOL(inet_csk_destroy_sock); /* This function allows to force a closure of a socket after the call to - * tcp/dccp_create_openreq_child(). + * tcp_create_openreq_child(). */ void inet_csk_prepare_forced_close(struct sock *sk) __releases(&sk->sk_lock.slock) @@ -1374,7 +1374,6 @@ int inet_csk_listen_start(struct sock *sk) inet_sk_set_state(sk, TCP_CLOSE); return err; } -EXPORT_SYMBOL_GPL(inet_csk_listen_start); static void inet_child_forget(struct sock *sk, struct request_sock *req, struct sock *child) @@ -1469,7 +1468,6 @@ child_put: sock_put(child); return NULL; } -EXPORT_SYMBOL(inet_csk_complete_hashdance); /* * This routine closes sockets which have been at least partially @@ -1547,34 +1545,16 @@ skip_child_forget: } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); -void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) -{ - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; - const struct inet_sock *inet = inet_sk(sk); - - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = inet->inet_daddr; - sin->sin_port = inet->inet_dport; -} -EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); - static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl) { const struct inet_sock *inet = inet_sk(sk); - const struct ip_options_rcu *inet_opt; - __be32 daddr = inet->inet_daddr; struct flowi4 *fl4; struct rtable *rt; rcu_read_lock(); - inet_opt = rcu_dereference(inet->inet_opt); - if (inet_opt && inet_opt->opt.srr) - daddr = inet_opt->opt.faddr; fl4 = &fl->u.ip4; - rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, - inet->inet_saddr, inet->inet_dport, - inet->inet_sport, sk->sk_protocol, - ip_sock_rt_tos(sk), sk->sk_bound_dev_if); + inet_sk_init_flowi4(inet, fl4); + rt = ip_route_output_flow(sock_net(sk), fl4, sk); if (IS_ERR(rt)) rt = NULL; if (rt) @@ -1602,4 +1582,3 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu) out: return dst; } -EXPORT_SYMBOL_GPL(inet_csk_update_pmtu); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 321acc8abf17..1d1d6ad53f4c 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -160,7 +160,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, ext & (1 << (INET_DIAG_TCLASS - 1))) { u32 classid = 0; -#ifdef CONFIG_SOCK_CGROUP_DATA +#ifdef CONFIG_CGROUP_NET_CLASSID classid = sock_cgroup_classid(&sk->sk_cgrp_data); #endif /* Fallback to socket priority if class id isn't set. @@ -282,7 +282,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), - .idiag_fmem = sk_forward_alloc_get(sk), + .idiag_fmem = READ_ONCE(sk->sk_forward_alloc), .idiag_tmem = sk_wmem_alloc_get(sk), }; @@ -315,12 +315,12 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, r->idiag_timer = 1; r->idiag_retrans = icsk->icsk_retransmits; r->idiag_expires = - jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; r->idiag_retrans = icsk->icsk_probes_out; r->idiag_expires = - jiffies_delta_to_msecs(icsk->icsk_timeout - jiffies); + jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; r->idiag_retrans = icsk->icsk_probes_out; @@ -1369,8 +1369,6 @@ static int inet_diag_type2proto(int type) switch (type) { case TCPDIAG_GETSOCK: return IPPROTO_TCP; - case DCCPDIAG_GETSOCK: - return IPPROTO_DCCP; default: return 0; } diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index d179a2c84222..470ab17ceb51 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -133,7 +133,7 @@ static void inet_frags_free_cb(void *ptr, void *arg) struct inet_frag_queue *fq = ptr; int count; - count = del_timer_sync(&fq->timer) ? 1 : 0; + count = timer_delete_sync(&fq->timer) ? 1 : 0; spin_lock_bh(&fq->lock); fq->flags |= INET_FRAG_DROP; @@ -145,8 +145,7 @@ static void inet_frags_free_cb(void *ptr, void *arg) } spin_unlock_bh(&fq->lock); - if (refcount_sub_and_test(count, &fq->refcnt)) - inet_frag_destroy(fq); + inet_frag_putn(fq, count); } static LLIST_HEAD(fqdir_free_list); @@ -226,10 +225,10 @@ void fqdir_exit(struct fqdir *fqdir) } EXPORT_SYMBOL(fqdir_exit); -void inet_frag_kill(struct inet_frag_queue *fq) +void inet_frag_kill(struct inet_frag_queue *fq, int *refs) { - if (del_timer(&fq->timer)) - refcount_dec(&fq->refcnt); + if (timer_delete(&fq->timer)) + (*refs)++; if (!(fq->flags & INET_FRAG_COMPLETE)) { struct fqdir *fqdir = fq->fqdir; @@ -244,7 +243,7 @@ void inet_frag_kill(struct inet_frag_queue *fq) if (!READ_ONCE(fqdir->dead)) { rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, fqdir->f->rhash_params); - refcount_dec(&fq->refcnt); + (*refs)++; } else { fq->flags |= INET_FRAG_HASH_DEAD; } @@ -298,7 +297,7 @@ void inet_frag_destroy(struct inet_frag_queue *q) reason = (q->flags & INET_FRAG_DROP) ? SKB_DROP_REASON_FRAG_REASM_TIMEOUT : SKB_CONSUMED; - WARN_ON(del_timer(&q->timer) != 0); + WARN_ON(timer_delete(&q->timer) != 0); /* Release all fragment data. */ fqdir = q->fqdir; @@ -328,7 +327,8 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); - refcount_set(&q->refcnt, 3); + /* One reference for the timer, one for the hash table. */ + refcount_set(&q->refcnt, 2); return q; } @@ -350,15 +350,20 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); if (*prev) { + /* We could not insert in the hash table, + * we need to cancel what inet_frag_alloc() + * anticipated. + */ + int refs = 1; + q->flags |= INET_FRAG_COMPLETE; - inet_frag_kill(q); - inet_frag_destroy(q); + inet_frag_kill(q, &refs); + inet_frag_putn(q, refs); return NULL; } return q; } -/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) { /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ @@ -368,17 +373,11 @@ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) return NULL; - rcu_read_lock(); - prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); if (!prev) fq = inet_frag_create(fqdir, key, &prev); - if (!IS_ERR_OR_NULL(prev)) { + if (!IS_ERR_OR_NULL(prev)) fq = prev; - if (!refcount_inc_not_zero(&fq->refcnt)) - fq = NULL; - } - rcu_read_unlock(); return fq; } EXPORT_SYMBOL(inet_frag_find); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9bfcfd016e18..77a0b52b2eab 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -23,11 +23,12 @@ #if IS_ENABLED(CONFIG_IPV6) #include <net/inet6_hashtables.h> #endif -#include <net/secure_seq.h> #include <net/hotdata.h> #include <net/ip.h> -#include <net/tcp.h> +#include <net/rps.h> +#include <net/secure_seq.h> #include <net/sock_reuseport.h> +#include <net/tcp.h> u32 inet_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, @@ -35,8 +36,8 @@ u32 inet_ehashfn(const struct net *net, const __be32 laddr, { net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret)); - return __inet_ehashfn(laddr, lport, faddr, fport, - inet_ehash_secret + net_hash_mix(net)); + return lport + __inet_ehashfn(laddr, 0, faddr, fport, + inet_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet_ehashfn); @@ -76,7 +77,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, tb->fastreuse = 0; tb->fastreuseport = 0; INIT_HLIST_HEAD(&tb->bhash2); - hlist_add_head(&tb->node, &head->chain); + hlist_add_head_rcu(&tb->node, &head->chain); } return tb; } @@ -84,11 +85,11 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, /* * Caller must hold hashbucket lock for this tb with local BH disabled */ -void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) +void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) { if (hlist_empty(&tb->bhash2)) { - __hlist_del(&tb->node); - kmem_cache_free(cachep, tb); + hlist_del_rcu(&tb->node); + kfree_rcu(tb, rcu); } } @@ -176,7 +177,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, */ static void __inet_put_port(struct sock *sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); struct inet_bind_bucket *tb; @@ -201,7 +202,7 @@ static void __inet_put_port(struct sock *sk) } spin_unlock(&head2->lock); - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head->lock); } @@ -215,7 +216,7 @@ EXPORT_SYMBOL(inet_put_port); int __inet_inherit_port(const struct sock *sk, struct sock *child) { - struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *table = tcp_get_hashinfo(sk); unsigned short port = inet_sk(child)->inet_num; struct inet_bind_hashbucket *head, *head2; bool created_inet_bind_bucket = false; @@ -285,7 +286,7 @@ bhash2_find: error: if (created_inet_bind_bucket) - inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; @@ -537,7 +538,9 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established); /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + bool rcu_lookup, + u32 hash) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -548,14 +551,25 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, int sdif = l3mdev_master_ifindex_by_index(net, dif); INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); - unsigned int hash = inet_ehashfn(net, daddr, lport, - saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); - spinlock_t *lock = inet_ehash_lockp(hinfo, hash); - struct sock *sk2; - const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; + const struct hlist_nulls_node *node; + struct sock *sk2; + spinlock_t *lock; + + if (rcu_lookup) { + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet_match(net, sk2, acookie, ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + return -EADDRNOTAVAIL; + } + return 0; + } + lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { @@ -655,7 +669,7 @@ static bool inet_ehash_lookup_by_sk(struct sock *sk, */ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_ehash_bucket *head; struct hlist_nulls_head *list; spinlock_t *lock; @@ -700,7 +714,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) } return ok; } -EXPORT_SYMBOL_GPL(inet_ehash_nolisten); +EXPORT_IPV6_MOD(inet_ehash_nolisten); static int inet_reuseport_add_sock(struct sock *sk, struct inet_listen_hashbucket *ilb) @@ -727,7 +741,7 @@ static int inet_reuseport_add_sock(struct sock *sk, int __inet_hash(struct sock *sk, struct sock *osk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; @@ -758,7 +772,7 @@ unlock: return err; } -EXPORT_SYMBOL(__inet_hash); +EXPORT_IPV6_MOD(__inet_hash); int inet_hash(struct sock *sk) { @@ -769,15 +783,15 @@ int inet_hash(struct sock *sk) return err; } -EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { - struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); if (sk_unhashed(sk)) return; + sock_rps_delete_flow(sk); if (sk->sk_state == TCP_LISTEN) { struct inet_listen_hashbucket *ilb2; @@ -810,7 +824,7 @@ void inet_unhash(struct sock *sk) spin_unlock_bh(lock); } } -EXPORT_SYMBOL_GPL(inet_unhash); +EXPORT_IPV6_MOD(inet_unhash); static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, const struct net *net, unsigned short port, @@ -861,7 +875,7 @@ inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); u32 hash; #if IS_ENABLED(CONFIG_IPV6) @@ -889,7 +903,7 @@ static void inet_update_saddr(struct sock *sk, void *saddr, int family) static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset) { - struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk); struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); @@ -969,14 +983,14 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family) { return __inet_bhash2_update_saddr(sk, saddr, family, false); } -EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr); +EXPORT_IPV6_MOD(inet_bhash2_update_saddr); void inet_bhash2_reset_saddr(struct sock *sk) { if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) __inet_bhash2_update_saddr(sk, NULL, 0, true); } -EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr); +EXPORT_IPV6_MOD(inet_bhash2_reset_saddr); /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm * Note that we use 32bit integers (vs RFC 'short integers') @@ -993,8 +1007,10 @@ static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, + u32 hash_port0, int (*check_established)(struct inet_timewait_death_row *, - struct sock *, __u16, struct inet_timewait_sock **)) + struct sock *, __u16, struct inet_timewait_sock **, + bool rcu_lookup, u32 hash)) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_bind_hashbucket *head, *head2; @@ -1012,7 +1028,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (port) { local_bh_disable(); - ret = check_established(death_row, sk, port, NULL); + ret = check_established(death_row, sk, port, NULL, false, + hash_port0 + port); local_bh_enable(); return ret; } @@ -1048,6 +1065,22 @@ other_parity_scan: continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; + rcu_read_lock(); + hlist_for_each_entry_rcu(tb, &head->chain, node) { + if (!inet_bind_bucket_match(tb, net, port, l3mdev)) + continue; + if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) { + rcu_read_unlock(); + goto next_port; + } + if (!check_established(death_row, sk, port, &tw, true, + hash_port0 + port)) + break; + rcu_read_unlock(); + goto next_port; + } + rcu_read_unlock(); + spin_lock_bh(&head->lock); /* Does not bother with rcv_saddr checks, because @@ -1057,12 +1090,13 @@ other_parity_scan: if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) - goto next_port; + goto next_port_unlock; WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk, - port, &tw)) + port, &tw, false, + hash_port0 + port)) goto ok; - goto next_port; + goto next_port_unlock; } } @@ -1076,8 +1110,9 @@ other_parity_scan: tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; -next_port: +next_port_unlock: spin_unlock_bh(&head->lock); +next_port: cond_resched(); } @@ -1149,7 +1184,7 @@ error: spin_unlock(&head2->lock); if (tb_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head->lock); if (tw) @@ -1166,14 +1201,20 @@ error: int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { + const struct inet_sock *inet = inet_sk(sk); + const struct net *net = sock_net(sk); u64 port_offset = 0; + u32 hash_port0; if (!inet_sk(sk)->inet_num) port_offset = inet_sk_port_offset(sk); - return __inet_hash_connect(death_row, sk, port_offset, + + hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0, + inet->inet_daddr, inet->inet_dport); + + return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet_check_established); } -EXPORT_SYMBOL_GPL(inet_hash_connect); static void init_hashinfo_lhash2(struct inet_hashinfo *h) { @@ -1224,32 +1265,45 @@ int inet_hashinfo2_init_mod(struct inet_hashinfo *h) init_hashinfo_lhash2(h); return 0; } -EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); unsigned int i, nblocks = 1; + spinlock_t *ptr = NULL; - if (locksz != 0) { - /* allocate 2 cache lines or at least one spinlock per cpu */ - nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); - nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); + if (locksz == 0) + goto set_mask; - /* no more locks than number of hash buckets */ - nblocks = min(nblocks, hashinfo->ehash_mask + 1); + /* Allocate 2 cache lines or at least one spinlock per cpu. */ + nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus(); - hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); - if (!hashinfo->ehash_locks) - return -ENOMEM; + /* At least one page per NUMA node. */ + nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz); + + nblocks = roundup_pow_of_two(nblocks); - for (i = 0; i < nblocks; i++) - spin_lock_init(&hashinfo->ehash_locks[i]); + /* No more locks than number of hash buckets. */ + nblocks = min(nblocks, hashinfo->ehash_mask + 1); + + if (num_online_nodes() > 1) { + /* Use vmalloc() to allow NUMA policy to spread pages + * on all available nodes if desired. + */ + ptr = vmalloc_array(nblocks, locksz); + } + if (!ptr) { + ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL); + if (!ptr) + return -ENOMEM; } + for (i = 0; i < nblocks; i++) + spin_lock_init(&ptr[i]); + hashinfo->ehash_locks = ptr; +set_mask: hashinfo->ehash_locks_mask = nblocks - 1; return 0; } -EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, unsigned int ehash_entries) @@ -1285,7 +1339,6 @@ free_hashinfo: err: return NULL; } -EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) { @@ -1296,4 +1349,3 @@ void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) vfree(hashinfo->ehash); kfree(hashinfo); } -EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 337390ba85b4..67efe9501581 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -39,7 +39,7 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, tw->tw_tb = NULL; tw->tw_tb2 = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); __sock_put((struct sock *)tw); } @@ -166,7 +166,6 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, spin_unlock(lock); local_bh_enable(); } -EXPORT_SYMBOL_GPL(inet_twsk_hashdance_schedule); static void tw_timer_handler(struct timer_list *t) { @@ -223,7 +222,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, return tw; } -EXPORT_SYMBOL_GPL(inet_twsk_alloc); /* These are always called from BH context. See callers in * tcp_input.c to verify this. @@ -306,7 +304,6 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } -EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ void inet_twsk_purge(struct inet_hashinfo *hashinfo) @@ -365,4 +362,3 @@ restart: rcu_read_unlock(); } } -EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 5ab56f4cb529..7b1e0a2d6906 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -60,7 +60,7 @@ void inet_peer_base_init(struct inet_peer_base *bp) seqlock_init(&bp->lock); bp->total = 0; } -EXPORT_SYMBOL_GPL(inet_peer_base_init); +EXPORT_IPV6_MOD_GPL(inet_peer_base_init); #define PEER_MAX_GC 32 @@ -95,6 +95,7 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, { struct rb_node **pp, *parent, *next; struct inet_peer *p; + u32 now; pp = &base->rb_root.rb_node; parent = NULL; @@ -108,8 +109,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, p = rb_entry(parent, struct inet_peer, rb_node); cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { - if (!refcount_inc_not_zero(&p->refcnt)) - break; + now = jiffies; + if (READ_ONCE(p->dtime) != now) + WRITE_ONCE(p->dtime, now); return p; } if (gc_stack) { @@ -150,9 +152,6 @@ static void inet_peer_gc(struct inet_peer_base *base, for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; - /* The READ_ONCE() pairs with the WRITE_ONCE() - * in inet_putpeer() - */ delta = (__u32)jiffies - READ_ONCE(p->dtime); if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) @@ -168,31 +167,23 @@ static void inet_peer_gc(struct inet_peer_base *base, } } +/* Must be called under RCU : No refcount change is done here. */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, - const struct inetpeer_addr *daddr, - int create) + const struct inetpeer_addr *daddr) { struct inet_peer *p, *gc_stack[PEER_MAX_GC]; struct rb_node **pp, *parent; unsigned int gc_cnt, seq; - int invalidated; /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ - rcu_read_lock(); seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); - invalidated = read_seqretry(&base->lock, seq); - rcu_read_unlock(); if (p) return p; - /* If no writer did a change during our lookup, we can return early. */ - if (!create && !invalidated) - return NULL; - /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ @@ -201,12 +192,12 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, gc_cnt = 0; p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); - if (!p && create) { + if (!p) { p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); if (p) { p->daddr = *daddr; p->dtime = (__u32)jiffies; - refcount_set(&p->refcnt, 2); + refcount_set(&p->refcnt, 1); atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; @@ -227,19 +218,13 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, return p; } -EXPORT_SYMBOL_GPL(inet_getpeer); +EXPORT_IPV6_MOD_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { - /* The WRITE_ONCE() pairs with itself (we run lockless) - * and the READ_ONCE() in inet_peer_gc() - */ - WRITE_ONCE(p->dtime, (__u32)jiffies); - if (refcount_dec_and_test(&p->refcnt)) kfree_rcu(p, rcu); } -EXPORT_SYMBOL_GPL(inet_putpeer); /* * Check transmit rate limitation for given message. @@ -261,26 +246,30 @@ EXPORT_SYMBOL_GPL(inet_putpeer); #define XRLIM_BURST_FACTOR 6 bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) { - unsigned long now, token; + unsigned long now, token, otoken, delta; bool rc = false; if (!peer) return true; - token = peer->rate_tokens; + token = otoken = READ_ONCE(peer->rate_tokens); now = jiffies; - token += now - peer->rate_last; - peer->rate_last = now; - if (token > XRLIM_BURST_FACTOR * timeout) - token = XRLIM_BURST_FACTOR * timeout; + delta = now - READ_ONCE(peer->rate_last); + if (delta) { + WRITE_ONCE(peer->rate_last, now); + token += delta; + if (token > XRLIM_BURST_FACTOR * timeout) + token = XRLIM_BURST_FACTOR * timeout; + } if (token >= timeout) { token -= timeout; rc = true; } - peer->rate_tokens = token; + if (token != otoken) + WRITE_ONCE(peer->rate_tokens, token); return rc; } -EXPORT_SYMBOL(inet_peer_xrlim_allow); +EXPORT_IPV6_MOD(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { @@ -297,4 +286,4 @@ void inetpeer_invalidate_tree(struct inet_peer_base *base) base->total = 0; } -EXPORT_SYMBOL(inetpeer_invalidate_tree); +EXPORT_IPV6_MOD(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 07036a2943c1..77f395b28ec7 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -76,21 +76,27 @@ static u8 ip4_frag_ecn(u8 tos) static struct inet_frags ip4_frags; static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static void ip4_frag_init(struct inet_frag_queue *q, const void *a) { struct ipq *qp = container_of(q, struct ipq, q); - struct net *net = q->fqdir->net; - const struct frag_v4_compare_key *key = a; + struct net *net = q->fqdir->net; + struct inet_peer *p = NULL; q->key.v4 = *key; qp->ecn = 0; - qp->peer = q->fqdir->max_dist ? - inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : - NULL; + if (q->fqdir->max_dist) { + rcu_read_lock(); + p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif); + if (p && !refcount_inc_not_zero(&p->refcnt)) + p = NULL; + rcu_read_unlock(); + } + qp->peer = p; } static void ip4_frag_free(struct inet_frag_queue *q) @@ -102,22 +108,6 @@ static void ip4_frag_free(struct inet_frag_queue *q) inet_putpeer(qp->peer); } - -/* Destruction primitives. */ - -static void ipq_put(struct ipq *ipq) -{ - inet_frag_put(&ipq->q); -} - -/* Kill ipq entry. It is not destroyed immediately, - * because caller (and someone more) holds reference count. - */ -static void ipq_kill(struct ipq *ipq) -{ - inet_frag_kill(&ipq->q); -} - static bool frag_expire_skip_icmp(u32 user) { return user == IP_DEFRAG_AF_PACKET || @@ -138,6 +128,7 @@ static void ip_expire(struct timer_list *t) struct sk_buff *head = NULL; struct net *net; struct ipq *qp; + int refs = 1; qp = container_of(frag, struct ipq, q); net = qp->q.fqdir->net; @@ -154,7 +145,7 @@ static void ip_expire(struct timer_list *t) goto out; qp->q.flags |= INET_FRAG_DROP; - ipq_kill(qp); + inet_frag_kill(&qp->q, &refs); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); @@ -197,7 +188,7 @@ out: out_rcu_unlock: rcu_read_unlock(); kfree_skb_reason(head, reason); - ipq_put(qp); + inet_frag_putn(&qp->q, refs); } /* Find the correct entry in the "incomplete datagrams" queue for @@ -273,7 +264,7 @@ static int ip_frag_reinit(struct ipq *qp) } /* Add new segment to existing queue. */ -static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) +static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs) { struct net *net = qp->q.fqdir->net; int ihl, end, flags, offset; @@ -293,7 +284,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && unlikely(ip_frag_too_far(qp)) && unlikely(err = ip_frag_reinit(qp))) { - ipq_kill(qp); + inet_frag_kill(&qp->q, refs); goto err; } @@ -377,10 +368,10 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip_frag_reasm(qp, skb, prev_tail, dev); + err = ip_frag_reasm(qp, skb, prev_tail, dev, refs); skb->_skb_refdst = orefdst; if (err) - inet_frag_kill(&qp->q); + inet_frag_kill(&qp->q, refs); return err; } @@ -397,7 +388,7 @@ insert_error: err = -EINVAL; __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: - inet_frag_kill(&qp->q); + inet_frag_kill(&qp->q, refs); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: kfree_skb_reason(skb, reason); @@ -411,7 +402,8 @@ static bool ip_frag_coalesce_ok(const struct ipq *qp) /* Build a new IP datagram from all its fragments. */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { struct net *net = qp->q.fqdir->net; struct iphdr *iph; @@ -419,7 +411,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, int len, err; u8 ecn; - ipq_kill(qp); + inet_frag_kill(&qp->q, refs); ecn = ip_frag_ecn_table[qp->ecn]; if (unlikely(ecn == 0xff)) { @@ -491,18 +483,21 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); /* Lookup (or create) queue header */ + rcu_read_lock(); qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { - int ret; + int ret, refs = 0; spin_lock(&qp->q.lock); - ret = ip_frag_queue(qp, skb); + ret = ip_frag_queue(qp, skb, &refs); spin_unlock(&qp->q.lock); - ipq_put(qp); + rcu_read_unlock(); + inet_frag_putn(&qp->q, refs); return ret; } + rcu_read_unlock(); __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f1f31ebfc793..f5b9004d6938 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -141,7 +141,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - unsigned int data_len = 0; struct ip_tunnel *t; if (tpi->proto == htons(ETH_P_TEB)) @@ -182,7 +181,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; - data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: @@ -190,10 +188,16 @@ static int ipgre_err(struct sk_buff *skb, u32 info, } #if IS_ENABLED(CONFIG_IPV6) - if (tpi->proto == htons(ETH_P_IPV6) && - !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, - type, data_len)) - return 0; + if (tpi->proto == htons(ETH_P_IPV6)) { + unsigned int data_len = 0; + + if (type == ICMP_TIME_EXCEEDED) + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ + + if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, + type, data_len)) + return 0; + } #endif if (t->parms.iph.daddr == 0 || @@ -924,15 +928,18 @@ static int ipgre_open(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr)) { - struct flowi4 fl4; + struct flowi4 fl4 = { + .flowi4_oif = t->parms.link, + .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(&t->parms.iph)), + .flowi4_scope = RT_SCOPE_UNIVERSE, + .flowi4_proto = IPPROTO_GRE, + .saddr = t->parms.iph.saddr, + .daddr = t->parms.iph.daddr, + .fl4_gre_key = t->parms.o_key, + }; struct rtable *rt; - rt = ip_route_output_gre(t->net, &fl4, - t->parms.iph.daddr, - t->parms.iph.saddr, - t->parms.o_key, - t->parms.iph.tos & INET_DSCP_MASK, - t->parms.link); + rt = ip_route_output_key(t->net, &fl4); if (IS_ERR(rt)) return -EADDRNOTAVAIL; dev = rt->dst.dev; @@ -1059,16 +1066,15 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipgre_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, ipgre_net_id, &ipgre_link_ops, dev_to_kill); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit_batch_rtnl = ipgre_exit_batch_rtnl, + .exit_rtnl = ipgre_exit_rtnl, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1389,10 +1395,12 @@ ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[]) return 0; } -static int ipgre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipgre_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1404,13 +1412,16 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; - return ip_tunnel_newlink(dev, tb, &p, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } -static int erspan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int erspan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1422,7 +1433,8 @@ static int erspan_newlink(struct net *src_net, struct net_device *dev, err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); if (err) return err; - return ip_tunnel_newlink(dev, tb, &p, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], @@ -1690,6 +1702,7 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = { struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type) { + struct rtnl_newlink_params params = { .src_net = net }; struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; LIST_HEAD(list_kill); @@ -1697,6 +1710,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, int err; memset(&tb, 0, sizeof(tb)); + params.tb = tb; dev = rtnl_create_link(net, name, name_assign_type, &ipgre_tap_ops, tb, NULL); @@ -1707,7 +1721,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, t = netdev_priv(dev); t->collect_md = true; - err = ipgre_newlink(net, dev, tb, NULL, NULL); + err = ipgre_newlink(dev, ¶ms, NULL); if (err < 0) { free_netdev(dev); return ERR_PTR(err); @@ -1737,16 +1751,15 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipgre_tap_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops, - dev_to_kill); + ip_tunnel_delete_net(net, gre_tap_net_id, &ipgre_tap_ops, dev_to_kill); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit_batch_rtnl = ipgre_tap_exit_batch_rtnl, + .exit_rtnl = ipgre_tap_exit_rtnl, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1757,16 +1770,15 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit erspan_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, erspan_net_id, &erspan_link_ops, dev_to_kill); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit_batch_rtnl = erspan_exit_batch_rtnl, + .exit_rtnl = erspan_exit_rtnl, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index f0a4dda246ab..30a5e9460d00 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -314,7 +314,7 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_early_demux(struct sk_buff *skb); int udp_v4_early_demux(struct sk_buff *skb); -static int ip_rcv_finish_core(struct net *net, struct sock *sk, +static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) { @@ -442,7 +442,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) if (!skb) return NET_RX_SUCCESS; - ret = ip_rcv_finish_core(net, sk, skb, dev, NULL); + ret = ip_rcv_finish_core(net, skb, dev, NULL); if (ret != NET_RX_DROP) ret = dst_input(skb); return ret; @@ -589,8 +589,7 @@ static struct sk_buff *ip_extract_route_hint(const struct net *net, return skb; } -static void ip_list_rcv_finish(struct net *net, struct sock *sk, - struct list_head *head) +static void ip_list_rcv_finish(struct net *net, struct list_head *head) { struct sk_buff *skb, *next, *hint = NULL; struct dst_entry *curr_dst = NULL; @@ -607,7 +606,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk, skb = l3mdev_ip_rcv(skb); if (!skb) continue; - if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP) + if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP) continue; dst = skb_dst(skb); @@ -633,7 +632,7 @@ static void ip_sublist_rcv(struct list_head *head, struct net_device *dev, { NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL, head, dev, NULL, ip_rcv_finish); - ip_list_rcv_finish(net, NULL, head); + ip_list_rcv_finish(net, head); } /* Receive a list of IP packets */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 0065b1996c94..a2705d454fd6 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -75,7 +75,6 @@ #include <net/checksum.h> #include <net/gso.h> #include <net/inetpeer.h> -#include <net/inet_ecn.h> #include <net/lwtunnel.h> #include <net/inet_dscp.h> #include <linux/bpf-cgroup.h> @@ -478,24 +477,16 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, /* Make sure we can route this packet. */ rt = dst_rtable(__sk_dst_check(sk, 0)); if (!rt) { - __be32 daddr; + inet_sk_init_flowi4(inet, fl4); - /* Use correct destination address if we have options. */ - daddr = inet->inet_daddr; - if (inet_opt && inet_opt->opt.srr) - daddr = inet_opt->opt.faddr; + /* sctp_v4_xmit() uses its own DSCP value */ + fl4->flowi4_tos = tos & INET_DSCP_MASK; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(net, fl4, sk, - daddr, inet->inet_saddr, - inet->inet_dport, - inet->inet_sport, - sk->sk_protocol, - tos & INET_DSCP_MASK, - sk->sk_bound_dev_if); + rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; sk_setup_caps(sk, &rt->dst); @@ -1023,7 +1014,8 @@ static int __ip_append_data(struct sock *sk, uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ @@ -1169,7 +1161,10 @@ alloc_new_skb: /* [!] NOTE: copy will be negative if pagedlen>0 * because then the equation reduces to -fraggap. */ - if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + if (copy > 0 && + INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; @@ -1213,8 +1208,9 @@ alloc_new_skb: unsigned int off; off = skb->len; - if (getfrag(from, skb_put(skb, copy), - offset, copy, off, skb) < 0) { + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; @@ -1252,7 +1248,8 @@ alloc_new_skb: get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); - if (getfrag(from, + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; @@ -1328,7 +1325,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, cork->ttl = ipc->ttl; cork->tos = ipc->tos; cork->mark = ipc->sockc.mark; - cork->priority = ipc->priority; + cork->priority = ipc->sockc.priority; cork->transmit_time = ipc->sockc.transmit_time; cork->tx_flags = 0; sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags); @@ -1465,7 +1462,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, ip_options_build(skb, opt, cork->addr, rt); } - skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority); + skb->priority = cork->priority; skb->mark = cork->mark; if (sk_is_tcp(sk)) skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC); @@ -1643,7 +1640,7 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, if (IS_ERR(rt)) return; - inet_sk(sk)->tos = arg->tos & ~INET_ECN_MASK; + inet_sk(sk)->tos = arg->tos; sk->sk_protocol = ip_hdr(skb)->protocol; sk->sk_bound_dev_if = arg->bound_dev_if; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index cf377377b52d..6d9c5c20b1c4 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -128,20 +128,20 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) { - char *secdata; - u32 seclen, secid; + struct lsm_context ctx; + u32 secid; int err; err = security_socket_getpeersec_dgram(NULL, skb, &secid); if (err) return; - err = security_secid_to_secctx(secid, &secdata, &seclen); - if (err) + err = security_secid_to_secctx(secid, &ctx); + if (err < 0) return; - put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata); - security_release_secctx(secdata, seclen); + put_cmsg(msg, SOL_IP, SCM_SECURITY, ctx.len, ctx.context); + security_release_secctx(&ctx); } static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) @@ -315,7 +315,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, if (val < 0 || val > 255) return -EINVAL; ipc->tos = val; - ipc->priority = rt_tos2priority(ipc->tos); + ipc->sockc.priority = rt_tos2priority(ipc->tos); break; case IP_PROTOCOL: if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 09b73acf037a..678b8f96e3e9 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -40,6 +40,7 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <net/rtnetlink.h> #include <net/udp.h> #include <net/dst_metadata.h> @@ -242,11 +243,11 @@ static struct net_device *__ip_tunnel_create(struct net *net, if (parms->name[0]) { if (!dev_valid_name(parms->name)) goto failed; - strscpy(name, parms->name, IFNAMSIZ); + strscpy(name, parms->name); } else { if (strlen(ops->kind) > (IFNAMSIZ - 3)) goto failed; - strcpy(name, ops->kind); + strscpy(name, ops->kind); strcat(name, "%d"); } @@ -1162,7 +1163,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, * Allowing to move it to another netns is clearly unsafe. */ if (!IS_ERR(itn->fb_tunnel_dev)) { - itn->fb_tunnel_dev->netns_local = true; + itn->fb_tunnel_dev->netns_immutable = true; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); itn->type = itn->fb_tunnel_dev->type; @@ -1173,13 +1174,16 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, } EXPORT_SYMBOL_GPL(ip_tunnel_init_net); -static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, - struct list_head *head, - struct rtnl_link_ops *ops) +void ip_tunnel_delete_net(struct net *net, unsigned int id, + struct rtnl_link_ops *ops, + struct list_head *head) { + struct ip_tunnel_net *itn = net_generic(net, id); struct net_device *dev, *aux; int h; + ASSERT_RTNL_NET(net); + for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == ops) unregister_netdevice_queue(dev, head); @@ -1197,27 +1201,13 @@ static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, unregister_netdevice_queue(t->dev, head); } } +EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); -void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, - struct rtnl_link_ops *ops, - struct list_head *dev_to_kill) -{ - struct ip_tunnel_net *itn; - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) { - itn = net_generic(net, id); - ip_tunnel_destroy(net, itn, dev_to_kill, ops); - } -} -EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); - -int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm_kern *p, __u32 fwmark) +int ip_tunnel_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct ip_tunnel_parm_kern *p, + __u32 fwmark) { struct ip_tunnel *nt; - struct net *net = dev_net(dev); struct ip_tunnel_net *itn; int mtu; int err; @@ -1326,7 +1316,6 @@ int ip_tunnel_init(struct net_device *dev) } tunnel->dev = dev; - tunnel->net = dev_net(dev); strscpy(tunnel->parms.name, dev->name); iph->version = 4; iph->ihl = 5; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index a3676155be78..f65d2f727381 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -416,7 +416,7 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, skb_dst_update_pmtu_no_confirm(skb, mtu); - if (!reply || skb->pkt_type == PACKET_HOST) + if (!reply) return 0; if (skb->protocol == htons(ETH_P_IP)) @@ -451,7 +451,7 @@ static const struct nla_policy geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = { [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, - [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, + [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 }, }; static const struct nla_policy diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index f0b4419cef34..686e4f3d83aa 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -523,16 +523,15 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit vti_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit_batch_rtnl = vti_exit_batch_rtnl, + .exit_rtnl = vti_exit_rtnl, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -575,15 +574,18 @@ static void vti_netlink_parms(struct nlattr *data[], *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } -static int vti_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vti_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; struct ip_tunnel_parm_kern parms; + struct nlattr **tb = params->tb; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); - return ip_tunnel_newlink(dev, tb, &parms, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, + &parms, fwmark); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index dc0db5895e0e..3e03af073a1c 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -436,11 +436,13 @@ static void ipip_netlink_parms(struct nlattr *data[], *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -static int ipip_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipip_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; @@ -453,7 +455,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, } ipip_netlink_parms(data, &p, &t->collect_md, &fwmark); - return ip_tunnel_newlink(dev, tb, &p, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], @@ -601,16 +604,15 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_batch_rtnl(struct list_head *list_net, - struct list_head *dev_to_kill) +static void __net_exit ipip_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops, - dev_to_kill); + ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit_batch_rtnl = ipip_exit_batch_rtnl, + .exit_rtnl = ipip_exit_rtnl, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 99d8faa508e5..2ff2f79c7351 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -120,11 +120,6 @@ static void ipmr_expire_process(struct timer_list *t); lockdep_rtnl_is_held() || \ list_empty(&net->ipv4.mr_tables)) -static bool ipmr_can_free_table(struct net *net) -{ - return !check_net(net) || !net_initialized(net); -} - static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { @@ -317,11 +312,6 @@ EXPORT_SYMBOL(ipmr_rule_default); #define ipmr_for_each_table(mrt, net) \ for (mrt = net->ipv4.mrt; mrt; mrt = NULL) -static bool ipmr_can_free_table(struct net *net) -{ - return !check_net(net); -} - static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) { @@ -437,7 +427,7 @@ static void ipmr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); - WARN_ON_ONCE(!ipmr_can_free_table(net)); + WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | @@ -563,7 +553,7 @@ static void reg_vif_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; - dev->netns_local = true; + dev->netns_immutable = true; } static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) @@ -831,7 +821,7 @@ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, cache->mfc_un.res.maxvif = vifi + 1; } } - cache->mfc_un.res.lastuse = jiffies; + WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int vif_add(struct net *net, struct mr_table *mrt, @@ -1289,7 +1279,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, } } if (list_empty(&mrt->mfc_unres_queue)) - del_timer(&mrt->ipmr_expire_timer); + timer_delete(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { @@ -1681,9 +1671,9 @@ int ipmr_ioctl(struct sock *sk, int cmd, void *arg) rcu_read_lock(); c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr); if (c) { - sr->pktcnt = c->_c.mfc_un.res.pkt; - sr->bytecnt = c->_c.mfc_un.res.bytes; - sr->wrong_if = c->_c.mfc_un.res.wrong_if; + sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); return 0; } @@ -1753,9 +1743,9 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { - sr.pktcnt = c->_c.mfc_un.res.pkt; - sr.bytecnt = c->_c.mfc_un.res.bytes; - sr.wrong_if = c->_c.mfc_un.res.wrong_if; + sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -1988,9 +1978,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, int vif, ct; vif = c->_c.mfc_parent; - c->_c.mfc_un.res.pkt++; - c->_c.mfc_un.res.bytes += skb->len; - c->_c.mfc_un.res.lastuse = jiffies; + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) { struct mfc_cache *cache_proxy; @@ -2021,7 +2011,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt, goto dont_forward; } - c->_c.mfc_un.res.wrong_if++; + atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, @@ -2511,7 +2501,8 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request"); return -EINVAL; } @@ -2520,7 +2511,6 @@ static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || @@ -2836,7 +2826,8 @@ static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump"); return -EINVAL; } @@ -2846,7 +2837,6 @@ static int ipmr_valid_dumplink(const struct nlmsghdr *nlh, return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request"); @@ -3029,9 +3019,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", - mfc->_c.mfc_un.res.pkt, - mfc->_c.mfc_un.res.bytes, - mfc->_c.mfc_un.res.wrong_if); + atomic_long_read(&mfc->_c.mfc_un.res.pkt), + atomic_long_read(&mfc->_c.mfc_un.res.bytes), + atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index f0af12a2f70b..28d77d454d44 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -263,9 +263,9 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, lastuse = READ_ONCE(c->mfc_un.res.lastuse); lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; - mfcs.mfcs_packets = c->mfc_un.res.pkt; - mfcs.mfcs_bytes = c->mfc_un.res.bytes; - mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; + mfcs.mfcs_packets = atomic_long_read(&c->mfc_un.res.pkt); + mfcs.mfcs_bytes = atomic_long_read(&c->mfc_un.res.bytes); + mfcs.mfcs_wrong_if = atomic_long_read(&c->mfc_un.res.wrong_if); if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) @@ -330,9 +330,6 @@ next_entry: list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { if (e < s_e) goto next_entry2; - if (filter->dev && - !mr_mfc_uses_dev(mrt, mfc, filter->dev)) - goto next_entry2; err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 3d101613f27f..23c8deff8095 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -270,7 +270,7 @@ ipt_do_table(void *priv, * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + jumpstack += private->stacksize * current->in_nf_duplicate; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index 25e1e8eb18dd..ed08fb78cfa8 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -54,7 +54,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, struct iphdr *iph; local_bh_disable(); - if (this_cpu_read(nf_skb_duplicated)) + if (current->in_nf_duplicate) goto out; /* * Copy the skb, and route the copy. Will later return %XT_CONTINUE for @@ -86,9 +86,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, --iph->ttl; if (nf_dup_ipv4_route(net, skb, gw, oif)) { - __this_cpu_write(nf_skb_duplicated, true); + current->in_nf_duplicate = true; ip_local_out(net, skb->sk, skb); - __this_cpu_write(nf_skb_duplicated, false); + current->in_nf_duplicate = false; } else { kfree_skb(skb); } diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 625adbc42037..7e7c49535e3f 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -50,7 +50,12 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs, else addr = iph->saddr; - *dst = inet_dev_addr_type(nft_net(pkt), dev, addr); + if (priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) { + *dst = inet_dev_addr_type(nft_net(pkt), dev, addr); + return; + } + + *dst = inet_addr_type_dev_table(nft_net(pkt), pkt->skb->dev, addr); } EXPORT_SYMBOL_GPL(nft_fib4_eval_type); @@ -65,12 +70,17 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, struct flowi4 fl4 = { .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_proto = pkt->tprot, .flowi4_uid = sock_net_uid(nft_net(pkt), NULL), - .flowi4_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)), }; const struct net_device *oif; const struct net_device *found; + if (nft_fib_can_skip(pkt)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; + } + /* * Do not set flowi4_oif, it restricts results (for example, asking * for oif 3 will get RTN_UNICAST result even if the daddr exits @@ -85,11 +95,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; - if (nft_hook(pkt) == NF_INET_PRE_ROUTING && - nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv, nft_in(pkt)); - return; - } + fl4.flowi4_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, oif); iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph); if (!iph) { diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 09a3d73b45ba..4397e89d3123 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -541,6 +541,7 @@ static struct nexthop *nexthop_alloc(void) INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fdb_list); + spin_lock_init(&nh->lock); } return nh; } @@ -1272,10 +1273,8 @@ static int nh_check_attr_group(struct net *net, u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); - u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int i, j; - u8 nhg_fdb = 0; if (!len || len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, @@ -1307,10 +1306,41 @@ static int nh_check_attr_group(struct net *net, } } - if (tb[NHA_FDB]) - nhg_fdb = 1; nhg = nla_data(tb[NHA_GROUP]); - for (i = 0; i < len; ++i) { + for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { + if (!tb[i]) + continue; + switch (i) { + case NHA_HW_STATS_ENABLE: + case NHA_FDB: + continue; + case NHA_RES_GROUP: + if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) + continue; + break; + } + NL_SET_ERR_MSG(extack, + "No other attributes can be set in nexthop groups"); + return -EINVAL; + } + + return 0; +} + +static int nh_check_attr_group_rtnl(struct net *net, struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + u8 nh_family = AF_UNSPEC; + struct nexthop_grp *nhg; + unsigned int len; + unsigned int i; + u8 nhg_fdb; + + len = nla_len(tb[NHA_GROUP]) / sizeof(*nhg); + nhg = nla_data(tb[NHA_GROUP]); + nhg_fdb = !!tb[NHA_FDB]; + + for (i = 0; i < len; i++) { struct nexthop *nh; bool is_fdb_nh; @@ -1330,22 +1360,6 @@ static int nh_check_attr_group(struct net *net, return -EINVAL; } } - for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { - if (!tb[i]) - continue; - switch (i) { - case NHA_HW_STATS_ENABLE: - case NHA_FDB: - continue; - case NHA_RES_GROUP: - if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) - continue; - break; - } - NL_SET_ERR_MSG(extack, - "No other attributes can be set in nexthop groups"); - return -EINVAL; - } return 0; } @@ -1542,12 +1556,12 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, if (nh->is_group) { struct nh_group *nhg; - nhg = rtnl_dereference(nh->nh_grp); + nhg = rcu_dereference_rtnl(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; is_fdb_nh = nhg->fdb_nh; } else { - nhi = rtnl_dereference(nh->nh_info); + nhi = rcu_dereference_rtnl(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; is_fdb_nh = nhi->fdb_nh; @@ -2105,7 +2119,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) /* not called for nexthop replace */ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { - struct fib6_info *f6i, *tmp; + struct fib6_info *f6i; bool do_flush = false; struct fib_info *fi; @@ -2116,13 +2130,24 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) if (do_flush) fib_flush(net); - /* ip6_del_rt removes the entry from this list hence the _safe */ - list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { + spin_lock_bh(&nh->lock); + + nh->dead = true; + + while (!list_empty(&nh->f6i_list)) { + f6i = list_first_entry(&nh->f6i_list, typeof(*f6i), nh_list); + /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); + + spin_unlock_bh(&nh->lock); ipv6_stub->ip6_del_rt(net, f6i, !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); + + spin_lock_bh(&nh->lock); } + + spin_unlock_bh(&nh->lock); } static void __remove_nexthop(struct net *net, struct nexthop *nh, @@ -2679,9 +2704,6 @@ static struct nexthop *nexthop_create_group(struct net *net, int err; int i; - if (WARN_ON(!num_nh)) - return ERR_PTR(-EINVAL); - nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); @@ -2915,11 +2937,6 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, struct nexthop *nh; int err; - if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { - NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); - return ERR_PTR(-EINVAL); - } - if (!cfg->nh_id) { cfg->nh_id = nh_find_unused_id(net); if (!cfg->nh_id) { @@ -3016,19 +3033,13 @@ static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, } static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, - struct nlmsghdr *nlh, struct nh_config *cfg, + struct nlmsghdr *nlh, struct nlattr **tb, + struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); - struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; int err; - err = nlmsg_parse(nlh, sizeof(*nhm), tb, - ARRAY_SIZE(rtm_nh_policy_new) - 1, - rtm_nh_policy_new, extack); - if (err < 0) - return err; - err = -EINVAL; if (nhm->resvd || nhm->nh_scope) { NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); @@ -3093,7 +3104,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } - err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), + + err = nh_check_attr_group(net, tb, ARRAY_SIZE(rtm_nh_policy_new), cfg->nh_grp_type, extack); if (err) goto out; @@ -3126,25 +3138,6 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, goto out; } - if (!cfg->nh_fdb && tb[NHA_OIF]) { - cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); - if (cfg->nh_ifindex) - cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); - - if (!cfg->dev) { - NL_SET_ERR_MSG(extack, "Invalid device index"); - goto out; - } else if (!(cfg->dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Nexthop device is not up"); - err = -ENETDOWN; - goto out; - } else if (!netif_carrier_ok(cfg->dev)) { - NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); - err = -ENETDOWN; - goto out; - } - } - err = -EINVAL; if (tb[NHA_GATEWAY]) { struct nlattr *gwa = tb[NHA_GATEWAY]; @@ -3206,22 +3199,76 @@ out: return err; } +static int rtm_to_nh_config_rtnl(struct net *net, struct nlattr **tb, + struct nh_config *cfg, + struct netlink_ext_ack *extack) +{ + if (tb[NHA_GROUP]) + return nh_check_attr_group_rtnl(net, tb, extack); + + if (tb[NHA_OIF]) { + cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); + if (cfg->nh_ifindex) + cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); + + if (!cfg->dev) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + return -EINVAL; + } + + if (!(cfg->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + return -ENETDOWN; + } + + if (!netif_carrier_ok(cfg->dev)) { + NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); + return -ENETDOWN; + } + } + + return 0; +} + /* rtnl */ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; struct net *net = sock_net(skb->sk); struct nh_config cfg; struct nexthop *nh; int err; - err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); - if (!err) { - nh = nexthop_add(net, &cfg, extack); - if (IS_ERR(nh)) - err = PTR_ERR(nh); + err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, + ARRAY_SIZE(rtm_nh_policy_new) - 1, + rtm_nh_policy_new, extack); + if (err < 0) + goto out; + + err = rtm_to_nh_config(net, skb, nlh, tb, &cfg, extack); + if (err) + goto out; + + if (cfg.nlflags & NLM_F_REPLACE && !cfg.nh_id) { + NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); + err = -EINVAL; + goto out; } + rtnl_net_lock(net); + + err = rtm_to_nh_config_rtnl(net, tb, &cfg, extack); + if (err) + goto unlock; + + nh = nexthop_add(net, &cfg, extack); + if (IS_ERR(nh)) + err = PTR_ERR(nh); + +unlock: + rtnl_net_unlock(net); +out: return err; } @@ -3278,13 +3325,17 @@ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, if (err) return err; + rtnl_net_lock(net); + nh = nexthop_find_by_id(net, id); - if (!nh) - return -ENOENT; + if (nh) + remove_nexthop(net, nh, &nlinfo); + else + err = -ENOENT; - remove_nexthop(net, nh, &nlinfo); + rtnl_net_unlock(net); - return 0; + return err; } /* rtnl */ @@ -4000,14 +4051,11 @@ out: } EXPORT_SYMBOL(nexthop_res_grp_activity_update); -static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) +static void __net_exit nexthop_net_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - flush_all_nexthops(net); + ASSERT_RTNL_NET(net); + flush_all_nexthops(net); } static void __net_exit nexthop_net_exit(struct net *net) @@ -4032,22 +4080,24 @@ static int __net_init nexthop_net_init(struct net *net) static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, .exit = nexthop_net_exit, - .exit_batch_rtnl = nexthop_net_exit_batch_rtnl, + .exit_rtnl = nexthop_net_exit_rtnl, }; static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = { - {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop}, - {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop}, + {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop, + .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_GETNEXTHOP, .doit = rtm_get_nexthop, .dumpit = rtm_dump_nexthop}, {.msgtype = RTM_GETNEXTHOPBUCKET, .doit = rtm_get_nexthop_bucket, .dumpit = rtm_dump_nexthop_bucket}, {.protocol = PF_INET, .msgtype = RTM_NEWNEXTHOP, - .doit = rtm_new_nexthop}, + .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETNEXTHOP, .dumpit = rtm_dump_nexthop}, {.protocol = PF_INET6, .msgtype = RTM_NEWNEXTHOP, - .doit = rtm_new_nexthop}, + .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET6, .msgtype = RTM_GETNEXTHOP, .dumpit = rtm_dump_nexthop}, }; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 619ddc087957..c14baa6589c7 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -705,7 +705,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip_options_data opt_copy; int free = 0; __be32 saddr, daddr, faddr; - u8 tos, scope; + u8 scope; int err; pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -768,7 +768,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } faddr = ipc.opt->opt.faddr; } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { @@ -779,7 +778,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) ipc.oif = READ_ONCE(inet->uc_index); - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); @@ -966,10 +966,9 @@ EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); enum skb_drop_reason ping_rcv(struct sk_buff *skb) { - enum skb_drop_reason reason = SKB_DROP_REASON_NO_SOCKET; - struct sock *sk; struct net *net = dev_net(skb->dev); struct icmphdr *icmph = icmp_hdr(skb); + struct sock *sk; /* We assume the packet has already been checked by icmp_rcv */ @@ -980,20 +979,11 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb) skb_push(skb, skb->data - (u8 *)icmph); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); - if (sk) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); - - pr_debug("rcv on socket %p\n", sk); - if (skb2) - reason = __ping_queue_rcv_skb(sk, skb2); - else - reason = SKB_DROP_REASON_NOMEM; - } - - if (reason) - pr_debug("no socket, dropping\n"); + if (sk) + return __ping_queue_rcv_skb(sk, skb); - return reason; + kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET); + return SKB_DROP_REASON_NO_SOCKET; } EXPORT_SYMBOL_GPL(ping_rcv); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 40053a02bae1..ea2f01584379 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -189,6 +189,9 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), + SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED), + SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), + SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 0e9e01967ec9..6aace4d55733 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -358,7 +358,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IP); - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); skb_dst_set(skb, &rt->dst); @@ -486,7 +486,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; - u8 tos, scope; + u8 scope; int free = 0; __be32 daddr; __be32 saddr; @@ -581,7 +581,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = ipc.opt->opt.faddr; } } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); uc_index = READ_ONCE(inet->uc_index); @@ -606,7 +605,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e1564b95fab0..fccb05fb3a79 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -189,7 +189,11 @@ const __u8 ip_tos2prio[16] = { EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); +#ifndef CONFIG_PREEMPT_RT #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) +#else +#define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field) +#endif #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) @@ -390,7 +394,13 @@ static inline int ip_rt_proc_init(void) static inline bool rt_is_expired(const struct rtable *rth) { - return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); + bool res; + + rcu_read_lock(); + res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); + rcu_read_unlock(); + + return res; } void rt_cache_flush(struct net *net) @@ -870,11 +880,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) } log_martians = IN_DEV_LOG_MARTIANS(in_dev); vif = l3mdev_master_ifindex_rcu(rt->dst.dev); - rcu_read_unlock(); net = dev_net(rt->dst.dev); - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif); if (!peer) { + rcu_read_unlock(); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); return; @@ -893,7 +903,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) */ if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; - goto out_put_peer; + goto out_unlock; } /* Check for load limit; set rate_last to the latest sent @@ -914,8 +924,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); } -out_put_peer: - inet_putpeer(peer); +out_unlock: + rcu_read_unlock(); } static int ip_error(struct sk_buff *skb) @@ -975,9 +985,9 @@ static int ip_error(struct sk_buff *skb) break; } + rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, - l3mdev_master_ifindex(skb->dev), 1); - + l3mdev_master_ifindex_rcu(skb->dev)); send = true; if (peer) { now = jiffies; @@ -989,8 +999,9 @@ static int ip_error(struct sk_buff *skb) peer->rate_tokens -= ip_rt_error_cost; else send = false; - inet_putpeer(peer); } + rcu_read_unlock(); + if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); @@ -1001,9 +1012,9 @@ out: kfree_skb_reason(skb, reason); static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; - struct net *net = dev_net(dst->dev); struct fib_result res; bool lock = false; + struct net *net; u32 old_mtu; if (ip_mtu_locked(dst)) @@ -1013,6 +1024,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (old_mtu < mtu) return; + rcu_read_lock(); + net = dev_net_rcu(dst->dev); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1020,9 +1033,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) - return; + goto out; - rcu_read_lock(); if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; @@ -1036,14 +1048,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } - rcu_read_unlock(); - return; + goto out; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } +out: rcu_read_unlock(); } @@ -1306,10 +1318,15 @@ static void set_class_tag(struct rtable *rt, u32 tag) static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { - struct net *net = dev_net(dst->dev); unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); - unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, - net->ipv4.ip_rt_min_advmss); + unsigned int advmss; + struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); + advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, + net->ipv4.ip_rt_min_advmss); + rcu_read_unlock(); return min(advmss, IPV4_MAX_PMTU - header_size); } @@ -2024,8 +2041,12 @@ static u32 fib_multipath_custom_hash_fl4(const struct net *net, hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl4->fl4_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; @@ -2080,7 +2101,10 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; - hash_keys.ports.src = fl4->fl4_sport; + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } @@ -2141,7 +2165,7 @@ ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); - fib_select_multipath(res, h); + fib_select_multipath(res, h, NULL); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif @@ -2686,8 +2710,7 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, if (fl4->saddr) { if (ipv4_is_multicast(fl4->saddr) || - ipv4_is_lbcast(fl4->saddr) || - ipv4_is_zeronet(fl4->saddr)) { + ipv4_is_lbcast(fl4->saddr)) { rth = ERR_PTR(-EINVAL); goto out; } @@ -3193,7 +3216,8 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for route get request"); return -EINVAL; @@ -3203,7 +3227,6 @@ static int inet_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_table || rtm->rtm_protocol || @@ -3269,6 +3292,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; + dscp_t dscp; kuid_t uid; u32 iif; int err; @@ -3283,6 +3307,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, dst = nla_get_in_addr_default(tb[RTA_DST], 0); iif = nla_get_u32_default(tb[RTA_IIF], 0); mark = nla_get_u32_default(tb[RTA_MARK], 0); + dscp = inet_dsfield_to_dscp(rtm->rtm_tos); if (tb[RTA_UID]) uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else @@ -3307,7 +3332,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = rtm->rtm_tos & INET_DSCP_MASK; + fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; @@ -3331,9 +3356,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.flowi4_iif = iif; /* for rt_fill_info */ skb->dev = dev; skb->mark = mark; - err = ip_route_input_rcu(skb, dst, src, - inet_dsfield_to_dscp(rtm->rtm_tos), - dev, &res) ? -EINVAL : 0; + err = ip_route_input_rcu(skb, dst, src, dscp, dev, + &res) ? -EINVAL : 0; rt = skb_rtable(skb); if (err == 0 && rt->dst.error) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 1948d15f1f28..5459a78b9809 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -222,7 +222,7 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, return NULL; } -EXPORT_SYMBOL(tcp_get_cookie_sock); +EXPORT_IPV6_MOD(tcp_get_cookie_sock); /* * when syncookies are in effect and tcp timestamps are enabled we stored @@ -259,7 +259,7 @@ bool cookie_timestamp_decode(const struct net *net, return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0; } -EXPORT_SYMBOL(cookie_timestamp_decode); +EXPORT_IPV6_MOD(cookie_timestamp_decode); static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, struct request_sock *req) @@ -279,6 +279,7 @@ static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, ireq->smc_ok = 0; treq->snt_synack = 0; + treq->snt_tsval_first = 0; treq->tfo_listener = false; treq->txhash = net_tx_rndhash(); treq->rcv_isn = ntohl(th->seq) - 1; @@ -310,7 +311,7 @@ struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb) return req; } -EXPORT_SYMBOL_GPL(cookie_bpf_check); +EXPORT_IPV6_MOD_GPL(cookie_bpf_check); #endif struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, @@ -351,7 +352,7 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, return req; } -EXPORT_SYMBOL_GPL(cookie_tcp_reqsk_alloc); +EXPORT_IPV6_MOD_GPL(cookie_tcp_reqsk_alloc); static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a79b2a52ce01..3a43010d726f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -28,6 +28,7 @@ static int tcp_adv_win_scale_max = 31; static int tcp_app_win_max = 31; static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; static int tcp_min_snd_mss_max = 65535; +static int tcp_rto_max_max = TCP_RTO_MAX_SEC * MSEC_PER_SEC; static int ip_privileged_port_min; static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; @@ -45,6 +46,7 @@ static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; +static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1066,6 +1068,15 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = SYSCTL_TWO, }, { + .procname = "tcp_tw_reuse_delay", + .data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &tcp_tw_reuse_delay_max, + }, + { .procname = "tcp_max_syn_backlog", .data = &init_net.ipv4.sysctl_max_syn_backlog, .maxlen = sizeof(int), @@ -1573,6 +1584,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "tcp_rto_max_ms", + .data = &init_net.ipv4.sysctl_tcp_rto_max_ms, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_THOUSAND, + .extra2 = &tcp_rto_max_max, + }, }; static __net_init int ipv4_sysctl_init_net(struct net *net) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0d704bda6c41..f64f8276a73c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -300,10 +300,10 @@ DEFINE_PER_CPU(u32, tcp_tw_isn); EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); long sysctl_tcp_mem[3] __read_mostly; -EXPORT_SYMBOL(sysctl_tcp_mem); +EXPORT_IPV6_MOD(sysctl_tcp_mem); atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ -EXPORT_SYMBOL(tcp_memory_allocated); +EXPORT_IPV6_MOD(tcp_memory_allocated); DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); @@ -316,7 +316,7 @@ EXPORT_SYMBOL(tcp_have_smc); * Current number of TCP sockets. */ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; -EXPORT_SYMBOL(tcp_sockets_allocated); +EXPORT_IPV6_MOD(tcp_sockets_allocated); /* * TCP splice context @@ -349,7 +349,7 @@ void tcp_enter_memory_pressure(struct sock *sk) if (!cmpxchg(&tcp_memory_pressure, 0, val)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); } -EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure); +EXPORT_IPV6_MOD_GPL(tcp_enter_memory_pressure); void tcp_leave_memory_pressure(struct sock *sk) { @@ -362,7 +362,7 @@ void tcp_leave_memory_pressure(struct sock *sk) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO, jiffies_to_msecs(jiffies - val)); } -EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure); +EXPORT_IPV6_MOD_GPL(tcp_leave_memory_pressure); /* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max) @@ -423,7 +423,7 @@ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int rto_min_us; + int rto_min_us, rto_max_ms; tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; @@ -432,6 +432,10 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; + + rto_max_ms = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_max_ms); + icsk->icsk_rto_max = msecs_to_jiffies(rto_max_ms); + rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us); icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us); icsk->icsk_delack_max = TCP_DELACK_MAX; @@ -475,7 +479,7 @@ void tcp_init_sock(struct sock *sk) sk_sockets_allocated_inc(sk); xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1); } -EXPORT_SYMBOL(tcp_init_sock); +EXPORT_IPV6_MOD(tcp_init_sock); static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) { @@ -488,10 +492,14 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) sock_tx_timestamp(sk, sockc, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) - tcb->txstamp_ack = 1; + tcb->txstamp_ack |= TSTAMP_ACK_SK; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; } + + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && + SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb) + bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB); } static bool tcp_stream_is_readable(struct sock *sk, int target) @@ -660,7 +668,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg) *karg = answ; return 0; } -EXPORT_SYMBOL(tcp_ioctl); +EXPORT_IPV6_MOD(tcp_ioctl); void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { @@ -876,7 +884,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, return ret; } -EXPORT_SYMBOL(tcp_splice_read); +EXPORT_IPV6_MOD(tcp_splice_read); struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule) @@ -1051,6 +1059,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { + struct net_devmem_dmabuf_binding *binding = NULL; struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; @@ -1058,11 +1067,20 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; + int sockc_err = 0; int zc = 0; long timeo; flags = msg->msg_flags; + sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) }; + if (msg->msg_controllen) { + sockc_err = sock_cmsg_send(sk, msg, &sockc); + /* Don't return error until MSG_FASTOPEN has been processed; + * that may succeed even if the cmsg is invalid. + */ + } + if ((flags & MSG_ZEROCOPY) && size) { if (msg->msg_ubuf) { uarg = msg->msg_ubuf; @@ -1070,7 +1088,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); - uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb), + !sockc_err && sockc.dmabuf_id); if (!uarg) { err = -ENOBUFS; goto out_err; @@ -1079,12 +1098,27 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) zc = MSG_ZEROCOPY; else uarg_to_msgzc(uarg)->zerocopy = 0; + + if (!sockc_err && sockc.dmabuf_id) { + binding = net_devmem_get_binding(sk, sockc.dmabuf_id); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + binding = NULL; + goto out_err; + } + } } } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_SPLICE_PAGES; } + if (!sockc_err && sockc.dmabuf_id && + (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) { + err = -EINVAL; + goto out_err; + } + if (unlikely(flags & MSG_FASTOPEN || inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { @@ -1123,13 +1157,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockcm_init(&sockc, sk); - if (msg->msg_controllen) { - err = sock_cmsg_send(sk, msg, &sockc); - if (unlikely(err)) { - err = -EINVAL; - goto out_err; - } + if (sockc_err) { + err = sockc_err; + goto out_err; } /* This should be in poll */ @@ -1152,6 +1182,8 @@ restart: if (skb) copy = size_goal - skb->len; + trace_tcp_sendmsg_locked(sk, msg, skb, size_goal); + if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; @@ -1248,7 +1280,8 @@ new_segment: goto wait_for_space; } - err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); + err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg, + binding); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; @@ -1329,6 +1362,8 @@ out_nopush: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put(uarg); + if (binding) + net_devmem_dmabuf_binding_put(binding); return copied + copied_syn; do_error: @@ -1346,6 +1381,9 @@ out_err: sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } + if (binding) + net_devmem_dmabuf_binding_put(binding); + return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); @@ -1376,7 +1414,7 @@ void tcp_splice_eof(struct socket *sock) tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); release_sock(sk); } -EXPORT_SYMBOL_GPL(tcp_splice_eof); +EXPORT_IPV6_MOD_GPL(tcp_splice_eof); /* * Handle reading urgent data. BSD has very simple semantics for @@ -1565,12 +1603,13 @@ EXPORT_SYMBOL(tcp_recv_skb); * or for 'peeking' the socket using this routine * (although both would be easy to implement). */ -int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq) { struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - u32 seq = tp->copied_seq; + u32 seq = *copied_seq; u32 offset; int copied = 0; @@ -1624,9 +1663,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_eat_recv_skb(sk, skb); if (!desc->count) break; - WRITE_ONCE(tp->copied_seq, seq); + WRITE_ONCE(*copied_seq, seq); } - WRITE_ONCE(tp->copied_seq, seq); + WRITE_ONCE(*copied_seq, seq); + + if (noack) + goto out; tcp_rcv_space_adjust(sk); @@ -1635,10 +1677,25 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); } +out: return copied; } + +int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + return __tcp_read_sock(sk, desc, recv_actor, false, + &tcp_sk(sk)->copied_seq); +} EXPORT_SYMBOL(tcp_read_sock); +int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq) +{ + return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq); +} + int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; @@ -1667,7 +1724,7 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) } return copied; } -EXPORT_SYMBOL(tcp_read_skb); +EXPORT_IPV6_MOD(tcp_read_skb); void tcp_read_done(struct sock *sk, size_t len) { @@ -1712,7 +1769,7 @@ int tcp_peek_len(struct socket *sock) { return tcp_inq(sock->sk); } -EXPORT_SYMBOL(tcp_peek_len); +EXPORT_IPV6_MOD(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) @@ -1739,7 +1796,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } return 0; } -EXPORT_SYMBOL(tcp_set_rcvlowat); +EXPORT_IPV6_MOD(tcp_set_rcvlowat); void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping_internal *tss) @@ -1772,7 +1829,7 @@ int tcp_mmap(struct file *file, struct socket *sock, vma->vm_ops = &tcp_vm_ops; return 0; } -EXPORT_SYMBOL(tcp_mmap); +EXPORT_IPV6_MOD(tcp_mmap); static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, u32 *offset_frag) @@ -2438,14 +2495,12 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, */ memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg)); dmabuf_cmsg.frag_size = copy; - err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR, - sizeof(dmabuf_cmsg), &dmabuf_cmsg); - if (err || msg->msg_flags & MSG_CTRUNC) { - msg->msg_flags &= ~MSG_CTRUNC; - if (!err) - err = -ETOOSMALL; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_LINEAR, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) goto out; - } sent += copy; @@ -2476,6 +2531,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, } niov = skb_frag_net_iov(frag); + if (!net_is_devmem_iov(niov)) { + err = -ENODEV; + goto out; + } + end = start + skb_frag_size(frag); copy = end - offset; @@ -2494,21 +2554,17 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, /* Will perform the exchange later */ dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; - dmabuf_cmsg.dmabuf_id = net_iov_binding_id(niov); + dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov); offset += copy; remaining_len -= copy; - err = put_cmsg(msg, SOL_SOCKET, - SO_DEVMEM_DMABUF, - sizeof(dmabuf_cmsg), - &dmabuf_cmsg); - if (err || msg->msg_flags & MSG_CTRUNC) { - msg->msg_flags &= ~MSG_CTRUNC; - if (!err) - err = -ETOOSMALL; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_DMABUF, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) goto out; - } atomic_long_inc(&niov->pp_ref_count); tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); @@ -2864,7 +2920,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, } return ret; } -EXPORT_SYMBOL(tcp_recvmsg); +EXPORT_IPV6_MOD(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) { @@ -2994,7 +3050,7 @@ void tcp_shutdown(struct sock *sk, int how) tcp_send_fin(sk); } } -EXPORT_SYMBOL(tcp_shutdown); +EXPORT_IPV6_MOD(tcp_shutdown); int tcp_orphan_count_sum(void) { @@ -3174,7 +3230,7 @@ adjudge_to_death: const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); @@ -3326,8 +3382,8 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_probes_out = 0; icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; - icsk->icsk_rto_min = TCP_RTO_MIN; - icsk->icsk_delack_max = TCP_DELACK_MAX; + WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN); + WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX); tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; @@ -3381,6 +3437,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rack.reo_wnd_persist = 0; tp->rack.dsack_seen = 0; tp->syn_data_acked = 0; + tp->syn_fastopen_child = 0; tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; @@ -3493,7 +3550,7 @@ static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, } DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); -EXPORT_SYMBOL(tcp_tx_delay_enabled); +EXPORT_IPV6_MOD(tcp_tx_delay_enabled); static void tcp_enable_tx_delay(void) { @@ -3627,7 +3684,7 @@ int tcp_sock_set_keepidle_locked(struct sock *sk, int val) elapsed = tp->keepalive_time - elapsed; else elapsed = 0; - inet_csk_reset_keepalive_timer(sk, elapsed); + tcp_reset_keepalive_timer(sk, elapsed); } return 0; @@ -3667,32 +3724,32 @@ EXPORT_SYMBOL(tcp_sock_set_keepcnt); int tcp_set_window_clamp(struct sock *sk, int val) { + u32 old_window_clamp, new_window_clamp, new_rcv_ssthresh; struct tcp_sock *tp = tcp_sk(sk); if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; WRITE_ONCE(tp->window_clamp, 0); - } else { - u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp; - u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ? - SOCK_MIN_RCVBUF / 2 : val; + return 0; + } - if (new_window_clamp == old_window_clamp) - return 0; + old_window_clamp = tp->window_clamp; + new_window_clamp = max_t(int, SOCK_MIN_RCVBUF / 2, val); - WRITE_ONCE(tp->window_clamp, new_window_clamp); - if (new_window_clamp < old_window_clamp) { - /* need to apply the reserved mem provisioning only - * when shrinking the window clamp - */ - __tcp_adjust_rcv_ssthresh(sk, tp->window_clamp); + if (new_window_clamp == old_window_clamp) + return 0; - } else { - new_rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp); - tp->rcv_ssthresh = max(new_rcv_ssthresh, - tp->rcv_ssthresh); - } + WRITE_ONCE(tp->window_clamp, new_window_clamp); + + /* Need to apply the reserved mem provisioning only + * when shrinking the window clamp. + */ + if (new_window_clamp < old_window_clamp) { + __tcp_adjust_rcv_ssthresh(sk, new_window_clamp); + } else { + new_rcv_ssthresh = min(tp->rcv_wnd, new_window_clamp); + tp->rcv_ssthresh = max(new_rcv_ssthresh, tp->rcv_ssthresh); } return 0; } @@ -3802,6 +3859,27 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ)); return 0; + case TCP_RTO_MAX_MS: + if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val)); + return 0; + case TCP_RTO_MIN_US: { + int rto_min = usecs_to_jiffies(val); + + if (rto_min > TCP_RTO_MIN || rto_min < TCP_TIMEOUT_MIN) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min); + return 0; + } + case TCP_DELACK_MAX_US: { + int delack_max = usecs_to_jiffies(val); + + if (delack_max > TCP_DELACK_MAX || delack_max < TCP_TIMEOUT_MIN) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max); + return 0; + } } sockopt_lock_sock(sk); @@ -4031,7 +4109,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } -EXPORT_SYMBOL(tcp_setsockopt); +EXPORT_IPV6_MOD(tcp_setsockopt); static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) @@ -4107,7 +4185,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; } - if (tp->ecn_flags & TCP_ECN_OK) + if (tcp_ecn_mode_any(tp)) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; @@ -4115,6 +4193,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_SYN_DATA; if (tp->tcp_usec_ts) info->tcpi_options |= TCPI_OPT_USEC_TS; + if (tp->syn_fastopen_child) + info->tcpi_options |= TCPI_OPT_TFO_CHILD; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato, @@ -4638,6 +4718,15 @@ zerocopy_rcv_out: case TCP_IS_MPTCP: val = 0; break; + case TCP_RTO_MAX_MS: + val = jiffies_to_msecs(tcp_rto_max(sk)); + break; + case TCP_RTO_MIN_US: + val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min)); + break; + case TCP_DELACK_MAX_US: + val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_delack_max)); + break; default: return -ENOPROTOOPT; } @@ -4659,7 +4748,7 @@ bool tcp_bpf_bypass_getsockopt(int level, int optname) return false; } -EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); +EXPORT_IPV6_MOD(tcp_bpf_bypass_getsockopt); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) @@ -4673,11 +4762,11 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); } -EXPORT_SYMBOL(tcp_getsockopt); +EXPORT_IPV6_MOD(tcp_getsockopt); #ifdef CONFIG_TCP_MD5SIG int tcp_md5_sigpool_id = -1; -EXPORT_SYMBOL_GPL(tcp_md5_sigpool_id); +EXPORT_IPV6_MOD_GPL(tcp_md5_sigpool_id); int tcp_md5_alloc_sigpool(void) { @@ -4723,7 +4812,7 @@ int tcp_md5_hash_key(struct tcp_sigpool *hp, */ return data_race(crypto_ahash_update(hp->req)); } -EXPORT_SYMBOL(tcp_md5_hash_key); +EXPORT_IPV6_MOD(tcp_md5_hash_key); /* Called with rcu_read_lock() */ static enum skb_drop_reason @@ -4843,7 +4932,7 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req, return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family, l3index, md5_location); } -EXPORT_SYMBOL_GPL(tcp_inbound_hash); +EXPORT_IPV6_MOD_GPL(tcp_inbound_hash); void tcp_done(struct sock *sk) { @@ -4992,7 +5081,12 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); +#if IS_ENABLED(CONFIG_TLS_DEVICE) + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tcp_clean_acked); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 77); +#else CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69); +#endif /* TX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); @@ -5133,7 +5227,7 @@ void __init tcp_init(void) /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); - max_rshare = min(6UL*1024*1024, limit); + max_rshare = min(32UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 47f65b1b70ca..ba581785adb4 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -646,6 +646,42 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; } +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) +int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + struct sock *sk = strp->sk; + struct sk_psock *psock; + struct tcp_sock *tp; + int copied = 0; + + tp = tcp_sk(sk); + rcu_read_lock(); + psock = sk_psock(sk); + if (WARN_ON_ONCE(!psock)) { + desc->error = -EINVAL; + goto out; + } + + psock->ingress_bytes = 0; + copied = tcp_read_sock_noack(sk, desc, recv_actor, true, + &psock->copied_seq); + if (copied < 0) + goto out; + /* recv_actor may redirect skb to another socket (SK_REDIRECT) or + * just put skb into ingress queue of current socket (SK_PASS). + * For SK_REDIRECT, we need to ack the frame immediately but for + * SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser(). + */ + tp->copied_seq = psock->copied_seq - psock->ingress_bytes; + tcp_rcv_space_adjust(sk); + __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes); +out: + rcu_read_unlock(); + return copied; +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 5dbed91c6178..76c23675ae50 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -392,6 +392,10 @@ static void hystart_update(struct sock *sk, u32 delay) if (after(tp->snd_una, ca->end_seq)) bictcp_hystart_reset(sk); + /* hystart triggers when cwnd is larger than some threshold */ + if (tcp_snd_cwnd(tp) < hystart_low_window) + return; + if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock_us(sk); @@ -467,9 +471,7 @@ __bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample if (ca->delay_min == 0 || ca->delay_min > delay) ca->delay_min = delay; - /* hystart triggers when cwnd is larger than some threshold */ - if (!ca->found && tcp_in_slow_start(tp) && hystart && - tcp_snd_cwnd(tp) >= hystart_low_window) + if (!ca->found && tcp_in_slow_start(tp) && hystart) hystart_update(sk, delay); } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 8a45a4aea933..03abe0848420 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -90,7 +90,7 @@ __bpf_kfunc static void dctcp_init(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - if ((tp->ecn_flags & TCP_ECN_OK) || + if (tcp_ecn_mode_any(tp) || (sk->sk_state == TCP_LISTEN || sk->sk_state == TCP_CLOSE)) { struct dctcp *ca = inet_csk_ca(sk); diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h index d69a77cbd0c7..4b0259111d81 100644 --- a/net/ipv4/tcp_dctcp.h +++ b/net/ipv4/tcp_dctcp.h @@ -28,7 +28,7 @@ static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, */ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { dctcp_ece_ack_cwr(sk, *ce_state); - __tcp_send_ack(sk, *prior_rcv_nxt); + __tcp_send_ack(sk, *prior_rcv_nxt, 0); } inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index f428ecf9120f..45e174b8cd22 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -83,7 +83,7 @@ static int tcp_diag_put_md5sig(struct sk_buff *skb, #endif static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk, - const struct tcp_ulp_ops *ulp_ops) + const struct tcp_ulp_ops *ulp_ops, bool net_admin) { struct nlattr *nest; int err; @@ -97,7 +97,7 @@ static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk, goto nla_failure; if (ulp_ops->get_info) - err = ulp_ops->get_info(sk, skb); + err = ulp_ops->get_info(sk, skb, net_admin); if (err) goto nla_failure; @@ -113,6 +113,7 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_ulp_ops *ulp_ops; int err = 0; #ifdef CONFIG_TCP_MD5SIG @@ -129,15 +130,13 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin, } #endif - if (net_admin) { - const struct tcp_ulp_ops *ulp_ops; - - ulp_ops = icsk->icsk_ulp_ops; - if (ulp_ops) - err = tcp_diag_put_ulp(skb, sk, ulp_ops); - if (err) + ulp_ops = icsk->icsk_ulp_ops; + if (ulp_ops) { + err = tcp_diag_put_ulp(skb, sk, ulp_ops, net_admin); + if (err < 0) return err; } + return 0; } @@ -164,7 +163,7 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) } #endif - if (net_admin && sk_fullsock(sk)) { + if (sk_fullsock(sk)) { const struct tcp_ulp_ops *ulp_ops; ulp_ops = icsk->icsk_ulp_ops; @@ -172,7 +171,7 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) size += nla_total_size(0) + nla_total_size(TCP_ULP_NAME_MAX); if (ulp_ops->get_info_size) - size += ulp_ops->get_info_size(sk); + size += ulp_ops->get_info_size(sk, net_admin); } } return size; diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 0f523cbfe329..9b83d639b5ac 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -178,7 +178,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) if (!skb) return; - skb_dst_drop(skb); + tcp_cleanup_skb(skb); /* segs_in has been initialized to 1 in tcp_create_openreq_child(). * Hence, reset segs_in to 0 before calling tcp_segs_in() * to avoid double counting. Also, tcp_segs_in() expects @@ -195,7 +195,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); tp->syn_data_acked = 1; /* u64_stats_update_begin(&tp->syncp) not needed here, @@ -274,8 +274,8 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, * because it's been added to the accept queue directly. */ req->timeout = tcp_timeout_init(child); - inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, - req->timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(child, ICSK_TIME_RETRANS, + req->timeout, false); refcount_set(&req->rsk_refcnt, 2); @@ -401,6 +401,7 @@ fastopen: } NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE); + tcp_sk(child)->syn_fastopen_child = 1; return child; } NET_INC_STATS(sock_net(sk), @@ -468,7 +469,7 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) } return false; } -EXPORT_SYMBOL(tcp_fastopen_defer_connect); +EXPORT_IPV6_MOD(tcp_fastopen_defer_connect); /* * The following code block is to deal with middle box issues with TFO: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4811727b8a02..8ec92dec321a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -102,6 +102,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ #define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */ +#define FLAG_TS_PROGRESS 0x40000 /* Positive timestamp delta */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -118,18 +119,18 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #if IS_ENABLED(CONFIG_TLS_DEVICE) static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ); -void clean_acked_data_enable(struct inet_connection_sock *icsk, +void clean_acked_data_enable(struct tcp_sock *tp, void (*cad)(struct sock *sk, u32 ack_seq)) { - icsk->icsk_clean_acked = cad; + tp->tcp_clean_acked = cad; static_branch_deferred_inc(&clean_acked_data_enabled); } EXPORT_SYMBOL_GPL(clean_acked_data_enable); -void clean_acked_data_disable(struct inet_connection_sock *icsk) +void clean_acked_data_disable(struct tcp_sock *tp) { static_branch_slow_dec_deferred(&clean_acked_data_enabled); - icsk->icsk_clean_acked = NULL; + tp->tcp_clean_acked = NULL; } EXPORT_SYMBOL_GPL(clean_acked_data_disable); @@ -169,6 +170,7 @@ static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); @@ -185,6 +187,7 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = bpf_op; sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ if (skb) @@ -243,9 +246,15 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) do_div(val, skb->truesize); tcp_sk(sk)->scaling_ratio = val ? val : 1; - if (old_ratio != tcp_sk(sk)->scaling_ratio) - WRITE_ONCE(tcp_sk(sk)->window_clamp, - tcp_win_from_space(sk, sk->sk_rcvbuf)); + if (old_ratio != tcp_sk(sk)->scaling_ratio) { + struct tcp_sock *tp = tcp_sk(sk); + + val = tcp_win_from_space(sk, sk->sk_rcvbuf); + tcp_set_window_clamp(sk, val); + + if (tp->window_clamp < tp->rcvq_space.space) + tp->rcvq_space.space = tp->window_clamp; + } } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); @@ -325,15 +334,14 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) static bool tcp_in_quickack_mode(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); - const struct dst_entry *dst = __sk_dst_get(sk); - return (dst && dst_metric(dst, RTAX_QUICKACK)) || + return icsk->icsk_ack.dst_quick_ack || (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } static void tcp_ecn_queue_cwr(struct tcp_sock *tp) { - if (tp->ecn_flags & TCP_ECN_OK) + if (tcp_ecn_mode_rfc3168(tp)) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } @@ -356,10 +364,13 @@ static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; } -static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) +static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + if (tcp_ecn_disabled(tp)) + return; + switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, @@ -388,31 +399,39 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) } } -static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) -{ - if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) - __tcp_ecn_check_ce(sk, skb); -} - static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { - if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) - tp->ecn_flags &= ~TCP_ECN_OK; + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) { - if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) - tp->ecn_flags &= ~TCP_ECN_OK; + if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { - if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) + if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) return true; return false; } +static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) +{ + tp->delivered_ce += ecn_count; +} + +/* Updates the delivered and delivered_ce counts */ +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, + bool ece_ack) +{ + tp->delivered += delivered; + if (ece_ack) + tcp_count_delivered_ce(tp, delivered); +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -630,7 +649,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) inet_csk(sk)->icsk_ack.rcv_mss = hint; } -EXPORT_SYMBOL(tcp_initialize_rcv_mss); +EXPORT_IPV6_MOD(tcp_initialize_rcv_mss); /* Receiver "autotuning" code. * @@ -645,10 +664,12 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss); */ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { - u32 new_sample = tp->rcv_rtt_est.rtt_us; - long m = sample; + u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us; + long m = sample << 3; - if (new_sample != 0) { + if (old_sample == 0 || m < old_sample) { + new_sample = m; + } else { /* If we sample in larger samples in the non-timestamp * case, we could grossly overestimate the RTT especially * with chatty applications or bulk transfer apps which @@ -659,17 +680,12 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) * else with timestamps disabled convergence takes too * long. */ - if (!win_dep) { - m -= (new_sample >> 3); - new_sample += m; - } else { - m <<= 3; - if (m < new_sample) - new_sample = m; - } - } else { - /* No previous measure. */ - new_sample = m << 3; + if (win_dep) + return; + /* Do not use this sample if receive queue is not empty. */ + if (tp->rcv_nxt != tp->copied_seq) + return; + new_sample = old_sample - (old_sample >> 3) + sample; } tp->rcv_rtt_est.rtt_us = new_sample; @@ -693,7 +709,7 @@ new_measure: tp->rcv_rtt_est.time = tp->tcp_mstamp; } -static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) +static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta) { u32 delta, delta_us; @@ -703,7 +719,7 @@ static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp) if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) { if (!delta) - delta = 1; + delta = min_delta; delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ); return delta_us; } @@ -721,13 +737,39 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, if (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) { - s32 delta = tcp_rtt_tsopt_us(tp); + s32 delta = tcp_rtt_tsopt_us(tp, 0); - if (delta >= 0) + if (delta > 0) tcp_rcv_rtt_update(tp, delta, 0); } } +static void tcp_rcvbuf_grow(struct sock *sk) +{ + const struct net *net = sock_net(sk); + struct tcp_sock *tp = tcp_sk(sk); + int rcvwin, rcvbuf, cap; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || + (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + return; + + /* slow start: allow the sender to double its rate. */ + rcvwin = tp->rcvq_space.space << 1; + + if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) + rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; + + cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); + + rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap); + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + /* Make the window clamp follow along. */ + WRITE_ONCE(tp->window_clamp, + tcp_win_from_space(sk, rcvbuf)); + } +} /* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. @@ -735,8 +777,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, void tcp_rcv_space_adjust(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 copied; - int time; + int time, inq, copied; trace_tcp_rcv_space_adjust(sk); @@ -747,45 +788,18 @@ void tcp_rcv_space_adjust(struct sock *sk) /* Number of bytes copied to user in last RTT */ copied = tp->copied_seq - tp->rcvq_space.seq; + /* Number of bytes in receive queue. */ + inq = tp->rcv_nxt - tp->copied_seq; + copied -= inq; if (copied <= tp->rcvq_space.space) goto new_measure; - /* A bit of theory : - * copied = bytes received in previous RTT, our base window - * To cope with packet losses, we need a 2x factor - * To cope with slow start, and sender growing its cwin by 100 % - * every RTT, we need a 4x factor, because the ACK we are sending - * now is for the next RTT, not the current one : - * <prev RTT . ><current RTT .. ><next RTT .... > - */ - - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - u64 rcvwin, grow; - int rcvbuf; + trace_tcp_rcvbuf_grow(sk, time); - /* minimal window to cope with packet losses, assuming - * steady state. Add some cushion because of small variations. - */ - rcvwin = ((u64)copied << 1) + 16 * tp->advmss; - - /* Accommodate for sender rate increase (eg. slow start) */ - grow = rcvwin * (copied - tp->rcvq_space.space); - do_div(grow, tp->rcvq_space.space); - rcvwin += (grow << 1); - - rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - if (rcvbuf > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); - - /* Make the window clamp follow along. */ - WRITE_ONCE(tp->window_clamp, - tcp_win_from_space(sk, rcvbuf)); - } - } tp->rcvq_space.space = copied; + tcp_rcvbuf_grow(sk); + new_measure: tp->rcvq_space.seq = tp->copied_seq; tp->rcvq_space.time = tp->tcp_mstamp; @@ -851,7 +865,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) icsk->icsk_ack.lrcvtime = now; tcp_save_lrcv_flowlabel(sk, skb); - tcp_ecn_check_ce(sk, skb); + tcp_data_ecn_check(sk, skb); if (skb->len >= 128) tcp_grow_window(sk, skb, true); @@ -1148,15 +1162,6 @@ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) } } -/* Updates the delivered and delivered_ce counts */ -static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, - bool ece_ack) -{ - tp->delivered += delivered; - if (ece_ack) - tp->delivered_ce += delivered; -} - /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -2252,8 +2257,7 @@ static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - delay, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, false); *ack_flag &= ~FLAG_SET_XMIT_TIMER; return true; } @@ -2710,6 +2714,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) return; + trace_tcp_cwnd_reduction_tp(sk, newly_acked_sacked, newly_lost, flag); + tp->prr_delivered += newly_acked_sacked; if (delta < 0) { u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + @@ -2892,7 +2898,7 @@ void tcp_simple_retransmit(struct sock *sk) */ tcp_non_congestion_loss_retransmit(sk); } -EXPORT_SYMBOL(tcp_simple_retransmit); +EXPORT_IPV6_MOD(tcp_simple_retransmit); void tcp_enter_recovery(struct sock *sk, bool ece_ack) { @@ -3215,7 +3221,7 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag, */ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) - seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp); + seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1); rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */ if (seq_rtt_us < 0) @@ -3282,8 +3288,7 @@ void tcp_rearm_rto(struct sock *sk) */ rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } - tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, true); } } @@ -3560,10 +3565,10 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { - unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); + unsigned long when = tcp_probe0_when(sk, tcp_rto_max(sk)); when = tcp_clamp_probe0_to_user_timeout(sk, when); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, true); } } @@ -3808,8 +3813,16 @@ static void tcp_store_ts_recent(struct tcp_sock *tp) tp->rx_opt.ts_recent_stamp = ktime_get_seconds(); } -static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +static int __tcp_replace_ts_recent(struct tcp_sock *tp, s32 tstamp_delta) { + tcp_store_ts_recent(tp); + return tstamp_delta > 0 ? FLAG_TS_PROGRESS : 0; +} + +static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +{ + s32 delta; + if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { /* PAWS bug workaround wrt. ACK frames, the PAWS discard * extra check below makes sure this can only happen @@ -3818,9 +3831,13 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) * Not only, also it occurs for expired timestamps. */ - if (tcp_paws_check(&tp->rx_opt, 0)) - tcp_store_ts_recent(tp); + if (tcp_paws_check(&tp->rx_opt, 0)) { + delta = tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent; + return __tcp_replace_ts_recent(tp, delta); + } } + + return 0; } /* This routine deals with acks during a TLP episode and ends an episode by @@ -3856,12 +3873,23 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } } -static inline void tcp_in_ack_event(struct sock *sk, u32 flags) +static void tcp_in_ack_event(struct sock *sk, int flag) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_ca_ops->in_ack_event) - icsk->icsk_ca_ops->in_ack_event(sk, flags); + if (icsk->icsk_ca_ops->in_ack_event) { + u32 ack_ev_flags = 0; + + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; + if (flag & FLAG_SLOWPATH) { + ack_ev_flags |= CA_ACK_SLOWPATH; + if (flag & FLAG_ECE) + ack_ev_flags |= CA_ACK_ECE; + } + + icsk->icsk_ca_ops->in_ack_event(sk, ack_ev_flags); + } } /* Congestion control has updated the cwnd already. So if we're in @@ -3954,8 +3982,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) #if IS_ENABLED(CONFIG_TLS_DEVICE) if (static_branch_unlikely(&clean_acked_data_enabled.key)) - if (icsk->icsk_clean_acked) - icsk->icsk_clean_acked(sk, ack); + if (tp->tcp_clean_acked) + tp->tcp_clean_acked(sk, ack); #endif } @@ -3966,7 +3994,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * is in window. */ if (flag & FLAG_UPDATE_TS_RECENT) - tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + flag |= tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) == FLAG_SND_UNA_ADVANCED) { @@ -3978,12 +4006,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_snd_una_update(tp, ack); flag |= FLAG_WIN_UPDATE; - tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { - u32 ack_ev_flags = CA_ACK_SLOWPATH; - if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else @@ -3995,19 +4019,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; - ack_ev_flags |= CA_ACK_ECE; - } if (sack_state.sack_delivered) tcp_count_delivered(tp, sack_state.sack_delivered, flag & FLAG_ECE); - - if (flag & FLAG_WIN_UPDATE) - ack_ev_flags |= CA_ACK_WIN_UPDATE; - - tcp_in_ack_event(sk, ack_ev_flags); } /* This is a deviation from RFC3168 since it states that: @@ -4034,6 +4051,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + tcp_in_ack_event(sk, flag); + if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -4065,6 +4084,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, @@ -4174,7 +4194,6 @@ u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) } return mss; } -EXPORT_SYMBOL_GPL(tcp_parse_mss_option); /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when @@ -4450,34 +4469,40 @@ static u32 tcp_tsval_replay(const struct sock *sk) return inet_csk(sk)->icsk_rto * 1200 / HZ; } -static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) +static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk, + const struct sk_buff *skb) { const struct tcp_sock *tp = tcp_sk(sk); const struct tcphdr *th = tcp_hdr(skb); - u32 seq = TCP_SKB_CB(skb)->seq; + SKB_DR_INIT(reason, TCP_RFC7323_PAWS); u32 ack = TCP_SKB_CB(skb)->ack_seq; + u32 seq = TCP_SKB_CB(skb)->seq; - return /* 1. Pure ACK with correct sequence number. */ - (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) && + /* 1. Is this not a pure ACK ? */ + if (!th->ack || seq != TCP_SKB_CB(skb)->end_seq) + return reason; - /* 2. ... and duplicate ACK. */ - ack == tp->snd_una && + /* 2. Is its sequence not the expected one ? */ + if (seq != tp->rcv_nxt) + return before(seq, tp->rcv_nxt) ? + SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK : + reason; - /* 3. ... and does not update window. */ - !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && + /* 3. Is this not a duplicate ACK ? */ + if (ack != tp->snd_una) + return reason; - /* 4. ... and sits in replay window. */ - (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= - tcp_tsval_replay(sk); -} + /* 4. Is this updating the window ? */ + if (tcp_may_update_window(tp, ack, seq, ntohs(th->window) << + tp->rx_opt.snd_wscale)) + return reason; -static inline bool tcp_paws_discard(const struct sock *sk, - const struct sk_buff *skb) -{ - const struct tcp_sock *tp = tcp_sk(sk); + /* 5. Is this not in the replay window ? */ + if ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > + tcp_tsval_replay(sk)) + return reason; - return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) && - !tcp_disordered_ack(sk, skb); + return 0; } /* Check segment sequence number for validity. @@ -4518,7 +4543,7 @@ void tcp_done_with_error(struct sock *sk, int err) if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } -EXPORT_SYMBOL(tcp_done_with_error); +EXPORT_IPV6_MOD(tcp_done_with_error); /* When we get a reset we do this. */ void tcp_reset(struct sock *sk, struct sk_buff *skb) @@ -4964,7 +4989,7 @@ static void tcp_ofo_queue(struct sock *sk) tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); else kfree_skb_partial(skb, fragstolen); @@ -5007,7 +5032,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) bool fragstolen; tcp_save_lrcv_flowlabel(sk, skb); - tcp_ecn_check_ce(sk, skb); + tcp_data_ecn_check(sk, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP); @@ -5143,6 +5168,7 @@ end: skb_condense(skb); skb_set_owner_r(skb, sk); } + tcp_rcvbuf_grow(sk); } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, @@ -5156,7 +5182,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); skb_set_owner_r(skb, sk); } return eaten; @@ -5239,7 +5265,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); return; } - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -5949,23 +5975,35 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) && - tp->rx_opt.saw_tstamp && - tcp_paws_discard(sk, skb)) { - if (!th->rst) { - if (unlikely(th->syn)) - goto syn_challenge; - NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); - if (!tcp_oow_rate_limited(sock_net(sk), skb, - LINUX_MIB_TCPACKSKIPPEDPAWS, - &tp->last_oow_ack_time)) - tcp_send_dupack(sk, skb); - SKB_DR_SET(reason, TCP_RFC7323_PAWS); - goto discard; - } - /* Reset is accepted even if it did not pass PAWS. */ + if (!tcp_fast_parse_options(sock_net(sk), skb, th, tp) || + !tp->rx_opt.saw_tstamp || + tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW)) + goto step1; + + reason = tcp_disordered_ack_check(sk, skb); + if (!reason) + goto step1; + /* Reset is accepted even if it did not pass PAWS. */ + if (th->rst) + goto step1; + if (unlikely(th->syn)) + goto syn_challenge; + + /* Old ACK are common, increment PAWS_OLD_ACK + * and do not send a dupack. + */ + if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWS_OLD_ACK); + goto discard; } + NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + if (!tcp_oow_rate_limited(sock_net(sk), skb, + LINUX_MIB_TCPACKSKIPPEDPAWS, + &tp->last_oow_ack_time)) + tcp_send_dupack(sk, skb); + goto discard; +step1: /* Step 1: check sequence number */ reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); if (reason) { @@ -6132,6 +6170,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->seq == tp->rcv_nxt && !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { int tcp_header_len = tp->tcp_header_len; + s32 delta = 0; + int flag = 0; /* Timestamp header prediction: tcp_header_len * is automatically equal to th->doff*4 due to pred_flags @@ -6144,8 +6184,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (!tcp_parse_aligned_timestamp(tp, th)) goto slow_path; + delta = tp->rx_opt.rcv_tsval - + tp->rx_opt.ts_recent; /* If PAWS failed, check it more carefully in slow path */ - if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) + if (delta < 0) goto slow_path; /* DO NOT update ts_recent here, if checksum fails @@ -6165,12 +6207,13 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); + flag |= __tcp_replace_ts_recent(tp, + delta); /* We know that such packets are checksummed * on entry. */ - tcp_ack(sk, skb, 0); + tcp_ack(sk, skb, flag); __kfree_skb(skb); tcp_data_snd_check(sk); /* When receiving pure ack in fast path, update @@ -6201,14 +6244,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (tcp_header_len == (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && tp->rcv_nxt == tp->rcv_wup) - tcp_store_ts_recent(tp); + flag |= __tcp_replace_ts_recent(tp, + delta); tcp_rcv_rtt_measure_ts(sk, skb); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); @@ -6216,7 +6260,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ - tcp_ack(sk, skb, FLAG_DATA); + tcp_ack(sk, skb, flag | FLAG_DATA); tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; @@ -6276,7 +6320,7 @@ csum_error: discard: tcp_drop_reason(sk, skb, reason); } -EXPORT_SYMBOL(tcp_rcv_established); +EXPORT_IPV6_MOD(tcp_rcv_established); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) { @@ -6329,7 +6373,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; if (sock_flag(sk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); + tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); if (!tp->rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp->snd_wnd); @@ -6452,9 +6496,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { /* Previous FIN/ACK or RST/ACK might be ignored. */ if (icsk->icsk_retransmits == 0) - inet_csk_reset_xmit_timer(sk, - ICSK_TIME_RETRANS, - TCP_TIMEOUT_MIN, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + TCP_TIMEOUT_MIN, false); SKB_DR_SET(reason, TCP_INVALID_ACK_SEQUENCE); goto reset_and_undo; } @@ -6569,8 +6612,8 @@ consume: */ inet_csk_schedule_ack(sk); tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, false); goto consume; } tcp_send_ack(sk); @@ -6788,10 +6831,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - if (!tcp_check_req(sk, skb, req, true, &req_stolen)) { - SKB_DR_SET(reason, TCP_FASTOPEN); + SKB_DR_SET(reason, TCP_FASTOPEN); + if (!tcp_check_req(sk, skb, req, true, &req_stolen, &reason)) goto discard; - } } if (!th->ack && !th->rst && !th->syn) { @@ -6827,6 +6869,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) if (!tp->srtt_us) tcp_synack_rtt_meas(sk, req); + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + if (req) { tcp_rcv_synrecv_state_fastopen(sk); } else { @@ -6852,9 +6897,6 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - if (tp->rx_opt.tstamp_ok) - tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; - if (!inet_csk(sk)->icsk_ca_ops->cong_control) tcp_update_pacing_rate(sk); @@ -6904,7 +6946,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing @@ -6912,7 +6954,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * if it spins in bh_lock_sock(), but it is really * marginal case. */ - inet_csk_reset_keepalive_timer(sk, tmo); + tcp_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto consume; @@ -6990,7 +7032,7 @@ consume: __kfree_skb(skb); return 0; } -EXPORT_SYMBOL(tcp_rcv_state_process); +EXPORT_IPV6_MOD(tcp_rcv_state_process); static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) { @@ -7057,6 +7099,7 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; + tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; @@ -7172,7 +7215,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, return mss; } -EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); +EXPORT_IPV6_MOD_GPL(tcp_get_syncookie_mss); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, @@ -7353,4 +7396,4 @@ drop: tcp_listendrop(sk); return 0; } -EXPORT_SYMBOL(tcp_conn_request); +EXPORT_IPV6_MOD(tcp_conn_request); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c26f6c4b7bb4..6a14f9e6fef6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -66,6 +66,7 @@ #include <net/transp_v6.h> #include <net/ipv6.h> #include <net/inet_common.h> +#include <net/inet_ecn.h> #include <net/timewait_sock.h> #include <net/xfrm.h> #include <net/secure_seq.h> @@ -92,7 +93,6 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, #endif struct inet_hashinfo tcp_hashinfo; -EXPORT_SYMBOL(tcp_hashinfo); static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), @@ -120,6 +120,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); struct tcp_sock *tp = tcp_sk(sk); int ts_recent_stamp; + u32 reuse_thresh; if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) reuse = 0; @@ -162,9 +163,10 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); + reuse_thresh = READ_ONCE(tw->tw_entry_stamp) + + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay); if (ts_recent_stamp && - (!twp || (reuse && time_after32(ktime_get_seconds(), - ts_recent_stamp)))) { + (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) { /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk * and releasing the bucket lock. */ @@ -197,7 +199,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) return 0; } -EXPORT_SYMBOL_GPL(tcp_twsk_unique); +EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -357,7 +359,7 @@ failure: inet->inet_dport = 0; return err; } -EXPORT_SYMBOL(tcp_v4_connect); +EXPORT_IPV6_MOD(tcp_v4_connect); /* * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. @@ -398,7 +400,7 @@ void tcp_v4_mtu_reduced(struct sock *sk) tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ } -EXPORT_SYMBOL(tcp_v4_mtu_reduced); +EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); static void do_redirect(struct sk_buff *skb, struct sock *sk) { @@ -432,7 +434,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) } reqsk_put(req); } -EXPORT_SYMBOL(tcp_req_err); +EXPORT_IPV6_MOD(tcp_req_err); /* TCP-LD (RFC 6069) logic */ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) @@ -456,15 +458,14 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) icsk->icsk_backoff--; icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; - icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); tcp_mstamp_refresh(tp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); if (remaining > 0) { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - remaining, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); } else { /* RTO revert clocked out retransmission. * Will retransmit now. @@ -472,7 +473,7 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) tcp_retransmit_timer(sk); } } -EXPORT_SYMBOL(tcp_ld_RTO_revert); +EXPORT_IPV6_MOD(tcp_ld_RTO_revert); /* * This routine is called by the ICMP module when it gets some @@ -494,14 +495,14 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); - struct tcp_sock *tp; + struct net *net = dev_net_rcu(skb->dev); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - struct sock *sk; struct request_sock *fastopen; + struct tcp_sock *tp; u32 seq, snd_una; + struct sock *sk; int err; - struct net *net = dev_net(skb->dev); sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->daddr, th->dest, iph->saddr, @@ -674,7 +675,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); } -EXPORT_SYMBOL(tcp_v4_send_check); +EXPORT_IPV6_MOD(tcp_v4_send_check); #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) @@ -786,7 +787,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); - net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) @@ -887,7 +888,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != offsetof(struct inet_timewait_sock, tw_bound_dev_if)); - arg.tos = ip_hdr(skb)->tos; + /* ECN bits of TW reset are cleared */ + arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK; arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); local_bh_disable(); local_lock_nested_bh(&ipv4_tcp_sk.bh_lock); @@ -1033,11 +1035,21 @@ static void tcp_v4_send_ack(const struct sock *sk, local_bh_enable(); } -static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) +static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb, + enum tcp_tw_status tw_status) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); struct tcp_key key = {}; + u8 tos = tw->tw_tos; + + /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject, + * while not cleaning ECN bits of other TW ACKs to avoid these ACKs + * being placed in a different service queues (Classic rather than L4S) + */ + if (tw_status == TCP_TW_ACK_OOW) + tos &= ~INET_ECN_MASK; + #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; @@ -1081,7 +1093,7 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, &key, tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0, - tw->tw_tos, + tos, tw->tw_txhash); inet_twsk_put(tw); @@ -1151,14 +1163,15 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, key.type = TCP_KEY_MD5; } + /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */ tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), - READ_ONCE(req->ts_recent), + req->ts_recent, 0, &key, inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, - ip_hdr(skb)->tos, + ip_hdr(skb)->tos & ~INET_ECN_MASK, READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) kfree(key.traffic_key); @@ -1229,7 +1242,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) */ DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); -EXPORT_SYMBOL(tcp_md5_needed); +EXPORT_IPV6_MOD(tcp_md5_needed); static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) { @@ -1288,7 +1301,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, } return best_match; } -EXPORT_SYMBOL(__tcp_md5_do_lookup); +EXPORT_IPV6_MOD(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, @@ -1335,7 +1348,7 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } -EXPORT_SYMBOL(tcp_v4_md5_lookup); +EXPORT_IPV6_MOD(tcp_v4_md5_lookup); static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) { @@ -1431,7 +1444,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, newkey, newkeylen, GFP_KERNEL); } -EXPORT_SYMBOL(tcp_md5_do_add); +EXPORT_IPV6_MOD(tcp_md5_do_add); int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, @@ -1463,7 +1476,7 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, key->flags, key->key, key->keylen, sk_gfp_mask(sk, GFP_ATOMIC)); } -EXPORT_SYMBOL(tcp_md5_key_copy); +EXPORT_IPV6_MOD(tcp_md5_key_copy); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags) @@ -1478,7 +1491,7 @@ int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, kfree_rcu(key, rcu); return 0; } -EXPORT_SYMBOL(tcp_md5_do_del); +EXPORT_IPV6_MOD(tcp_md5_do_del); void tcp_clear_md5_list(struct sock *sk) { @@ -1657,7 +1670,7 @@ clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } -EXPORT_SYMBOL(tcp_v4_md5_hash_skb); +EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); #endif @@ -1730,7 +1743,7 @@ drop: tcp_listendrop(sk); return 0; } -EXPORT_SYMBOL(tcp_v4_conn_request); +EXPORT_IPV6_MOD(tcp_v4_conn_request); /* @@ -1768,10 +1781,6 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - sk_daddr_set(newsk, ireq->ir_rmt_addr); - sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); - newsk->sk_bound_dev_if = ireq->ir_iif; - newinet->inet_saddr = ireq->ir_loc_addr; inet_opt = rcu_dereference(ireq->ireq_opt); RCU_INIT_POINTER(newinet->inet_opt, inet_opt); newinet->mc_index = inet_iif(skb); @@ -1854,7 +1863,7 @@ put_and_exit: tcp_done(newsk); goto exit; } -EXPORT_SYMBOL(tcp_v4_syn_recv_sock); +EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { @@ -1965,7 +1974,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv); int tcp_v4_early_demux(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; @@ -2025,7 +2034,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, */ skb_condense(skb); - skb_dst_drop(skb); + tcp_cleanup_skb(skb); if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); @@ -2055,7 +2064,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, !((TCP_SKB_CB(tail)->tcp_flags & TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) || ((TCP_SKB_CB(tail)->tcp_flags ^ - TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) || + TCP_SKB_CB(skb)->tcp_flags) & + (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || !tcp_skb_can_collapse_rx(tail, skb) || thtail->doff != th->doff || memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) @@ -2133,7 +2143,7 @@ no_coalesce: } return false; } -EXPORT_SYMBOL(tcp_add_backlog); +EXPORT_IPV6_MOD(tcp_add_backlog); int tcp_filter(struct sock *sk, struct sk_buff *skb) { @@ -2141,7 +2151,7 @@ int tcp_filter(struct sock *sk, struct sk_buff *skb) return sk_filter_trim_cap(sk, skb, th->doff * 4); } -EXPORT_SYMBOL(tcp_filter); +EXPORT_IPV6_MOD(tcp_filter); static void tcp_v4_restore_cb(struct sk_buff *skb) { @@ -2163,7 +2173,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -2176,8 +2186,9 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; + enum tcp_tw_status tw_status; int sdif = inet_sdif(skb); int dif = inet_iif(skb); const struct iphdr *iph; @@ -2269,7 +2280,8 @@ lookup: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); - nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen, + &drop_reason); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } @@ -2404,7 +2416,10 @@ do_time_wait: inet_twsk_put(inet_twsk(sk)); goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { + + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); + switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, @@ -2425,7 +2440,8 @@ do_time_wait: /* to ACK */ fallthrough; case TCP_TW_ACK: - tcp_v4_timewait_ack(sk, skb); + case TCP_TW_ACK_OOW: + tcp_v4_timewait_ack(sk, skb, tw_status); break; case TCP_TW_RST: tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); @@ -2450,7 +2466,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) sk->sk_rx_dst_ifindex = skb->skb_iif; } } -EXPORT_SYMBOL(inet_sk_rx_dst_set); +EXPORT_IPV6_MOD(inet_sk_rx_dst_set); const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, @@ -2462,11 +2478,9 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, - .addr2sockaddr = inet_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in), .mtu_reduced = tcp_v4_mtu_reduced, }; -EXPORT_SYMBOL(ipv4_specific); +EXPORT_IPV6_MOD(ipv4_specific); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { @@ -2576,7 +2590,7 @@ void tcp_v4_destroy_sock(struct sock *sk) sk_sockets_allocated_dec(sk); } -EXPORT_SYMBOL(tcp_v4_destroy_sock); +EXPORT_IPV6_MOD(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ @@ -2812,7 +2826,7 @@ out: st->last_pos = *pos; return rc; } -EXPORT_SYMBOL(tcp_seq_start); +EXPORT_IPV6_MOD(tcp_seq_start); void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { @@ -2843,7 +2857,7 @@ out: st->last_pos = *pos; return rc; } -EXPORT_SYMBOL(tcp_seq_next); +EXPORT_IPV6_MOD(tcp_seq_next); void tcp_seq_stop(struct seq_file *seq, void *v) { @@ -2861,7 +2875,7 @@ void tcp_seq_stop(struct seq_file *seq, void *v) break; } } -EXPORT_SYMBOL(tcp_seq_stop); +EXPORT_IPV6_MOD(tcp_seq_stop); static void get_openreq4(const struct request_sock *req, struct seq_file *f, int i) @@ -2910,10 +2924,10 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk_timeout(icsk); } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk_timeout(icsk); } else if (timer_pending(&sk->sk_timer)) { timer_active = 2; timer_expires = sk->sk_timer.expires; @@ -3457,6 +3471,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 2; + net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC; net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); @@ -3480,8 +3495,8 @@ static int __net_init tcp_sk_init(struct net *net) * which are too large can cause TCP streams to be bursty. */ net->ipv4.sysctl_tcp_tso_win_divisor = 3; - /* Default TSQ limit of 16 TSO segments */ - net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536; + /* Default TSQ limit of 4 MB */ + net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20; /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */ net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX; @@ -3530,6 +3545,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_pingpong_thresh = 1; net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); + net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; return 0; } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 95669935494e..4251670e328c 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, bool reclaim = false; spin_lock_bh(&tcp_metrics_lock); - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. @@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return NULL; } - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); @@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, else return NULL; - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7121d8573928..43d7852ce07e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -44,7 +44,7 @@ tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, /* Send ACK. Note, we do not put the bucket, * it will be released by caller. */ - return TCP_TW_ACK; + return TCP_TW_ACK_OOW; } /* We are rate-limiting, so just release the tw sock and drop skb. */ @@ -97,7 +97,8 @@ static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq, */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, - const struct tcphdr *th, u32 *tw_isn) + const struct tcphdr *th, u32 *tw_isn, + enum skb_drop_reason *drop_reason) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); @@ -157,8 +158,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, rcv_nxt); if (tmp_opt.saw_tstamp) { + u64 ts = tcp_clock_ms(); + + WRITE_ONCE(tw->tw_entry_stamp, ts); WRITE_ONCE(tcptw->tw_ts_recent_stamp, - ktime_get_seconds()); + div_u64(ts, MSEC_PER_SEC)); WRITE_ONCE(tcptw->tw_ts_recent, tmp_opt.rcv_tsval); } @@ -242,8 +246,10 @@ kill: return TCP_TW_SYN; } - if (paws_reject) - __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); + if (paws_reject) { + *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; + __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); + } if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. @@ -261,7 +267,7 @@ kill: inet_twsk_put(tw); return TCP_TW_SUCCESS; } -EXPORT_SYMBOL(tcp_timewait_state_process); +EXPORT_IPV6_MOD(tcp_timewait_state_process); static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) { @@ -316,6 +322,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_mark = sk->sk_mark; tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; + /* refreshed when we enter true TIME-WAIT state */ + tw->tw_entry_stamp = tcp_time_stamp_ms(tp); tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); @@ -393,7 +401,7 @@ void tcp_twsk_destructor(struct sock *sk) #endif tcp_ao_destroy_sock(sk, true); } -EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor); void tcp_twsk_purge(struct list_head *net_exit_list) { @@ -452,12 +460,13 @@ void tcp_openreq_init_rwin(struct request_sock *req, rcv_wnd); ireq->rcv_wscale = rcv_wscale; } -EXPORT_SYMBOL(tcp_openreq_init_rwin); static void tcp_ecn_openreq_child(struct tcp_sock *tp, const struct request_sock *req) { - tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? + TCP_ECN_MODE_RFC3168 : + TCP_ECN_DISABLED); } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) @@ -487,7 +496,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) tcp_set_ca_state(sk, TCP_CA_Open); } -EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); +EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child); static void smc_check_reset_syn_req(const struct tcp_sock *oldtp, struct request_sock *req, @@ -561,8 +570,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); + tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; @@ -582,7 +590,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (newtp->rx_opt.tstamp_ok) { newtp->tcp_usec_ts = treq->req_usec_ts; - newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent); + newtp->rx_opt.ts_recent = req->ts_recent; newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { @@ -654,12 +662,14 @@ EXPORT_SYMBOL(tcp_create_openreq_child); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - bool fastopen, bool *req_stolen) + bool fastopen, bool *req_stolen, + enum skb_drop_reason *drop_reason) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); + bool tsecr_reject = false; bool paws_reject = false; bool own_req; @@ -668,9 +678,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { - tmp_opt.ts_recent = READ_ONCE(req->ts_recent); - if (tmp_opt.rcv_tsecr) + tmp_opt.ts_recent = req->ts_recent; + if (tmp_opt.rcv_tsecr) { + if (inet_rsk(req)->tstamp_ok && !fastopen) + tsecr_reject = !between(tmp_opt.rcv_tsecr, + tcp_rsk(req)->snt_tsval_first, + READ_ONCE(tcp_rsk(req)->snt_tsval_last)); tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; + } /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. @@ -785,37 +800,34 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->snt_isn + 1)) return sk; - /* Also, it would be not so bad idea to check rcv_tsecr, which - * is essentially ACK extension and too early or too late values - * should cause reset in unsynchronized states. - */ - /* RFC793: "first check sequence number". */ - if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, - tcp_rsk(req)->rcv_nxt + - tcp_synack_window(req))) { + if (paws_reject || tsecr_reject || + !tcp_in_window(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + tcp_rsk(req)->rcv_nxt, + tcp_rsk(req)->rcv_nxt + + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); - if (paws_reject) + if (paws_reject) { + SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS); NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + } else if (tsecr_reject) { + SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); + } else { + SKB_DR_SET(*drop_reason, TCP_OVERWINDOW); + } return NULL; } /* In sequence, PAWS is OK. */ - /* TODO: We probably should defer ts_recent change once - * we take ownership of @req. - */ - if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) - WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval); - if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ @@ -864,6 +876,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; + if (own_req && tmp_opt.saw_tstamp && + !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) + tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; + if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); @@ -876,6 +892,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: + SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW); if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); @@ -905,7 +922,7 @@ embryonic_reset: } return NULL; } -EXPORT_SYMBOL(tcp_check_req); +EXPORT_IPV6_MOD(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, @@ -947,4 +964,4 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, sock_put(child); return reason; } -EXPORT_SYMBOL(tcp_child_process); +EXPORT_IPV6_MOD(tcp_child_process); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 2308665b51c5..d293087b426d 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -13,12 +13,15 @@ #include <net/tcp.h> #include <net/protocol.h> -static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq, +static void tcp_gso_tstamp(struct sk_buff *skb, struct sk_buff *gso_skb, unsigned int seq, unsigned int mss) { + u32 flags = skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP; + u32 ts_seq = skb_shinfo(gso_skb)->tskey; + while (skb) { if (before(ts_seq, seq + mss)) { - skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP; + skb_shinfo(skb)->tx_flags |= flags; skb_shinfo(skb)->tskey = ts_seq; return; } @@ -139,6 +142,7 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, struct sk_buff *gso_skb = skb; __sum16 newcheck; bool ooo_okay, copy_destructor; + bool ecn_cwr_mask; __wsum delta; th = tcp_hdr(skb); @@ -193,11 +197,13 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th = tcp_hdr(skb); seq = ntohl(th->seq); - if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP)) - tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss); + if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP)) + tcp_gso_tstamp(segs, gso_skb, seq, mss); newcheck = ~csum_fold(csum_add(csum_unfold(th->check), delta)); + ecn_cwr_mask = !!(skb_shinfo(gso_skb)->gso_type & SKB_GSO_TCP_ACCECN); + while (skb->next) { th->fin = th->psh = 0; th->check = newcheck; @@ -217,7 +223,8 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, th = tcp_hdr(skb); th->seq = htonl(seq); - th->cwr = 0; + + th->cwr &= ecn_cwr_mask; } /* Following permits TCP Small Queues to work well with GSO : @@ -325,7 +332,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, th2 = tcp_hdr(p); flush = (__force int)(flags & TCP_FLAG_CWR); flush |= (__force int)((flags ^ tcp_flag_word(th2)) & - ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH)); + ~(TCP_FLAG_FIN | TCP_FLAG_PSH)); flush |= (__force int)(th->ack_seq ^ th2->ack_seq); for (i = sizeof(*th); i < thlen; i += 4) flush |= *(u32 *)((u8 *)th + i) ^ @@ -401,7 +408,7 @@ void tcp_gro_complete(struct sk_buff *skb) shinfo->gso_segs = NAPI_GRO_CB(skb)->count; if (th->cwr) - shinfo->gso_type |= SKB_GSO_TCP_ECN; + shinfo->gso_type |= SKB_GSO_TCP_ACCECN; } EXPORT_SYMBOL(tcp_gro_complete); @@ -425,14 +432,14 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet_get_iif_sdif(skb, &iif, &sdif); iph = skb_gro_network_header(skb); - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; if (sk) - sock_put(sk); + sock_gen_put(sk); } INDIRECT_CALLABLE_SCOPE diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0e5b9a654254..3ac8d2d17e1f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -250,7 +250,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, WRITE_ONCE(*__window_clamp, min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } -EXPORT_SYMBOL(tcp_select_initial_window); +EXPORT_IPV6_MOD(tcp_select_initial_window); /* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return @@ -265,11 +265,14 @@ static u16 tcp_select_window(struct sock *sk) u32 cur_win, new_win; /* Make the window 0 if we failed to queue the data because we - * are out of memory. The window is temporary, so we don't store - * it on the socket. + * are out of memory. */ - if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) + if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) { + tp->pred_flags = 0; + tp->rcv_wnd = 0; + tp->rcv_wup = tp->rcv_nxt; return 0; + } cur_win = tcp_receive_window(tp); new_win = __tcp_select_window(sk); @@ -322,7 +325,7 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) const struct tcp_sock *tp = tcp_sk(sk); TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; - if (!(tp->ecn_flags & TCP_ECN_OK)) + if (tcp_ecn_disabled(tp)) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; else if (tcp_ca_needs_ecn(sk) || tcp_bpf_ca_needs_ecn(sk)) @@ -348,7 +351,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) if (use_ecn) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tp->ecn_flags = TCP_ECN_OK; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); } @@ -378,7 +381,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (tp->ecn_flags & TCP_ECN_OK) { + if (tcp_ecn_mode_rfc3168(tp)) { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { @@ -400,7 +403,7 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ -static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) +static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags) { skb->ip_summed = CHECKSUM_PARTIAL; @@ -522,6 +525,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, sock_owned_by_me(sk); sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } @@ -567,6 +571,7 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, sock_owned_by_me(sk); sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } @@ -938,7 +943,13 @@ static unsigned int tcp_synack_options(const struct sock *sk, opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + tcp_rsk(req)->ts_off; - opts->tsecr = READ_ONCE(req->ts_recent); + if (!tcp_rsk(req)->snt_tsval_first) { + if (!opts->tsval) + opts->tsval = ~0U; + tcp_rsk(req)->snt_tsval_first = opts->tsval; + } + WRITE_ONCE(tcp_rsk(req)->snt_tsval_last, opts->tsval); + opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { @@ -1168,7 +1179,7 @@ void tcp_release_cb(struct sock *sk) if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk)) tcp_send_ack(sk); } -EXPORT_SYMBOL(tcp_release_cb); +EXPORT_IPV6_MOD(tcp_release_cb); void __init tcp_tasklet_init(void) { @@ -1384,7 +1395,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, th->seq = htonl(tcb->seq); th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | - tcb->tcp_flags); + (tcb->tcp_flags & TCPHDR_FLAGS_MASK)); th->check = 0; th->urg_ptr = 0; @@ -1605,8 +1616,8 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *buff; int old_factor; long limit; + u16 flags; int nlen; - u8 flags; if (WARN_ON(len > skb->len)) return -EINVAL; @@ -1780,7 +1791,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } -EXPORT_SYMBOL(tcp_mtu_to_mss); +EXPORT_IPV6_MOD(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) @@ -1810,7 +1821,6 @@ void tcp_mtup_init(struct sock *sk) if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } -EXPORT_SYMBOL(tcp_mtup_init); /* This function synchronize snd mss to current pmtu/exthdr set. @@ -1854,7 +1864,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) return mss_now; } -EXPORT_SYMBOL(tcp_sync_mss); +EXPORT_IPV6_MOD(tcp_sync_mss); /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. @@ -2161,7 +2171,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, { int nlen = skb->len - len; struct sk_buff *buff; - u8 flags; + u16 flags; /* All of a TSO frame must be composed of paged data. */ DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len); @@ -2609,9 +2619,8 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit = max_t(unsigned long, 2 * skb->truesize, READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift)); - if (sk->sk_pacing_status == SK_PACING_NONE) - limit = min_t(unsigned long, limit, - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); + limit = min_t(unsigned long, limit, + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes)); limit <<= factor; if (static_branch_unlikely(&tcp_tx_delay_enabled) && @@ -2908,7 +2917,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); - tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, true); return true; } @@ -3542,8 +3551,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) } if (rearm_timer) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); + inet_csk(sk)->icsk_rto, true); } /* We allow to exceed memory limits for FIN packets to expedite @@ -3850,7 +3858,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, return skb; } -EXPORT_SYMBOL(tcp_make_synack); +EXPORT_IPV6_MOD(tcp_make_synack); static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) { @@ -4160,8 +4168,8 @@ int tcp_connect(struct sock *sk) TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, false); return 0; } EXPORT_SYMBOL(tcp_connect); @@ -4170,7 +4178,7 @@ u32 tcp_delack_max(const struct sock *sk) { u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1; - return min(inet_csk(sk)->icsk_delack_max, delack_from_rto_min); + return min(READ_ONCE(inet_csk(sk)->icsk_delack_max), delack_from_rto_min); } /* Send out a delayed ack, the caller does the policy checking @@ -4216,22 +4224,21 @@ void tcp_send_delayed_ack(struct sock *sk) /* Use new timeout only if there wasn't a older one earlier. */ if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer is about to expire, send ACK now. */ - if (time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { + if (time_before_eq(icsk_delack_timeout(icsk), jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } - if (!time_before(timeout, icsk->icsk_ack.timeout)) - timeout = icsk->icsk_ack.timeout; + if (!time_before(timeout, icsk_delack_timeout(icsk))) + timeout = icsk_delack_timeout(icsk); } smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); - icsk->icsk_ack.timeout = timeout; sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } /* This routine sends an ack and also updates the window. */ -void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) +void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags) { struct sk_buff *buff; @@ -4250,17 +4257,17 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) unsigned long delay; delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; - if (delay < TCP_RTO_MAX) + if (delay < tcp_rto_max(sk)) icsk->icsk_ack.retry++; inet_csk_schedule_ack(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, false); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK); + tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK | flags); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. @@ -4275,7 +4282,7 @@ EXPORT_SYMBOL_GPL(__tcp_send_ack); void tcp_send_ack(struct sock *sk) { - __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); + __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt, 0); } /* This routine sends a packet with an out of date sequence @@ -4390,7 +4397,7 @@ void tcp_send_probe0(struct sock *sk) if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; - timeout = tcp_probe0_when(sk, TCP_RTO_MAX); + timeout = tcp_probe0_when(sk, tcp_rto_max(sk)); } else { /* If packet was not sent due to local congestion, * Let senders fight for local resources conservatively. @@ -4399,7 +4406,7 @@ void tcp_send_probe0(struct sock *sk) } timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, true); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) @@ -4427,4 +4434,4 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) } return res; } -EXPORT_SYMBOL(tcp_rtx_synack); +EXPORT_IPV6_MOD(tcp_rtx_synack); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b412ed88ccd9..e4c616bbd727 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -109,7 +109,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*tcp_rto_max(sk) || !do_reset) shift++; /* If some dubious ICMP arrived, penalize even more. */ @@ -189,12 +189,12 @@ static unsigned int tcp_model_timeout(struct sock *sk, { unsigned int linear_backoff_thresh, timeout; - linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base); + linear_backoff_thresh = ilog2(tcp_rto_max(sk) / rto_base); if (boundary <= linear_backoff_thresh) timeout = ((2 << boundary) - 1) * rto_base; else timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + (boundary - linear_backoff_thresh) * tcp_rto_max(sk); return jiffies_to_msecs(timeout); } /** @@ -268,7 +268,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { - const bool alive = icsk->icsk_rto < TCP_RTO_MAX; + const bool alive = icsk->icsk_rto < tcp_rto_max(sk); retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -322,8 +322,9 @@ void tcp_delack_timer_handler(struct sock *sk) if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) return; - if (time_after(icsk->icsk_ack.timeout, jiffies)) { - sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); + if (time_after(icsk_delack_timeout(icsk), jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, + icsk_delack_timeout(icsk)); return; } icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; @@ -416,7 +417,8 @@ static void tcp_probe_timer(struct sock *sk) } max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { - const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; + unsigned int rto_max = tcp_rto_max(sk); + const bool alive = inet_csk_rto_backoff(icsk, rto_max) < rto_max; max_probes = tcp_orphan_retries(sk, alive); if (!alive && icsk->icsk_backoff >= max_probes) @@ -481,8 +483,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) tcp_update_rto_stats(sk); if (!tp->retrans_stamp) tp->retrans_stamp = tcp_time_stamp_ts(tp); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - req->timeout << req->num_timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + req->timeout << req->num_timeout, false); } static bool tcp_rtx_probe0_timed_out(const struct sock *sk, @@ -492,7 +494,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, const struct inet_connection_sock *icsk = inet_csk(sk); u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); const struct tcp_sock *tp = tcp_sk(sk); - int timeout = TCP_RTO_MAX * 2; + int timeout = tcp_rto_max(sk) * 2; s32 rcv_delta; if (user_timeout) { @@ -508,7 +510,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, * and tp->rcv_tstamp might very well have been written recently. * rcv_delta can thus be negative. */ - rcv_delta = icsk->icsk_timeout - tp->rcv_tstamp; + rcv_delta = icsk_timeout(icsk) - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; @@ -626,9 +628,9 @@ void tcp_retransmit_timer(struct sock *sk) /* Retransmission failed because of local congestion, * Let senders fight for local resources conservatively. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - TCP_RESOURCE_PROBE_INTERVAL, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + TCP_RESOURCE_PROBE_INTERVAL, + false); goto out; } @@ -665,7 +667,7 @@ out_reset_timer: icsk->icsk_backoff = 0; icsk->icsk_rto = clamp(__tcp_set_rto(tp), tcp_rto_min(sk), - TCP_RTO_MAX); + tcp_rto_max(sk)); } else if (sk->sk_state != TCP_SYN_SENT || tp->total_rto > READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) { @@ -673,10 +675,10 @@ out_reset_timer: * activated. */ icsk->icsk_backoff++; - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + icsk->icsk_rto = min(icsk->icsk_rto << 1, tcp_rto_max(sk)); } - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + tcp_clamp_rto_to_user_timeout(sk), false); if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0)) __sk_dst_reset(sk); @@ -684,7 +686,8 @@ out:; } /* Called with bottom-half processing disabled. - Called by tcp_write_timer() */ + * Called by tcp_write_timer() and tcp_release_cb(). + */ void tcp_write_timer_handler(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -694,11 +697,11 @@ void tcp_write_timer_handler(struct sock *sk) !icsk->icsk_pending) return; - if (time_after(icsk->icsk_timeout, jiffies)) { - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); + if (time_after(icsk_timeout(icsk), jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, + icsk_timeout(icsk)); return; } - tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; @@ -749,7 +752,17 @@ void tcp_syn_ack_timeout(const struct request_sock *req) __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS); } -EXPORT_SYMBOL(tcp_syn_ack_timeout); +EXPORT_IPV6_MOD(tcp_syn_ack_timeout); + +void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len) +{ + sk_reset_timer(sk, &sk->sk_timer, jiffies + len); +} + +static void tcp_delete_keepalive_timer(struct sock *sk) +{ + sk_stop_timer(sk, &sk->sk_timer); +} void tcp_set_keepalive(struct sock *sk, int val) { @@ -757,14 +770,13 @@ void tcp_set_keepalive(struct sock *sk, int val) return; if (val && !sock_flag(sk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); + tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) - inet_csk_delete_keepalive_timer(sk); + tcp_delete_keepalive_timer(sk); } -EXPORT_SYMBOL_GPL(tcp_set_keepalive); - +EXPORT_IPV6_MOD_GPL(tcp_set_keepalive); -static void tcp_keepalive_timer (struct timer_list *t) +static void tcp_keepalive_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); @@ -775,7 +787,7 @@ static void tcp_keepalive_timer (struct timer_list *t) bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - inet_csk_reset_keepalive_timer (sk, HZ/20); + tcp_reset_keepalive_timer(sk, HZ/20); goto out; } @@ -841,7 +853,7 @@ static void tcp_keepalive_timer (struct timer_list *t) } resched: - inet_csk_reset_keepalive_timer (sk, elapsed); + tcp_reset_keepalive_timer(sk, elapsed); goto out; death: @@ -884,11 +896,9 @@ void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); - hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED_SOFT); - tcp_sk(sk)->pacing_timer.function = tcp_pace_kick; + hrtimer_setup(&tcp_sk(sk)->pacing_timer, tcp_pace_kick, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_SOFT); - hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL_PINNED_SOFT); - tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick; + hrtimer_setup(&tcp_sk(sk)->compressed_ack_timer, tcp_compressed_ack_kick, CLOCK_MONOTONIC, + HRTIMER_MODE_REL_PINNED_SOFT); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 86d282618515..dde52b8050b8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -93,6 +93,7 @@ #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/slab.h> +#include <linux/sock_diag.h> #include <net/tcp_states.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> @@ -119,15 +120,15 @@ #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6_stubs.h> #endif +#include <net/rps.h> struct udp_table udp_table __read_mostly; -EXPORT_SYMBOL(udp_table); long sysctl_udp_mem[3] __read_mostly; -EXPORT_SYMBOL(sysctl_udp_mem); +EXPORT_IPV6_MOD(sysctl_udp_mem); atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; -EXPORT_SYMBOL(udp_memory_allocated); +EXPORT_IPV6_MOD(udp_memory_allocated); DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); @@ -352,7 +353,7 @@ fail_unlock: fail: return error; } -EXPORT_SYMBOL(udp_lib_get_port); +EXPORT_IPV6_MOD(udp_lib_get_port); int udp_v4_get_port(struct sock *sk, unsigned short snum) { @@ -418,7 +419,50 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, return __inet_ehashfn(laddr, lport, faddr, fport, udp_ehash_secret + net_hash_mix(net)); } -EXPORT_SYMBOL(udp_ehashfn); +EXPORT_IPV6_MOD(udp_ehashfn); + +/** + * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port) + * @net: Network namespace + * @saddr: Source address, network order + * @sport: Source port, network order + * @daddr: Destination address, network order + * @hnum: Destination port, host order + * @dif: Destination interface index + * @sdif: Destination bridge port index, if relevant + * @udptable: Set of UDP hash tables + * + * Simplified lookup to be used as fallback if no sockets are found due to a + * potential race between (receive) address change, and lookup happening before + * the rehash operation. This function ignores SO_REUSEPORT groups while scoring + * result sockets, because if we have one, we don't need the fallback at all. + * + * Called under rcu_read_lock(). + * + * Return: socket with highest matching score if any, NULL if none + */ +static struct sock *udp4_lib_lookup1(const struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, + int dif, int sdif, + const struct udp_table *udptable) +{ + unsigned int slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot = &udptable->hash[slot]; + struct sock *sk, *result = NULL; + int score, badness = 0; + + sk_for_each_rcu(sk, &hslot->head) { + score = compute_score(sk, net, + saddr, sport, daddr, hnum, dif, sdif); + if (score > badness) { + result = sk; + badness = score; + } + } + + return result; +} /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(const struct net *net, @@ -610,7 +654,7 @@ void udp_lib_hash4(struct sock *sk, u16 hash) spin_unlock_bh(&hslot->lock); } -EXPORT_SYMBOL(udp_lib_hash4); +EXPORT_IPV6_MOD(udp_lib_hash4); /* call with sock lock */ void udp4_hash4(struct sock *sk) @@ -626,7 +670,7 @@ void udp4_hash4(struct sock *sk) udp_lib_hash4(sk, hash); } -EXPORT_SYMBOL(udp4_hash4); +EXPORT_IPV6_MOD(udp4_hash4); #endif /* CONFIG_BASE_SMALL */ /* UDP is nearly always wildcards out the wazoo, it makes no sense to try @@ -681,6 +725,19 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, hslot2, skb); + if (!IS_ERR_OR_NULL(result)) + goto done; + + /* Primary hash (destination port) lookup as fallback for this race: + * 1. __ip4_datagram_connect() sets sk_rcv_saddr + * 2. lookup (this function): new sk_rcv_saddr, hashes not updated yet + * 3. rehash operation updating _secondary and four-tuple_ hashes + * The primary hash doesn't need an update after 1., so, thanks to this + * further step, 1. and 3. don't need to be atomic against the lookup. + */ + result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif, + udptable); + done: if (IS_ERR(result)) return NULL; @@ -753,11 +810,11 @@ static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk, } DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); -EXPORT_SYMBOL(udp_encap_needed_key); +EXPORT_IPV6_MOD(udp_encap_needed_key); #if IS_ENABLED(CONFIG_IPV6) DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); -EXPORT_SYMBOL(udpv6_encap_needed_key); +EXPORT_IPV6_MOD(udpv6_encap_needed_key); #endif void udp_encap_enable(void) @@ -985,7 +1042,7 @@ void udp_flush_pending_frames(struct sock *sk) ip_flush_pending_frames(sk); } } -EXPORT_SYMBOL(udp_flush_pending_frames); +EXPORT_IPV6_MOD(udp_flush_pending_frames); /** * udp4_hwcsum - handle outgoing HW checksumming @@ -1085,9 +1142,9 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) { + if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); - return -EINVAL; + return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); @@ -1173,7 +1230,7 @@ out: WRITE_ONCE(up->pending, 0); return err; } -EXPORT_SYMBOL(udp_push_pending_frames); +EXPORT_IPV6_MOD(udp_push_pending_frames); static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size) { @@ -1210,7 +1267,7 @@ int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size) return need_ip; } -EXPORT_SYMBOL_GPL(udp_cmsg_send); +EXPORT_IPV6_MOD_GPL(udp_cmsg_send); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { @@ -1225,7 +1282,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int free = 0; int connected = 0; __be32 daddr, faddr, saddr; - u8 tos, scope; + u8 scope; __be16 dport; int err, is_udplite = IS_UDPLITE(sk); int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; @@ -1349,7 +1406,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) faddr = ipc.opt->opt.faddr; connected = 0; } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (scope == RT_SCOPE_LINK) connected = 0; @@ -1386,7 +1442,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, sk->sk_uid); @@ -1505,7 +1562,7 @@ void udp_splice_eof(struct socket *sock) udp_push_pending_frames(sk); release_sock(sk); } -EXPORT_SYMBOL_GPL(udp_splice_eof); +EXPORT_IPV6_MOD_GPL(udp_splice_eof); #define UDP_SKB_IS_STATELESS 0x80000000 @@ -1570,12 +1627,12 @@ static bool udp_skb_has_head_state(struct sk_buff *skb) } /* fully reclaim rmem/fwd memory allocated for skb */ -static void udp_rmem_release(struct sock *sk, int size, int partial, - bool rx_queue_lock_held) +static void udp_rmem_release(struct sock *sk, unsigned int size, + int partial, bool rx_queue_lock_held) { struct udp_sock *up = udp_sk(sk); struct sk_buff_head *sk_queue; - int amt; + unsigned int amt; if (likely(partial)) { up->forward_deficit += size; @@ -1595,10 +1652,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (!rx_queue_lock_held) spin_lock(&sk_queue->lock); - - sk_forward_alloc_add(sk, size); - amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); - sk_forward_alloc_add(sk, -amt); + amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); + sk_forward_alloc_add(sk, size - amt); if (amt) __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT); @@ -1622,7 +1677,7 @@ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) prefetch(&skb->data); udp_rmem_release(sk, udp_skb_truesize(skb), 1, false); } -EXPORT_SYMBOL(udp_skb_destructor); +EXPORT_IPV6_MOD(udp_skb_destructor); /* as above, but the caller held the rx queue lock, too */ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) @@ -1670,17 +1725,25 @@ static int udp_rmem_schedule(struct sock *sk, int size) int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; - int rmem, err = -ENOMEM; + unsigned int rmem, rcvbuf; spinlock_t *busy = NULL; - int size, rcvbuf; + int size, err = -ENOMEM; - /* Immediately drop when the receive queue is full. - * Always allow at least one packet. - */ rmem = atomic_read(&sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); - if (rmem > rcvbuf) - goto drop; + size = skb->truesize; + + /* Immediately drop when the receive queue is full. + * Cast to unsigned int performs the boundary check for INT_MAX. + */ + if (rmem + size > rcvbuf) { + if (rcvbuf > INT_MAX >> 1) + goto drop; + + /* Always allow at least one packet for small buffer. */ + if (rmem > rcvbuf) + goto drop; + } /* Under mem pressure, it might be helpful to help udp_recvmsg() * having linear skbs : @@ -1690,10 +1753,10 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) */ if (rmem > (rcvbuf >> 1)) { skb_condense(skb); - + size = skb->truesize; busy = busylock_acquire(sk); } - size = skb->truesize; + udp_set_dev_scratch(skb); atomic_add(size, &sk->sk_rmem_alloc); @@ -1729,7 +1792,7 @@ drop: busylock_release(busy); return err; } -EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); +EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb); void udp_destruct_common(struct sock *sk) { @@ -1745,7 +1808,7 @@ void udp_destruct_common(struct sock *sk) } udp_rmem_release(sk, total, 0, true); } -EXPORT_SYMBOL_GPL(udp_destruct_common); +EXPORT_IPV6_MOD_GPL(udp_destruct_common); static void udp_destruct_sock(struct sock *sk) { @@ -1776,11 +1839,11 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) skb_release_head_state(skb); __consume_stateless_skb(skb); } -EXPORT_SYMBOL_GPL(skb_consume_udp); +EXPORT_IPV6_MOD_GPL(skb_consume_udp); static struct sk_buff *__first_packet_length(struct sock *sk, struct sk_buff_head *rcvq, - int *total) + unsigned int *total) { struct sk_buff *skb; @@ -1793,7 +1856,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, atomic_inc(&sk->sk_drops); __skb_unlink(skb, rcvq); *total += skb->truesize; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); } else { udp_skb_csum_unnecessary_set(skb); break; @@ -1813,8 +1876,8 @@ static int first_packet_length(struct sock *sk) { struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue; struct sk_buff_head *sk_queue = &sk->sk_receive_queue; + unsigned int total = 0; struct sk_buff *skb; - int total = 0; int res; spin_lock_bh(&rcvq->lock); @@ -1858,7 +1921,7 @@ int udp_ioctl(struct sock *sk, int cmd, int *karg) return 0; } -EXPORT_SYMBOL(udp_ioctl); +EXPORT_IPV6_MOD(udp_ioctl); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off, int *err) @@ -1881,8 +1944,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, error = -EAGAIN; do { spin_lock_bh(&queue->lock); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, - err, &last); + skb = __skb_try_recv_from_queue(queue, flags, off, err, + &last); if (skb) { if (!(flags & MSG_PEEK)) udp_skb_destructor(sk, skb); @@ -1903,8 +1966,8 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, queue); - skb = __skb_try_recv_from_queue(sk, queue, flags, off, - err, &last); + skb = __skb_try_recv_from_queue(queue, flags, off, err, + &last); if (skb && !(flags & MSG_PEEK)) udp_skb_dtor_locked(sk, skb); spin_unlock(&sk_queue->lock); @@ -1947,14 +2010,14 @@ try_again: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); return recv_actor(sk, skb); } -EXPORT_SYMBOL(udp_read_skb); +EXPORT_IPV6_MOD(udp_read_skb); /* * This should be easy, if there is something there we @@ -2062,7 +2125,7 @@ csum_copy_err: UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); /* starting over for a new packet, but check if we need to yield */ cond_resched(); @@ -2081,7 +2144,7 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len); } -EXPORT_SYMBOL(udp_pre_connect); +EXPORT_IPV6_MOD(udp_pre_connect); static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -2130,7 +2193,7 @@ int udp_disconnect(struct sock *sk, int flags) release_sock(sk); return 0; } -EXPORT_SYMBOL(udp_disconnect); +EXPORT_IPV6_MOD(udp_disconnect); void udp_lib_unhash(struct sock *sk) { @@ -2138,6 +2201,7 @@ void udp_lib_unhash(struct sock *sk) struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; + sock_rps_delete_flow(sk); hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); @@ -2160,7 +2224,7 @@ void udp_lib_unhash(struct sock *sk) spin_unlock_bh(&hslot->lock); } } -EXPORT_SYMBOL(udp_lib_unhash); +EXPORT_IPV6_MOD(udp_lib_unhash); /* * inet_rcv_saddr was changed, we must rehash secondary hash @@ -2224,7 +2288,7 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) } } } -EXPORT_SYMBOL(udp_lib_rehash); +EXPORT_IPV6_MOD(udp_lib_rehash); void udp_v4_rehash(struct sock *sk) { @@ -2429,7 +2493,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) } return false; } -EXPORT_SYMBOL(udp_sk_rx_dst_set); +EXPORT_IPV6_MOD(udp_sk_rx_dst_set); /* * Multicasts and broadcasts go to each listener. @@ -2836,20 +2900,40 @@ void udp_destroy_sock(struct sock *sk) if (encap_destroy) encap_destroy(sk); } - if (udp_test_bit(ENCAP_ENABLED, sk)) + if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udp_encap_needed_key); + udp_tunnel_cleanup_gro(sk); + } } } +typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); + static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family, struct sock *sk) { #ifdef CONFIG_XFRM + udp_gro_receive_t new_gro_receive; + if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) { - if (family == AF_INET) - WRITE_ONCE(udp_sk(sk)->gro_receive, xfrm4_gro_udp_encap_rcv); - else if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) - WRITE_ONCE(udp_sk(sk)->gro_receive, ipv6_stub->xfrm6_gro_udp_encap_rcv); + if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) + new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv; + else + new_gro_receive = xfrm4_gro_udp_encap_rcv; + + if (udp_sk(sk)->gro_receive != new_gro_receive) { + /* + * With IPV6_ADDRFORM the gro callback could change + * after being set, unregister the old one, if valid. + */ + if (udp_sk(sk)->gro_receive) + udp_tunnel_update_gro_rcv(sk, false); + + WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive); + udp_tunnel_update_gro_rcv(sk, true); + } } #endif } @@ -2899,6 +2983,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_ENCAP: + sockopt_lock_sock(sk); switch (val) { case 0: #ifdef CONFIG_XFRM @@ -2922,6 +3007,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, err = -ENOPROTOOPT; break; } + sockopt_release_sock(sk); break; case UDP_NO_CHECK6_TX: @@ -2939,13 +3025,14 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, break; case UDP_GRO: - + sockopt_lock_sock(sk); /* when enabling GRO, accept the related GSO packet type */ if (valbool) udp_tunnel_encap_enable(sk); udp_assign_bit(GRO_ENABLED, sk, valbool); udp_assign_bit(ACCEPT_L4, sk, valbool); set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk); + sockopt_release_sock(sk); break; /* @@ -2985,7 +3072,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, return err; } -EXPORT_SYMBOL(udp_lib_setsockopt); +EXPORT_IPV6_MOD(udp_lib_setsockopt); int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) @@ -3056,7 +3143,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; } -EXPORT_SYMBOL(udp_lib_getsockopt); +EXPORT_IPV6_MOD(udp_lib_getsockopt); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) @@ -3098,7 +3185,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) return mask; } -EXPORT_SYMBOL(udp_poll); +EXPORT_IPV6_MOD(udp_poll); int udp_abort(struct sock *sk, int err) { @@ -3121,7 +3208,7 @@ out: return 0; } -EXPORT_SYMBOL_GPL(udp_abort); +EXPORT_IPV6_MOD_GPL(udp_abort); struct proto udp_prot = { .name = "UDP", @@ -3255,7 +3342,7 @@ void *udp_seq_start(struct seq_file *seq, loff_t *pos) return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } -EXPORT_SYMBOL(udp_seq_start); +EXPORT_IPV6_MOD(udp_seq_start); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { @@ -3269,7 +3356,7 @@ void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; return sk; } -EXPORT_SYMBOL(udp_seq_next); +EXPORT_IPV6_MOD(udp_seq_next); void udp_seq_stop(struct seq_file *seq, void *v) { @@ -3281,7 +3368,7 @@ void udp_seq_stop(struct seq_file *seq, void *v) if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); } -EXPORT_SYMBOL(udp_seq_stop); +EXPORT_IPV6_MOD(udp_seq_stop); /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, @@ -3329,34 +3416,55 @@ struct bpf_iter__udp { int bucket __aligned(8); }; +union bpf_udp_iter_batch_item { + struct sock *sk; + __u64 cookie; +}; + struct bpf_udp_iter_state { struct udp_iter_state state; unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; - int offset; - struct sock **batch; - bool st_bucket_done; + union bpf_udp_iter_batch_item *batch; }; static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, - unsigned int new_batch_sz); + unsigned int new_batch_sz, gfp_t flags); +static struct sock *bpf_iter_udp_resume(struct sock *first_sk, + union bpf_udp_iter_batch_item *cookies, + int n_cookies) +{ + struct sock *sk = NULL; + int i; + + for (i = 0; i < n_cookies; i++) { + sk = first_sk; + udp_portaddr_for_each_entry_from(sk) + if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) + goto done; + } +done: + return sk; +} + static struct sock *bpf_iter_udp_batch(struct seq_file *seq) { struct bpf_udp_iter_state *iter = seq->private; struct udp_iter_state *state = &iter->state; + unsigned int find_cookie, end_cookie; struct net *net = seq_file_net(seq); - int resume_bucket, resume_offset; struct udp_table *udptable; unsigned int batch_sks = 0; - bool resized = false; + int resume_bucket; + int resizes = 0; struct sock *sk; + int err = 0; resume_bucket = state->bucket; - resume_offset = iter->offset; /* The current batch is done, so advance the bucket. */ - if (iter->st_bucket_done) + if (iter->cur_sk == iter->end_sk) state->bucket++; udptable = udp_get_table_seq(seq, net); @@ -3369,62 +3477,89 @@ again: * before releasing the bucket lock. This allows BPF programs that are * called in seq_show to acquire the bucket lock if needed. */ + find_cookie = iter->cur_sk; + end_cookie = iter->end_sk; iter->cur_sk = 0; iter->end_sk = 0; - iter->st_bucket_done = false; batch_sks = 0; for (; state->bucket <= udptable->mask; state->bucket++) { struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot; if (hlist_empty(&hslot2->head)) - continue; + goto next_bucket; - iter->offset = 0; spin_lock_bh(&hslot2->lock); - udp_portaddr_for_each_entry(sk, &hslot2->head) { + sk = hlist_entry_safe(hslot2->head.first, struct sock, + __sk_common.skc_portaddr_node); + /* Resume from the first (in iteration order) unseen socket from + * the last batch that still exists in resume_bucket. Most of + * the time this will just be where the last iteration left off + * in resume_bucket unless that socket disappeared between + * reads. + */ + if (state->bucket == resume_bucket) + sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie], + end_cookie - find_cookie); +fill_batch: + udp_portaddr_for_each_entry_from(sk) { if (seq_sk_match(seq, sk)) { - /* Resume from the last iterated socket at the - * offset in the bucket before iterator was stopped. - */ - if (state->bucket == resume_bucket && - iter->offset < resume_offset) { - ++iter->offset; - continue; - } if (iter->end_sk < iter->max_sk) { sock_hold(sk); - iter->batch[iter->end_sk++] = sk; + iter->batch[iter->end_sk++].sk = sk; } batch_sks++; } } + + /* Allocate a larger batch and try again. */ + if (unlikely(resizes <= 1 && iter->end_sk && + iter->end_sk != batch_sks)) { + resizes++; + + /* First, try with GFP_USER to maximize the chances of + * grabbing more memory. + */ + if (resizes == 1) { + spin_unlock_bh(&hslot2->lock); + err = bpf_iter_udp_realloc_batch(iter, + batch_sks * 3 / 2, + GFP_USER); + if (err) + return ERR_PTR(err); + /* Start over. */ + goto again; + } + + /* Next, hold onto the lock, so the bucket doesn't + * change while we get the rest of the sockets. + */ + err = bpf_iter_udp_realloc_batch(iter, batch_sks, + GFP_NOWAIT); + if (err) { + spin_unlock_bh(&hslot2->lock); + return ERR_PTR(err); + } + + /* Pick up where we left off. */ + sk = iter->batch[iter->end_sk - 1].sk; + sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next, + struct sock, + __sk_common.skc_portaddr_node); + batch_sks = iter->end_sk; + goto fill_batch; + } + spin_unlock_bh(&hslot2->lock); if (iter->end_sk) break; +next_bucket: + resizes = 0; } - /* All done: no batch made. */ - if (!iter->end_sk) - return NULL; - - if (iter->end_sk == batch_sks) { - /* Batching is done for the current bucket; return the first - * socket to be iterated from the batch. - */ - iter->st_bucket_done = true; - goto done; - } - if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) { - resized = true; - /* After allocating a larger batch, retry one more time to grab - * the whole bucket. - */ - goto again; - } -done: - return iter->batch[0]; + WARN_ON_ONCE(iter->end_sk != batch_sks); + return iter->end_sk ? iter->batch[0].sk : NULL; } static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -3435,16 +3570,14 @@ static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) /* Whenever seq_next() is called, the iter->cur_sk is * done with seq_show(), so unref the iter->cur_sk. */ - if (iter->cur_sk < iter->end_sk) { - sock_put(iter->batch[iter->cur_sk++]); - ++iter->offset; - } + if (iter->cur_sk < iter->end_sk) + sock_put(iter->batch[iter->cur_sk++].sk); /* After updating iter->cur_sk, check if there are more sockets * available in the current bucket batch. */ if (iter->cur_sk < iter->end_sk) - sk = iter->batch[iter->cur_sk]; + sk = iter->batch[iter->cur_sk].sk; else /* Prepare a new batch. */ sk = bpf_iter_udp_batch(seq); @@ -3508,8 +3641,19 @@ unlock: static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter) { - while (iter->cur_sk < iter->end_sk) - sock_put(iter->batch[iter->cur_sk++]); + union bpf_udp_iter_batch_item *item; + unsigned int cur_sk = iter->cur_sk; + __u64 cookie; + + /* Remember the cookies of the sockets we haven't seen yet, so we can + * pick up where we left off next time around. + */ + while (cur_sk < iter->end_sk) { + item = &iter->batch[cur_sk++]; + cookie = sock_gen_cookie(item->sk); + sock_put(item->sk); + item->cookie = cookie; + } } static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) @@ -3525,10 +3669,8 @@ static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) (void)udp_prog_seq_show(prog, &meta, v, 0, 0); } - if (iter->cur_sk < iter->end_sk) { + if (iter->cur_sk < iter->end_sk) bpf_iter_udp_put_batch(iter); - iter->st_bucket_done = false; - } } static const struct seq_operations bpf_iter_udp_seq_ops = { @@ -3560,7 +3702,7 @@ const struct seq_operations udp_seq_ops = { .stop = udp_seq_stop, .show = udp4_seq_show, }; -EXPORT_SYMBOL(udp_seq_ops); +EXPORT_IPV6_MOD(udp_seq_ops); static struct udp_seq_afinfo udp4_seq_afinfo = { .family = AF_INET, @@ -3749,6 +3891,15 @@ fallback: static int __net_init udp_pernet_init(struct net *net) { +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + int i; + + /* No tunnel is configured */ + for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) { + INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list); + RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL); + } +#endif udp_sysctl_init(net); udp_set_table(net); @@ -3770,16 +3921,19 @@ DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, - unsigned int new_batch_sz) + unsigned int new_batch_sz, gfp_t flags) { - struct sock **new_batch; + union bpf_udp_iter_batch_item *new_batch; new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch), - GFP_USER | __GFP_NOWARN); + flags | __GFP_NOWARN); if (!new_batch) return -ENOMEM; - bpf_iter_udp_put_batch(iter); + if (flags != GFP_NOWAIT) + bpf_iter_udp_put_batch(iter); + + memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); kvfree(iter->batch); iter->batch = new_batch; iter->max_sk = new_batch_sz; @@ -3798,10 +3952,12 @@ static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) if (ret) return ret; - ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ); + ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); if (ret) bpf_iter_fini_seq_net(priv_data); + iter->state.bucket = -1; + return ret; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a5be6e4ed326..85b5aa82d7d7 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -12,6 +12,169 @@ #include <net/udp.h> #include <net/protocol.h> #include <net/inet_common.h> +#include <net/udp_tunnel.h> + +#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) + +/* + * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL + * values for the udp tunnel static call. + */ +static struct sk_buff *dummy_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + NAPI_GRO_CB(skb)->flush = 1; + return NULL; +} + +typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk, + struct list_head *head, + struct sk_buff *skb); + +struct udp_tunnel_type_entry { + udp_tunnel_gro_rcv_t gro_receive; + refcount_t count; +}; + +#define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \ + IS_ENABLED(CONFIG_VXLAN) * 2 + \ + IS_ENABLED(CONFIG_NET_FOU) * 2 + \ + IS_ENABLED(CONFIG_XFRM) * 2) + +DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv); +static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call); +static struct mutex udp_tunnel_gro_type_lock; +static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES]; +static unsigned int udp_tunnel_gro_type_nr; +static DEFINE_SPINLOCK(udp_tunnel_gro_lock); + +void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add) +{ + bool is_ipv6 = sk->sk_family == AF_INET6; + struct udp_sock *tup, *up = udp_sk(sk); + struct udp_tunnel_gro *udp_tunnel_gro; + + spin_lock(&udp_tunnel_gro_lock); + udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6]; + if (add) + hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list); + else if (up->tunnel_list.pprev) + hlist_del_init(&up->tunnel_list); + + if (udp_tunnel_gro->list.first && + !udp_tunnel_gro->list.first->next) { + tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock, + tunnel_list); + + rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup); + } else { + RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL); + } + + spin_unlock(&udp_tunnel_gro_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup); + +void udp_tunnel_update_gro_rcv(struct sock *sk, bool add) +{ + struct udp_tunnel_type_entry *cur = NULL; + struct udp_sock *up = udp_sk(sk); + int i, old_gro_type_nr; + + if (!UDP_MAX_TUNNEL_TYPES || !up->gro_receive) + return; + + mutex_lock(&udp_tunnel_gro_type_lock); + + /* Check if the static call is permanently disabled. */ + if (udp_tunnel_gro_type_nr > UDP_MAX_TUNNEL_TYPES) + goto out; + + for (i = 0; i < udp_tunnel_gro_type_nr; i++) + if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive) + cur = &udp_tunnel_gro_types[i]; + + old_gro_type_nr = udp_tunnel_gro_type_nr; + if (add) { + /* + * Update the matching entry, if found, or add a new one + * if needed + */ + if (cur) { + refcount_inc(&cur->count); + goto out; + } + + if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) { + pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n"); + /* Ensure static call will never be enabled */ + udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 1; + } else { + cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++]; + refcount_set(&cur->count, 1); + cur->gro_receive = up->gro_receive; + } + } else { + /* + * The stack cleanups only successfully added tunnel, the + * lookup on removal should never fail. + */ + if (WARN_ON_ONCE(!cur)) + goto out; + + if (!refcount_dec_and_test(&cur->count)) + goto out; + + /* Avoid gaps, so that the enable tunnel has always id 0 */ + *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr]; + } + + if (udp_tunnel_gro_type_nr == 1) { + static_call_update(udp_tunnel_gro_rcv, + udp_tunnel_gro_types[0].gro_receive); + static_branch_enable(&udp_tunnel_static_call); + } else if (old_gro_type_nr == 1) { + static_branch_disable(&udp_tunnel_static_call); + static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv); + } + +out: + mutex_unlock(&udp_tunnel_gro_type_lock); +} +EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv); + +static void udp_tunnel_gro_init(void) +{ + mutex_init(&udp_tunnel_gro_type_lock); +} + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + if (static_branch_likely(&udp_tunnel_static_call)) { + if (unlikely(gro_recursion_inc_test(skb))) { + NAPI_GRO_CB(skb)->flush |= 1; + return NULL; + } + return static_call(udp_tunnel_gro_rcv)(sk, head, skb); + } + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + +#else + +static void udp_tunnel_gro_init(void) {} + +static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk, + struct list_head *head, + struct sk_buff *skb) +{ + return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); +} + +#endif static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, @@ -247,6 +410,62 @@ static struct sk_buff *__udpv4_gso_segment_list_csum(struct sk_buff *segs) return segs; } +static void __udpv6_gso_segment_csum(struct sk_buff *seg, + struct in6_addr *oldip, + const struct in6_addr *newip, + __be16 *oldport, __be16 newport) +{ + struct udphdr *uh = udp_hdr(seg); + + if (ipv6_addr_equal(oldip, newip) && *oldport == newport) + return; + + if (uh->check) { + inet_proto_csum_replace16(&uh->check, seg, oldip->s6_addr32, + newip->s6_addr32, true); + + inet_proto_csum_replace2(&uh->check, seg, *oldport, newport, + false); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + + *oldip = *newip; + *oldport = newport; +} + +static struct sk_buff *__udpv6_gso_segment_list_csum(struct sk_buff *segs) +{ + const struct ipv6hdr *iph; + const struct udphdr *uh; + struct ipv6hdr *iph2; + struct sk_buff *seg; + struct udphdr *uh2; + + seg = segs; + uh = udp_hdr(seg); + iph = ipv6_hdr(seg); + uh2 = udp_hdr(seg->next); + iph2 = ipv6_hdr(seg->next); + + if (!(*(const u32 *)&uh->source ^ *(const u32 *)&uh2->source) && + ipv6_addr_equal(&iph->saddr, &iph2->saddr) && + ipv6_addr_equal(&iph->daddr, &iph2->daddr)) + return segs; + + while ((seg = seg->next)) { + uh2 = udp_hdr(seg); + iph2 = ipv6_hdr(seg); + + __udpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, + &uh2->source, uh->source); + __udpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, + &uh2->dest, uh->dest); + } + + return segs; +} + static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, netdev_features_t features, bool is_ipv6) @@ -259,7 +478,10 @@ static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss); - return is_ipv6 ? skb : __udpv4_gso_segment_list_csum(skb); + if (is_ipv6) + return __udpv6_gso_segment_list_csum(skb); + else + return __udpv4_gso_segment_list_csum(skb); } struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, @@ -273,6 +495,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, bool copy_dtor; __sum16 check; __be16 newlen; + int ret = 0; mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) @@ -301,6 +524,10 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size) return __udp_gso_segment_list(gso_skb, features, is_ipv6); + ret = __skb_linearize(gso_skb); + if (ret) + return ERR_PTR(ret); + /* Setup csum, as fraglist skips this in udp4_gro_receive. */ gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head; gso_skb->csum_offset = offsetof(struct udphdr, check); @@ -321,13 +548,17 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, /* clear destructor to avoid skb_segment assigning it to tail */ copy_dtor = gso_skb->destructor == sock_wfree; - if (copy_dtor) + if (copy_dtor) { gso_skb->destructor = NULL; + gso_skb->sk = NULL; + } segs = skb_segment(gso_skb, features); if (IS_ERR_OR_NULL(segs)) { - if (copy_dtor) + if (copy_dtor) { gso_skb->destructor = sock_wfree; + gso_skb->sk = sk; + } return segs; } @@ -618,7 +849,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */ skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); - pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); + pp = udp_tunnel_gro_rcv(sk, head, skb); out: skb_gro_flush_final(skb, pp, flush); @@ -630,9 +861,14 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct iphdr *iph = skb_gro_network_header(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, false); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, @@ -763,5 +999,7 @@ int __init udpv4_offload_init(void) .gro_complete = udp4_gro_complete, }, }; + + udp_tunnel_gro_init(); return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP); } diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 619a53eb672d..2326548997d3 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -58,6 +58,15 @@ error: } EXPORT_SYMBOL(udp_sock_create4); +static bool sk_saddr_any(struct sock *sk) +{ +#if IS_ENABLED(CONFIG_IPV6) + return ipv6_addr_any(&sk->sk_v6_rcv_saddr); +#else + return !sk->sk_rcv_saddr; +#endif +} + void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { @@ -80,6 +89,12 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sk); + + udp_tunnel_update_gro_rcv(sk, true); + + if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) && + sk->sk_kern_sock) + udp_tunnel_update_gro_lookup(net, sk, true); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index b5b06323cfd9..0d31a8c108d4 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -182,11 +182,15 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, int offset = skb_gro_offset(skb); const struct net_offload *ops; struct sk_buff *pp = NULL; - int ret; - - offset = offset - sizeof(struct udphdr); + int len, dlen; + __u8 *udpdata; + __be32 *udpdata32; - if (!pskb_pull(skb, offset)) + len = skb->len - offset; + dlen = offset + min(len, 8); + udpdata = skb_gro_header(skb, dlen, offset); + udpdata32 = (__be32 *)udpdata; + if (unlikely(!udpdata)) return NULL; rcu_read_lock(); @@ -194,11 +198,10 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, if (!ops || !ops->callbacks.gro_receive) goto out; - ret = __xfrm4_udp_encap_rcv(sk, skb, false); - if (ret) + /* check if it is a keepalive or IKE packet */ + if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0) goto out; - skb_push(skb, offset); NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); @@ -208,7 +211,6 @@ struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, out: rcu_read_unlock(); - skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0e765466d7f7..43b19adfbf88 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -80,6 +80,7 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/l3mdev.h> +#include <net/netdev_lock.h> #include <linux/if_tunnel.h> #include <linux/rtnetlink.h> #include <linux/netconf.h> @@ -312,7 +313,7 @@ static inline bool addrconf_link_ready(const struct net_device *dev) static void addrconf_del_rs_timer(struct inet6_dev *idev) { - if (del_timer(&idev->rs_timer)) + if (timer_delete(&idev->rs_timer)) __in6_dev_put(idev); } @@ -377,6 +378,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) int err = -ENOMEM; ASSERT_RTNL(); + netdev_ops_assert_locked(dev); if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev) return ERR_PTR(-EINVAL); @@ -402,7 +404,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) return ERR_PTR(err); } if (ndev->cnf.forwarding) - dev_disable_lro(dev); + netif_disable_lro(dev); /* We refer to the device */ netdev_hold(dev, &ndev->dev_tracker, GFP_KERNEL); @@ -852,7 +854,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf) struct inet6_dev *idev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); @@ -865,13 +867,12 @@ static void addrconf_forward_change(struct net *net, __s32 newf) static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int newf) { - struct net *net; + struct net *net = (struct net *)table->extra2; int old; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); - net = (struct net *)table->extra2; old = *p; WRITE_ONCE(*p, newf); @@ -881,7 +882,7 @@ static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } @@ -903,7 +904,7 @@ static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int net->ipv6.devconf_all); } else if ((!newf) ^ (!old)) dev_forward_change((struct inet6_dev *)table->extra1); - rtnl_unlock(); + rtnl_net_unlock(net); if (newf) rt6_purge_dflt_routers(net); @@ -916,7 +917,7 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf) struct inet6_dev *idev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf); @@ -933,13 +934,12 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf) static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int newf) { - struct net *net; + struct net *net = (struct net *)table->extra2; int old; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); - net = (struct net *)table->extra2; old = *p; WRITE_ONCE(*p, newf); @@ -950,7 +950,7 @@ static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int ne NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } @@ -964,7 +964,8 @@ static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int ne NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); } - rtnl_unlock(); + + rtnl_net_unlock(net); return 1; } @@ -2980,11 +2981,11 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; - rtnl_lock(); + rtnl_net_lock(net); dev = __dev_get_by_index(net, ireq.ifr6_ifindex); if (dev && dev->type == ARPHRD_SIT) err = addrconf_set_sit_dstaddr(net, dev, &ireq); - rtnl_unlock(); + rtnl_net_unlock(net); return err; } @@ -3008,39 +3009,25 @@ static int ipv6_mc_config(struct sock *sk, bool join, /* * Manual configuration of address on an interface */ -static int inet6_addr_add(struct net *net, int ifindex, - struct ifa6_config *cfg, +static int inet6_addr_add(struct net *net, struct net_device *dev, + struct ifa6_config *cfg, clock_t expires, u32 flags, struct netlink_ext_ack *extack) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; - struct net_device *dev; - unsigned long timeout; - clock_t expires; - u32 flags; - ASSERT_RTNL(); + ASSERT_RTNL_NET(net); if (cfg->plen > 128) { NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length"); return -EINVAL; } - /* check the lifetime */ - if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) { - NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid"); - return -EINVAL; - } - if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) { NL_SET_ERR_MSG_MOD(extack, "address with \"mngtmpaddr\" flag must have a prefix length of 64"); return -EINVAL; } - dev = __dev_get_by_index(net, ifindex); - if (!dev) - return -ENODEV; - idev = addrconf_add_dev(dev); if (IS_ERR(idev)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); @@ -3049,7 +3036,7 @@ static int inet6_addr_add(struct net *net, int ifindex, if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk, - true, cfg->pfx, ifindex); + true, cfg->pfx, dev->ifindex); if (ret < 0) { NL_SET_ERR_MSG_MOD(extack, "Multicast auto join failed"); @@ -3059,24 +3046,6 @@ static int inet6_addr_add(struct net *net, int ifindex, cfg->scope = ipv6_addr_scope(cfg->pfx); - timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - expires = jiffies_to_clock_t(timeout * HZ); - cfg->valid_lft = timeout; - flags = RTF_EXPIRES; - } else { - expires = 0; - flags = 0; - cfg->ifa_flags |= IFA_F_PERMANENT; - } - - timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - if (timeout == 0) - cfg->ifa_flags |= IFA_F_DEPRECATED; - cfg->preferred_lft = timeout; - } - ifp = ipv6_add_addr(idev, cfg, true, extack); if (!IS_ERR(ifp)) { if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) { @@ -3104,7 +3073,7 @@ static int inet6_addr_add(struct net *net, int ifindex, return 0; } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { ipv6_mc_config(net->ipv6.mc_autojoin_sk, false, - cfg->pfx, ifindex); + cfg->pfx, dev->ifindex); } return PTR_ERR(ifp); @@ -3129,7 +3098,7 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, return -ENODEV; } - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (!idev) { NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); return -ENXIO; @@ -3170,6 +3139,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) .preferred_lft = INFINITY_LIFE_TIME, .valid_lft = INFINITY_LIFE_TIME, }; + struct net_device *dev; struct in6_ifreq ireq; int err; @@ -3182,9 +3152,16 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) cfg.pfx = &ireq.ifr6_addr; cfg.plen = ireq.ifr6_prefixlen; - rtnl_lock(); - err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL); - rtnl_unlock(); + rtnl_net_lock(net); + dev = __dev_get_by_index(net, ireq.ifr6_ifindex); + if (dev) { + netdev_lock_ops(dev); + err = inet6_addr_add(net, dev, &cfg, 0, 0, NULL); + netdev_unlock_ops(dev); + } else { + err = -ENODEV; + } + rtnl_net_unlock(net); return err; } @@ -3199,10 +3176,10 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; - rtnl_lock(); + rtnl_net_lock(net); err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr, ireq.ifr6_prefixlen, NULL); - rtnl_unlock(); + rtnl_net_unlock(net); return err; } @@ -3237,16 +3214,13 @@ static void add_v4_addrs(struct inet6_dev *idev) struct in6_addr addr; struct net_device *dev; struct net *net = dev_net(idev->dev); - int scope, plen, offset = 0; + int scope, plen; u32 pflags = 0; ASSERT_RTNL(); memset(&addr, 0, sizeof(struct in6_addr)); - /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */ - if (idev->dev->addr_len == sizeof(struct in6_addr)) - offset = sizeof(struct in6_addr) - 4; - memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) { scope = IPV6_ADDR_COMPATv4; @@ -3557,7 +3531,13 @@ static void addrconf_gre_config(struct net_device *dev) return; } - if (dev->type == ARPHRD_ETHER) { + /* Generate the IPv6 link-local address using addrconf_addr_gen(), + * unless we have an IPv4 GRE device not bound to an IP address and + * which is in EUI64 mode (as __ipv6_isatap_ifid() would fail in this + * case). Such devices fall back to add_v4_addrs() instead. + */ + if (!(dev->type == ARPHRD_IPGRE && *(__be32 *)dev->dev_addr == 0 && + idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)) { addrconf_addr_gen(idev, true); return; } @@ -4205,6 +4185,7 @@ static void addrconf_dad_work(struct work_struct *w) struct inet6_dev *idev = ifp->idev; bool bump_id, disable_ipv6 = false; struct in6_addr mcaddr; + struct net *net; enum { DAD_PROCESS, @@ -4212,7 +4193,9 @@ static void addrconf_dad_work(struct work_struct *w) DAD_ABORT, } action = DAD_PROCESS; - rtnl_lock(); + net = dev_net(idev->dev); + + rtnl_net_lock(net); spin_lock_bh(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_PREDAD) { @@ -4222,7 +4205,7 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; - if ((READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->accept_dad) > 1 || + if ((READ_ONCE(net->ipv6.devconf_all->accept_dad) > 1 || READ_ONCE(idev->cnf.accept_dad) > 1) && !idev->cnf.disable_ipv6 && !(ifp->flags & IFA_F_STABLE_PRIVACY)) { @@ -4304,7 +4287,7 @@ static void addrconf_dad_work(struct work_struct *w) ifp->dad_nonce); out: in6_ifa_put(ifp); - rtnl_unlock(); + rtnl_net_unlock(net); } /* ifp->idev must be at least read locked */ @@ -4752,9 +4735,9 @@ static void addrconf_verify_work(struct work_struct *w) struct net *net = container_of(to_delayed_work(w), struct net, ipv6.addr_chk_work); - rtnl_lock(); + rtnl_net_lock(net); addrconf_verify_rtnl(net); - rtnl_unlock(); + rtnl_net_unlock(net); } static void addrconf_verify(struct net *net) @@ -4817,8 +4800,12 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, /* We ignore other flags so far. */ ifa_flags &= IFA_F_MANAGETEMPADDR; - return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, - ifm->ifa_prefixlen, extack); + rtnl_net_lock(net); + err = inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, + ifm->ifa_prefixlen, extack); + rtnl_net_unlock(net); + + return err; } static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp, @@ -4867,19 +4854,14 @@ static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp, } static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, - struct ifa6_config *cfg) + struct ifa6_config *cfg, clock_t expires, + u32 flags) { - u32 flags; - clock_t expires; - unsigned long timeout; bool was_managetempaddr; - bool had_prefixroute; bool new_peer = false; + bool had_prefixroute; - ASSERT_RTNL(); - - if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) - return -EINVAL; + ASSERT_RTNL_NET(net); if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64)) @@ -4888,24 +4870,6 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED) cfg->ifa_flags &= ~IFA_F_OPTIMISTIC; - timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - expires = jiffies_to_clock_t(timeout * HZ); - cfg->valid_lft = timeout; - flags = RTF_EXPIRES; - } else { - expires = 0; - flags = 0; - cfg->ifa_flags |= IFA_F_PERMANENT; - } - - timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - if (timeout == 0) - cfg->ifa_flags |= IFA_F_DEPRECATED; - cfg->preferred_lft = timeout; - } - if (cfg->peer_pfx && memcmp(&ifp->peer_addr, cfg->peer_pfx, sizeof(struct in6_addr))) { if (!ipv6_addr_any(&ifp->peer_addr)) @@ -4990,13 +4954,16 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *peer_pfx; struct inet6_ifaddr *ifa; struct net_device *dev; struct inet6_dev *idev; struct ifa6_config cfg; + struct ifaddrmsg *ifm; + unsigned long timeout; + clock_t expires; + u32 flags; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, @@ -5019,8 +4986,18 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[IFA_PROTO]) cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]); + cfg.ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); + + /* We ignore other flags so far. */ + cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | + IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE | + IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; + + cfg.ifa_flags |= IFA_F_PERMANENT; cfg.valid_lft = INFINITY_LIFE_TIME; cfg.preferred_lft = INFINITY_LIFE_TIME; + expires = 0; + flags = 0; if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; @@ -5028,24 +5005,44 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, ci = nla_data(tb[IFA_CACHEINFO]); cfg.valid_lft = ci->ifa_valid; cfg.preferred_lft = ci->ifa_prefered; + + if (!cfg.valid_lft || cfg.preferred_lft > cfg.valid_lft) { + NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid"); + return -EINVAL; + } + + timeout = addrconf_timeout_fixup(cfg.valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + cfg.ifa_flags &= ~IFA_F_PERMANENT; + cfg.valid_lft = timeout; + expires = jiffies_to_clock_t(timeout * HZ); + flags = RTF_EXPIRES; + } + + timeout = addrconf_timeout_fixup(cfg.preferred_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + cfg.ifa_flags |= IFA_F_DEPRECATED; + + cfg.preferred_lft = timeout; + } } + rtnl_net_lock(net); + dev = __dev_get_by_index(net, ifm->ifa_index); if (!dev) { NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface"); - return -ENODEV; + err = -ENODEV; + goto unlock_rtnl; } - cfg.ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); - - /* We ignore other flags so far. */ - cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | - IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE | - IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; - + netdev_lock_ops(dev); idev = ipv6_find_idev(dev); - if (IS_ERR(idev)) - return PTR_ERR(idev); + if (IS_ERR(idev)) { + err = PTR_ERR(idev); + goto unlock; + } if (!ipv6_allow_optimistic_dad(net, idev)) cfg.ifa_flags &= ~IFA_F_OPTIMISTIC; @@ -5053,7 +5050,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (cfg.ifa_flags & IFA_F_NODAD && cfg.ifa_flags & IFA_F_OPTIMISTIC) { NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive"); - return -EINVAL; + err = -EINVAL; + goto unlock; } ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1); @@ -5062,7 +5060,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, * It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. */ - return inet6_addr_add(net, ifm->ifa_index, &cfg, extack); + err = inet6_addr_add(net, dev, &cfg, expires, flags, extack); + goto unlock; } if (nlh->nlmsg_flags & NLM_F_EXCL || @@ -5070,10 +5069,14 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, NL_SET_ERR_MSG_MOD(extack, "address already assigned"); err = -EEXIST; } else { - err = inet6_addr_modify(net, ifa, &cfg); + err = inet6_addr_modify(net, ifa, &cfg, expires, flags); } in6_ifa_put(ifa); +unlock: + netdev_unlock_ops(dev); +unlock_rtnl: + rtnl_net_unlock(net); return err; } @@ -5127,22 +5130,6 @@ static inline int inet6_ifaddr_msgsize(void) + nla_total_size(4) /* IFA_RT_PRIORITY */; } -enum addr_type_t { - UNICAST_ADDR, - MULTICAST_ADDR, - ANYCAST_ADDR, -}; - -struct inet6_fill_args { - u32 portid; - u32 seq; - int event; - unsigned int flags; - int netnsid; - int ifindex; - enum addr_type_t type; -}; - static int inet6_fill_ifaddr(struct sk_buff *skb, const struct inet6_ifaddr *ifa, struct inet6_fill_args *args) @@ -5221,15 +5208,16 @@ error: return -EMSGSIZE; } -static int inet6_fill_ifmcaddr(struct sk_buff *skb, - const struct ifmcaddr6 *ifmca, - struct inet6_fill_args *args) +int inet6_fill_ifmcaddr(struct sk_buff *skb, + const struct ifmcaddr6 *ifmca, + struct inet6_fill_args *args) { int ifindex = ifmca->idev->dev->ifindex; u8 scope = RT_SCOPE_UNIVERSE; struct nlmsghdr *nlh; - if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) + if (!args->force_rt_scope_universe && + ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE) scope = RT_SCOPE_SITE; nlh = nlmsg_put(skb, args->portid, args->seq, args->event, @@ -5255,9 +5243,9 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, return 0; } -static int inet6_fill_ifacaddr(struct sk_buff *skb, - const struct ifacaddr6 *ifaca, - struct inet6_fill_args *args) +int inet6_fill_ifacaddr(struct sk_buff *skb, + const struct ifacaddr6 *ifaca, + struct inet6_fill_args *args) { struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt); int ifindex = dev ? dev->ifindex : 1; @@ -5361,12 +5349,12 @@ static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, struct ifaddrmsg *ifm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request"); return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request"); return -EINVAL; @@ -5418,6 +5406,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, .flags = NLM_F_MULTI, .netnsid = -1, .type = type, + .force_rt_scope_universe = false, }; struct { unsigned long ifindex; @@ -5498,7 +5487,8 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, struct ifaddrmsg *ifm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get address request"); return -EINVAL; } @@ -5507,7 +5497,6 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, extack); - ifm = nlmsg_data(nlh); if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request"); return -EINVAL; @@ -5546,6 +5535,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, .event = RTM_NEWADDR, .flags = 0, .netnsid = -1, + .force_rt_scope_universe = false, }; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; @@ -5617,6 +5607,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) .event = event, .flags = 0, .netnsid = -1, + .force_rt_scope_universe = false, }; int err = -ENOBUFS; @@ -5804,6 +5795,27 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, } } +static int inet6_fill_ifla6_stats_attrs(struct sk_buff *skb, + struct inet6_dev *idev) +{ + struct nlattr *nla; + + nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); + if (!nla) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); + + nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); + if (!nla) + goto nla_put_failure; + snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, u32 ext_filter_mask) { @@ -5826,18 +5838,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, /* XXX - MC not implemented */ - if (ext_filter_mask & RTEXT_FILTER_SKIP_STATS) - return 0; - - nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64)); - if (!nla) - goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla)); - - nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64)); - if (!nla) - goto nla_put_failure; - snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla)); + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) { + if (inet6_fill_ifla6_stats_attrs(skb, idev) < 0) + goto nla_put_failure; + } nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr)); if (!nla) @@ -6111,7 +6115,8 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, { struct ifinfomsg *ifm; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { + ifm = nlmsg_payload(nlh, sizeof(*ifm)); + if (!ifm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request"); return -EINVAL; } @@ -6121,7 +6126,6 @@ static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh, return -EINVAL; } - ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request"); @@ -6382,7 +6386,7 @@ static void addrconf_disable_change(struct net *net, __s32 newf) struct inet6_dev *idev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.disable_ipv6) ^ (!newf); @@ -6403,7 +6407,7 @@ static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf return 0; } - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); old = *p; @@ -6412,10 +6416,11 @@ static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf if (p == &net->ipv6.devconf_all->disable_ipv6) { WRITE_ONCE(net->ipv6.devconf_dflt->disable_ipv6, newf); addrconf_disable_change(net, newf); - } else if ((!newf) ^ (!old)) + } else if ((!newf) ^ (!old)) { dev_disable_change((struct inet6_dev *)table->extra1); + } - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } @@ -6458,20 +6463,20 @@ static int addrconf_sysctl_proxy_ndp(const struct ctl_table *ctl, int write, if (write && old != new) { struct net *net = ctl->extra2; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); - if (valp == &net->ipv6.devconf_dflt->proxy_ndp) + if (valp == &net->ipv6.devconf_dflt->proxy_ndp) { inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); - else if (valp == &net->ipv6.devconf_all->proxy_ndp) + } else if (valp == &net->ipv6.devconf_all->proxy_ndp) { inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); - else { + } else { struct inet6_dev *idev = ctl->extra1; inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, @@ -6479,7 +6484,7 @@ static int addrconf_sysctl_proxy_ndp(const struct ctl_table *ctl, int write, idev->dev->ifindex, &idev->cnf); } - rtnl_unlock(); + rtnl_net_unlock(net); } return ret; @@ -6499,7 +6504,7 @@ static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write, .mode = ctl->mode, }; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); new_val = *((u32 *)ctl->data); @@ -6522,19 +6527,23 @@ static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write, if (idev->cnf.addr_gen_mode != new_val) { WRITE_ONCE(idev->cnf.addr_gen_mode, new_val); + netdev_lock_ops(idev->dev); addrconf_init_auto_addrs(idev->dev); + netdev_unlock_ops(idev->dev); } } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) { struct net_device *dev; WRITE_ONCE(net->ipv6.devconf_dflt->addr_gen_mode, new_val); for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev && idev->cnf.addr_gen_mode != new_val) { WRITE_ONCE(idev->cnf.addr_gen_mode, new_val); + netdev_lock_ops(idev->dev); addrconf_init_auto_addrs(idev->dev); + netdev_unlock_ops(idev->dev); } } } @@ -6543,7 +6552,7 @@ static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write, } out: - rtnl_unlock(); + rtnl_net_unlock(net); return ret; } @@ -6565,7 +6574,7 @@ static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write, lctl.maxlen = IPV6_MAX_STRLEN; lctl.data = str; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); if (!write && !secret->initialized) { @@ -6595,7 +6604,7 @@ static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write, struct net_device *dev; for_each_netdev(net, dev) { - struct inet6_dev *idev = __in6_dev_get(dev); + struct inet6_dev *idev = __in6_dev_get_rtnl_net(dev); if (idev) { WRITE_ONCE(idev->cnf.addr_gen_mode, @@ -6610,7 +6619,7 @@ static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write, } out: - rtnl_unlock(); + rtnl_net_unlock(net); return err; } @@ -6694,7 +6703,7 @@ int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val) return 0; } - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); WRITE_ONCE(*valp, val); @@ -6703,7 +6712,7 @@ int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val) struct net_device *dev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) addrconf_disable_policy_idev(idev, val); } @@ -6712,7 +6721,7 @@ int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val) addrconf_disable_policy_idev(idev, val); } - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } @@ -7425,9 +7434,9 @@ static const struct rtnl_msg_handler addrconf_rtnl_msg_handlers[] __initconst_or {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETLINK, .dumpit = inet6_dump_ifinfo, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDR, - .doit = inet6_rtm_newaddr}, + .doit = inet6_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDR, - .doit = inet6_rtm_deladdr}, + .doit = inet6_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDR, .doit = inet6_rtm_getaddr, .dumpit = inet6_dump_ifaddr, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, @@ -7469,9 +7478,9 @@ int __init addrconf_init(void) goto out_nowq; } - rtnl_lock(); + rtnl_net_lock(&init_net); idev = ipv6_add_dev(blackhole_netdev); - rtnl_unlock(); + rtnl_net_unlock(&init_net); if (IS_ERR(idev)) { err = PTR_ERR(idev); goto errlo; @@ -7521,17 +7530,17 @@ void addrconf_cleanup(void) rtnl_af_unregister(&inet6_ops); - rtnl_lock(); + rtnl_net_lock(&init_net); /* clean dev list */ for_each_netdev(&init_net, dev) { - if (__in6_dev_get(dev) == NULL) + if (!__in6_dev_get_rtnl_net(dev)) continue; addrconf_ifdown(dev, true); } addrconf_ifdown(init_net.loopback_dev, true); - rtnl_unlock(); + rtnl_net_unlock(&init_net); destroy_workqueue(addrconf_wq); } diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index ab054f329e12..fb63ffbcfc64 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -473,12 +473,12 @@ static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh, { struct ifaddrlblmsg *ifal; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + ifal = nlmsg_payload(nlh, sizeof(*ifal)); + if (!ifal) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request"); return -EINVAL; } - ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_prefixlen || ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request"); @@ -543,7 +543,8 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, struct ifaddrlblmsg *ifal; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifal))) { + ifal = nlmsg_payload(nlh, sizeof(*ifal)); + if (!ifal) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for addrlabel get request"); return -EINVAL; } @@ -552,7 +553,6 @@ static int ip6addrlbl_valid_get_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, extack); - ifal = nlmsg_data(nlh); if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for addrlabel get request"); return -EINVAL; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f60ec8b0f8ea..acaff1296783 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -715,6 +715,7 @@ const struct proto_ops inet6_stream_ops = { #endif .set_rcvlowat = tcp_set_rcvlowat, }; +EXPORT_SYMBOL_GPL(inet6_stream_ops); const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, @@ -881,7 +882,6 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, } return false; } -EXPORT_SYMBOL_GPL(ipv6_opt_accepted); static struct packet_type ipv6_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IPV6), diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index 562cace50ca9..21e01695b48c 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -278,6 +278,37 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i, return aca; } +static void inet6_ifacaddr_notify(struct net_device *dev, + const struct ifacaddr6 *ifaca, int event) +{ + struct inet6_fill_args fillargs = { + .event = event, + .netnsid = -1, + }; + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOMEM; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(sizeof(struct in6_addr)) + + nla_total_size(sizeof(struct ifa_cacheinfo)), + GFP_KERNEL); + if (!skb) + goto error; + + err = inet6_fill_ifacaddr(skb, ifaca, &fillargs); + if (err < 0) { + pr_err("Failed to fill in anycast addresses (err %d)\n", err); + nlmsg_free(skb); + goto error; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ACADDR, NULL, GFP_KERNEL); + return; +error: + rtnl_set_sk_err(net, RTNLGRP_IPV6_ACADDR, err); +} + /* * device anycast group inc (add if not found) */ @@ -333,6 +364,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr) addrconf_join_solict(idev->dev, &aca->aca_addr); + inet6_ifacaddr_notify(idev->dev, aca, RTM_NEWANYCAST); + aca_put(aca); return 0; out: @@ -375,6 +408,8 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); + inet6_ifacaddr_notify(idev->dev, aca, RTM_DELANYCAST); + aca_put(aca); return 0; } diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index dbcea9fee626..62618a058b8f 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -1072,8 +1072,13 @@ static int calipso_sock_getattr(struct sock *sk, struct ipv6_opt_hdr *hop; int opt_len, len, ret_val = -ENOMSG, offset; unsigned char *opt; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return -EAFNOSUPPORT; + txopts = txopt_get(pinfo); if (!txopts || !txopts->hopopt) goto done; @@ -1125,8 +1130,13 @@ static int calipso_sock_setattr(struct sock *sk, { int ret_val; struct ipv6_opt_hdr *old, *new; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return -EAFNOSUPPORT; + txopts = txopt_get(pinfo); old = NULL; if (txopts) old = txopts->hopopt; @@ -1153,8 +1163,13 @@ static int calipso_sock_setattr(struct sock *sk, static void calipso_sock_delattr(struct sock *sk) { struct ipv6_opt_hdr *new_hop; - struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); + struct ipv6_pinfo *pinfo = inet6_sk(sk); + struct ipv6_txoptions *txopts; + + if (!pinfo) + return; + txopts = txopt_get(pinfo); if (!txopts || !txopts->hopopt) goto done; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index b2400c226a32..72adfc107b55 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -137,47 +137,16 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb) } #ifdef CONFIG_INET6_ESPINTCP -struct esp_tcp_sk { - struct sock *sk; - struct rcu_head rcu; -}; - -static void esp_free_tcp_sk(struct rcu_head *head) -{ - struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); - - sock_put(esk->sk); - kfree(esk); -} - static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; struct net *net = xs_net(x); - struct esp_tcp_sk *esk; __be16 sport, dport; - struct sock *nsk; struct sock *sk; - sk = rcu_dereference(x->encap_sk); - if (sk && sk->sk_state == TCP_ESTABLISHED) - return sk; - spin_lock_bh(&x->lock); sport = encap->encap_sport; dport = encap->encap_dport; - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (sk && sk == nsk) { - esk = kmalloc(sizeof(*esk), GFP_ATOMIC); - if (!esk) { - spin_unlock_bh(&x->lock); - return ERR_PTR(-ENOMEM); - } - RCU_INIT_POINTER(x->encap_sk, NULL); - esk->sk = sk; - call_rcu(&esk->rcu, esp_free_tcp_sk); - } spin_unlock_bh(&x->lock); sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6, @@ -190,20 +159,6 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) return ERR_PTR(-EINVAL); } - spin_lock_bh(&x->lock); - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (encap->encap_sport != sport || - encap->encap_dport != dport) { - sock_put(sk); - sk = nsk ?: ERR_PTR(-EREMCHG); - } else if (sk == nsk) { - sock_put(sk); - } else { - rcu_assign_pointer(x->encap_sk, sk); - } - spin_unlock_bh(&x->lock); - return sk; } @@ -216,8 +171,10 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) sk = esp6_find_tcp_sk(x); err = PTR_ERR_OR_ZERO(sk); - if (err) + if (err) { + kfree_skb(skb); goto out; + } bh_lock_sock(sk); if (sock_owned_by_user(sk)) @@ -226,6 +183,8 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk); + sock_put(sk); + out: rcu_read_unlock(); return err; @@ -315,7 +274,7 @@ static void esp_output_done(void *data, int err) x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) esp_output_tail_tcp(x, skb); else - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } } @@ -422,6 +381,8 @@ static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x, if (IS_ERR(sk)) return ERR_CAST(sk); + sock_put(sk); + *lenp = htons(len); esph = (struct ip_esp_hdr *)(lenp + 1); @@ -859,7 +820,8 @@ int esp6_input_done2(struct sk_buff *skb, int err) skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull_rcsum(skb, hlen); - if (x->props.mode == XFRM_MODE_TUNNEL) + if (x->props.mode == XFRM_MODE_TUNNEL || + x->props.mode == XFRM_MODE_IPTFS) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -hdr_len); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 6789623b2b0d..457de0745a33 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -1204,10 +1204,9 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) { struct ipv6_txoptions *opt2; - opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); + opt2 = sock_kmemdup(sk, opt, opt->tot_len, GFP_ATOMIC); if (opt2) { long dif = (char *)opt2 - (char *)opt; - memcpy(opt2, opt, opt->tot_len); if (opt2->hopopt) *((char **)&opt2->hopopt) += dif; if (opt2->dst0opt) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index c85c1627cb16..fd5f7112a51f 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -26,7 +26,10 @@ struct fib6_rule { struct fib_rule common; struct rt6key src; struct rt6key dst; + __be32 flowlabel; + __be32 flowlabel_mask; dscp_t dscp; + dscp_t dscp_mask; u8 dscp_full:1; /* DSCP or TOS selector */ }; @@ -34,7 +37,7 @@ static bool fib6_rule_matchall(const struct fib_rule *rule) { struct fib6_rule *r = container_of(rule, struct fib6_rule, common); - if (r->dst.plen || r->src.plen || r->dscp) + if (r->dst.plen || r->src.plen || r->dscp || r->flowlabel_mask) return false; return fib_rule_matchall(rule); } @@ -329,18 +332,21 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, return 0; } - if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel)) + if ((r->dscp ^ ip6_dscp(fl6->flowlabel)) & r->dscp_mask) + return 0; + + if ((r->flowlabel ^ flowi6_get_flowlabel(fl6)) & r->flowlabel_mask) return 0; if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto)) return 0; - if (fib_rule_port_range_set(&rule->sport_range) && - !fib_rule_port_inrange(&rule->sport_range, fl6->fl6_sport)) + if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask, + fl6->fl6_sport)) return 0; - if (fib_rule_port_range_set(&rule->dport_range) && - !fib_rule_port_inrange(&rule->dport_range, fl6->fl6_dport)) + if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask, + fl6->fl6_dport)) return 0; return 1; @@ -355,19 +361,72 @@ static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6, } rule6->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + rule6->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK); rule6->dscp_full = true; return 0; } +static int fib6_nl2rule_dscp_mask(const struct nlattr *nla, + struct fib6_rule *rule6, + struct netlink_ext_ack *extack) +{ + dscp_t dscp_mask; + + if (!rule6->dscp_full) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Cannot specify DSCP mask without DSCP value"); + return -EINVAL; + } + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + if (rule6->dscp & ~dscp_mask) { + NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask"); + return -EINVAL; + } + + rule6->dscp_mask = dscp_mask; + + return 0; +} + +static int fib6_nl2rule_flowlabel(struct nlattr **tb, struct fib6_rule *rule6, + struct netlink_ext_ack *extack) +{ + __be32 flowlabel, flowlabel_mask; + + if (NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL) || + NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL_MASK)) + return -EINVAL; + + flowlabel = nla_get_be32(tb[FRA_FLOWLABEL]); + flowlabel_mask = nla_get_be32(tb[FRA_FLOWLABEL_MASK]); + + if (flowlabel_mask & ~IPV6_FLOWLABEL_MASK) { + NL_SET_ERR_MSG_ATTR(extack, tb[FRA_FLOWLABEL_MASK], + "Invalid flow label mask"); + return -EINVAL; + } + + if (flowlabel & ~flowlabel_mask) { + NL_SET_ERR_MSG(extack, "Flow label and mask do not match"); + return -EINVAL; + } + + rule6->flowlabel = flowlabel; + rule6->flowlabel_mask = flowlabel_mask; + + return 0; +} + static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { + struct fib6_rule *rule6 = (struct fib6_rule *)rule; + struct net *net = rule->fr_net; int err = -EINVAL; - struct net *net = sock_net(skb->sk); - struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, @@ -375,10 +434,19 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, goto errout; } rule6->dscp = inet_dsfield_to_dscp(frh->tos); + rule6->dscp_mask = frh->tos ? inet_dsfield_to_dscp(INET_DSCP_MASK) : 0; if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0) goto errout; + if (tb[FRA_DSCP_MASK] && + fib6_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule6, extack) < 0) + goto errout; + + if ((tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) && + fib6_nl2rule_flowlabel(tb, rule6, extack) < 0) + goto errout; + if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) { if (rule->table == RT6_TABLE_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid table"); @@ -444,6 +512,22 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, return 0; } + if (tb[FRA_DSCP_MASK]) { + dscp_t dscp_mask; + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2); + if (!rule6->dscp_full || rule6->dscp_mask != dscp_mask) + return 0; + } + + if (tb[FRA_FLOWLABEL] && + nla_get_be32(tb[FRA_FLOWLABEL]) != rule6->flowlabel) + return 0; + + if (tb[FRA_FLOWLABEL_MASK] && + nla_get_be32(tb[FRA_FLOWLABEL_MASK]) != rule6->flowlabel_mask) + return 0; + if (frh->src_len && nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr))) return 0; @@ -466,12 +550,19 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, if (rule6->dscp_full) { frh->tos = 0; if (nla_put_u8(skb, FRA_DSCP, - inet_dscp_to_dsfield(rule6->dscp) >> 2)) + inet_dscp_to_dsfield(rule6->dscp) >> 2) || + nla_put_u8(skb, FRA_DSCP_MASK, + inet_dscp_to_dsfield(rule6->dscp_mask) >> 2)) goto nla_put_failure; } else { frh->tos = inet_dscp_to_dsfield(rule6->dscp); } + if (rule6->flowlabel_mask && + (nla_put_be32(skb, FRA_FLOWLABEL, rule6->flowlabel) || + nla_put_be32(skb, FRA_FLOWLABEL_MASK, rule6->flowlabel_mask))) + goto nla_put_failure; + if ((rule6->dst.plen && nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) || (rule6->src.plen && @@ -487,7 +578,10 @@ static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) { return nla_total_size(16) /* dst */ + nla_total_size(16) /* src */ - + nla_total_size(1); /* dscp */ + + nla_total_size(1) /* dscp */ + + nla_total_size(1) /* dscp mask */ + + nla_total_size(4) /* flowlabel */ + + nla_total_size(4); /* flowlabel mask */ } static void fib6_rule_flush_cache(struct fib_rules_ops *ops) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 071b0bc1179d..3fd19a84b358 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -76,7 +76,7 @@ static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); if (type == ICMPV6_PKT_TOOBIG) ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); @@ -222,10 +222,10 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1); + rcu_read_lock(); + peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); res = inet_peer_xrlim_allow(peer, tmo); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); } if (!res) __ICMP6_INC_STATS(net, ip6_dst_idev(dst), @@ -473,7 +473,10 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (!skb->dev) return; - net = dev_net(skb->dev); + + rcu_read_lock(); + + net = dev_net_rcu(skb->dev); mark = IP6_REPLY_MARK(net, skb->mark); /* * Make sure we respect the rules @@ -496,7 +499,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, !(type == ICMPV6_PARAMPROB && code == ICMPV6_UNK_OPTION && (opt_unrec(skb, info)))) - return; + goto out; saddr = NULL; } @@ -526,7 +529,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); - return; + goto out; } /* @@ -535,7 +538,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (is_ineligible(skb)) { net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); - return; + goto out; } /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ @@ -582,7 +585,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, np = inet6_sk(sk); if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) - goto out; + goto out_unlock; tmp_hdr.icmp6_type = type; tmp_hdr.icmp6_code = code; @@ -600,7 +603,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) - goto out; + goto out_unlock; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); @@ -616,7 +619,6 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, goto out_dst_release; } - rcu_read_lock(); idev = __in6_dev_get(skb->dev); if (ip6_append_data(sk, icmpv6_getfrag, &msg, @@ -630,13 +632,15 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, len + sizeof(struct icmp6hdr)); } - rcu_read_unlock(); + out_dst_release: dst_release(dst); -out: +out_unlock: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); +out: + rcu_read_unlock(); } EXPORT_SYMBOL(icmp6_send); @@ -679,8 +683,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, skb_pull(skb2, nhs); skb_reset_network_header(skb2); - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, - skb, 0); + rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr, + NULL, 0, skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; @@ -717,7 +721,7 @@ EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct sock *sk; struct inet6_dev *idev; struct ipv6_pinfo *np; @@ -832,7 +836,7 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) { struct inet6_skb_parm *opt = IP6CB(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct inet6_protocol *ipprot; enum skb_drop_reason reason; int inner_offset; @@ -889,7 +893,7 @@ out: static int icmpv6_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct net_device *dev = icmp6_dev(skb); struct inet6_dev *idev = __in6_dev_get(dev); const struct in6_addr *saddr, *daddr; @@ -921,7 +925,7 @@ static int icmpv6_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; @@ -939,7 +943,7 @@ static int icmpv6_rcv(struct sk_buff *skb) type = hdr->icmp6_type; - ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type); + ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type); switch (type) { case ICMPV6_ECHO_REQUEST: @@ -953,12 +957,9 @@ static int icmpv6_rcv(struct sk_buff *skb) break; case ICMPV6_ECHO_REPLY: - reason = ping_rcv(skb); - break; - case ICMPV6_EXT_ECHO_REPLY: - reason = ping_rcv(skb); - break; + ping_rcv(skb); + return 0; case ICMPV6_PKT_TOOBIG: /* BUGGG_FUTURE: if packet contains rthdr, we cannot update @@ -1034,9 +1035,9 @@ static int icmpv6_rcv(struct sk_buff *skb) csum_error: reason = SKB_DROP_REASON_ICMP_CSUM; - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: kfree_skb_reason(skb, reason); return 0; diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c index 95e9146918cc..b8d43ed4689d 100644 --- a/net/ipv6/ila/ila_common.c +++ b/net/ipv6/ila/ila_common.c @@ -86,7 +86,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&th->check, skb, - diff, true); + diff, true, true); } break; case NEXTHDR_UDP: @@ -97,7 +97,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&uh->check, skb, - diff, true); + diff, true, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } @@ -111,7 +111,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb, diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb, - diff, true); + diff, true, true); } break; } diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index ff7e734e335b..7d574f5132e2 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -88,13 +88,15 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } - if (ilwt->connected) { + /* cache only if we don't create a dst reference loop */ + if (ilwt->connected && orig_dst->lwtstate != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr); local_bh_enable(); } } + skb_dst_drop(skb); skb_dst_set(skb, dst); return dst_output(net, sk, skb); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 80043e46117c..8f500eaf33cf 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -54,21 +54,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, return dst; } -EXPORT_SYMBOL(inet6_csk_route_req); - -void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) -{ - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; - - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = sk->sk_v6_daddr; - sin6->sin6_port = inet_sk(sk)->inet_dport; - /* We do not store received flowlabel for TCP */ - sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, - sk->sk_bound_dev_if); -} -EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); static inline struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) @@ -151,4 +136,3 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) dst = inet6_csk_route_socket(sk, &fl6); return IS_ERR(dst) ? NULL : dst; } -EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 9ec05e354baa..76ee521189eb 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -35,8 +35,8 @@ u32 inet6_ehashfn(const struct net *net, lhash = (__force u32)laddr->s6_addr32[3]; fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret); - return __inet6_ehashfn(lhash, lport, fhash, fport, - inet6_ehash_secret + net_hash_mix(net)); + return lport + __inet6_ehashfn(lhash, 0, fhash, fport, + inet6_ehash_secret + net_hash_mix(net)); } EXPORT_SYMBOL_GPL(inet6_ehashfn); @@ -263,7 +263,9 @@ EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + bool rcu_lookup, + u32 hash) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -273,14 +275,26 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct net *net = sock_net(sk); const int sdif = l3mdev_master_ifindex_by_index(net, dif); const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); - const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, - inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); - spinlock_t *lock = inet_ehash_lockp(hinfo, hash); - struct sock *sk2; - const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; + const struct hlist_nulls_node *node; + struct sock *sk2; + spinlock_t *lock; + + if (rcu_lookup) { + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet6_match(net, sk2, saddr, daddr, + ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + return -EADDRNOTAVAIL; + } + return 0; + } + lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { @@ -339,11 +353,19 @@ static u64 inet6_sk_port_offset(const struct sock *sk) int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk) { + const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr; + const struct in6_addr *saddr = &sk->sk_v6_daddr; + const struct inet_sock *inet = inet_sk(sk); + const struct net *net = sock_net(sk); u64 port_offset = 0; + u32 hash_port0; if (!inet_sk(sk)->inet_num) port_offset = inet6_sk_port_offset(sk); - return __inet_hash_connect(death_row, sk, port_offset, + + hash_port0 = inet6_ehashfn(net, daddr, 0, saddr, inet->inet_dport); + + return __inet_hash_connect(death_row, sk, port_offset, hash_port0, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 9d8422e350f8..40df8bdfaacd 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -38,6 +38,7 @@ struct ioam6_lwt_freq { }; struct ioam6_lwt { + struct dst_entry null_dst; struct dst_cache cache; struct ioam6_lwt_freq freq; atomic_t pkt_cnt; @@ -177,6 +178,14 @@ static int ioam6_build_state(struct net *net, struct nlattr *nla, if (err) goto free_lwt; + /* This "fake" dst_entry will be stored in a dst_cache, which will call + * dst_hold() and dst_release() on it. We must ensure that dst_destroy() + * will never be called. For that, its initial refcount is 1 and +1 when + * it is stored in the cache. Then, +1/-1 each time we read the cache + * and release it. Long story short, we're fine. + */ + dst_init(&ilwt->null_dst, NULL, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); + atomic_set(&ilwt->pkt_cnt, 0); ilwt->freq.k = freq_k; ilwt->freq.n = freq_n; @@ -253,14 +262,15 @@ static int ioam6_do_fill(struct net *net, struct sk_buff *skb) } static int ioam6_do_inline(struct net *net, struct sk_buff *skb, - struct ioam6_lwt_encap *tuninfo) + struct ioam6_lwt_encap *tuninfo, + struct dst_entry *cache_dst) { struct ipv6hdr *oldhdr, *hdr; int hdrlen, err; hdrlen = (tuninfo->eh.hdrlen + 1) << 3; - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -291,7 +301,8 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, struct ioam6_lwt_encap *tuninfo, bool has_tunsrc, struct in6_addr *tunsrc, - struct in6_addr *tundst) + struct in6_addr *tundst, + struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr, *inner_hdr; @@ -300,7 +311,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, hdrlen = (tuninfo->eh.hdrlen + 1) << 3; len = sizeof(*hdr) + hdrlen; - err = skb_cow_head(skb, len + skb->mac_len); + err = skb_cow_head(skb, len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -334,8 +345,8 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb); - struct in6_addr orig_daddr; + struct dst_entry *orig_dst = skb_dst(skb); + struct dst_entry *dst = NULL; struct ioam6_lwt *ilwt; int err = -EINVAL; u32 pkt_cnt; @@ -343,14 +354,27 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - ilwt = ioam6_lwt_state(dst->lwtstate); + ilwt = ioam6_lwt_state(orig_dst->lwtstate); /* Check for insertion frequency (i.e., "k over n" insertions) */ pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt); if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k) goto out; - orig_daddr = ipv6_hdr(skb)->daddr; + local_bh_disable(); + dst = dst_cache_get(&ilwt->cache); + local_bh_enable(); + + /* This is how we notify that the destination does not change after + * transformation and that we need to use orig_dst instead of the cache + */ + if (dst == &ilwt->null_dst) { + dst_release(dst); + + dst = orig_dst; + /* keep refcount balance: dst_release() is called at the end */ + dst_hold(dst); + } switch (ilwt->mode) { case IOAM6_IPTUNNEL_MODE_INLINE: @@ -359,7 +383,7 @@ do_inline: if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) goto out; - err = ioam6_do_inline(net, skb, &ilwt->tuninfo); + err = ioam6_do_inline(net, skb, &ilwt->tuninfo, dst); if (unlikely(err)) goto drop; @@ -369,7 +393,7 @@ do_encap: /* Encapsulation (ip6ip6) */ err = ioam6_do_encap(net, skb, &ilwt->tuninfo, ilwt->has_tunsrc, &ilwt->tunsrc, - &ilwt->tundst); + &ilwt->tundst, dst); if (unlikely(err)) goto drop; @@ -387,52 +411,65 @@ do_encap: goto drop; } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; + if (unlikely(!dst)) { + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.daddr = hdr->daddr; + fl6.saddr = hdr->saddr; + fl6.flowlabel = ip6_flowinfo(hdr); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = hdr->nexthdr; - if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + err = dst->error; + goto drop; + } + + /* If the destination is the same after transformation (which is + * a valid use case for IOAM), then we don't want to add it to + * the cache in order to avoid a reference loop. Instead, we add + * our fake dst_entry to the cache as a way to detect this case. + * Otherwise, we add the resolved destination to the cache. + */ local_bh_disable(); - dst = dst_cache_get(&ilwt->cache); + if (orig_dst->lwtstate == dst->lwtstate) + dst_cache_set_ip6(&ilwt->cache, + &ilwt->null_dst, &fl6.saddr); + else + dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); local_bh_enable(); - if (unlikely(!dst)) { - struct ipv6hdr *hdr = ipv6_hdr(skb); - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.daddr = hdr->daddr; - fl6.saddr = hdr->saddr; - fl6.flowlabel = ip6_flowinfo(hdr); - fl6.flowi6_mark = skb->mark; - fl6.flowi6_proto = hdr->nexthdr; - - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - err = dst->error; - dst_release(dst); - goto drop; - } - - local_bh_disable(); - dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); - local_bh_enable(); - } + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; + } + /* avoid lwtunnel_output() reentry loop when destination is the same + * after transformation (e.g., with the inline mode) + */ + if (orig_dst->lwtstate != dst->lwtstate) { skb_dst_drop(skb); skb_dst_set(skb, dst); - return dst_output(net, sk, skb); } out: - return dst->lwtstate->orig_output(net, sk, skb); + dst_release(dst); + return orig_dst->lwtstate->orig_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } static void ioam6_destroy_state(struct lwtunnel_state *lwt) { + /* Since the refcount of per-cpu dst_entry caches will never be 0 (see + * why above) when our "fake" dst_entry is used, it is not necessary to + * remove them before calling dst_cache_destroy() + */ dst_cache_destroy(&ioam6_lwt_state(lwt)->cache); } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c134ba202c4c..7094d7708686 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -249,40 +249,52 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) struct fib6_table *fib6_new_table(struct net *net, u32 id) { - struct fib6_table *tb; + struct fib6_table *tb, *new_tb; if (id == 0) id = RT6_TABLE_MAIN; + tb = fib6_get_table(net, id); if (tb) return tb; - tb = fib6_alloc_table(net, id); - if (tb) - fib6_link_table(net, tb); + new_tb = fib6_alloc_table(net, id); + if (!new_tb) + return NULL; + + spin_lock_bh(&net->ipv6.fib_table_hash_lock); + + tb = fib6_get_table(net, id); + if (unlikely(tb)) { + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); + kfree(new_tb); + return tb; + } + + fib6_link_table(net, new_tb); + + spin_unlock_bh(&net->ipv6.fib_table_hash_lock); - return tb; + return new_tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { - struct fib6_table *tb; struct hlist_head *head; - unsigned int h; + struct fib6_table *tb; - if (id == 0) + if (!id) id = RT6_TABLE_MAIN; - h = id & (FIB6_TABLE_HASHSZ - 1); - rcu_read_lock(); - head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) { - if (tb->tb6_id == id) { - rcu_read_unlock(); + + head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)]; + + /* See comment in fib6_link_table(). RCU is not required, + * but rcu_dereference_raw() is used to avoid data-race. + */ + hlist_for_each_entry_rcu(tb, head, tb6_hlist, true) + if (tb->tb6_id == id) return tb; - } - } - rcu_read_unlock(); return NULL; } @@ -1015,8 +1027,9 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, .table = table }; - nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, - &arg); + rcu_read_lock(); + nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg); + rcu_read_unlock(); } else { struct fib6_nh *fib6_nh; @@ -1034,8 +1047,14 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); - if (rt->nh && !list_empty(&rt->nh_list)) - list_del_init(&rt->nh_list); + if (rt->nh) { + spin_lock(&rt->nh->lock); + + if (!list_empty(&rt->nh_list)) + list_del_init(&rt->nh_list); + + spin_unlock(&rt->nh->lock); + } if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split @@ -1069,8 +1088,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, - struct nl_info *info, - struct netlink_ext_ack *extack) + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); @@ -1203,7 +1222,9 @@ next_iter: fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); + rcu_read_lock(); rt6_multipath_rebalance(temp_sibling); + rcu_read_unlock(); } /* @@ -1246,7 +1267,9 @@ add: sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_rcu(&rt->fib6_siblings); + rcu_read_lock(); rt6_multipath_rebalance(next_sibling); + rcu_read_unlock(); return err; } } @@ -1294,10 +1317,9 @@ add: } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); + list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ @@ -1310,10 +1332,9 @@ add: if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; - fib6_purge_rt(iter, fn, info->nl_net); + list_add(&iter->purge_link, purge_list); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; - fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { @@ -1329,6 +1350,28 @@ add: return 0; } +static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt, + struct nl_info *info, struct netlink_ext_ack *extack, + struct list_head *purge_list) +{ + int err; + + spin_lock(&rt->nh->lock); + + if (rt->nh->dead) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -EINVAL; + } else { + err = fib6_add_rt2node(fn, rt, info, extack, purge_list); + if (!err) + list_add(&rt->nh_list, &rt->nh->f6i_list); + } + + spin_unlock(&rt->nh->lock); + + return err; +} + static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && @@ -1383,6 +1426,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; + LIST_HEAD(purge_list); struct fib6_node *fn; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node *pn = NULL; @@ -1485,10 +1529,19 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, } #endif - err = fib6_add_rt2node(fn, rt, info, extack); + if (rt->nh) + err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list); + else + err = fib6_add_rt2node(fn, rt, info, extack, &purge_list); if (!err) { - if (rt->nh) - list_add(&rt->nh_list, &rt->nh->f6i_list); + struct fib6_info *iter, *next; + + list_for_each_entry_safe(iter, next, &purge_list, purge_link) { + list_del(&iter->purge_link); + fib6_purge_rt(iter, fn, info->nl_net); + fib6_info_release(iter); + } + __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (rt->fib6_flags & RTF_EXPIRES) @@ -2383,7 +2436,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else - del_timer(&net->ipv6.ip6_fib_timer); + timer_delete(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } @@ -2423,6 +2476,8 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib_table_hash) goto out_rt6_stats; + spin_lock_init(&net->ipv6.fib_table_hash_lock); + net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) @@ -2470,7 +2525,7 @@ static void fib6_net_exit(struct net *net) { unsigned int i; - del_timer_sync(&net->ipv6.ip6_fib_timer); + timer_delete_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index eca07e10e21f..a3ff575798dd 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -907,6 +907,6 @@ int ip6_flowlabel_init(void) void ip6_flowlabel_cleanup(void) { static_key_deferred_flush(&ipv6_flowlabel_exclusive); - del_timer(&ip6_fl_gc_timer); + timer_delete(&ip6_fl_gc_timer); unregister_pernet_subsys(&ip6_flowlabel_net_ops); } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 235808cfec70..2dc9dcffe2ca 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -43,6 +43,7 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <net/rtnetlink.h> #include <net/ipv6.h> @@ -1498,7 +1499,6 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); @@ -1570,7 +1570,7 @@ static struct inet6_protocol ip6gre_protocol __read_mostly = { .flags = INET6_PROTO_FINAL, }; -static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) +static void __net_exit ip6gre_exit_rtnl_net(struct net *net, struct list_head *head) { struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); struct net_device *dev, *aux; @@ -1587,16 +1587,16 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) for (h = 0; h < IP6_GRE_HASH_SIZE; h++) { struct ip6_tnl *t; - t = rtnl_dereference(ign->tunnels[prio][h]); + t = rtnl_net_dereference(net, ign->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1621,7 +1621,7 @@ static int __net_init ip6gre_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - ign->fb_tunnel_dev->netns_local = true; + ign->fb_tunnel_dev->netns_immutable = true; ip6gre_fb_tunnel_init(ign->fb_tunnel_dev); ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops; @@ -1640,19 +1640,9 @@ err_alloc_dev: return err; } -static void __net_exit ip6gre_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - ip6gre_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit_batch_rtnl = ip6gre_exit_batch_rtnl, + .exit_rtnl = ip6gre_exit_rtnl_net, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; @@ -1882,7 +1872,6 @@ static int ip6erspan_tap_init(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); @@ -1971,7 +1960,7 @@ static bool ip6gre_netlink_encap_parms(struct nlattr *data[], return ret; } -static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev, +static int ip6gre_newlink_common(struct net *link_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -1992,7 +1981,7 @@ static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev, eth_hw_addr_random(dev); nt->dev = dev; - nt->net = dev_net(dev); + nt->net = link_net; err = register_netdevice(dev); if (err) @@ -2005,12 +1994,14 @@ out: return err; } -static int ip6gre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6gre_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *net = params->link_net ? : dev_net(dev); struct ip6_tnl *nt = netdev_priv(dev); - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip6gre_net *ign; int err; @@ -2025,7 +2016,7 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, return -EEXIST; } - err = ip6gre_newlink_common(src_net, dev, tb, data, extack); + err = ip6gre_newlink_common(net, dev, tb, data, extack); if (!err) { ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); ip6gre_tunnel_link_md(ign, nt); @@ -2241,12 +2232,14 @@ static void ip6erspan_tap_setup(struct net_device *dev) netif_keep_dst(dev); } -static int ip6erspan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6erspan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *net = params->link_net ? : dev_net(dev); struct ip6_tnl *nt = netdev_priv(dev); - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip6gre_net *ign; int err; @@ -2262,7 +2255,7 @@ static int ip6erspan_newlink(struct net *src_net, struct net_device *dev, return -EEXIST; } - err = ip6gre_newlink_common(src_net, dev, tb, data, extack); + err = ip6gre_newlink_common(net, dev, tb, data, extack); if (!err) { ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]); ip6erspan_tunnel_link_md(ign, nt); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 70c0e16c0ae6..39da6a7ce5f1 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -477,9 +477,7 @@ discard: static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_delivery_time(skb); - rcu_read_lock(); ip6_protocol_deliver_rcu(net, skb, 0, false); - rcu_read_unlock(); return 0; } @@ -487,9 +485,15 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk int ip6_input(struct sk_buff *skb) { - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, - dev_net(skb->dev), NULL, skb, skb->dev, NULL, - ip6_input_finish); + int res; + + rcu_read_lock(); + res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, + dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL, + ip6_input_finish); + rcu_read_unlock(); + + return res; } EXPORT_SYMBOL_GPL(ip6_input); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f7b4608bb316..7bd29a9ff0db 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -259,7 +259,7 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk) } /* - * xmit an sk_buff (used by TCP, SCTP and DCCP) + * xmit an sk_buff (used by TCP and SCTP) * Note : socket lock is not held for SYNACK packets, but might be modified * by calls to skb_set_owner_w() and ipv6_local_error(), * which are using proper atomic operations or spinlocks. @@ -613,15 +613,15 @@ int ip6_forward(struct sk_buff *skb) else target = &hdr->daddr; - peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); + rcu_read_lock(); + peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); /* Limit redirects both by destination (here) and by source (inside ndisc_send_redirect) */ if (inet_peer_xrlim_allow(peer, 1*HZ)) ndisc_send_redirect(skb, target); - if (peer) - inet_putpeer(peer); + rcu_read_unlock(); } else { int addrtype = ipv6_addr_type(&hdr->saddr); @@ -1386,6 +1386,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, } v6_cork->hop_limit = ipc6->hlimit; v6_cork->tclass = ipc6->tclass; + v6_cork->dontfrag = ipc6->dontfrag; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); @@ -1401,6 +1402,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, cork->base.gso_size = ipc6->gso_size; cork->base.tx_flags = 0; cork->base.mark = ipc6->sockc.mark; + cork->base.priority = ipc6->sockc.priority; sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { cork->base.flags |= IPCORK_TS_OPT_ID; @@ -1420,7 +1422,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, - unsigned int flags, struct ipcm6_cookie *ipc6) + unsigned int flags) { struct sk_buff *skb, *skb_prev = NULL; struct inet_cork *cork = &cork_full->base; @@ -1474,7 +1476,7 @@ static int __ip6_append_data(struct sock *sk, if (headersize + transhdrlen > mtu) goto emsgsize; - if (cork->length + length > mtu - headersize && ipc6->dontfrag && + if (cork->length + length > mtu - headersize && v6_cork->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_ICMPV6 || sk->sk_protocol == IPPROTO_RAW)) { @@ -1522,7 +1524,8 @@ emsgsize: uarg = msg->msg_ubuf; } } else if (sock_flag(sk, SOCK_ZEROCOPY)) { - uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); + uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), + false); if (!uarg) return -ENOBUFS; extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ @@ -1697,8 +1700,9 @@ alloc_new_skb: pskb_trim_unique(skb_prev, maxfraglen); } if (copy > 0 && - getfrag(from, data + transhdrlen, offset, - copy, fraggap, skb) < 0) { + INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, data + transhdrlen, offset, + copy, fraggap, skb) < 0) { err = -EFAULT; kfree_skb(skb); goto error; @@ -1742,8 +1746,9 @@ alloc_new_skb: unsigned int off; off = skb->len; - if (getfrag(from, skb_put(skb, copy), - offset, copy, off, skb) < 0) { + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { __skb_trim(skb, off); err = -EFAULT; goto error; @@ -1781,7 +1786,8 @@ alloc_new_skb: get_page(pfrag->page); } copy = min_t(int, copy, pfrag->size - pfrag->offset); - if (getfrag(from, + if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, + from, page_address(pfrag->page) + pfrag->offset, offset, copy, skb->len, skb) < 0) goto error_efault; @@ -1851,7 +1857,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, ipc6); + from, length, transhdrlen, flags); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -1939,7 +1945,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, hdr->saddr = fl6->saddr; hdr->daddr = *final_dst; - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = cork->base.priority; skb->mark = cork->base.mark; if (sk_is_tcp(sk)) skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); @@ -2050,13 +2056,11 @@ struct sk_buff *ip6_make_skb(struct sock *sk, ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); } - if (ipc6->dontfrag < 0) - ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk); err = __ip6_append_data(sk, &queue, cork, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, ipc6); + flags); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 48fd53b98972..894d3158a6f0 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -52,6 +52,7 @@ #include <net/inet_ecn.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> @@ -253,8 +254,7 @@ static void ip6_dev_free(struct net_device *dev) static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; @@ -1878,7 +1878,6 @@ ip6_tnl_dev_init_gen(struct net_device *dev) int t_hlen; t->dev = dev; - t->net = dev_net(dev); ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) @@ -1940,6 +1939,7 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); @@ -2002,17 +2002,22 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[], parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6_tnl_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; + struct ip6_tnl_net *ip6n; struct ip6_tnl *nt, *t; + struct net *net; int err; + net = params->link_net ? : dev_net(dev); + ip6n = net_generic(net, ip6_tnl_net_id); nt = netdev_priv(dev); + nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); @@ -2205,7 +2210,7 @@ static struct xfrm6_tunnel mplsip6_handler __read_mostly = { .priority = 1, }; -static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) +static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); struct net_device *dev, *aux; @@ -2217,25 +2222,27 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head unregister_netdevice_queue(dev, list); for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); + t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + + t = rtnl_net_dereference(net, t->next); } } - t = rtnl_dereference(ip6n->tnls_wc[0]); + t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + + t = rtnl_net_dereference(net, t->next); } } @@ -2261,7 +2268,7 @@ static int __net_init ip6_tnl_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - ip6n->fb_tnl_dev->netns_local = true; + ip6n->fb_tnl_dev->netns_immutable = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) @@ -2282,19 +2289,9 @@ err_alloc_dev: return err; } -static void __net_exit ip6_tnl_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - ip6_tnl_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit_batch_rtnl = ip6_tnl_exit_batch_rtnl, + .exit_rtnl = ip6_tnl_exit_rtnl_net, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 590737c27537..40464a88bca6 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -45,6 +45,7 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <linux/etherdevice.h> #define IP6_VTI_HASH_SIZE_SHIFT 5 @@ -177,8 +178,7 @@ vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t) static int vti6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n = net_generic(t->net, vti6_net_id); int err; dev->rtnl_link_ops = &vti6_link_ops; @@ -925,7 +925,6 @@ static inline int vti6_dev_init_gen(struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); t->dev = dev; - t->net = dev_net(dev); netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; @@ -958,6 +957,7 @@ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); + t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); @@ -997,17 +997,20 @@ static void vti6_netlink_parms(struct nlattr *data[], parms->fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } -static int vti6_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vti6_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); + struct nlattr **data = params->data; struct ip6_tnl *nt; + struct net *net; + net = params->link_net ? : dev_net(dev); nt = netdev_priv(dev); vti6_netlink_parms(data, &nt->parms); nt->parms.proto = IPPROTO_IPV6; + nt->net = net; if (vti6_locate(net, &nt->parms, 0)) return -EEXIST; @@ -1109,21 +1112,21 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = { .get_link_net = ip6_tnl_get_link_net, }; -static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, - struct list_head *list) +static void __net_exit vti6_exit_rtnl_net(struct net *net, struct list_head *list) { - int h; + struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct ip6_tnl *t; + int h; for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); + t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]); while (t) { unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); + t = rtnl_net_dereference(net, t->next); } } - t = rtnl_dereference(ip6n->tnls_wc[0]); + t = rtnl_net_dereference(net, ip6n->tnls_wc[0]); if (t) unregister_netdevice_queue(t->dev, list); } @@ -1167,22 +1170,9 @@ err_alloc_dev: return err; } -static void __net_exit vti6_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct vti6_net *ip6n; - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) { - ip6n = net_generic(net, vti6_net_id); - vti6_destroy_tunnels(ip6n, dev_to_kill); - } -} - static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit_batch_rtnl = vti6_exit_batch_rtnl, + .exit_rtnl = vti6_exit_rtnl_net, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 578ff1336afe..3276cde5ebd7 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -108,11 +108,6 @@ static void ipmr_expire_process(struct timer_list *t); lockdep_rtnl_is_held() || \ list_empty(&net->ipv6.mr6_tables)) -static bool ip6mr_can_free_table(struct net *net) -{ - return !check_net(net) || !net_initialized(net); -} - static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { @@ -306,11 +301,6 @@ EXPORT_SYMBOL(ip6mr_rule_default); #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) -static bool ip6mr_can_free_table(struct net *net) -{ - return !check_net(net); -} - static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { @@ -416,7 +406,7 @@ static void ip6mr_free_table(struct mr_table *mrt) { struct net *net = read_pnet(&mrt->net); - WARN_ON_ONCE(!ip6mr_can_free_table(net)); + WARN_ON_ONCE(!mr_can_free_table(net)); timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | @@ -520,9 +510,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", - mfc->_c.mfc_un.res.pkt, - mfc->_c.mfc_un.res.bytes, - mfc->_c.mfc_un.res.wrong_if); + atomic_long_read(&mfc->_c.mfc_un.res.pkt), + atomic_long_read(&mfc->_c.mfc_un.res.bytes), + atomic_long_read(&mfc->_c.mfc_un.res.wrong_if)); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && @@ -668,7 +658,7 @@ static void reg_vif_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; - dev->netns_local = true; + dev->netns_immutable = true; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) @@ -884,7 +874,7 @@ static void ip6mr_update_thresholds(struct mr_table *mrt, cache->mfc_un.res.maxvif = vifi + 1; } } - cache->mfc_un.res.lastuse = jiffies; + WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies); } static int mif6_add(struct net *net, struct mr_table *mrt, @@ -1526,7 +1516,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, } } if (list_empty(&mrt->mfc_unres_queue)) - del_timer(&mrt->ipmr_expire_timer); + timer_delete(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { @@ -1945,9 +1935,9 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, &sr->grp.sin6_addr); if (c) { - sr->pktcnt = c->_c.mfc_un.res.pkt; - sr->bytecnt = c->_c.mfc_un.res.bytes; - sr->wrong_if = c->_c.mfc_un.res.wrong_if; + sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); return 0; } @@ -2017,9 +2007,9 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { - sr.pktcnt = c->_c.mfc_un.res.pkt; - sr.bytecnt = c->_c.mfc_un.res.bytes; - sr.wrong_if = c->_c.mfc_un.res.wrong_if; + sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt); + sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes); + sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if); rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) @@ -2142,9 +2132,9 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; - c->_c.mfc_un.res.pkt++; - c->_c.mfc_un.res.bytes += skb->len; - c->_c.mfc_un.res.lastuse = jiffies; + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; @@ -2162,7 +2152,7 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, * Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { - c->_c.mfc_un.res.wrong_if++; + atomic_long_inc(&c->_c.mfc_un.res.wrong_if); if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index b244dbf61d5f..65831b4fee1f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -33,8 +33,10 @@ #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> +#include <linux/if_addr.h> #include <linux/if_arp.h> #include <linux/route.h> +#include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -47,6 +49,7 @@ #include <linux/netfilter_ipv6.h> #include <net/net_namespace.h> +#include <net/netlink.h> #include <net/sock.h> #include <net/snmp.h> @@ -901,6 +904,41 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev, return mc; } +static void inet6_ifmcaddr_notify(struct net_device *dev, + const struct ifmcaddr6 *ifmca, int event) +{ + struct inet6_fill_args fillargs = { + .portid = 0, + .seq = 0, + .event = event, + .flags = 0, + .netnsid = -1, + .force_rt_scope_universe = true, + }; + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOMEM; + + skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) + + nla_total_size(sizeof(struct in6_addr)) + + nla_total_size(sizeof(struct ifa_cacheinfo)), + GFP_KERNEL); + if (!skb) + goto error; + + err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs); + if (err < 0) { + WARN_ON_ONCE(err == -EMSGSIZE); + nlmsg_free(skb); + goto error; + } + + rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_KERNEL); + return; +error: + rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err); +} + /* * device multicast group inc (add if not found) */ @@ -948,6 +986,7 @@ static int __ipv6_dev_mc_inc(struct net_device *dev, mld_del_delrec(idev, mc); igmp6_group_added(mc); + inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST); mutex_unlock(&idev->mc_lock); ma_put(mc); return 0; @@ -977,6 +1016,8 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr) *map = ma->next; igmp6_group_dropped(ma); + inet6_ifmcaddr_notify(idev->dev, ma, + RTM_DELMULTICAST); ip6_mc_clear_src(ma); mutex_unlock(&idev->mc_lock); @@ -1021,29 +1062,31 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, rcu_read_lock(); idev = __in6_dev_get(dev); - if (idev) { - for_each_mc_rcu(idev, mc) { - if (ipv6_addr_equal(&mc->mca_addr, group)) - break; - } - if (mc) { - if (src_addr && !ipv6_addr_any(src_addr)) { - struct ip6_sf_list *psf; + if (!idev) + goto unlock; + for_each_mc_rcu(idev, mc) { + if (ipv6_addr_equal(&mc->mca_addr, group)) + break; + } + if (!mc) + goto unlock; + if (src_addr && !ipv6_addr_any(src_addr)) { + struct ip6_sf_list *psf; - for_each_psf_rcu(mc, psf) { - if (ipv6_addr_equal(&psf->sf_addr, src_addr)) - break; - } - if (psf) - rv = psf->sf_count[MCAST_INCLUDE] || - psf->sf_count[MCAST_EXCLUDE] != - mc->mca_sfcount[MCAST_EXCLUDE]; - else - rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0; - } else - rv = true; /* don't filter unspecified source */ + for_each_psf_rcu(mc, psf) { + if (ipv6_addr_equal(&psf->sf_addr, src_addr)) + break; } + if (psf) + rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) || + READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) != + READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]); + else + rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0; + } else { + rv = true; /* don't filter unspecified source */ } +unlock: rcu_read_unlock(); return rv; } @@ -1730,21 +1773,19 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) struct net_device *dev = idev->dev; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct net *net = dev_net(dev); const struct in6_addr *saddr; struct in6_addr addr_buf; struct mld2_report *pmr; struct sk_buff *skb; unsigned int size; struct sock *sk; - int err; + struct net *net; - sk = net->ipv6.igmp_sk; /* we assume size > sizeof(ra) here * Also try to not allocate high-order pages for big MTU */ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; - skb = sock_alloc_send_skb(sk, size, 1, &err); + skb = alloc_skb(size, GFP_KERNEL); if (!skb) return NULL; @@ -1752,6 +1793,12 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); + rcu_read_lock(); + + net = dev_net_rcu(dev); + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address @@ -1763,6 +1810,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + rcu_read_unlock(); + skb_put_data(skb, ra, sizeof(ra)); skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); @@ -2122,21 +2171,21 @@ static void mld_send_cr(struct inet6_dev *idev) static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) { - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.igmp_sk; + const struct in6_addr *snd_addr, *saddr; + int err, len, payload_len, full_len; + struct in6_addr addr_buf; struct inet6_dev *idev; struct sk_buff *skb; struct mld_msg *hdr; - const struct in6_addr *snd_addr, *saddr; - struct in6_addr addr_buf; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; - struct flowi6 fl6; struct dst_entry *dst; + struct flowi6 fl6; + struct net *net; + struct sock *sk; if (type == ICMPV6_MGM_REDUCTION) snd_addr = &in6addr_linklocal_allrouters; @@ -2147,19 +2196,21 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) payload_len = len + sizeof(ra); full_len = sizeof(struct ipv6hdr) + payload_len; - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTREQUESTS); - rcu_read_unlock(); + skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL); - skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); + rcu_read_lock(); + net = dev_net_rcu(dev); + idev = __in6_dev_get(dev); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); if (!skb) { - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), - IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; } + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); @@ -2184,9 +2235,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) IPPROTO_ICMPV6, csum_partial(hdr, len, 0)); - rcu_read_lock(); - idev = __in6_dev_get(skb->dev); - icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); @@ -2285,7 +2333,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, /* source filter not found, or count wrong => bug */ return -ESRCH; } - psf->sf_count[sfmode]--; + WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1); if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { struct inet6_dev *idev = pmc->idev; @@ -2391,7 +2439,7 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, rcu_assign_pointer(pmc->mca_sources, psf); } } - psf->sf_count[sfmode]++; + WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1); return 0; } @@ -2503,7 +2551,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, sf_markstate(pmc); isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; if (!delta) - pmc->mca_sfcount[sfmode]++; + WRITE_ONCE(pmc->mca_sfcount[sfmode], + pmc->mca_sfcount[sfmode] + 1); err = 0; for (i = 0; i < sfcount; i++) { err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]); @@ -2514,7 +2563,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca, int j; if (!delta) - pmc->mca_sfcount[sfmode]--; + WRITE_ONCE(pmc->mca_sfcount[sfmode], + pmc->mca_sfcount[sfmode] - 1); for (j = 0; j < i; j++) ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]); } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) { @@ -2559,7 +2609,8 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) RCU_INIT_POINTER(pmc->mca_sources, NULL); pmc->mca_sfmode = MCAST_EXCLUDE; pmc->mca_sfcount[MCAST_INCLUDE] = 0; - pmc->mca_sfcount[MCAST_EXCLUDE] = 1; + /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */ + WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1); } /* called with mc_lock */ @@ -3074,8 +3125,8 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) state->dev->ifindex, state->dev->name, &state->im->mca_addr, &psf->sf_addr, - psf->sf_count[MCAST_INCLUDE], - psf->sf_count[MCAST_EXCLUDE]); + READ_ONCE(psf->sf_count[MCAST_INCLUDE]), + READ_ONCE(psf->sf_count[MCAST_EXCLUDE])); } return 0; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index aba94a348673..ecb5c4b8518f 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -418,15 +418,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); - if (!skb) { - ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", - __func__); + if (!skb) return NULL; - } skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; @@ -437,7 +433,9 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, /* Manually assign socket ownership as we avoid calling * sock_alloc_send_pskb() to bypass wmem buffer limits */ - skb_set_owner_w(skb, sk); + rcu_read_lock(); + skb_set_owner_w(skb, dev_net_rcu(dev)->ipv6.ndisc_sk); + rcu_read_unlock(); return skb; } @@ -473,16 +471,20 @@ static void ip6_nd_hdr(struct sk_buff *skb, void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct icmp6hdr *icmp6h = icmp6_hdr(skb); struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(skb->dev); - struct sock *sk = net->ipv6.ndisc_sk; struct inet6_dev *idev; + struct net *net; + struct sock *sk; int err; - struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; + rcu_read_lock(); + + net = dev_net_rcu(skb->dev); + sk = net->ipv6.ndisc_sk; if (!dst) { struct flowi6 fl6; int oif = skb->dev->ifindex; @@ -490,6 +492,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { + rcu_read_unlock(); kfree_skb(skb); return; } @@ -504,7 +507,6 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); - rcu_read_lock(); idev = __in6_dev_get(dst->dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); @@ -1678,7 +1680,7 @@ static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { struct net_device *dev = skb->dev; - struct net *net = dev_net(dev); + struct net *net = dev_net_rcu(dev); struct sock *sk = net->ipv6.ndisc_sk; int optlen = 0; struct inet_peer *peer; @@ -1693,8 +1695,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; bool ret; - if (netif_is_l3_master(skb->dev)) { - dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); + if (netif_is_l3_master(dev)) { + dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) return; } @@ -1731,10 +1733,10 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) "Redirect: destination is not a neighbour\n"); goto release; } - peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1); + + peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr); ret = inet_peer_xrlim_allow(peer, 1*HZ); - if (peer) - inet_putpeer(peer); + if (!ret) goto release; diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 581ce055bf52..4541836ee3da 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -164,20 +164,20 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, struct ip6_fraglist_iter iter; struct sk_buff *frag2; - if (first_len - hlen > mtu || - skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) + if (first_len - hlen > mtu) goto blackhole; - if (skb_cloned(skb)) + if (skb_cloned(skb) || + skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) goto slow_path; skb_walk_frags(skb, frag2) { - if (frag2->len > mtu || - skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) + if (frag2->len > mtu) goto blackhole; /* Partially cloned skb? */ - if (skb_shared(frag2)) + if (skb_shared(frag2) || + skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr))) goto slow_path; } diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7d5602950ae7..d585ac3c1113 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -292,7 +292,7 @@ ip6t_do_table(void *priv, struct sk_buff *skb, * but it is no problem since absolute verdict is issued by these. */ if (static_key_false(&xt_tee_enabled)) - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + jumpstack += private->stacksize * current->in_nf_duplicate; e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 4120e67a8ce6..d6bd8f7079bb 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -123,7 +123,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) #endif static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { @@ -167,7 +168,8 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, - const struct frag_hdr *fhdr, int nhoff) + const struct frag_hdr *fhdr, int nhoff, + int *refs) { unsigned int payload_len; struct net_device *dev; @@ -221,7 +223,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, * this case. -DaveM */ pr_debug("end of fragment not rounded to 8 bytes.\n"); - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -EPROTO; } if (end > fq->q.len) { @@ -287,7 +289,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = nf_ct_frag6_reasm(fq, skb, prev, dev); + err = nf_ct_frag6_reasm(fq, skb, prev, dev, refs); skb->_skb_refdst = orefdst; /* After queue has assumed skb ownership, only 0 or @@ -301,7 +303,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, return -EINPROGRESS; insert_error: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); err: skb_dst_drop(skb); return -EINVAL; @@ -315,13 +317,14 @@ err: * the last and the first frames arrived and all the bits are here. */ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { void *reasm_data; int payload_len; u8 ecn; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -372,7 +375,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, return 0; err: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -EINVAL; } @@ -447,6 +450,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) struct frag_hdr *fhdr; struct frag_queue *fq; struct ipv6hdr *hdr; + int refs = 0; u8 prevhdr; /* Jumbo payload inhibits frag. header */ @@ -473,23 +477,26 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); + rcu_read_lock(); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { + rcu_read_unlock(); pr_debug("Can't find and can't create new queue\n"); return -ENOMEM; } spin_lock_bh(&fq->q.lock); - ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); + ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, &refs); if (ret == -EPROTO) { skb->transport_header = savethdr; ret = 0; } spin_unlock_bh(&fq->q.lock); - inet_frag_put(&fq->q); + rcu_read_unlock(); + inet_frag_putn(&fq->q, refs); return ret; } EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 0c39c77fe8a8..b903c62c00c9 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -48,7 +48,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, const struct in6_addr *gw, int oif) { local_bh_disable(); - if (this_cpu_read(nf_skb_duplicated)) + if (current->in_nf_duplicate) goto out; skb = pskb_copy(skb, GFP_ATOMIC); if (skb == NULL) @@ -64,9 +64,9 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, --iph->hop_limit; } if (nf_dup_ipv6_route(net, skb, gw, oif)) { - __this_cpu_write(nf_skb_duplicated, true); + current->in_nf_duplicate = true; ip6_local_out(net, skb->sk, skb); - __this_cpu_write(nf_skb_duplicated, false); + current->in_nf_duplicate = false; } else { kfree_skb(skb); } diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index a7690ec62325..9ea5ef56cb27 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -103,6 +103,10 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, struct sk_buff *data_skb = NULL; int doff = 0; int thoff = 0, tproto; +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + enum ip_conntrack_info ctinfo; + struct nf_conn const *ct; +#endif tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL); if (tproto < 0) { @@ -136,6 +140,25 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, return NULL; } +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + /* Do the lookup with the original socket address in + * case this is a reply packet of an established + * SNAT-ted connection. + */ + ct = nf_ct_get(skb, &ctinfo); + if (ct && + ((tproto != IPPROTO_ICMPV6 && + ctinfo == IP_CT_ESTABLISHED_REPLY) || + (tproto == IPPROTO_ICMPV6 && + ctinfo == IP_CT_RELATED_REPLY)) && + (ct->status & IPS_SRC_NAT_DONE)) { + daddr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6; + dport = (tproto == IPPROTO_TCP) ? + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + } +#endif + return nf_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr, sport, dport, indev); } diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index c9f1634b3838..421036a3605b 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -50,6 +50,7 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, fl6->flowi6_mark = pkt->skb->mark; fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK; + fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev); return lookup_flags; } @@ -73,8 +74,6 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, else if (priv->flags & NFTA_FIB_F_OIF) dev = nft_out(pkt); - fl6.flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev); - nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph); if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true)) @@ -158,6 +157,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, { const struct nft_fib *priv = nft_expr_priv(expr); int noff = skb_network_offset(pkt->skb); + const struct net_device *found = NULL; const struct net_device *oif = NULL; u32 *dest = ®s->data[priv->dreg]; struct ipv6hdr *iph, _iph; @@ -165,11 +165,15 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), - .flowi6_l3mdev = l3mdev_master_ifindex_rcu(nft_in(pkt)), }; struct rt6_info *rt; int lookup_flags; + if (nft_fib_can_skip(pkt)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; + } + if (priv->flags & NFTA_FIB_F_IIF) oif = nft_in(pkt); else if (priv->flags & NFTA_FIB_F_OIF) @@ -181,17 +185,13 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); - - if (nft_hook(pkt) == NF_INET_PRE_ROUTING || - nft_hook(pkt) == NF_INET_INGRESS) { - if (nft_fib_is_loopback(pkt->skb, nft_in(pkt)) || - nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) { - nft_fib_store_result(dest, priv, nft_in(pkt)); - return; - } + if (nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) { + nft_fib_store_result(dest, priv, nft_in(pkt)); + return; } + lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph); + *dest = 0; rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb, lookup_flags); @@ -202,11 +202,15 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) goto put_rt_err; - if (oif && oif != rt->rt6i_idev->dev && - l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex) - goto put_rt_err; + if (!oif) { + found = rt->rt6i_idev->dev; + } else { + if (oif == rt->rt6i_idev->dev || + l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex) + found = oif; + } - nft_fib_store_result(dest, priv, rt->rt6i_idev->dev); + nft_fib_store_result(dest, priv, found); put_rt_err: ip6_rt_put(rt); } diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 88b3fcacd4f9..84d90dd8b3f0 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,8 +119,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EINVAL; ipcm6_init_sk(&ipc6, sk); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = READ_ONCE(sk->sk_mark); fl6.flowi6_oif = oif; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 8476a3944a88..fda640ebd53f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -619,7 +619,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IPV6); - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); @@ -769,18 +769,16 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) hdrincl = inet_test_bit(HDRINCL, sk); + ipcm6_init_sk(&ipc6, sk); + /* * Get and verify the address. */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_mark = READ_ONCE(sk->sk_mark); + fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; - ipcm6_init(&ipc6); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = fl6.flowi6_mark; - if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -890,9 +888,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); @@ -903,9 +898,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); - if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index a48be617a8ab..49740898bc13 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -68,7 +68,8 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev); + struct sk_buff *prev_tail, struct net_device *dev, + int *refs); static void ip6_frag_expire(struct timer_list *t) { @@ -105,7 +106,7 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct frag_hdr *fhdr, int nhoff, - u32 *prob_offset) + u32 *prob_offset, int *refs) { struct net *net = dev_net(skb_dst(skb)->dev); int offset, end, fragsize; @@ -220,7 +221,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - err = ip6_frag_reasm(fq, skb, prev_tail, dev); + err = ip6_frag_reasm(fq, skb, prev_tail, dev, refs); skb->_skb_refdst = orefdst; return err; } @@ -238,7 +239,7 @@ insert_error: __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASM_OVERLAPS); discard_fq: - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); err: @@ -254,7 +255,8 @@ err: * the last and the first frames arrived and all the bits are here. */ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, - struct sk_buff *prev_tail, struct net_device *dev) + struct sk_buff *prev_tail, struct net_device *dev, + int *refs) { struct net *net = fq->q.fqdir->net; unsigned int nhoff; @@ -262,7 +264,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, int payload_len; u8 ecn; - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) @@ -303,9 +305,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, skb_postpush_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); - rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS); - rcu_read_unlock(); fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; fq->q.last_run_head = NULL; @@ -317,10 +317,8 @@ out_oversize: out_oom: net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); out_fail: - rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS); - rcu_read_unlock(); - inet_frag_kill(&fq->q); + inet_frag_kill(&fq->q, refs); return -1; } @@ -377,19 +375,21 @@ static int ipv6_frag_rcv(struct sk_buff *skb) } iif = skb->dev ? skb->dev->ifindex : 0; + rcu_read_lock(); fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { u32 prob_offset = 0; - int ret; + int ret, refs = 0; spin_lock(&fq->q.lock); fq->iif = iif; ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, - &prob_offset); + &prob_offset, &refs); spin_unlock(&fq->q.lock); - inet_frag_put(&fq->q); + rcu_read_unlock(); + inet_frag_putn(&fq->q, refs); if (prob_offset) { __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INHDRERRORS); @@ -398,6 +398,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) } return ret; } + rcu_read_unlock(); __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); kfree_skb(skb); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 67ff16c04718..0143262094b0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -412,12 +412,37 @@ static bool rt6_check_expired(const struct rt6_info *rt) return false; } +static struct fib6_info * +rt6_multipath_first_sibling_rcu(const struct fib6_info *rt) +{ + struct fib6_info *iter; + struct fib6_node *fn; + + fn = rcu_dereference(rt->fib6_node); + if (!fn) + goto out; + iter = rcu_dereference(fn->leaf); + if (!iter) + goto out; + + while (iter) { + if (iter->fib6_metric == rt->fib6_metric && + rt6_qualify_for_ecmp(iter)) + return iter; + iter = rcu_dereference(iter->fib6_next); + } + +out: + return NULL; +} + void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { - struct fib6_info *match = res->f6i; + struct fib6_info *first, *match = res->f6i; struct fib6_info *sibling; + int hash; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; @@ -440,16 +465,25 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, return; } - if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) + first = rt6_multipath_first_sibling_rcu(match); + if (!first) goto out; - list_for_each_entry_rcu(sibling, &match->fib6_siblings, + hash = fl6->mp_hash; + if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) { + if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif, + strict) >= 0) + match = first; + goto out; + } + + list_for_each_entry_rcu(sibling, &first->fib6_siblings, fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound); - if (fl6->mp_hash > nh_upper_bound) + if (hash > nh_upper_bound) continue; if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0) break; @@ -1737,6 +1771,7 @@ out: if (!err) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum(net, f6i); + fib6_add_gc_list(f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); fib6_force_start_gc(net); } @@ -1785,11 +1820,13 @@ static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) void rt6_flush_exceptions(struct fib6_info *f6i) { - if (f6i->nh) - nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, - f6i); - else + if (f6i->nh) { + rcu_read_lock(); + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i); + rcu_read_unlock(); + } else { fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); + } } /* Find cached rt in the hash table inside passed in rt @@ -2457,8 +2494,12 @@ static u32 rt6_multipath_custom_hash_fl6(const struct net *net, hash_keys.basic.ip_proto = fl6->flowi6_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) - hash_keys.ports.src = fl6->fl6_sport; + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; + } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl6->fl6_dport; @@ -2512,7 +2553,10 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = fl6->saddr; hash_keys.addrs.v6addrs.dst = fl6->daddr; - hash_keys.ports.src = fl6->fl6_sport; + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) + hash_keys.ports.src = (__force __be16)get_random_u16(); + else + hash_keys.ports.src = fl6->fl6_sport; hash_keys.ports.dst = fl6->fl6_dport; hash_keys.basic.ip_proto = fl6->flowi6_proto; } @@ -3196,13 +3240,18 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) { struct net_device *dev = dst->dev; unsigned int mtu = dst_mtu(dst); - struct net *net = dev_net(dev); + struct net *net; mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + rcu_read_lock(); + + net = dev_net_rcu(dev); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) mtu = net->ipv6.sysctl.ip6_rt_min_advmss; + rcu_read_unlock(); + /* * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. @@ -3639,7 +3688,8 @@ out: in6_dev_put(idev); if (err) { - lwtstate_put(fib6_nh->fib_nh_lws); + fib_nh_common_release(&fib6_nh->nh_common); + fib6_nh->nh_common.nhc_pcpu_rth_output = NULL; fib6_nh->fib_nh_lws = NULL; netdev_put(dev, dev_tracker); } @@ -3688,61 +3738,14 @@ void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) } static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, - gfp_t gfp_flags, - struct netlink_ext_ack *extack) + gfp_t gfp_flags, + struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; - struct fib6_info *rt = NULL; - struct nexthop *nh = NULL; struct fib6_table *table; - struct fib6_nh *fib6_nh; - int err = -EINVAL; - int addr_type; - - /* RTF_PCPU is an internal flag; can not be set by userspace */ - if (cfg->fc_flags & RTF_PCPU) { - NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); - goto out; - } - - /* RTF_CACHE is an internal flag; can not be set by userspace */ - if (cfg->fc_flags & RTF_CACHE) { - NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); - goto out; - } - - if (cfg->fc_type > RTN_MAX) { - NL_SET_ERR_MSG(extack, "Invalid route type"); - goto out; - } - - if (cfg->fc_dst_len > 128) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - goto out; - } - if (cfg->fc_src_len > 128) { - NL_SET_ERR_MSG(extack, "Invalid source address length"); - goto out; - } -#ifndef CONFIG_IPV6_SUBTREES - if (cfg->fc_src_len) { - NL_SET_ERR_MSG(extack, - "Specifying source address requires IPV6_SUBTREES to be enabled"); - goto out; - } -#endif - if (cfg->fc_nh_id) { - nh = nexthop_find_by_id(net, cfg->fc_nh_id); - if (!nh) { - NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); - goto out; - } - err = fib6_check_nexthop(nh, cfg, extack); - if (err) - goto out; - } + struct fib6_info *rt; + int err; - err = -ENOBUFS; if (cfg->fc_nlinfo.nlh && !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) { table = fib6_get_table(net, cfg->fc_table); @@ -3753,22 +3756,22 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, } else { table = fib6_new_table(net, cfg->fc_table); } + if (!table) { + err = -ENOBUFS; + goto err; + } - if (!table) - goto out; - - err = -ENOMEM; - rt = fib6_info_alloc(gfp_flags, !nh); - if (!rt) - goto out; + rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id); + if (!rt) { + err = -ENOMEM; + goto err; + } rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(rt->fib6_metrics)) { err = PTR_ERR(rt->fib6_metrics); - /* Do not leave garbage there. */ - rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics; - goto out_free; + goto free; } if (cfg->fc_flags & RTF_ADDRCONF) @@ -3776,12 +3779,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + - clock_t_to_jiffies(cfg->fc_expires)); + clock_t_to_jiffies(cfg->fc_expires)); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; - rt->fib6_protocol = cfg->fc_protocol; + rt->fib6_protocol = cfg->fc_protocol; rt->fib6_table = table; rt->fib6_metric = cfg->fc_metric; rt->fib6_type = cfg->fc_type ? : RTN_UNICAST; @@ -3794,21 +3797,54 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len); rt->fib6_src.plen = cfg->fc_src_len; #endif - if (nh) { - if (rt->fib6_src.plen) { - NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + return rt; +free: + kfree(rt); +err: + return ERR_PTR(err); +} + +static int ip6_route_info_create_nh(struct fib6_info *rt, + struct fib6_config *cfg, + gfp_t gfp_flags, + struct netlink_ext_ack *extack) +{ + struct net *net = cfg->fc_nlinfo.nl_net; + struct fib6_nh *fib6_nh; + int err; + + if (cfg->fc_nh_id) { + struct nexthop *nh; + + rcu_read_lock(); + + nh = nexthop_find_by_id(net, cfg->fc_nh_id); + if (!nh) { + err = -EINVAL; + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); goto out_free; } + + err = fib6_check_nexthop(nh, cfg, extack); + if (err) + goto out_free; + if (!nexthop_get(nh)) { NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + err = -ENOENT; goto out_free; } + rt->nh = nh; fib6_nh = nexthop_fib6_nh(rt->nh); + + rcu_read_unlock(); } else { + int addr_type; + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); if (err) - goto out; + goto out_release; fib6_nh = rt->fib6_nh; @@ -3827,21 +3863,21 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) { NL_SET_ERR_MSG(extack, "Invalid source address"); err = -EINVAL; - goto out; + goto out_release; } rt->fib6_prefsrc.addr = cfg->fc_prefsrc; rt->fib6_prefsrc.plen = 128; - } else - rt->fib6_prefsrc.plen = 0; + } - return rt; -out: + return 0; +out_release: fib6_info_release(rt); - return ERR_PTR(err); + return err; out_free: + rcu_read_unlock(); ip_fib_metrics_put(rt->fib6_metrics); kfree(rt); - return ERR_PTR(err); + return err; } int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, @@ -3854,6 +3890,10 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, if (IS_ERR(rt)) return PTR_ERR(rt); + err = ip6_route_info_create_nh(rt, cfg, gfp_flags, extack); + if (err) + return err; + err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); fib6_info_release(rt); @@ -4082,9 +4122,9 @@ static int ip6_route_del(struct fib6_config *cfg, if (rt->nh) { if (!fib6_info_hold_safe(rt)) continue; - rcu_read_unlock(); - return __ip6_del_rt(rt, &cfg->fc_nlinfo); + err = __ip6_del_rt(rt, &cfg->fc_nlinfo); + break; } if (cfg->fc_nh_id) continue; @@ -4099,13 +4139,13 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (!fib6_info_hold_safe(rt)) continue; - rcu_read_unlock(); /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) - return __ip6_del_rt(rt, &cfg->fc_nlinfo); - - return __ip6_del_rt_siblings(rt, cfg); + err = __ip6_del_rt(rt, &cfg->fc_nlinfo); + else + err = __ip6_del_rt_siblings(rt, cfg); + break; } } rcu_read_unlock(); @@ -4439,6 +4479,53 @@ void rt6_purge_dflt_routers(struct net *net) rcu_read_unlock(); } +static int fib6_config_validate(struct fib6_config *cfg, + struct netlink_ext_ack *extack) +{ + /* RTF_PCPU is an internal flag; can not be set by userspace */ + if (cfg->fc_flags & RTF_PCPU) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU"); + goto errout; + } + + /* RTF_CACHE is an internal flag; can not be set by userspace */ + if (cfg->fc_flags & RTF_CACHE) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE"); + goto errout; + } + + if (cfg->fc_type > RTN_MAX) { + NL_SET_ERR_MSG(extack, "Invalid route type"); + goto errout; + } + + if (cfg->fc_dst_len > 128) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + goto errout; + } + +#ifdef CONFIG_IPV6_SUBTREES + if (cfg->fc_src_len > 128) { + NL_SET_ERR_MSG(extack, "Invalid source address length"); + goto errout; + } + + if (cfg->fc_nh_id && cfg->fc_src_len) { + NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing"); + goto errout; + } +#else + if (cfg->fc_src_len) { + NL_SET_ERR_MSG(extack, + "Specifying source address requires IPV6_SUBTREES to be enabled"); + goto errout; + } +#endif + return 0; +errout: + return -EINVAL; +} + static void rtmsg_to_fib6_config(struct net *net, struct in6_rtmsg *rtmsg, struct fib6_config *cfg) @@ -4474,9 +4561,12 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) rtmsg_to_fib6_config(net, rtmsg, &cfg); - rtnl_lock(); switch (cmd) { case SIOCADDRT: + err = fib6_config_validate(&cfg, NULL); + if (err) + break; + /* Only do the default setting of fc_metric in route adding */ if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; @@ -4486,7 +4576,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) err = ip6_route_del(&cfg, NULL); break; } - rtnl_unlock(); + return err; } @@ -4576,6 +4666,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, .fc_ignore_dev_down = true, }; struct fib6_info *f6i; + int err; if (anycast) { cfg.fc_type = RTN_ANYCAST; @@ -4586,14 +4677,19 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, } f6i = ip6_route_info_create(&cfg, gfp_flags, extack); - if (!IS_ERR(f6i)) { - f6i->dst_nocount = true; + if (IS_ERR(f6i)) + return f6i; - if (!anycast && - (READ_ONCE(net->ipv6.devconf_all->disable_policy) || - READ_ONCE(idev->cnf.disable_policy))) - f6i->dst_nopolicy = true; - } + err = ip6_route_info_create_nh(f6i, &cfg, gfp_flags, extack); + if (err) + return ERR_PTR(err); + + f6i->dst_nocount = true; + + if (!anycast && + (READ_ONCE(net->ipv6.devconf_all->disable_policy) || + READ_ONCE(idev->cnf.disable_policy))) + f6i->dst_nopolicy = true; return f6i; } @@ -5005,14 +5101,63 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_SPORT] = { .type = NLA_U16 }, [RTA_DPORT] = { .type = NLA_U16 }, [RTA_NH_ID] = { .type = NLA_U32 }, + [RTA_FLOWLABEL] = { .type = NLA_BE32 }, }; +static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, + struct netlink_ext_ack *extack, + bool newroute) +{ + struct rtnexthop *rtnh; + int remaining; + + remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; + + if (!rtnh_ok(rtnh, remaining)) { + NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops"); + return -EINVAL; + } + + do { + bool has_gateway = cfg->fc_flags & RTF_GATEWAY; + int attrlen = rtnh_attrlen(rtnh); + + if (attrlen > 0) { + struct nlattr *nla, *attrs; + + attrs = rtnh_attrs(rtnh); + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + if (nla_len(nla) < sizeof(cfg->fc_gateway)) { + NL_SET_ERR_MSG(extack, + "Invalid IPv6 address in RTA_GATEWAY"); + return -EINVAL; + } + + has_gateway = true; + } + } + + if (newroute && (cfg->fc_nh_id || !has_gateway)) { + NL_SET_ERR_MSG(extack, + "Device only routes can not be added for IPv6 using the multipath API."); + return -EINVAL; + } + + rtnh = rtnh_next(rtnh, &remaining); + } while (rtnh_ok(rtnh, remaining)); + + return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack); +} + static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { - struct rtmsg *rtm; + bool newroute = nlh->nlmsg_type == RTM_NEWROUTE; struct nlattr *tb[RTA_MAX+1]; + struct rtmsg *rtm; unsigned int pref; int err; @@ -5030,6 +5175,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } + if (tb[RTA_FLOWLABEL]) { + NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], + "Flow label cannot be specified for this operation"); + goto errout; + } + *cfg = (struct fib6_config){ .fc_table = rtm->rtm_table, .fc_dst_len = rtm->rtm_dst_len, @@ -5115,8 +5266,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]); cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]); - err = lwtunnel_valid_encap_type_attr(cfg->fc_mp, - cfg->fc_mp_len, extack); + err = rtm_to_fib6_multipath_config(cfg, extack, newroute); if (err < 0) goto errout; } @@ -5157,29 +5307,28 @@ errout: struct rt6_nh { struct fib6_info *fib6_info; struct fib6_config r_cfg; - struct list_head next; + struct list_head list; }; -static int ip6_route_info_append(struct net *net, - struct list_head *rt6_nh_list, +static int ip6_route_info_append(struct list_head *rt6_nh_list, struct fib6_info *rt, struct fib6_config *r_cfg) { struct rt6_nh *nh; - int err = -EEXIST; - list_for_each_entry(nh, rt6_nh_list, next) { + list_for_each_entry(nh, rt6_nh_list, list) { /* check if fib6_info already exists */ if (rt6_duplicate_nexthop(nh->fib6_info, rt)) - return err; + return -EEXIST; } nh = kzalloc(sizeof(*nh), GFP_KERNEL); if (!nh) return -ENOMEM; + nh->fib6_info = rt; memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); - list_add_tail(&nh->next, rt6_nh_list); + list_add_tail(&nh->list, rt6_nh_list); return 0; } @@ -5235,37 +5384,26 @@ out: return should_notify; } -static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla, - struct netlink_ext_ack *extack) -{ - if (nla_len(nla) < sizeof(*gw)) { - NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY"); - return -EINVAL; - } - - *gw = nla_get_in6_addr(nla); - - return 0; -} - static int ip6_route_multipath_add(struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; + struct rt6_nh *nh, *nh_safe; struct fib6_config r_cfg; struct rtnexthop *rtnh; - struct fib6_info *rt; + LIST_HEAD(rt6_nh_list); struct rt6_nh *err_nh; - struct rt6_nh *nh, *nh_safe; + struct fib6_info *rt; __u16 nlflags; int remaining; int attrlen; - int err = 1; + int replace; int nhn = 0; - int replace = (cfg->fc_nlinfo.nlh && - (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); - LIST_HEAD(rt6_nh_list); + int err; + + replace = (cfg->fc_nlinfo.nlh && + (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE)); nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE; if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) @@ -5288,18 +5426,11 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, - extack); - if (err) - goto cleanup; - + r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } - r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); - /* RTA_ENCAP_TYPE length checked in - * lwtunnel_valid_encap_type_attr - */ + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla) r_cfg.fc_encap_type = nla_get_u16(nla); @@ -5312,18 +5443,16 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rt = NULL; goto cleanup; } - if (!rt6_qualify_for_ecmp(rt)) { - err = -EINVAL; - NL_SET_ERR_MSG(extack, - "Device only routes can not be added for IPv6 using the multipath API."); - fib6_info_release(rt); + + err = ip6_route_info_create_nh(rt, &r_cfg, GFP_KERNEL, extack); + if (err) { + rt = NULL; goto cleanup; } rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; - err = ip6_route_info_append(info->nl_net, &rt6_nh_list, - rt, &r_cfg); + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); if (err) { fib6_info_release(rt); goto cleanup; @@ -5332,12 +5461,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, rtnh = rtnh_next(rtnh, &remaining); } - if (list_empty(&rt6_nh_list)) { - NL_SET_ERR_MSG(extack, - "Invalid nexthop configuration - no valid nexthops"); - return -EINVAL; - } - /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with * the full route when done @@ -5350,7 +5473,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, info->skip_notify_kernel = 1; err_nh = NULL; - list_for_each_entry(nh, &rt6_nh_list, next) { + list_for_each_entry(nh, &rt6_nh_list, list) { err = __ip6_ins_rt(nh->fib6_info, info, extack); if (err) { @@ -5418,16 +5541,16 @@ add_errout: ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags); /* Delete routes that were already added */ - list_for_each_entry(nh, &rt6_nh_list, next) { + list_for_each_entry(nh, &rt6_nh_list, list) { if (err_nh == nh) break; ip6_route_del(&nh->r_cfg, extack); } cleanup: - list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) { + list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) { fib6_info_release(nh->fib6_info); - list_del(&nh->next); + list_del(&nh->list); kfree(nh); } @@ -5459,21 +5582,15 @@ static int ip6_route_multipath_del(struct fib6_config *cfg, nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla) { - err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla, - extack); - if (err) { - last_err = err; - goto next_rtnh; - } - + r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } } + err = ip6_route_del(&r_cfg, extack); if (err) last_err = err; -next_rtnh: rtnh = rtnh_next(rtnh, &remaining); } @@ -5490,15 +5607,20 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; - if (cfg.fc_nh_id && - !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) { - NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); - return -EINVAL; + if (cfg.fc_nh_id) { + rcu_read_lock(); + err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id); + rcu_read_unlock(); + + if (err) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + return -EINVAL; + } } - if (cfg.fc_mp) + if (cfg.fc_mp) { return ip6_route_multipath_del(&cfg, extack); - else { + } else { cfg.fc_delete_all_nh = 1; return ip6_route_del(&cfg, extack); } @@ -5514,6 +5636,10 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) return err; + err = fib6_config_validate(&cfg, extack); + if (err) + return err; + if (cfg.fc_metric == 0) cfg.fc_metric = IP6_RT_PRIO_USER; @@ -5978,7 +6104,8 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; @@ -5988,7 +6115,6 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || @@ -6013,6 +6139,13 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, return -EINVAL; } + if (tb[RTA_FLOWLABEL] && + (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) { + NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL], + "Invalid flow label"); + return -EINVAL; + } + for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; @@ -6027,6 +6160,7 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb, case RTA_SPORT: case RTA_DPORT: case RTA_IP_PROTO: + case RTA_FLOWLABEL: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request"); @@ -6049,6 +6183,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct sk_buff *skb; struct rtmsg *rtm; struct flowi6 fl6 = {}; + __be32 flowlabel; bool fibmatch; err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack); @@ -6057,7 +6192,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = -EINVAL; rtm = nlmsg_data(nlh); - fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0); fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH); if (tb[RTA_SRC]) { @@ -6103,6 +6237,9 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, goto errout; } + flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0); + fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel); + if (iif) { struct net_device *dev; int flags = 0; @@ -6192,6 +6329,8 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; + rcu_read_lock(); + skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC); if (!skb) goto errout; @@ -6204,10 +6343,14 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, kfree_skb(skb); goto errout; } + + rcu_read_unlock(); + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, GFP_ATOMIC); return; errout: + rcu_read_unlock(); rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } @@ -6697,9 +6840,9 @@ static void bpf_iter_unregister(void) static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = { {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE, - .doit = inet6_rtm_newroute}, + .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE, - .doit = inet6_rtm_delroute}, + .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE, .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index db3c19a42e1c..7c05ac846646 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -125,7 +125,8 @@ static void rpl_destroy_state(struct lwtunnel_state *lwt) } static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, - const struct ipv6_rpl_sr_hdr *srh) + const struct ipv6_rpl_sr_hdr *srh, + struct dst_entry *cache_dst) { struct ipv6_rpl_sr_hdr *isrh, *csrh; const struct ipv6hdr *oldhdr; @@ -153,7 +154,7 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, hdrlen = ((csrh->hdrlen + 1) << 3); - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) { kfree(buf); return err; @@ -186,7 +187,8 @@ static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt, return 0; } -static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) +static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt, + struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct rpl_iptunnel_encap *tinfo; @@ -196,7 +198,7 @@ static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt) tinfo = rpl_encap_lwtunnel(dst->lwtstate); - return rpl_do_srh_inline(skb, rlwt, tinfo->srh); + return rpl_do_srh_inline(skb, rlwt, tinfo->srh, cache_dst); } static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -208,14 +210,14 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); - err = rpl_do_srh(skb, rlwt); - if (unlikely(err)) - goto drop; - local_bh_disable(); dst = dst_cache_get(&rlwt->cache); local_bh_enable(); + err = rpl_do_srh(skb, rlwt, dst); + if (unlikely(err)) + goto drop; + if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; @@ -230,25 +232,28 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; - dst_release(dst); goto drop; } - local_bh_disable(); - dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); - local_bh_enable(); + /* cache only if we don't create a dst reference loop */ + if (orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); + local_bh_enable(); + } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; - return dst_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } @@ -257,34 +262,48 @@ static int rpl_input(struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct rpl_lwt *rlwt; int err; - rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; - err = rpl_do_srh(skb, rlwt); - if (unlikely(err)) - goto drop; + rlwt = rpl_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&rlwt->cache); + local_bh_enable(); + + err = rpl_do_srh(skb, rlwt, dst); + if (unlikely(err)) { + dst_release(dst); + goto drop; + } if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { + local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); + local_bh_enable(); } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } - local_bh_enable(); - - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; return dst_input(skb); diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index bbf5b84a70fc..f78ecb6ad838 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -40,7 +40,14 @@ #include <net/seg6_hmac.h> #include <linux/random.h> -static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring); +struct hmac_storage { + local_lock_t bh_lock; + char hmac_ring[SEG6_HMAC_RING_SIZE]; +}; + +static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -187,7 +194,8 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, */ local_bh_disable(); - ring = this_cpu_ptr(hmac_ring); + local_lock_nested_bh(&hmac_storage.bh_lock); + ring = this_cpu_ptr(hmac_storage.hmac_ring); off = ring; /* source address */ @@ -212,6 +220,7 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, dgsize = __do_hmac(hinfo, ring, plen, tmp_out, SEG6_HMAC_MAX_DIGESTSIZE); + local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); if (dgsize < 0) diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 098632adc9b5..51583461ae29 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -124,8 +124,8 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb, return flowlabel; } -/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ -int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) +static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, + int proto, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct net *net = dev_net(dst->dev); @@ -137,7 +137,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) hdrlen = (osrh->hdrlen + 1) << 3; tot_len = hdrlen + sizeof(*hdr); - err = skb_cow_head(skb, tot_len + skb->mac_len); + err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -197,11 +197,18 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) return 0; } + +/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */ +int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto) +{ + return __seg6_do_srh_encap(skb, osrh, proto, NULL); +} EXPORT_SYMBOL_GPL(seg6_do_srh_encap); /* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */ static int seg6_do_srh_encap_red(struct sk_buff *skb, - struct ipv6_sr_hdr *osrh, int proto) + struct ipv6_sr_hdr *osrh, int proto, + struct dst_entry *cache_dst) { __u8 first_seg = osrh->first_segment; struct dst_entry *dst = skb_dst(skb); @@ -230,7 +237,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, tot_len = red_hdrlen + sizeof(struct ipv6hdr); - err = skb_cow_head(skb, tot_len + skb->mac_len); + err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -317,8 +324,8 @@ out: return 0; } -/* insert an SRH within an IPv6 packet, just after the IPv6 header */ -int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, + struct dst_entry *cache_dst) { struct ipv6hdr *hdr, *oldhdr; struct ipv6_sr_hdr *isrh; @@ -326,7 +333,7 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) hdrlen = (osrh->hdrlen + 1) << 3; - err = skb_cow_head(skb, hdrlen + skb->mac_len); + err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb)); if (unlikely(err)) return err; @@ -369,9 +376,8 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) return 0; } -EXPORT_SYMBOL_GPL(seg6_do_srh_inline); -static int seg6_do_srh(struct sk_buff *skb) +static int seg6_do_srh(struct sk_buff *skb, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); struct seg6_iptunnel_encap *tinfo; @@ -384,7 +390,7 @@ static int seg6_do_srh(struct sk_buff *skb) if (skb->protocol != htons(ETH_P_IPV6)) return -EINVAL; - err = seg6_do_srh_inline(skb, tinfo->srh); + err = __seg6_do_srh_inline(skb, tinfo->srh, cache_dst); if (err) return err; break; @@ -402,9 +408,11 @@ static int seg6_do_srh(struct sk_buff *skb) return -EINVAL; if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP) - err = seg6_do_srh_encap(skb, tinfo->srh, proto); + err = __seg6_do_srh_encap(skb, tinfo->srh, + proto, cache_dst); else - err = seg6_do_srh_encap_red(skb, tinfo->srh, proto); + err = seg6_do_srh_encap_red(skb, tinfo->srh, + proto, cache_dst); if (err) return err; @@ -425,11 +433,13 @@ static int seg6_do_srh(struct sk_buff *skb) skb_push(skb, skb->mac_len); if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP) - err = seg6_do_srh_encap(skb, tinfo->srh, - IPPROTO_ETHERNET); + err = __seg6_do_srh_encap(skb, tinfo->srh, + IPPROTO_ETHERNET, + cache_dst); else err = seg6_do_srh_encap_red(skb, tinfo->srh, - IPPROTO_ETHERNET); + IPPROTO_ETHERNET, + cache_dst); if (err) return err; @@ -444,6 +454,13 @@ static int seg6_do_srh(struct sk_buff *skb) return 0; } +/* insert an SRH within an IPv6 packet, just after the IPv6 header */ +int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) +{ + return __seg6_do_srh_inline(skb, osrh, NULL); +} +EXPORT_SYMBOL_GPL(seg6_do_srh_inline); + static int seg6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -455,34 +472,48 @@ static int seg6_input_core(struct net *net, struct sock *sk, { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct seg6_lwt *slwt; int err; - err = seg6_do_srh(skb); - if (unlikely(err)) - goto drop; + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); + slwt = seg6_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&slwt->cache); + local_bh_enable(); + + err = seg6_do_srh(skb, dst); + if (unlikely(err)) { + dst_release(dst); + goto drop; + } if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { + local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); + local_bh_enable(); } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } else { skb_dst_drop(skb); skb_dst_set(skb, dst); } - local_bh_enable(); - - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, @@ -528,16 +559,16 @@ static int seg6_output_core(struct net *net, struct sock *sk, struct seg6_lwt *slwt; int err; - err = seg6_do_srh(skb); - if (unlikely(err)) - goto drop; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); local_bh_disable(); dst = dst_cache_get(&slwt->cache); local_bh_enable(); + err = seg6_do_srh(skb, dst); + if (unlikely(err)) + goto drop; + if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); struct flowi6 fl6; @@ -552,28 +583,31 @@ static int seg6_output_core(struct net *net, struct sock *sk, dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; - dst_release(dst); goto drop; } - local_bh_disable(); - dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); - local_bh_enable(); + /* cache only if we don't create a dst reference loop */ + if (orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); + local_bh_enable(); + } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; } skb_dst_drop(skb); skb_dst_set(skb, dst); - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); - if (unlikely(err)) - goto drop; - if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, skb_dst(skb)->dev, dst_output); return dst_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index ac1dbd492c22..a11a02b4ba95 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1644,10 +1644,8 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_SRH] = { .type = NLA_BINARY }, [SEG6_LOCAL_TABLE] = { .type = NLA_U32 }, [SEG6_LOCAL_VRFTABLE] = { .type = NLA_U32 }, - [SEG6_LOCAL_NH4] = { .type = NLA_BINARY, - .len = sizeof(struct in_addr) }, - [SEG6_LOCAL_NH6] = { .type = NLA_BINARY, - .len = sizeof(struct in6_addr) }, + [SEG6_LOCAL_NH4] = NLA_POLICY_EXACT_LEN(sizeof(struct in_addr)), + [SEG6_LOCAL_NH6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 39bd8951bfca..a72dbca9e8fc 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -51,6 +51,7 @@ #include <net/dsfield.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <net/inet_dscp.h> /* @@ -201,8 +202,7 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) static int ipip6_tunnel_create(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct sit_net *sitn = net_generic(net, sit_net_id); + struct sit_net *sitn = net_generic(t->net, sit_net_id); int err; __dev_addr_set(dev, &t->parms.iph.saddr, 4); @@ -269,6 +269,7 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, nt = netdev_priv(dev); + nt->net = net; nt->parms = *parms; if (ipip6_tunnel_create(dev) < 0) goto failed_free; @@ -1449,7 +1450,6 @@ static int ipip6_tunnel_init(struct net_device *dev) int err; tunnel->dev = dev; - tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ipip6_tunnel_bind_dev(dev); @@ -1550,19 +1550,23 @@ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], } #endif -static int ipip6_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipip6_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel *nt; struct ip_tunnel_encap ipencap; #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif + struct net *net; int err; + net = params->link_net ? : dev_net(dev); nt = netdev_priv(dev); + nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip_tunnel_encap_setup(nt, &ipencap); @@ -1800,8 +1804,7 @@ static struct xfrm_tunnel mplsip_handler __read_mostly = { }; #endif -static void __net_exit sit_destroy_tunnels(struct net *net, - struct list_head *head) +static void __net_exit sit_exit_rtnl_net(struct net *net, struct list_head *head) { struct sit_net *sitn = net_generic(net, sit_net_id); struct net_device *dev, *aux; @@ -1816,15 +1819,15 @@ static void __net_exit sit_destroy_tunnels(struct net *net, for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) { struct ip_tunnel *t; - t = rtnl_dereference(sitn->tunnels[prio][h]); + t = rtnl_net_dereference(net, sitn->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); + unregister_netdevice_queue(t->dev, head); + + t = rtnl_net_dereference(net, t->next); } } } @@ -1856,7 +1859,10 @@ static int __net_init sit_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - sitn->fb_tunnel_dev->netns_local = true; + sitn->fb_tunnel_dev->netns_immutable = true; + + t = netdev_priv(sitn->fb_tunnel_dev); + t->net = net; err = register_netdev(sitn->fb_tunnel_dev); if (err) @@ -1865,8 +1871,6 @@ static int __net_init sit_init_net(struct net *net) ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); - t = netdev_priv(sitn->fb_tunnel_dev); - strcpy(t->parms.name, sitn->fb_tunnel_dev->name); return 0; @@ -1876,19 +1880,9 @@ err_alloc_dev: return err; } -static void __net_exit sit_exit_batch_rtnl(struct list_head *net_list, - struct list_head *dev_to_kill) -{ - struct net *net; - - ASSERT_RTNL(); - list_for_each_entry(net, net_list, exit_list) - sit_destroy_tunnels(net, dev_to_kill); -} - static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit_batch_rtnl = sit_exit_batch_rtnl, + .exit_rtnl = sit_exit_rtnl_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2debdf085a3b..e8e68a142649 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -267,6 +267,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) + fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; fl6.flowi6_uid = sk->sk_uid; opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); @@ -376,7 +378,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct request_sock *fastopen; struct ipv6_pinfo *np; struct tcp_sock *tp; @@ -798,6 +800,8 @@ static void tcp_v6_init_req(struct request_sock *req, ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + ireq->ir_rmt_addr = LOOPBACK4_IPV6; + ireq->ir_loc_addr = LOOPBACK4_IPV6; /* So that link locals have meaning */ if ((!sk_listener->sk_bound_dev_if || l3_slave) && @@ -864,16 +868,16 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 int oif, int rst, u8 tclass, __be32 label, u32 priority, u32 txhash, struct tcp_key *key) { - const struct tcphdr *th = tcp_hdr(skb); - struct tcphdr *t1; - struct sk_buff *buff; - struct flowi6 fl6; - struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); - struct sock *ctl_sk = net->ipv6.tcp_sk; + struct net *net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); unsigned int tot_len = sizeof(struct tcphdr); + struct sock *ctl_sk = net->ipv6.tcp_sk; + const struct tcphdr *th = tcp_hdr(skb); __be32 mrst = 0, *topt; struct dst_entry *dst; - __u32 mark = 0; + struct sk_buff *buff; + struct tcphdr *t1; + struct flowi6 fl6; + u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; @@ -997,7 +1001,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, - tclass & ~INET_ECN_MASK, priority); + tclass, priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); @@ -1039,7 +1043,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, if (!sk && !ipv6_unicast_destination(skb)) return; - net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; @@ -1133,7 +1137,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, trace_tcp_send_reset(sk, skb, reason); tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, - ipv6_get_dsfield(ipv6h), label, priority, txhash, + ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK, + label, priority, txhash, &key); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) @@ -1153,11 +1158,16 @@ static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, tclass, label, priority, txhash, key); } -static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) +static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb, + enum tcp_tw_status tw_status) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); + u8 tclass = tw->tw_tclass; struct tcp_key key = {}; + + if (tw_status == TCP_TW_ACK_OOW) + tclass &= ~INET_ECN_MASK; #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; @@ -1201,7 +1211,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if, - &key, tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), + &key, tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, tw->tw_txhash); #ifdef CONFIG_TCP_AO @@ -1277,8 +1287,9 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), - READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, - &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, + req->ts_recent, sk->sk_bound_dev_if, + &key, ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK, + 0, READ_ONCE(sk->sk_priority), READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) @@ -1451,10 +1462,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * ip6_dst_store(newsk, dst, NULL, NULL); - newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; newnp->saddr = ireq->ir_v6_loc_addr; - newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; - newsk->sk_bound_dev_if = ireq->ir_iif; /* Now IPv6 options... @@ -1507,9 +1515,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_initialize_rcv_mss(newsk); - newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; - newinet->inet_rcv_saddr = LOOPBACK4_IPV6; - #ifdef CONFIG_TCP_MD5SIG l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); @@ -1735,7 +1740,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); - TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); + TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th); TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = @@ -1744,7 +1749,9 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { + struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; + enum tcp_tw_status tw_status; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; @@ -1753,7 +1760,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) bool refcounted; int ret; u32 isn; - struct net *net = dev_net(skb->dev); drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) @@ -1832,7 +1838,8 @@ lookup: th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); - nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen, + &drop_reason); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } @@ -1965,7 +1972,9 @@ do_time_wait: goto csum_error; } - switch (tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn)) { + tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn, + &drop_reason); + switch (tw_status) { case TCP_TW_SYN: { struct sock *sk2; @@ -1990,7 +1999,8 @@ do_time_wait: /* to ACK */ fallthrough; case TCP_TW_ACK: - tcp_v6_timewait_ack(sk, skb); + case TCP_TW_ACK_OOW: + tcp_v6_timewait_ack(sk, skb, tw_status); break; case TCP_TW_RST: tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET); @@ -2004,7 +2014,7 @@ do_time_wait: void tcp_v6_early_demux(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct ipv6hdr *hdr; const struct tcphdr *th; struct sock *sk; @@ -2061,8 +2071,6 @@ const struct inet_connection_sock_af_ops ipv6_specific = { .net_header_len = sizeof(struct ipv6hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = inet6_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v6_mtu_reduced, }; @@ -2095,8 +2103,6 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = { .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = inet6_csk_addr2sockaddr, - .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v4_mtu_reduced, }; @@ -2192,10 +2198,10 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk_timeout(icsk); } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk->icsk_timeout; + timer_expires = icsk_timeout(icsk); } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index a45bf17cb2a1..a8a04f441e78 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -35,14 +35,14 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet6_get_iif_sdif(skb, &iif, &sdif); hdr = skb_gro_network_header(skb); - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; if (sk) - sock_put(sk); + sock_gen_put(sk); #endif /* IS_ENABLED(CONFIG_IPV6) */ } @@ -94,14 +94,23 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) } static void __tcpv6_gso_segment_csum(struct sk_buff *seg, + struct in6_addr *oldip, + const struct in6_addr *newip, __be16 *oldport, __be16 newport) { - struct tcphdr *th; + struct tcphdr *th = tcp_hdr(seg); + + if (!ipv6_addr_equal(oldip, newip)) { + inet_proto_csum_replace16(&th->check, seg, + oldip->s6_addr32, + newip->s6_addr32, + true); + *oldip = *newip; + } if (*oldport == newport) return; - th = tcp_hdr(seg); inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); *oldport = newport; } @@ -129,10 +138,10 @@ static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs) th2 = tcp_hdr(seg); iph2 = ipv6_hdr(seg); - iph2->saddr = iph->saddr; - iph2->daddr = iph->daddr; - __tcpv6_gso_segment_csum(seg, &th2->source, th->source); - __tcpv6_gso_segment_csum(seg, &th2->dest, th->dest); + __tcpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr, + &th2->source, th->source); + __tcpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr, + &th2->dest, th->dest); } return segs; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index d766fd798ecf..7317f8e053f1 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include <net/tcp_states.h> #include <net/ip6_checksum.h> #include <net/ip6_tunnel.h> +#include <net/udp_tunnel.h> #include <net/xfrm.h> #include <net/inet_hashtables.h> #include <net/inet6_hashtables.h> @@ -170,6 +171,49 @@ static int compute_score(struct sock *sk, const struct net *net, return score; } +/** + * udp6_lib_lookup1() - Simplified lookup using primary hash (destination port) + * @net: Network namespace + * @saddr: Source address, network order + * @sport: Source port, network order + * @daddr: Destination address, network order + * @hnum: Destination port, host order + * @dif: Destination interface index + * @sdif: Destination bridge port index, if relevant + * @udptable: Set of UDP hash tables + * + * Simplified lookup to be used as fallback if no sockets are found due to a + * potential race between (receive) address change, and lookup happening before + * the rehash operation. This function ignores SO_REUSEPORT groups while scoring + * result sockets, because if we have one, we don't need the fallback at all. + * + * Called under rcu_read_lock(). + * + * Return: socket with highest matching score if any, NULL if none + */ +static struct sock *udp6_lib_lookup1(const struct net *net, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, + unsigned int hnum, int dif, int sdif, + const struct udp_table *udptable) +{ + unsigned int slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot = &udptable->hash[slot]; + struct sock *sk, *result = NULL; + int score, badness = 0; + + sk_for_each_rcu(sk, &hslot->head) { + score = compute_score(sk, net, + saddr, sport, daddr, hnum, dif, sdif); + if (score > badness) { + result = sk; + badness = score; + } + } + + return result; +} + /* called with rcu_read_lock() */ static struct sock *udp6_lib_lookup2(const struct net *net, const struct in6_addr *saddr, __be16 sport, @@ -347,6 +391,13 @@ struct sock *__udp6_lib_lookup(const struct net *net, result = udp6_lib_lookup2(net, saddr, sport, &in6addr_any, hnum, dif, sdif, hslot2, skb); + if (!IS_ERR_OR_NULL(result)) + goto done; + + /* Cover address change/lookup/rehash race: see __udp4_lib_lookup() */ + result = udp6_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif, + udptable); + done: if (IS_ERR(result)) return NULL; @@ -536,7 +587,7 @@ csum_copy_err: SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); /* starting over for a new packet, but check if we need to yield */ cond_resched(); @@ -1339,9 +1390,9 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) { + if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); - return -EINVAL; + return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); @@ -1444,10 +1495,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); ipc6.gso_size = READ_ONCE(up->gso_size); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = READ_ONCE(sk->sk_mark); /* destination address check */ if (sin6) { @@ -1653,9 +1702,6 @@ do_udp_sendmsg: security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel); dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected); @@ -1701,8 +1747,6 @@ back_from_confirm: WRITE_ONCE(up->pending, AF_INET6); do_append_data: - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, fl6, dst_rt6_info(dst), @@ -1782,6 +1826,7 @@ void udpv6_destroy_sock(struct sock *sk) if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udpv6_encap_needed_key); udp_encap_disable(); + udp_tunnel_cleanup_gro(sk); } } } diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index b41152dd4246..d8445ac1b2e4 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -117,9 +117,14 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct ipv6hdr *iph = skb_gro_network_header(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); + struct sock *sk; int iif, sdif; + sk = udp_tunnel_sk(net, true); + if (sk && dport == htons(sk->sk_num)) + return sk; + inet6_get_iif_sdif(skb, &iif, &sdif); return __udp6_lib_lookup(net, &iph->saddr, sport, diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 4abc5e9d6322..841c81abaaf4 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -179,14 +179,18 @@ struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, int offset = skb_gro_offset(skb); const struct net_offload *ops; struct sk_buff *pp = NULL; - int ret; + int len, dlen; + __u8 *udpdata; + __be32 *udpdata32; if (skb->protocol == htons(ETH_P_IP)) return xfrm4_gro_udp_encap_rcv(sk, head, skb); - offset = offset - sizeof(struct udphdr); - - if (!pskb_pull(skb, offset)) + len = skb->len - offset; + dlen = offset + min(len, 8); + udpdata = skb_gro_header(skb, dlen, offset); + udpdata32 = (__be32 *)udpdata; + if (unlikely(!udpdata)) return NULL; rcu_read_lock(); @@ -194,11 +198,10 @@ struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, if (!ops || !ops->callbacks.gro_receive) goto out; - ret = __xfrm6_udp_encap_rcv(sk, skb, false); - if (ret) + /* check if it is a keepalive or IKE packet */ + if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0) goto out; - skb_push(skb, offset); NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); @@ -208,7 +211,6 @@ struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, out: rcu_read_unlock(); - skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 5f7b1fdbffe6..b3d5d1f266ee 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -82,14 +82,14 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) toobig = skb->len > mtu && !skb_is_gso(skb); - if (toobig && xfrm6_local_dontfrag(skb->sk)) { + if (toobig && xfrm6_local_dontfrag(sk)) { xfrm6_local_rxpmtu(skb, mtu); kfree_skb(skb); return -EMSGSIZE; } else if (toobig && xfrm6_noneed_fragment(skb)) { skb->ignore_df = 1; goto skip_frag; - } else if (!skb->ignore_df && toobig && skb->sk) { + } else if (!skb->ignore_df && toobig && sk) { xfrm_local_error(skb, mtu); kfree_skb(skb); return -EMSGSIZE; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 7929df08d4e0..cc2b3c44bc05 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -28,6 +28,7 @@ #include <linux/poll.h> #include <linux/security.h> #include <net/sock.h> +#include <asm/machine.h> #include <asm/ebcdic.h> #include <asm/cpcmd.h> #include <linux/kmod.h> @@ -2272,7 +2273,7 @@ static int __init afiucv_init(void) { int err; - if (MACHINE_IS_VM && IS_ENABLED(CONFIG_IUCV)) { + if (machine_is_vm() && IS_ENABLED(CONFIG_IUCV)) { cpcmd("QUERY USERID", iucv_userid, sizeof(iucv_userid), &err); if (unlikely(err)) { WARN_ON(err); diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index d3e9efab7f4b..83070a2e4485 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -39,6 +39,7 @@ #include <linux/reboot.h> #include <net/iucv/iucv.h> #include <linux/atomic.h> +#include <asm/machine.h> #include <asm/ebcdic.h> #include <asm/io.h> #include <asm/irq.h> @@ -1865,7 +1866,7 @@ static int __init iucv_init(void) { int rc; - if (!MACHINE_IS_VM) { + if (!machine_is_vm()) { rc = -EPROTONOSUPPORT; goto out; } diff --git a/net/key/af_key.c b/net/key/af_key.c index c56bb4f451e6..efc2a91f4c48 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -2630,7 +2630,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, - kma ? &k : NULL, net, NULL, 0, NULL); + kma ? &k : NULL, net, NULL, 0, NULL, NULL); out: return err; diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index d692b902e120..cf0b66f4fb29 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -25,6 +25,7 @@ #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/netdev_lock.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/udp.h> @@ -73,9 +74,9 @@ static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev int ret = l2tp_xmit_skb(session, skb); if (likely(ret == NET_XMIT_SUCCESS)) - dev_sw_netstats_tx_add(dev, 1, len); + dev_dstats_tx_add(dev, len); else - DEV_STATS_INC(dev, tx_dropped); + dev_dstats_tx_dropped(dev); return NETDEV_TX_OK; } @@ -84,7 +85,6 @@ static const struct net_device_ops l2tp_eth_netdev_ops = { .ndo_init = l2tp_eth_dev_init, .ndo_uninit = l2tp_eth_dev_uninit, .ndo_start_xmit = l2tp_eth_dev_xmit, - .ndo_get_stats64 = dev_get_tstats64, .ndo_set_mac_address = eth_mac_addr, }; @@ -100,7 +100,7 @@ static void l2tp_eth_dev_setup(struct net_device *dev) dev->lltx = true; dev->netdev_ops = &l2tp_eth_netdev_ops; dev->needs_free_netdev = true; - dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; + dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) @@ -128,7 +128,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, goto error_rcu; if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) - dev_sw_netstats_rx_add(dev, data_len); + dev_dstats_rx_add(dev, data_len); else DEV_STATS_INC(dev, rx_errors); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 4bc24fddfd52..29795d2839e8 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -425,7 +425,6 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int rc; struct inet_sock *inet = inet_sk(sk); struct rtable *rt = NULL; - struct flowi4 *fl4; int connected = 0; __be32 daddr; @@ -455,7 +454,6 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (sk->sk_state != TCP_ESTABLISHED) goto out; - daddr = inet->inet_daddr; connected = 1; } @@ -482,29 +480,24 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) goto error; } - fl4 = &inet->cork.fl.u.ip4; if (connected) rt = dst_rtable(__sk_dst_check(sk, 0)); rcu_read_lock(); if (!rt) { - const struct ip_options_rcu *inet_opt; + struct flowi4 *fl4 = &inet->cork.fl.u.ip4; - inet_opt = rcu_dereference(inet->inet_opt); + inet_sk_init_flowi4(inet, fl4); - /* Use correct destination address if we have options. */ - if (inet_opt && inet_opt->opt.srr) - daddr = inet_opt->opt.faddr; + /* Overwrite ->daddr if msg->msg_name was provided */ + if (!connected) + fl4->daddr = daddr; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ - rt = ip_route_output_ports(sock_net(sk), fl4, sk, - daddr, inet->inet_saddr, - inet->inet_dport, inet->inet_sport, - sk->sk_protocol, ip_sock_rt_tos(sk), - sk->sk_bound_dev_if); + rt = ip_route_output_flow(sock_net(sk), fl4, sk); if (IS_ERR(rt)) goto no_route; if (connected) { diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index f4c1da070826..b98d13584c81 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -547,7 +547,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) @@ -634,9 +634,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); @@ -648,9 +645,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); - if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index 53baf2dd5d5d..fc5c2fd8f34c 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -806,6 +806,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, po->chan.private = sk; po->chan.ops = &pppol2tp_chan_ops; po->chan.mtu = pppol2tp_tunnel_mtu(tunnel); + po->chan.direct_xmit = true; error = ppp_register_net_channel(sock_net(sk), &po->chan); if (error) { diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c index ca10916340b0..5432a5f2dfc8 100644 --- a/net/l3mdev/l3mdev.c +++ b/net/l3mdev/l3mdev.c @@ -277,8 +277,10 @@ void l3mdev_update_flow(struct net *net, struct flowi *fl) if (fl->flowi_oif) { dev = dev_get_by_index_rcu(net, fl->flowi_oif); if (dev) { - if (!fl->flowi_l3mdev) + if (!fl->flowi_l3mdev) { fl->flowi_l3mdev = l3mdev_master_ifindex_rcu(dev); + fl->flowi_flags |= FLOWI_FLAG_L3MDEV_OIF; + } /* oif set to L3mdev directs lookup to its table; * reset to avoid oif match in fib_lookup diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c index 0971ca48ba15..a0596e1f91da 100644 --- a/net/lapb/lapb_iface.c +++ b/net/lapb/lapb_iface.c @@ -194,8 +194,8 @@ int lapb_unregister(struct net_device *dev) spin_unlock_bh(&lapb->lock); /* Wait for running timers to stop */ - del_timer_sync(&lapb->t1timer); - del_timer_sync(&lapb->t2timer); + timer_delete_sync(&lapb->t1timer); + timer_delete_sync(&lapb->t2timer); __lapb_remove_cb(lapb); diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c index 5be68869064d..5b3f3b444d19 100644 --- a/net/lapb/lapb_timer.c +++ b/net/lapb/lapb_timer.c @@ -35,7 +35,7 @@ static void lapb_t2timer_expiry(struct timer_list *); void lapb_start_t1timer(struct lapb_cb *lapb) { - del_timer(&lapb->t1timer); + timer_delete(&lapb->t1timer); lapb->t1timer.function = lapb_t1timer_expiry; lapb->t1timer.expires = jiffies + lapb->t1; @@ -46,7 +46,7 @@ void lapb_start_t1timer(struct lapb_cb *lapb) void lapb_start_t2timer(struct lapb_cb *lapb) { - del_timer(&lapb->t2timer); + timer_delete(&lapb->t2timer); lapb->t2timer.function = lapb_t2timer_expiry; lapb->t2timer.expires = jiffies + lapb->t2; @@ -58,13 +58,13 @@ void lapb_start_t2timer(struct lapb_cb *lapb) void lapb_stop_t1timer(struct lapb_cb *lapb) { lapb->t1timer_running = false; - del_timer(&lapb->t1timer); + timer_delete(&lapb->t1timer); } void lapb_stop_t2timer(struct lapb_cb *lapb) { lapb->t2timer_running = false; - del_timer(&lapb->t2timer); + timer_delete(&lapb->t2timer); } int lapb_t1timer_running(struct lapb_cb *lapb) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 0259cde394ba..cc77ec5769d8 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -887,15 +887,15 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (sk->sk_type != SOCK_STREAM) goto copy_uaddr; + /* Partial read */ + if (used + offset < skb_len) + continue; + if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } - - /* Partial read */ - if (used + offset < skb_len) - continue; } while (len > 0); out: diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index 40ca3c1e42a2..7e8fc710c590 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -51,7 +51,7 @@ int llc_conn_ac_clear_remote_busy(struct sock *sk, struct sk_buff *skb) struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); llc->remote_busy_flag = 0; - del_timer(&llc->busy_state_timer.timer); + timer_delete(&llc->busy_state_timer.timer); nr = LLC_I_GET_NR(pdu); llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); } @@ -191,7 +191,7 @@ int llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2(struct sock *sk, struct llc_sock *llc = llc_sk(sk); if (llc->data_flag == 2) - del_timer(&llc->rej_sent_timer.timer); + timer_delete(&llc->rej_sent_timer.timer); return 0; } @@ -1111,9 +1111,9 @@ int llc_conn_ac_stop_other_timers(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); - del_timer(&llc->rej_sent_timer.timer); - del_timer(&llc->pf_cycle_timer.timer); - del_timer(&llc->busy_state_timer.timer); + timer_delete(&llc->rej_sent_timer.timer); + timer_delete(&llc->pf_cycle_timer.timer); + timer_delete(&llc->busy_state_timer.timer); llc->ack_must_be_send = 0; llc->ack_pf = 0; return 0; @@ -1149,7 +1149,7 @@ int llc_conn_ac_start_ack_tmr_if_not_running(struct sock *sk, int llc_conn_ac_stop_ack_timer(struct sock *sk, struct sk_buff *skb) { - del_timer(&llc_sk(sk)->ack_timer.timer); + timer_delete(&llc_sk(sk)->ack_timer.timer); return 0; } @@ -1157,14 +1157,14 @@ int llc_conn_ac_stop_p_timer(struct sock *sk, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(sk); - del_timer(&llc->pf_cycle_timer.timer); + timer_delete(&llc->pf_cycle_timer.timer); llc_conn_set_p_flag(sk, 0); return 0; } int llc_conn_ac_stop_rej_timer(struct sock *sk, struct sk_buff *skb) { - del_timer(&llc_sk(sk)->rej_sent_timer.timer); + timer_delete(&llc_sk(sk)->rej_sent_timer.timer); return 0; } @@ -1180,7 +1180,7 @@ int llc_conn_ac_upd_nr_received(struct sock *sk, struct sk_buff *skb) /* On loopback we don't queue I frames in unack_pdu_q queue. */ if (acked > 0 || (llc->dev->flags & IFF_LOOPBACK)) { llc->retry_count = 0; - del_timer(&llc->ack_timer.timer); + timer_delete(&llc->ack_timer.timer); if (llc->failed_data_req) { /* already, we did not accept data from upper layer * (tx_window full or unacceptable state). Now, we diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index afc6974eafda..5c0ac243b248 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -949,15 +949,15 @@ void llc_sk_stop_all_timers(struct sock *sk, bool sync) struct llc_sock *llc = llc_sk(sk); if (sync) { - del_timer_sync(&llc->pf_cycle_timer.timer); - del_timer_sync(&llc->ack_timer.timer); - del_timer_sync(&llc->rej_sent_timer.timer); - del_timer_sync(&llc->busy_state_timer.timer); + timer_delete_sync(&llc->pf_cycle_timer.timer); + timer_delete_sync(&llc->ack_timer.timer); + timer_delete_sync(&llc->rej_sent_timer.timer); + timer_delete_sync(&llc->busy_state_timer.timer); } else { - del_timer(&llc->pf_cycle_timer.timer); - del_timer(&llc->ack_timer.timer); - del_timer(&llc->rej_sent_timer.timer); - del_timer(&llc->busy_state_timer.timer); + timer_delete(&llc->pf_cycle_timer.timer); + timer_delete(&llc->ack_timer.timer); + timer_delete(&llc->rej_sent_timer.timer); + timer_delete(&llc->busy_state_timer.timer); } llc->ack_must_be_send = 0; diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c index 06fb8e6944b0..7a0cae9a8111 100644 --- a/net/llc/llc_s_ac.c +++ b/net/llc/llc_s_ac.c @@ -24,7 +24,7 @@ #include <net/llc_s_ac.h> #include <net/llc_s_ev.h> #include <net/llc_sap.h> - +#include <net/sock.h> /** * llc_sap_action_unitdata_ind - forward UI PDU to network layer @@ -40,6 +40,26 @@ int llc_sap_action_unitdata_ind(struct llc_sap *sap, struct sk_buff *skb) return 0; } +static int llc_prepare_and_xmit(struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct sk_buff *nskb; + int rc; + + rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); + if (rc) + return rc; + + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return -ENOMEM; + + if (skb->sk) + skb_set_owner_w(nskb, skb->sk); + + return dev_queue_xmit(nskb); +} + /** * llc_sap_action_send_ui - sends UI PDU resp to UNITDATA REQ to MAC layer * @sap: SAP @@ -52,17 +72,12 @@ int llc_sap_action_unitdata_ind(struct llc_sap *sap, struct sk_buff *skb) int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); - int rc; llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_ui_cmd(skb); - rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) { - skb_get(skb); - rc = dev_queue_xmit(skb); - } - return rc; + + return llc_prepare_and_xmit(skb); } /** @@ -77,17 +92,12 @@ int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb) int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); - int rc; llc_pdu_header_init(skb, LLC_PDU_TYPE_U_XID, ev->saddr.lsap, ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0); - rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) { - skb_get(skb); - rc = dev_queue_xmit(skb); - } - return rc; + + return llc_prepare_and_xmit(skb); } /** @@ -133,17 +143,12 @@ out: int llc_sap_action_send_test_c(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); - int rc; llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, ev->daddr.lsap, LLC_PDU_CMD); llc_pdu_init_as_test_cmd(skb); - rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); - if (likely(!rc)) { - skb_get(skb); - rc = dev_queue_xmit(skb); - } - return rc; + + return llc_prepare_and_xmit(skb); } int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb) diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c index 72e101135f8c..c8d88e2508fc 100644 --- a/net/llc/sysctl_net_llc.c +++ b/net/llc/sysctl_net_llc.c @@ -11,10 +11,6 @@ #include <net/net_namespace.h> #include <net/llc.h> -#ifndef CONFIG_SYSCTL -#error This file should not be compiled without CONFIG_SYSCTL defined -#endif - static struct ctl_table llc2_timeout_table[] = { { .procname = "ack", diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index f3fbe5a4395e..85612234742a 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ /** @@ -103,13 +103,13 @@ void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, if (!tid_rx) return; - del_timer_sync(&tid_rx->session_timer); + timer_delete_sync(&tid_rx->session_timer); /* make sure ieee80211_sta_reorder_release() doesn't re-arm the timer */ spin_lock_bh(&tid_rx->reorder_lock); tid_rx->removed = true; spin_unlock_bh(&tid_rx->reorder_lock); - del_timer_sync(&tid_rx->reorder_timer); + timer_delete_sync(&tid_rx->reorder_timer); call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx); } @@ -206,17 +206,19 @@ u8 ieee80211_retrieve_addba_ext_data(struct sta_info *sta, elems = ieee802_11_parse_elems(elem_data, elem_len, true, NULL); - if (elems && !elems->parse_error && elems->addba_ext_ie) { - data = elems->addba_ext_ie->data; + if (!elems || elems->parse_error || !elems->addba_ext_ie) + goto free; - if (!sta->sta.deflink.eht_cap.has_eht || !buf_size) - goto free; + data = elems->addba_ext_ie->data; + if (buf_size && + (sta->sta.valid_links || sta->sta.deflink.eht_cap.has_eht)) { buf_size_1k = u8_get_bits(elems->addba_ext_ie->data, IEEE80211_ADDBA_EXT_BUF_SIZE_MASK); *buf_size |= (u16)buf_size_1k << IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT; } + free: kfree(elems); @@ -258,7 +260,7 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout); mgmt->u.action.u.addba_resp.status = cpu_to_le16(status); - if (sta->sta.deflink.he_cap.has_he) + if (sta->sta.valid_links || sta->sta.deflink.he_cap.has_he) ieee80211_add_addbaext(skb, req_addba_ext_data, buf_size); ieee80211_tx_skb(sdata, skb); @@ -293,7 +295,8 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } - if (!sta->sta.deflink.ht_cap.ht_supported && + if (!sta->sta.valid_links && + !sta->sta.deflink.ht_cap.ht_supported && !sta->sta.deflink.he_cap.has_he) { ht_dbg(sta->sdata, "STA %pM erroneously requests BA session on tid %d w/o HT\n", @@ -309,7 +312,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } - if (sta->sta.deflink.eht_cap.has_eht) + if (sta->sta.valid_links || sta->sta.deflink.eht_cap.has_eht) max_buf_size = IEEE80211_MAX_AMPDU_BUF_EHT; else if (sta->sta.deflink.he_cap.has_he) max_buf_size = IEEE80211_MAX_AMPDU_BUF_HE; @@ -321,7 +324,8 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, * and if buffer size does not exceeds max value */ /* XXX: check own ht delayed BA capability?? */ if (((ba_policy != 1) && - (!(sta->sta.deflink.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || + (sta->sta.valid_links || + !(sta->sta.deflink.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) || (buf_size > max_buf_size)) { status = WLAN_STATUS_INVALID_QOS_PARAM; ht_dbg_ratelimited(sta->sdata, diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 61f2cac37728..8dc8c3c96b96 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2023 Intel Corporation + * Copyright (C) 2018 - 2024 Intel Corporation */ #include <linux/ieee80211.h> @@ -362,8 +362,8 @@ int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, ht_dbg(sta->sdata, "Tx BA session stop requested for %pM tid %u\n", sta->sta.addr, tid); - del_timer_sync(&tid_tx->addba_resp_timer); - del_timer_sync(&tid_tx->session_timer); + timer_delete_sync(&tid_tx->addba_resp_timer); + timer_delete_sync(&tid_tx->session_timer); /* * After this packets are no longer handed right through @@ -464,7 +464,9 @@ static void ieee80211_send_addba_with_timeout(struct sta_info *sta, sta->ampdu_mlme.addba_req_num[tid]++; spin_unlock_bh(&sta->lock); - if (sta->sta.deflink.eht_cap.has_eht) { + if (sta->sta.valid_links || + sta->sta.deflink.eht_cap.has_eht || + ieee80211_hw_check(&local->hw, STRICT)) { buf_size = local->hw.max_tx_aggregation_subframes; } else if (sta->sta.deflink.he_cap.has_he) { buf_size = min_t(u16, local->hw.max_tx_aggregation_subframes, @@ -608,7 +610,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, "Requested to start BA session on reserved tid=%d", tid)) return -EINVAL; - if (!pubsta->deflink.ht_cap.ht_supported && + if (!pubsta->valid_links && + !pubsta->deflink.ht_cap.ht_supported && !pubsta->deflink.vht_cap.vht_supported && !pubsta->deflink.he_cap.has_he && !pubsta->deflink.eht_cap.has_eht) @@ -999,7 +1002,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, return; } - del_timer_sync(&tid_tx->addba_resp_timer); + timer_delete_sync(&tid_tx->addba_resp_timer); ht_dbg(sta->sdata, "switched off addBA timer for %pM tid %d\n", sta->sta.addr, tid); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index d3fc158ccaf6..d9d88f2f2831 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/ieee80211.h> @@ -89,15 +89,14 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, /* check flags first */ if (params->flags && ieee80211_sdata_running(sdata)) { - u32 mask = MONITOR_FLAG_COOK_FRAMES | MONITOR_FLAG_ACTIVE; + u32 mask = MONITOR_FLAG_ACTIVE; /* - * Prohibit MONITOR_FLAG_COOK_FRAMES and - * MONITOR_FLAG_ACTIVE to be changed while the - * interface is up. + * Prohibit MONITOR_FLAG_ACTIVE to be changed + * while the interface is up. * Else we would need to add a lot of cruft * to update everything: - * cooked_mntrs, monitor and all fif_* counters + * monitor and all fif_* counters * reconfigure hardware */ if ((params->flags & mask) != (sdata->u.mntr.flags & mask)) @@ -147,8 +146,8 @@ static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *link_conf) { struct ieee80211_sub_if_data *tx_sdata; + struct ieee80211_bss_conf *old; - sdata->vif.mbssid_tx_vif = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; @@ -157,14 +156,26 @@ static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, if (sdata->vif.type != NL80211_IFTYPE_AP || !params->tx_wdev) return -EINVAL; + old = sdata_dereference(link_conf->tx_bss_conf, sdata); + if (old) + return -EALREADY; + tx_sdata = IEEE80211_WDEV_TO_SUB_IF(params->tx_wdev); if (!tx_sdata) return -EINVAL; if (tx_sdata == sdata) { - sdata->vif.mbssid_tx_vif = &sdata->vif; + rcu_assign_pointer(link_conf->tx_bss_conf, link_conf); } else { - sdata->vif.mbssid_tx_vif = &tx_sdata->vif; + struct ieee80211_bss_conf *tx_bss_conf; + + tx_bss_conf = sdata_dereference(tx_sdata->vif.link_conf[params->tx_link_id], + sdata); + if (rcu_access_pointer(tx_bss_conf->tx_bss_conf) != tx_bss_conf) + return -EINVAL; + + rcu_assign_pointer(link_conf->tx_bss_conf, tx_bss_conf); + link_conf->nontransmitted = true; link_conf->bssid_index = params->index; } @@ -503,6 +514,9 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, if (IS_ERR(link)) return PTR_ERR(link); + if (WARN_ON(pairwise && link_id >= 0)) + return -EINVAL; + if (pairwise && params->mode == NL80211_KEY_SET_TX) return ieee80211_set_tx(sdata, mac_addr, key_idx); @@ -525,10 +539,12 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, if (IS_ERR(key)) return PTR_ERR(key); - key->conf.link_id = link_id; - - if (pairwise) + if (pairwise) { key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE; + key->conf.link_id = -1; + } else { + key->conf.link_id = link->link_id; + } if (params->mode == NL80211_KEY_NO_TX) key->conf.flags |= IEEE80211_KEY_FLAG_NO_AUTO_TX; @@ -915,7 +931,7 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { if (cfg80211_chandef_identical(&local->monitor_chanreq.oper, - &chanreq.oper)) + &chanreq.oper)) return 0; sdata = wiphy_dereference(wiphy, local->monitor_sdata); @@ -924,7 +940,7 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, } if (rcu_access_pointer(sdata->deflink.conf->chanctx_conf) && - cfg80211_chandef_identical(&sdata->vif.bss_conf.chanreq.oper, + cfg80211_chandef_identical(&sdata->vif.bss_conf.chanreq.oper, &chanreq.oper)) return 0; @@ -1274,9 +1290,9 @@ static u8 ieee80211_num_beaconing_links(struct ieee80211_sub_if_data *sdata) sdata->vif.type != NL80211_IFTYPE_P2P_GO) return num; - if (!sdata->vif.valid_links) - return num; - + /* non-MLO mode of operation also uses link_id 0 in sdata so it is + * safe to directly proceed with the below loop + */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { link = sdata_dereference(sdata->link[link_id], sdata); if (!link) @@ -1405,6 +1421,9 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, (IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ); + link_conf->eht_disable_mcs15 = + u8_get_bits(params->eht_oper->params, + IEEE80211_EHT_OPER_MCS15_DISABLE); } else { link_conf->eht_su_beamformer = false; link_conf->eht_su_beamformee = false; @@ -1665,13 +1684,12 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, kfree(link_conf->ftmr_params); link_conf->ftmr_params = NULL; - sdata->vif.mbssid_tx_vif = NULL; link_conf->bssid_index = 0; link_conf->nontransmitted = false; link_conf->ema_ap = false; link_conf->bssid_indicator = 0; - __sta_info_flush(sdata, true, link_id); + __sta_info_flush(sdata, true, link_id, NULL); ieee80211_remove_link_keys(link, &keys); if (!list_empty(&keys)) { @@ -1679,6 +1697,9 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, ieee80211_free_key_list(local, &keys); } + ieee80211_stop_mbssid(sdata); + RCU_INIT_POINTER(link_conf->tx_bss_conf, NULL); + link_conf->enable_beacon = false; sdata->beacon_rate_set = false; sdata->vif.cfg.ssid_len = 0; @@ -1903,12 +1924,12 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, } if (params->supported_rates && - params->supported_rates_len) { - ieee80211_parse_bitrates(link->conf->chanreq.oper.width, - sband, params->supported_rates, - params->supported_rates_len, - &link_sta->pub->supp_rates[sband->band]); - } + params->supported_rates_len && + !ieee80211_parse_bitrates(link->conf->chanreq.oper.width, + sband, params->supported_rates, + params->supported_rates_len, + &link_sta->pub->supp_rates[sband->band])) + return -EINVAL; if (params->ht_capa) ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, @@ -2062,6 +2083,9 @@ static int sta_apply_parameters(struct ieee80211_local *local, if (params->listen_interval >= 0) sta->listen_interval = params->listen_interval; + if (params->eml_cap_present) + sta->sta.eml_cap = params->eml_cap; + ret = sta_link_apply_parameters(local, sta, STA_LINK_MODE_STA_MODIFY, ¶ms->link_sta_params); if (ret) @@ -2900,7 +2924,7 @@ static int ieee80211_scan(struct wiphy *wiphy, * the frames sent while scanning on other channel will be * lost) */ - if (sdata->deflink.u.ap.beacon && + if (ieee80211_num_beaconing_links(sdata) && (!(wiphy->features & NL80211_FEATURE_AP_SCAN) || !(req->flags & NL80211_SCAN_FLAG_AP))) return -EOPNOTSUPP; @@ -3190,19 +3214,27 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, static int ieee80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id, int *dbm) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + struct ieee80211_link_data *link_data; if (local->ops->get_txpower && (sdata->flags & IEEE80211_SDATA_IN_DRIVER)) - return drv_get_txpower(local, sdata, dbm); + return drv_get_txpower(local, sdata, link_id, dbm); - if (local->emulate_chanctx) + if (local->emulate_chanctx) { *dbm = local->hw.conf.power_level; - else - *dbm = sdata->vif.bss_conf.txpower; + } else { + link_data = wiphy_dereference(wiphy, sdata->link[link_id]); + + if (link_data) + *dbm = link_data->conf->txpower; + else + return -ENOLINK; + } /* INT_MIN indicates no power level was set yet */ if (*dbm == INT_MIN) @@ -3688,6 +3720,7 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; + struct ieee80211_bss_conf *tx_bss_conf; struct ieee80211_link_data *link_data; if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS)) @@ -3701,25 +3734,24 @@ void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id) return; } - /* TODO: MBSSID with MLO changes */ - if (vif->mbssid_tx_vif == vif) { + tx_bss_conf = rcu_dereference(link_data->conf->tx_bss_conf); + if (tx_bss_conf == link_data->conf) { /* Trigger ieee80211_csa_finish() on the non-transmitting * interfaces when channel switch is received on * transmitting interface */ - struct ieee80211_sub_if_data *iter; - - list_for_each_entry_rcu(iter, &local->interfaces, list) { - if (!ieee80211_sdata_running(iter)) - continue; + struct ieee80211_link_data *iter; - if (iter == sdata || iter->vif.mbssid_tx_vif != vif) + for_each_sdata_link(local, iter) { + if (iter->sdata == sdata || + rcu_access_pointer(iter->conf->tx_bss_conf) != tx_bss_conf) continue; - wiphy_work_queue(iter->local->hw.wiphy, - &iter->deflink.csa.finalize_work); + wiphy_work_queue(iter->sdata->local->hw.wiphy, + &iter->csa.finalize_work); } } + wiphy_work_queue(local->hw.wiphy, &link_data->csa.finalize_work); rcu_read_unlock(); @@ -4358,9 +4390,8 @@ static int ieee80211_cfg_get_channel(struct wiphy *wiphy, if (chanctx_conf) { *chandef = link->conf->chanreq.oper; ret = 0; - } else if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && - local->open_count > 0 && - local->open_count == local->monitors && + } else if (local->open_count > 0 && + local->open_count == local->virt_monitors && sdata->vif.type == NL80211_IFTYPE_MONITOR) { *chandef = local->monitor_chanreq.oper; ret = 0; @@ -4822,17 +4853,19 @@ ieee80211_color_change_bss_config_notify(struct ieee80211_link_data *link, ieee80211_link_info_change_notify(sdata, link, changed); - if (!sdata->vif.bss_conf.nontransmitted && sdata->vif.mbssid_tx_vif) { - struct ieee80211_sub_if_data *child; + if (!link->conf->nontransmitted && + rcu_access_pointer(link->conf->tx_bss_conf)) { + struct ieee80211_link_data *tmp; - list_for_each_entry(child, &sdata->local->interfaces, list) { - if (child != sdata && child->vif.mbssid_tx_vif == &sdata->vif) { - child->vif.bss_conf.he_bss_color.color = color; - child->vif.bss_conf.he_bss_color.enabled = enable; - ieee80211_link_info_change_notify(child, - &child->deflink, - BSS_CHANGED_HE_BSS_COLOR); - } + for_each_sdata_link(sdata->local, tmp) { + if (tmp->sdata == sdata || + rcu_access_pointer(tmp->conf->tx_bss_conf) != link->conf) + continue; + + tmp->conf->he_bss_color.color = color; + tmp->conf->he_bss_color.enabled = enable; + ieee80211_link_info_change_notify(tmp->sdata, tmp, + BSS_CHANGED_HE_BSS_COLOR); } } } @@ -5172,6 +5205,25 @@ ieee80211_set_ttlm(struct wiphy *wiphy, struct net_device *dev, return ieee80211_req_neg_ttlm(sdata, params); } +static int +ieee80211_assoc_ml_reconf(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_ml_reconf_req *req) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + lockdep_assert_wiphy(sdata->local->hw.wiphy); + + return ieee80211_mgd_assoc_ml_reconf(sdata, req); +} + +static int +ieee80211_set_epcs(struct wiphy *wiphy, struct net_device *dev, bool enable) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + return ieee80211_mgd_set_epcs(sdata, enable); +} + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -5286,4 +5338,6 @@ const struct cfg80211_ops mac80211_config_ops = { .set_hw_timestamp = ieee80211_set_hw_timestamp, .set_ttlm = ieee80211_set_ttlm, .get_radio_mask = ieee80211_get_radio_mask, + .assoc_ml_reconf = ieee80211_assoc_ml_reconf, + .set_epcs = ieee80211_set_epcs, }; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index a442cb667520..3aaf5abf1acc 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * mac80211 - channel management - * Copyright 2020 - 2024 Intel Corporation + * Copyright 2020 - 2025 Intel Corporation */ #include <linux/nl80211.h> @@ -247,6 +247,13 @@ static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta, if (!link_sta) return NL80211_CHAN_WIDTH_20_NOHT; + /* + * We assume that TX/RX might be asymmetric (so e.g. VHT operating + * mode notification changes what a STA wants to receive, but not + * necessarily what it will transmit to us), and therefore use the + * capabilities here. Calling it RX bandwidth capability is a bit + * wrong though, since capabilities are in fact symmetric. + */ width = ieee80211_sta_cap_rx_bw(link_sta); switch (width) { @@ -2124,6 +2131,9 @@ void ieee80211_link_release_channel(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + return; + lockdep_assert_wiphy(sdata->local->hw.wiphy); if (rcu_access_pointer(link->conf->chanctx_conf)) @@ -2171,3 +2181,21 @@ void ieee80211_iter_chan_contexts_atomic( rcu_read_unlock(); } EXPORT_SYMBOL_GPL(ieee80211_iter_chan_contexts_atomic); + +void ieee80211_iter_chan_contexts_mtx( + struct ieee80211_hw *hw, + void (*iter)(struct ieee80211_hw *hw, + struct ieee80211_chanctx_conf *chanctx_conf, + void *data), + void *iter_data) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_chanctx *ctx; + + lockdep_assert_wiphy(hw->wiphy); + + list_for_each_entry(ctx, &local->chanctx_list, list) + if (ctx->driver_present) + iter(hw, &ctx->conf, iter_data); +} +EXPORT_SYMBOL_GPL(ieee80211_iter_chan_contexts_mtx); diff --git a/net/mac80211/debug.h b/net/mac80211/debug.h index 35a8ba25fa57..5b81998cb0c9 100644 --- a/net/mac80211/debug.h +++ b/net/mac80211/debug.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions - * Copyright (C) 2022 - 2023 Intel Corporation + * Copyright (C) 2022 - 2024 Intel Corporation */ #ifndef __MAC80211_DEBUG_H #define __MAC80211_DEBUG_H @@ -152,6 +152,14 @@ do { \ else \ _sdata_err((link)->sdata, fmt, ##__VA_ARGS__); \ } while (0) +#define link_id_info(sdata, link_id, fmt, ...) \ + do { \ + if (ieee80211_vif_is_mld(&sdata->vif)) \ + _sdata_info(sdata, "[link %d] " fmt, link_id, \ + ##__VA_ARGS__); \ + else \ + _sdata_info(sdata, fmt, ##__VA_ARGS__); \ + } while (0) #define _link_id_dbg(print, sdata, link_id, fmt, ...) \ do { \ if (ieee80211_vif_is_mld(&(sdata)->vif)) \ diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index be2e486907f9..69e03630f64c 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -284,7 +284,8 @@ static ssize_t aql_txq_limit_write(struct file *file, q_limit_low_old = local->aql_txq_limit_low[ac]; q_limit_high_old = local->aql_txq_limit_high[ac]; - wiphy_lock(local->hw.wiphy); + guard(wiphy)(local->hw.wiphy); + local->aql_txq_limit_low[ac] = q_limit_low; local->aql_txq_limit_high[ac] = q_limit_high; @@ -296,7 +297,6 @@ static ssize_t aql_txq_limit_write(struct file *file, sta->airtime[ac].aql_limit_high = q_limit_high; } } - wiphy_unlock(local->hw.wiphy); return count; } @@ -492,6 +492,7 @@ static const char *hw_flag_names[] = { FLAG(DISALLOW_PUNCTURING), FLAG(DISALLOW_PUNCTURING_5GHZ), FLAG(HANDLES_QUIET_CSA), + FLAG(STRICT), #undef FLAG }; @@ -524,6 +525,46 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf, return rv; } +static ssize_t hwflags_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct ieee80211_local *local = file->private_data; + char buf[100]; + int val; + + if (count >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(buf, user_buf, count)) + return -EFAULT; + + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; + + if (sscanf(buf, "strict=%d", &val) == 1) { + switch (val) { + case 0: + ieee80211_hw_set(&local->hw, STRICT); + return count; + case 1: + __clear_bit(IEEE80211_HW_STRICT, local->hw.flags); + return count; + default: + return -EINVAL; + } + } + + return -EINVAL; +} + +static const struct file_operations hwflags_ops = { + .open = simple_open, + .read = hwflags_read, + .write = hwflags_write, +}; + static ssize_t misc_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { @@ -574,7 +615,6 @@ static ssize_t queues_read(struct file *file, char __user *user_buf, return simple_read_from_buffer(user_buf, count, ppos, buf, res); } -DEBUGFS_READONLY_FILE_OPS(hwflags); DEBUGFS_READONLY_FILE_OPS(queues); DEBUGFS_READONLY_FILE_OPS(misc); @@ -651,7 +691,7 @@ void debugfs_hw_add(struct ieee80211_local *local) #ifdef CONFIG_PM DEBUGFS_ADD_MODE(reset, 0200); #endif - DEBUGFS_ADD(hwflags); + DEBUGFS_ADD_MODE(hwflags, 0600); DEBUGFS_ADD(user_power); DEBUGFS_ADD(power); DEBUGFS_ADD(hw_conf); diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index b3a64edea0f2..117f58af5ff9 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -402,25 +402,6 @@ void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata) } } -void ieee80211_debugfs_key_add_mgmt_default(struct ieee80211_sub_if_data *sdata) -{ - char buf[50]; - struct ieee80211_key *key; - - if (!sdata->vif.debugfs_dir) - return; - - key = wiphy_dereference(sdata->local->hw.wiphy, - sdata->deflink.default_mgmt_key); - if (key) { - sprintf(buf, "../keys/%d", key->debugfs.cnt); - sdata->debugfs.default_mgmt_key = - debugfs_create_symlink("default_mgmt_key", - sdata->vif.debugfs_dir, buf); - } else - ieee80211_debugfs_key_remove_mgmt_default(sdata); -} - void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sdata) { if (!sdata) @@ -431,27 +412,6 @@ void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sda } void -ieee80211_debugfs_key_add_beacon_default(struct ieee80211_sub_if_data *sdata) -{ - char buf[50]; - struct ieee80211_key *key; - - if (!sdata->vif.debugfs_dir) - return; - - key = wiphy_dereference(sdata->local->hw.wiphy, - sdata->deflink.default_beacon_key); - if (key) { - sprintf(buf, "../keys/%d", key->debugfs.cnt); - sdata->debugfs.default_beacon_key = - debugfs_create_symlink("default_beacon_key", - sdata->vif.debugfs_dir, buf); - } else { - ieee80211_debugfs_key_remove_beacon_default(sdata); - } -} - -void ieee80211_debugfs_key_remove_beacon_default(struct ieee80211_sub_if_data *sdata) { if (!sdata) @@ -460,10 +420,3 @@ ieee80211_debugfs_key_remove_beacon_default(struct ieee80211_sub_if_data *sdata) debugfs_remove(sdata->debugfs.default_beacon_key); sdata->debugfs.default_beacon_key = NULL; } - -void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key, - struct sta_info *sta) -{ - debugfs_remove(key->debugfs.stalink); - key->debugfs.stalink = NULL; -} diff --git a/net/mac80211/debugfs_key.h b/net/mac80211/debugfs_key.h index af7cf495f8d1..e17a48d5c6cc 100644 --- a/net/mac80211/debugfs_key.h +++ b/net/mac80211/debugfs_key.h @@ -6,16 +6,10 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key); void ieee80211_debugfs_key_remove(struct ieee80211_key *key); void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata); -void ieee80211_debugfs_key_add_mgmt_default( - struct ieee80211_sub_if_data *sdata); void ieee80211_debugfs_key_remove_mgmt_default( struct ieee80211_sub_if_data *sdata); -void ieee80211_debugfs_key_add_beacon_default( - struct ieee80211_sub_if_data *sdata); void ieee80211_debugfs_key_remove_beacon_default( struct ieee80211_sub_if_data *sdata); -void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key, - struct sta_info *sta); #else static inline void ieee80211_debugfs_key_add(struct ieee80211_key *key) {} @@ -24,21 +18,12 @@ static inline void ieee80211_debugfs_key_remove(struct ieee80211_key *key) static inline void ieee80211_debugfs_key_update_default( struct ieee80211_sub_if_data *sdata) {} -static inline void ieee80211_debugfs_key_add_mgmt_default( - struct ieee80211_sub_if_data *sdata) -{} static inline void ieee80211_debugfs_key_remove_mgmt_default( struct ieee80211_sub_if_data *sdata) {} -static inline void ieee80211_debugfs_key_add_beacon_default( - struct ieee80211_sub_if_data *sdata) -{} static inline void ieee80211_debugfs_key_remove_beacon_default( struct ieee80211_sub_if_data *sdata) {} -static inline void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key, - struct sta_info *sta) -{} #endif #endif /* __MAC80211_DEBUGFS_KEY_H */ diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index a9bc2fd59f55..54c479910d05 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -727,7 +727,7 @@ static ssize_t ieee80211_if_parse_active_links(struct ieee80211_sub_if_data *sda { u16 active_links; - if (kstrtou16(buf, 0, &active_links)) + if (kstrtou16(buf, 0, &active_links) || !active_links) return -EINVAL; return ieee80211_set_active_links(&sdata->vif, active_links) ?: buflen; @@ -1025,16 +1025,7 @@ void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata) void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata) { - struct dentry *dir; - char buf[10 + IFNAMSIZ]; - - dir = sdata->vif.debugfs_dir; - - if (IS_ERR_OR_NULL(dir)) - return; - - sprintf(buf, "netdev:%s", sdata->name); - debugfs_rename(dir->d_parent, dir, dir->d_parent, buf); + debugfs_change_name(sdata->vif.debugfs_dir, "netdev:%s", sdata->name); } void ieee80211_debugfs_recreate_netdev(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index a67a9d316008..49061bd4151b 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -152,12 +152,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, p += scnprintf(p, bufsz + buf - p, - "target %uus interval %uus ecn %s\n", - codel_time_to_us(sta->cparams.target), - codel_time_to_us(sta->cparams.interval), - sta->cparams.ecn ? "yes" : "no"); - p += scnprintf(p, - bufsz + buf - p, "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n"); for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { @@ -457,11 +451,12 @@ static ssize_t link_sta_addr_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct link_sta_info *link_sta = file->private_data; - u8 mac[3 * ETH_ALEN + 1]; + u8 mac[MAC_ADDR_STR_LEN + 2]; snprintf(mac, sizeof(mac), "%pM\n", link_sta->pub->addr); - return simple_read_from_buffer(userbuf, count, ppos, mac, 3 * ETH_ALEN); + return simple_read_from_buffer(userbuf, count, ppos, mac, + MAC_ADDR_STR_LEN + 1); } LINK_STA_OPS(addr); @@ -1240,7 +1235,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct dentry *stations_dir = sta->sdata->debugfs.subdir_stations; - u8 mac[3*ETH_ALEN]; + u8 mac[MAC_ADDR_STR_LEN + 1]; if (!stations_dir) return; diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index 299d38e9e863..35349a7f16cb 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -116,8 +116,14 @@ void drv_remove_interface(struct ieee80211_local *local, sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER; - /* Remove driver debugfs entries */ - ieee80211_debugfs_recreate_netdev(sdata, sdata->vif.valid_links); + /* + * Remove driver debugfs entries. + * The virtual monitor interface doesn't get a debugfs + * entry, so it's exempt here. + */ + if (sdata != rcu_access_pointer(local->monitor_sdata)) + ieee80211_debugfs_recreate_netdev(sdata, + sdata->vif.valid_links); trace_drv_remove_interface(local, sdata); local->ops->remove_interface(&local->hw, &sdata->vif); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index edd1e4d4ad9d..307587c8a003 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016 Intel Deutschland GmbH -* Copyright (C) 2018-2019, 2021-2024 Intel Corporation +* Copyright (C) 2018-2019, 2021-2025 Intel Corporation */ #ifndef __MAC80211_DRIVER_OPS @@ -724,6 +724,9 @@ static inline void drv_flush_sta(struct ieee80211_local *local, if (sdata && !check_sdata_in_driver(sdata)) return; + if (!sta->uploaded) + return; + trace_drv_flush_sta(local, sdata, &sta->sta); if (local->ops->flush_sta) local->ops->flush_sta(&local->hw, &sdata->vif, &sta->sta); @@ -952,6 +955,7 @@ static inline void drv_mgd_complete_tx(struct ieee80211_local *local, return; WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION); + info->link_id = info->link_id < 0 ? 0 : info->link_id; trace_drv_mgd_complete_tx(local, sdata, info->duration, info->subtype, info->success); if (local->ops->mgd_complete_tx) @@ -1273,7 +1277,8 @@ static inline u32 drv_get_expected_throughput(struct ieee80211_local *local, } static inline int drv_get_txpower(struct ieee80211_local *local, - struct ieee80211_sub_if_data *sdata, int *dbm) + struct ieee80211_sub_if_data *sdata, + unsigned int link_id, int *dbm) { int ret; @@ -1283,8 +1288,8 @@ static inline int drv_get_txpower(struct ieee80211_local *local, if (!local->ops->get_txpower) return -EOPNOTSUPP; - ret = local->ops->get_txpower(&local->hw, &sdata->vif, dbm); - trace_drv_get_txpower(local, sdata, *dbm, ret); + ret = local->ops->get_txpower(&local->hw, &sdata->vif, link_id, dbm); + trace_drv_get_txpower(local, sdata, link_id, *dbm, ret); return ret; } diff --git a/net/mac80211/drop.h b/net/mac80211/drop.h index 59e3ec4dc960..eb9ab310f91c 100644 --- a/net/mac80211/drop.h +++ b/net/mac80211/drop.h @@ -11,12 +11,6 @@ typedef unsigned int __bitwise ieee80211_rx_result; -#define MAC80211_DROP_REASONS_MONITOR(R) \ - R(RX_DROP_M_UNEXPECTED_4ADDR_FRAME) \ - R(RX_DROP_M_BAD_BCN_KEYIDX) \ - R(RX_DROP_M_BAD_MGMT_KEYIDX) \ -/* this line for the trailing \ - add before this */ - #define MAC80211_DROP_REASONS_UNUSABLE(R) \ /* 0x00 == ___RX_DROP_UNUSABLE */ \ R(RX_DROP_U_MIC_FAIL) \ @@ -66,6 +60,10 @@ typedef unsigned int __bitwise ieee80211_rx_result; R(RX_DROP_U_UNEXPECTED_STA_4ADDR) \ R(RX_DROP_U_UNEXPECTED_VLAN_MCAST) \ R(RX_DROP_U_NOT_PORT_CONTROL) \ + R(RX_DROP_U_UNEXPECTED_4ADDR_FRAME) \ + R(RX_DROP_U_BAD_BCN_KEYIDX) \ + /* 0x30 */ \ + R(RX_DROP_U_BAD_MGMT_KEYIDX) \ R(RX_DROP_U_UNKNOWN_ACTION_REJECTED) \ /* this line for the trailing \ - add before this */ @@ -78,10 +76,6 @@ enum ___mac80211_drop_reason { ___RX_QUEUED = SKB_NOT_DROPPED_YET, #define ENUM(x) ___ ## x, - ___RX_DROP_MONITOR = SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR << - SKB_DROP_REASON_SUBSYS_SHIFT, - MAC80211_DROP_REASONS_MONITOR(ENUM) - ___RX_DROP_UNUSABLE = SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE << SKB_DROP_REASON_SUBSYS_SHIFT, MAC80211_DROP_REASONS_UNUSABLE(ENUM) @@ -89,11 +83,10 @@ enum ___mac80211_drop_reason { }; enum mac80211_drop_reason { - RX_CONTINUE = (__force ieee80211_rx_result)___RX_CONTINUE, - RX_QUEUED = (__force ieee80211_rx_result)___RX_QUEUED, - RX_DROP_MONITOR = (__force ieee80211_rx_result)___RX_DROP_MONITOR, + RX_CONTINUE = (__force ieee80211_rx_result)___RX_CONTINUE, + RX_QUEUED = (__force ieee80211_rx_result)___RX_QUEUED, + RX_DROP = (__force ieee80211_rx_result)___RX_DROP_UNUSABLE, #define DEF(x) x = (__force ieee80211_rx_result)___ ## x, - MAC80211_DROP_REASONS_MONITOR(DEF) MAC80211_DROP_REASONS_UNUSABLE(DEF) #undef DEF }; diff --git a/net/mac80211/eht.c b/net/mac80211/eht.c index 7a3116c36df9..fd41046e3b68 100644 --- a/net/mac80211/eht.c +++ b/net/mac80211/eht.c @@ -2,7 +2,7 @@ /* * EHT handling * - * Copyright(c) 2021-2024 Intel Corporation + * Copyright(c) 2021-2025 Intel Corporation */ #include "ieee80211_i.h" @@ -76,6 +76,13 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata, link_sta->cur_max_bandwidth = ieee80211_sta_cap_rx_bw(link_sta); link_sta->pub->bandwidth = ieee80211_sta_cur_vht_bw(link_sta); + /* + * The MPDU length bits are reserved on all but 2.4 GHz and get set via + * VHT (5 GHz) or HE (6 GHz) capabilities. + */ + if (sband->band != NL80211_BAND_2GHZ) + return; + switch (u8_get_bits(eht_cap->eht_cap_elem.mac_cap_info[0], IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK)) { case IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454: diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 99f6174a9d69..0397755a3bd1 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -19,16 +19,13 @@ static int ieee80211_set_ringparam(struct net_device *dev, struct netlink_ext_ack *extack) { struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy); - int ret; if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0) return -EINVAL; - wiphy_lock(local->hw.wiphy); - ret = drv_set_ringparam(local, rp->tx_pending, rp->rx_pending); - wiphy_unlock(local->hw.wiphy); + guard(wiphy)(local->hw.wiphy); - return ret; + return drv_set_ringparam(local, rp->tx_pending, rp->rx_pending); } static void ieee80211_get_ringparam(struct net_device *dev, @@ -40,10 +37,10 @@ static void ieee80211_get_ringparam(struct net_device *dev, memset(rp, 0, sizeof(*rp)); - wiphy_lock(local->hw.wiphy); + guard(wiphy)(local->hw.wiphy); + drv_get_ringparam(local, &rp->tx_pending, &rp->tx_max_pending, &rp->rx_pending, &rp->rx_max_pending); - wiphy_unlock(local->hw.wiphy); } static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { @@ -109,7 +106,7 @@ static void ieee80211_get_stats(struct net_device *dev, * network device. */ - wiphy_lock(local->hw.wiphy); + guard(wiphy)(local->hw.wiphy); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sta = sta_info_get_bss(sdata, sdata->deflink.u.mgd.bssid); @@ -160,6 +157,10 @@ do_survey: chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (chanctx_conf) channel = chanctx_conf->def.chan; + else if (local->open_count > 0 && + local->open_count == local->virt_monitors && + sdata->vif.type == NL80211_IFTYPE_MONITOR) + channel = local->monitor_chanreq.oper.chan; else channel = NULL; rcu_read_unlock(); @@ -205,13 +206,10 @@ do_survey: else data[i++] = -1LL; - if (WARN_ON(i != STA_STATS_LEN)) { - wiphy_unlock(local->hw.wiphy); + if (WARN_ON(i != STA_STATS_LEN)) return; - } drv_get_et_stats(sdata, stats, &(data[STA_STATS_LEN])); - wiphy_unlock(local->hw.wiphy); } static void ieee80211_get_strings(struct net_device *dev, u32 sset, u8 *data) diff --git a/net/mac80211/he.c b/net/mac80211/he.c index ecbb042dd043..5792ef77e986 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -3,10 +3,11 @@ * HE handling * * Copyright(c) 2017 Intel Deutschland GmbH - * Copyright(c) 2019 - 2023 Intel Corporation + * Copyright(c) 2019 - 2024 Intel Corporation */ #include "ieee80211_i.h" +#include "rate.h" static void ieee80211_update_from_he_6ghz_capa(const struct ieee80211_he_6ghz_capa *he_6ghz_capa, @@ -248,3 +249,119 @@ ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, he_obss_pd->enable = true; } } + +static void ieee80211_link_sta_rc_update_omi(struct ieee80211_link_data *link, + struct link_sta_info *link_sta) +{ + struct ieee80211_sub_if_data *sdata = link->sdata; + struct ieee80211_supported_band *sband; + enum ieee80211_sta_rx_bandwidth new_bw; + enum nl80211_band band; + + band = link->conf->chanreq.oper.chan->band; + sband = sdata->local->hw.wiphy->bands[band]; + + new_bw = ieee80211_sta_cur_vht_bw(link_sta); + if (link_sta->pub->bandwidth == new_bw) + return; + + link_sta->pub->bandwidth = new_bw; + rate_control_rate_update(sdata->local, sband, link_sta, + IEEE80211_RC_BW_CHANGED); +} + +bool ieee80211_prepare_rx_omi_bw(struct ieee80211_link_sta *pub_link_sta, + enum ieee80211_sta_rx_bandwidth bw) +{ + struct sta_info *sta = container_of(pub_link_sta->sta, + struct sta_info, sta); + struct ieee80211_local *local = sta->sdata->local; + struct link_sta_info *link_sta = + sdata_dereference(sta->link[pub_link_sta->link_id], sta->sdata); + struct ieee80211_link_data *link = + sdata_dereference(sta->sdata->link[pub_link_sta->link_id], + sta->sdata); + struct ieee80211_chanctx_conf *conf; + struct ieee80211_chanctx *chanctx; + bool ret; + + if (WARN_ON(!link || !link_sta || link_sta->pub != pub_link_sta)) + return false; + + conf = sdata_dereference(link->conf->chanctx_conf, sta->sdata); + if (WARN_ON(!conf)) + return false; + + trace_api_prepare_rx_omi_bw(local, sta->sdata, link_sta, bw); + + chanctx = container_of(conf, typeof(*chanctx), conf); + + if (link_sta->rx_omi_bw_staging == bw) { + ret = false; + goto trace; + } + + /* must call this API in pairs */ + if (WARN_ON(link_sta->rx_omi_bw_tx != link_sta->rx_omi_bw_staging || + link_sta->rx_omi_bw_rx != link_sta->rx_omi_bw_staging)) { + ret = false; + goto trace; + } + + if (bw < link_sta->rx_omi_bw_staging) { + link_sta->rx_omi_bw_tx = bw; + ieee80211_link_sta_rc_update_omi(link, link_sta); + } else { + link_sta->rx_omi_bw_rx = bw; + ieee80211_recalc_chanctx_min_def(local, chanctx, NULL, false); + } + + link_sta->rx_omi_bw_staging = bw; + ret = true; +trace: + trace_api_return_bool(local, ret); + return ret; +} +EXPORT_SYMBOL_GPL(ieee80211_prepare_rx_omi_bw); + +void ieee80211_finalize_rx_omi_bw(struct ieee80211_link_sta *pub_link_sta) +{ + struct sta_info *sta = container_of(pub_link_sta->sta, + struct sta_info, sta); + struct ieee80211_local *local = sta->sdata->local; + struct link_sta_info *link_sta = + sdata_dereference(sta->link[pub_link_sta->link_id], sta->sdata); + struct ieee80211_link_data *link = + sdata_dereference(sta->sdata->link[pub_link_sta->link_id], + sta->sdata); + struct ieee80211_chanctx_conf *conf; + struct ieee80211_chanctx *chanctx; + + if (WARN_ON(!link || !link_sta || link_sta->pub != pub_link_sta)) + return; + + conf = sdata_dereference(link->conf->chanctx_conf, sta->sdata); + if (WARN_ON(!conf)) + return; + + trace_api_finalize_rx_omi_bw(local, sta->sdata, link_sta); + + chanctx = container_of(conf, typeof(*chanctx), conf); + + if (link_sta->rx_omi_bw_tx != link_sta->rx_omi_bw_staging) { + /* rate control in finalize only when widening bandwidth */ + WARN_ON(link_sta->rx_omi_bw_tx > link_sta->rx_omi_bw_staging); + link_sta->rx_omi_bw_tx = link_sta->rx_omi_bw_staging; + ieee80211_link_sta_rc_update_omi(link, link_sta); + } + + if (link_sta->rx_omi_bw_rx != link_sta->rx_omi_bw_staging) { + /* channel context in finalize only when narrowing bandwidth */ + WARN_ON(link_sta->rx_omi_bw_rx < link_sta->rx_omi_bw_staging); + link_sta->rx_omi_bw_rx = link_sta->rx_omi_bw_staging; + ieee80211_recalc_chanctx_min_def(local, chanctx, NULL, false); + } + + trace_api_return_void(local); +} +EXPORT_SYMBOL_GPL(ieee80211_finalize_rx_omi_bw); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index a1b4178deccf..a6e7b7ba6a01 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -48,7 +48,7 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt; u8 *pos; struct ieee80211_supported_band *sband; - u32 rate_flags, rates = 0, rates_added = 0; + u32 rates = 0, rates_added = 0; struct beacon_data *presp; int frame_len; @@ -90,14 +90,11 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, pos += ifibss->ssid_len; sband = local->hw.wiphy->bands[chandef->chan->band]; - rate_flags = ieee80211_chandef_rate_flags(chandef); rates_n = 0; if (have_higher_than_11mbit) *have_higher_than_11mbit = false; for (i = 0; i < sband->n_bitrates; i++) { - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; if (sband->bitrates[i].bitrate > 110 && have_higher_than_11mbit) *have_higher_than_11mbit = true; @@ -245,6 +242,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, sdata->vif.cfg.ibss_creator = false; sdata->vif.bss_conf.enable_beacon = false; netif_carrier_off(sdata->dev); + synchronize_net(); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IBSS | BSS_CHANGED_BEACON_ENABLED); @@ -394,7 +392,6 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, const struct cfg80211_bss_ies *ies; enum nl80211_channel_type chan_type; u64 tsf; - u32 rate_flags; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -428,7 +425,6 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, } sband = sdata->local->hw.wiphy->bands[cbss->channel->band]; - rate_flags = ieee80211_chandef_rate_flags(&sdata->u.ibss.chandef); basic_rates = 0; @@ -438,9 +434,6 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, for (j = 0; j < sband->n_bitrates; j++) { int brate; - if ((rate_flags & sband->bitrates[j].flags) - != rate_flags) - continue; brate = DIV_ROUND_UP(sband->bitrates[j].bitrate, 5); if (brate == rate) { @@ -1716,12 +1709,9 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, struct cfg80211_ibss_params *params) { u64 changed = 0; - u32 rate_flags; - struct ieee80211_supported_band *sband; enum ieee80211_chanctx_mode chanmode; struct ieee80211_local *local = sdata->local; int radar_detect_width = 0; - int i; int ret; lockdep_assert_wiphy(local->hw.wiphy); @@ -1764,12 +1754,6 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, sdata->u.ibss.last_scan_completed = jiffies; /* fix basic_rates if channel does not support these rates */ - rate_flags = ieee80211_chandef_rate_flags(¶ms->chandef); - sband = local->hw.wiphy->bands[params->chandef.chan->band]; - for (i = 0; i < sband->n_bitrates; i++) { - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - sdata->u.ibss.basic_rates &= ~BIT(i); - } memcpy(sdata->vif.bss_conf.mcast_rate, params->mcast_rate, sizeof(params->mcast_rate)); @@ -1826,8 +1810,8 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; - ieee80211_ibss_disconnect(sdata); ifibss->ssid_len = 0; + ieee80211_ibss_disconnect(sdata); eth_zero_addr(ifibss->bssid); /* remove beacon */ @@ -1843,7 +1827,7 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) skb_queue_purge(&sdata->skb_queue); - del_timer_sync(&sdata->u.ibss.timer); + timer_delete_sync(&sdata->u.ibss.timer); return 0; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9f0db39b28ff..30809f0b35f7 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #ifndef IEEE80211_I_H @@ -200,7 +200,6 @@ enum ieee80211_packet_rx_flags { /** * enum ieee80211_rx_flags - RX data flags * - * @IEEE80211_RX_CMNTR: received on cooked monitor already * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported * to cfg80211_report_obss_beacon(). * @@ -208,8 +207,7 @@ enum ieee80211_packet_rx_flags { * for a single frame. */ enum ieee80211_rx_flags { - IEEE80211_RX_CMNTR = BIT(0), - IEEE80211_RX_BEACON_REPORTED = BIT(1), + IEEE80211_RX_BEACON_REPORTED = BIT(0), }; struct ieee80211_rx_data { @@ -404,6 +402,8 @@ struct ieee80211_mgd_auth_data { int tries; u16 algorithm, expected_transaction; + unsigned long userspace_selectors[BITS_TO_LONGS(128)]; + u8 key[WLAN_KEY_LEN_WEP104]; u8 key_len, key_idx; bool done, waiting; @@ -458,7 +458,9 @@ struct ieee80211_mgd_assoc_data { bool s1g; bool spp_amsdu; - unsigned int assoc_link_id; + s8 assoc_link_id; + + __le16 ext_mld_capa_ops; u8 fils_nonces[2 * FILS_NONCE_LEN]; u8 fils_kek[FILS_MAX_KEK_LEN]; @@ -520,6 +522,8 @@ struct ieee80211_if_managed { struct ieee80211_mgd_auth_data *auth_data; struct ieee80211_mgd_assoc_data *assoc_data; + unsigned long userspace_selectors[BITS_TO_LONGS(128)]; + bool powersave; /* powersave requested for this iface */ bool broken_ap; /* AP is broken -- turn off powersave */ @@ -602,6 +606,21 @@ struct ieee80211_if_managed { /* dialog token enumerator for neg TTLM request */ u8 dialog_token_alloc; struct wiphy_delayed_work neg_ttlm_timeout_work; + + /* Locally initiated multi-link reconfiguration */ + struct { + struct ieee80211_mgd_assoc_data *add_links_data; + struct wiphy_delayed_work wk; + u16 removed_links; + u16 added_links; + u8 dialog_token; + } reconf; + + /* Support for epcs */ + struct { + bool enabled; + u8 dialog_token; + } epcs; }; struct ieee80211_if_ibss { @@ -1204,9 +1223,18 @@ struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) for (int ___link_id = 0; \ ___link_id < ARRAY_SIZE(___sdata->link); \ ___link_id++) \ - if ((_link = wiphy_dereference((local)->hw.wiphy, \ + if ((_link = wiphy_dereference((_local)->hw.wiphy, \ ___sdata->link[___link_id]))) +#define for_each_link_data(sdata, __link) \ + struct ieee80211_sub_if_data *__sdata = sdata; \ + for (int __link_id = 0; \ + __link_id < ARRAY_SIZE((__sdata)->link); __link_id++) \ + if ((!(__sdata)->vif.valid_links || \ + (__sdata)->vif.valid_links & BIT(__link_id)) && \ + ((__link) = sdata_dereference((__sdata)->link[__link_id], \ + (__sdata)))) + static inline int ieee80211_get_mbssid_beacon_len(struct cfg80211_mbssid_elems *elems, struct cfg80211_rnr_elems *rnr_elems, @@ -1367,7 +1395,7 @@ struct ieee80211_local { spinlock_t queue_stop_reason_lock; int open_count; - int monitors, cooked_mntrs, tx_mntrs; + int monitors, virt_monitors, tx_mntrs; /* number of interfaces with corresponding FIF_ flags */ int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, fif_probe_req; @@ -1479,7 +1507,7 @@ struct ieee80211_local { /* see iface.c */ struct list_head interfaces; - struct list_head mon_list; /* only that are IFF_UP && !cooked */ + struct list_head mon_list; /* only that are IFF_UP */ struct mutex iflist_mtx; /* Scanning and BSS list */ @@ -1749,6 +1777,7 @@ struct ieee802_11_elems { const struct ieee80211_eht_operation *eht_operation; const struct ieee80211_multi_link_elem *ml_basic; const struct ieee80211_multi_link_elem *ml_reconf; + const struct ieee80211_multi_link_elem *ml_epcs; const struct ieee80211_bandwidth_indication *bandwidth_indication; const struct ieee80211_ttlm_elem *ttlm[IEEE80211_TTLM_MAX_CNT]; @@ -1779,6 +1808,7 @@ struct ieee802_11_elems { /* mult-link element can be de-fragmented and thus u8 is not sufficient */ size_t ml_basic_len; size_t ml_reconf_len; + size_t ml_epcs_len; u8 ttlm_num; @@ -2057,6 +2087,9 @@ static inline void ieee80211_vif_clear_links(struct ieee80211_sub_if_data *sdata ieee80211_vif_set_links(sdata, 0, 0); } +void ieee80211_apvlan_link_setup(struct ieee80211_sub_if_data *sdata); +void ieee80211_apvlan_link_clear(struct ieee80211_sub_if_data *sdata); + /* tx handling */ void ieee80211_clear_tx_pending(struct ieee80211_local *local); void ieee80211_tx_pending(struct tasklet_struct *t); @@ -2075,8 +2108,7 @@ struct sk_buff * ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags); void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, - int retry_count, bool send_to_cooked, - struct ieee80211_tx_status *status); + int retry_count, struct ieee80211_tx_status *status); void ieee80211_check_fast_xmit(struct sta_info *sta); void ieee80211_check_fast_xmit_all(struct ieee80211_local *local); @@ -2109,8 +2141,6 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps, const u8 *da, const u8 *bssid, int link_id); -bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old, - enum ieee80211_smps_mode smps_mode_new); void ieee80211_add_addbaext(struct sk_buff *skb, const u8 req_addba_ext_data, u16 buf_size); @@ -2595,7 +2625,7 @@ void ieee80211_add_aid_request_ie(struct ieee80211_sub_if_data *sdata, /* element building in SKBs */ int ieee80211_put_srates_elem(struct sk_buff *skb, const struct ieee80211_supported_band *sband, - u32 basic_rates, u32 rate_flags, u32 masked_rates, + u32 basic_rates, u32 masked_rates, u8 element_id); int ieee80211_put_he_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, @@ -2761,10 +2791,23 @@ void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); int ieee80211_req_neg_ttlm(struct ieee80211_sub_if_data *sdata, struct cfg80211_ttlm_params *params); +void ieee80211_process_ttlm_teardown(struct ieee80211_sub_if_data *sdata); void ieee80211_check_wbrf_support(struct ieee80211_local *local); void ieee80211_add_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef); void ieee80211_remove_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef); +int ieee80211_mgd_set_epcs(struct ieee80211_sub_if_data *sdata, bool enable); +void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len); +void ieee80211_process_epcs_teardown(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len); + +int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, + struct cfg80211_ml_reconf_req *req); + +void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len); +void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata); #if IS_ENABLED(CONFIG_MAC80211_KUNIT_TEST) #define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym) @@ -2776,6 +2819,13 @@ int ieee80211_calc_chandef_subchan_offset(const struct cfg80211_chan_def *ap, void ieee80211_rearrange_tpe_psd(struct ieee80211_parsed_tpe_psd *psd, const struct cfg80211_chan_def *ap, const struct cfg80211_chan_def *used); +struct ieee802_11_elems * +ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, + struct ieee80211_conn_settings *conn, + struct cfg80211_bss *cbss, int link_id, + struct ieee80211_chan_req *chanreq, + struct cfg80211_chan_def *ap_chandef, + unsigned long *userspace_selectors); #else #define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) #define VISIBLE_IF_MAC80211_KUNIT static diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 806dffa48ef9..7c27f3cd841c 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -8,7 +8,7 @@ * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (c) 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/slab.h> #include <linux/kernel.h> @@ -295,7 +295,6 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; - int ret; /* * This happens during unregistration if there's a bond device @@ -305,11 +304,9 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr) if (!dev->ieee80211_ptr->registered) return 0; - wiphy_lock(local->hw.wiphy); - ret = _ieee80211_change_mac(sdata, addr); - wiphy_unlock(local->hw.wiphy); + guard(wiphy)(local->hw.wiphy); - return ret; + return _ieee80211_change_mac(sdata, addr); } static inline int identical_mac_addr_allowed(int type1, int type2) @@ -445,16 +442,13 @@ static int ieee80211_open(struct net_device *dev) if (!is_valid_ether_addr(dev->dev_addr)) return -EADDRNOTAVAIL; - wiphy_lock(sdata->local->hw.wiphy); + guard(wiphy)(sdata->local->hw.wiphy); + err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type); if (err) - goto out; - - err = ieee80211_do_open(&sdata->wdev, true); -out: - wiphy_unlock(sdata->local->hw.wiphy); + return err; - return err; + return ieee80211_do_open(&sdata->wdev, true); } static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down) @@ -489,10 +483,11 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do ieee80211_ibss_stop(sdata); break; case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) - break; list_del_rcu(&sdata->u.mntr.list); break; + case NL80211_IFTYPE_AP_VLAN: + ieee80211_apvlan_link_clear(sdata); + break; default: break; } @@ -534,7 +529,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do netif_addr_unlock_bh(sdata->dev); } - del_timer_sync(&local->dynamic_ps_timer); + timer_delete_sync(&local->dynamic_ps_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); WARN(ieee80211_vif_is_mld(&sdata->vif), @@ -590,18 +585,19 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do /* no need to tell driver */ break; case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) { - local->cooked_mntrs--; - break; - } - local->monitors--; - if (local->monitors == 0) { - local->hw.conf.flags &= ~IEEE80211_CONF_MONITOR; - hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; - } - ieee80211_adjust_monitor_flags(sdata, -1); + if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { + + local->virt_monitors--; + if (local->virt_monitors == 0) { + local->hw.conf.flags &= ~IEEE80211_CONF_MONITOR; + hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; + } + + ieee80211_adjust_monitor_flags(sdata, -1); + } break; case NL80211_IFTYPE_NAN: /* clean all the functions */ @@ -666,6 +662,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ieee80211_txq_remove_vlan(local, sdata); + if (sdata->vif.txq) + ieee80211_txq_purge(sdata->local, to_txq_info(sdata->vif.txq)); + sdata->bss = NULL; if (local->open_count == 0) @@ -692,7 +691,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do case NL80211_IFTYPE_AP_VLAN: break; case NL80211_IFTYPE_MONITOR: - if (local->monitors == 0) + if (local->virt_monitors == 0) ieee80211_del_virtual_monitor(local); ieee80211_recalc_idle(local); @@ -729,34 +728,63 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do ieee80211_configure_filter(local); ieee80211_hw_config(local, hw_reconf_flags); - if (local->monitors == local->open_count) + if (local->virt_monitors == local->open_count) ieee80211_add_virtual_monitor(local); } -static void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata) +void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata) { - struct ieee80211_sub_if_data *tx_sdata, *non_tx_sdata, *tmp_sdata; - struct ieee80211_vif *tx_vif = sdata->vif.mbssid_tx_vif; + struct ieee80211_sub_if_data *tx_sdata; + struct ieee80211_bss_conf *link_conf, *tx_bss_conf; + struct ieee80211_link_data *tx_link, *link; + unsigned int link_id; - if (!tx_vif) - return; + lockdep_assert_wiphy(sdata->local->hw.wiphy); + + /* Check if any of the links of current sdata is an MBSSID. */ + for_each_vif_active_link(&sdata->vif, link_conf, link_id) { + tx_bss_conf = sdata_dereference(link_conf->tx_bss_conf, sdata); + if (!tx_bss_conf) + continue; - tx_sdata = vif_to_sdata(tx_vif); - sdata->vif.mbssid_tx_vif = NULL; + tx_sdata = vif_to_sdata(tx_bss_conf->vif); + RCU_INIT_POINTER(link_conf->tx_bss_conf, NULL); - list_for_each_entry_safe(non_tx_sdata, tmp_sdata, - &tx_sdata->local->interfaces, list) { - if (non_tx_sdata != sdata && non_tx_sdata != tx_sdata && - non_tx_sdata->vif.mbssid_tx_vif == tx_vif && - ieee80211_sdata_running(non_tx_sdata)) { - non_tx_sdata->vif.mbssid_tx_vif = NULL; - dev_close(non_tx_sdata->wdev.netdev); + /* If we are not tx sdata reset tx sdata's tx_bss_conf to avoid recusrion + * while closing tx sdata at the end of outer loop below. + */ + if (sdata != tx_sdata) { + tx_link = sdata_dereference(tx_sdata->link[tx_bss_conf->link_id], + tx_sdata); + if (!tx_link) + continue; + + RCU_INIT_POINTER(tx_link->conf->tx_bss_conf, NULL); } - } - if (sdata != tx_sdata && ieee80211_sdata_running(tx_sdata)) { - tx_sdata->vif.mbssid_tx_vif = NULL; - dev_close(tx_sdata->wdev.netdev); + /* loop through sdatas to find if any of their links + * belong to same MBSSID set as the one getting deleted. + */ + for_each_sdata_link(tx_sdata->local, link) { + struct ieee80211_sub_if_data *link_sdata = link->sdata; + + if (link_sdata == sdata || link_sdata == tx_sdata || + rcu_access_pointer(link->conf->tx_bss_conf) != tx_bss_conf) + continue; + + RCU_INIT_POINTER(link->conf->tx_bss_conf, NULL); + + /* Remove all links of matching MLD until dynamic link + * removal can be supported. + */ + cfg80211_stop_iface(link_sdata->wdev.wiphy, &link_sdata->wdev, + GFP_KERNEL); + } + + /* If we are not tx sdata, remove links of tx sdata and proceed */ + if (sdata != tx_sdata && ieee80211_sdata_running(tx_sdata)) + cfg80211_stop_iface(tx_sdata->wdev.wiphy, + &tx_sdata->wdev, GFP_KERNEL); } } @@ -764,22 +792,26 @@ static int ieee80211_stop(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - /* close dependent VLAN and MBSSID interfaces before locking wiphy */ + /* close dependent VLAN interfaces before locking wiphy */ if (sdata->vif.type == NL80211_IFTYPE_AP) { struct ieee80211_sub_if_data *vlan, *tmpsdata; list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, u.vlan.list) dev_close(vlan->dev); - - ieee80211_stop_mbssid(sdata); } - wiphy_lock(sdata->local->hw.wiphy); + guard(wiphy)(sdata->local->hw.wiphy); + wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->activate_links_work); + /* Close the dependent MBSSID interfaces with wiphy lock as we may be + * terminating its partner links too in case of MLD. + */ + if (sdata->vif.type == NL80211_IFTYPE_AP) + ieee80211_stop_mbssid(sdata); + ieee80211_do_stop(sdata, true); - wiphy_unlock(sdata->local->hw.wiphy); return 0; } @@ -813,6 +845,9 @@ static void ieee80211_set_multicast_list(struct net_device *dev) */ static void ieee80211_teardown_sdata(struct ieee80211_sub_if_data *sdata) { + if (WARN_ON(!list_empty(&sdata->work.entry))) + wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->work); + /* free extra data */ ieee80211_free_keys(sdata, false); @@ -985,7 +1020,7 @@ static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdat local->hw.wiphy->frag_threshold != (u32)-1) flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED; - if (local->monitors) + if (local->virt_monitors) flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED; } else { flags &= ~IEEE80211_OFFLOAD_ENCAP_ENABLED; @@ -995,7 +1030,7 @@ static bool ieee80211_set_sdata_offload_flags(struct ieee80211_sub_if_data *sdat ieee80211_iftype_supports_hdr_offload(sdata->vif.type)) { flags |= IEEE80211_OFFLOAD_DECAP_ENABLED; - if (local->monitors && + if (local->virt_monitors && !ieee80211_hw_check(&local->hw, SUPPORTS_CONC_MON_RX_DECAP)) flags &= ~IEEE80211_OFFLOAD_DECAP_ENABLED; } else { @@ -1212,16 +1247,17 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local) return; } - RCU_INIT_POINTER(local->monitor_sdata, NULL); - mutex_unlock(&local->iflist_mtx); - - synchronize_net(); - + clear_bit(SDATA_STATE_RUNNING, &sdata->state); ieee80211_link_release_channel(&sdata->deflink); if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) drv_remove_interface(local, sdata); + RCU_INIT_POINTER(local->monitor_sdata, NULL); + mutex_unlock(&local->iflist_mtx); + + synchronize_net(); + kfree(sdata); } @@ -1268,6 +1304,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) sdata->crypto_tx_tailroom_needed_cnt += master->crypto_tx_tailroom_needed_cnt; + ieee80211_apvlan_link_setup(sdata); + break; } case NL80211_IFTYPE_AP: @@ -1324,7 +1362,12 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) case NL80211_IFTYPE_AP_VLAN: /* no need to tell driver, but set carrier and chanctx */ if (sdata->bss->active) { - ieee80211_link_vlan_copy_chanctx(&sdata->deflink); + struct ieee80211_link_data *link; + + for_each_link_data(sdata, link) { + ieee80211_link_vlan_copy_chanctx(link); + } + netif_carrier_on(dev); ieee80211_set_vif_encap_ops(sdata); } else { @@ -1332,28 +1375,27 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) } break; case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) { - local->cooked_mntrs++; - break; - } - if ((sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) || ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { res = drv_add_interface(local, sdata); if (res) goto err_stop; - } else if (local->monitors == 0 && local->open_count == 0) { - res = ieee80211_add_virtual_monitor(local); - if (res) - goto err_stop; + } else { + if (local->virt_monitors == 0 && local->open_count == 0) { + res = ieee80211_add_virtual_monitor(local); + if (res) + goto err_stop; + } + local->virt_monitors++; + + /* must be before the call to ieee80211_configure_filter */ + if (local->virt_monitors == 1) { + local->hw.conf.flags |= IEEE80211_CONF_MONITOR; + hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; + } } - /* must be before the call to ieee80211_configure_filter */ local->monitors++; - if (local->monitors == 1) { - local->hw.conf.flags |= IEEE80211_CONF_MONITOR; - hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; - } ieee80211_adjust_monitor_flags(sdata, 1); ieee80211_configure_filter(local); @@ -1429,8 +1471,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) rcu_assign_pointer(local->p2p_sdata, sdata); break; case NL80211_IFTYPE_MONITOR: - if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) - break; list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list); break; default: @@ -1566,6 +1606,21 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, ieee80211_process_neg_ttlm_res(sdata, mgmt, skb->len); break; + case WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN: + ieee80211_process_ttlm_teardown(sdata); + break; + case WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP: + ieee80211_process_ml_reconf_resp(sdata, mgmt, + skb->len); + break; + case WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP: + ieee80211_process_epcs_ena_resp(sdata, mgmt, + skb->len); + break; + case WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN: + ieee80211_process_epcs_teardown(sdata, mgmt, + skb->len); + break; default: break; } @@ -2279,7 +2334,7 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) */ cfg80211_shutdown_all_interfaces(local->hw.wiphy); - wiphy_lock(local->hw.wiphy); + guard(wiphy)(local->hw.wiphy); WARN(local->open_count, "%s: open count remains %d\n", wiphy_name(local->hw.wiphy), local->open_count); @@ -2309,7 +2364,6 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local) if (!netdev) kfree(sdata); } - wiphy_unlock(local->hw.wiphy); } static int netdev_notify(struct notifier_block *nb, diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 67ecfea22982..dcf8643a0baa 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -1409,7 +1409,7 @@ ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED) key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT; - key->conf.link_id = link_id; + key->conf.link_id = link_data->link_id; err = ieee80211_key_link(key, link_data, NULL); if (err) diff --git a/net/mac80211/led.c b/net/mac80211/led.c index 2dc732147e85..885fa6aa3fc1 100644 --- a/net/mac80211/led.c +++ b/net/mac80211/led.c @@ -342,7 +342,7 @@ static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local) return; tpt_trig->running = false; - del_timer_sync(&tpt_trig->timer); + timer_delete_sync(&tpt_trig->timer); led_trigger_event(&local->tpt_led, LED_OFF); } diff --git a/net/mac80211/link.c b/net/mac80211/link.c index 58a76bcd6ae6..d40c2bd3b50b 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -12,6 +12,71 @@ #include "key.h" #include "debugfs_netdev.h" +static void ieee80211_update_apvlan_links(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_sub_if_data *vlan; + struct ieee80211_link_data *link; + u16 ap_bss_links = sdata->vif.valid_links; + u16 new_links, vlan_links; + unsigned long add; + + list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { + int link_id; + + if (!vlan) + continue; + + /* No support for 4addr with MLO yet */ + if (vlan->wdev.use_4addr) + return; + + vlan_links = vlan->vif.valid_links; + + new_links = ap_bss_links; + + add = new_links & ~vlan_links; + if (!add) + continue; + + ieee80211_vif_set_links(vlan, add, 0); + + for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { + link = sdata_dereference(vlan->link[link_id], vlan); + ieee80211_link_vlan_copy_chanctx(link); + } + } +} + +void ieee80211_apvlan_link_setup(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_sub_if_data *ap_bss = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + u16 new_links = ap_bss->vif.valid_links; + unsigned long add; + int link_id; + + if (!ap_bss->vif.valid_links) + return; + + add = new_links; + for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { + sdata->wdev.valid_links |= BIT(link_id); + ether_addr_copy(sdata->wdev.links[link_id].addr, + ap_bss->wdev.links[link_id].addr); + } + + ieee80211_vif_set_links(sdata, new_links, 0); +} + +void ieee80211_apvlan_link_clear(struct ieee80211_sub_if_data *sdata) +{ + if (!sdata->wdev.valid_links) + return; + + sdata->wdev.valid_links = 0; + ieee80211_vif_clear_links(sdata); +} + void ieee80211_link_setup(struct ieee80211_link_data *link) { if (link->sdata->vif.type == NL80211_IFTYPE_STATION) @@ -31,6 +96,17 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, rcu_assign_pointer(sdata->vif.link_conf[link_id], link_conf); rcu_assign_pointer(sdata->link[link_id], link); + if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + struct ieee80211_sub_if_data *ap_bss; + struct ieee80211_bss_conf *ap_bss_conf; + + ap_bss = container_of(sdata->bss, + struct ieee80211_sub_if_data, u.ap); + ap_bss_conf = sdata_dereference(ap_bss->vif.link_conf[link_id], + ap_bss); + memcpy(link_conf, ap_bss_conf, sizeof(*link_conf)); + } + link->sdata = sdata; link->link_id = link_id; link->conf = link_conf; @@ -54,6 +130,7 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, if (!deflink) { switch (sdata->vif.type) { case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: ether_addr_copy(link_conf->addr, sdata->wdev.links[link_id].addr); link_conf->bssid = link_conf->addr; @@ -177,6 +254,7 @@ static void ieee80211_set_vif_links_bitmaps(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: /* in an AP all links are always active */ sdata->vif.active_links = valid_links; @@ -278,12 +356,16 @@ static int ieee80211_vif_update_links(struct ieee80211_sub_if_data *sdata, ieee80211_set_vif_links_bitmaps(sdata, new_links, dormant_links); /* tell the driver */ - ret = drv_change_vif_links(sdata->local, sdata, - old_links & old_active, - new_links & sdata->vif.active_links, - old); + if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN) + ret = drv_change_vif_links(sdata->local, sdata, + old_links & old_active, + new_links & sdata->vif.active_links, + old); if (!new_links) ieee80211_debugfs_recreate_netdev(sdata, false); + + if (sdata->vif.type == NL80211_IFTYPE_AP) + ieee80211_update_apvlan_links(sdata); } if (ret) { diff --git a/net/mac80211/main.c b/net/mac80211/main.c index ee1211a213d7..6b6de43d9420 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #include <net/mac80211.h> @@ -726,8 +726,13 @@ ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = { }, [NL80211_IFTYPE_P2P_DEVICE] = { .tx = 0xffff, + /* + * To support P2P PASN pairing let user space register to rx + * also AUTH frames on P2P device interface. + */ .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | - BIT(IEEE80211_STYPE_PROBE_REQ >> 4), + BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4), }, }; @@ -1305,6 +1310,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 && !(iftd->he_cap.he_cap_elem.phy_cap_info[0] & he_40_mhz_cap)) return -EINVAL; + + /* no support for per-band vendor elems with MLO */ + if (WARN_ON(iftd->vendor_elems.len && + hw->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO)) + return -EINVAL; } /* HT, VHT, HE require QoS, thus >= 4 queues */ @@ -1344,10 +1354,12 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) hw->wiphy->software_iftypes |= BIT(NL80211_IFTYPE_MONITOR); - local->int_scan_req = kzalloc(sizeof(*local->int_scan_req) + - sizeof(void *) * channels, GFP_KERNEL); + local->int_scan_req = kzalloc(struct_size(local->int_scan_req, + channels, channels), + GFP_KERNEL); if (!local->int_scan_req) return -ENOMEM; + local->int_scan_req->n_channels = channels; eth_broadcast_addr(local->int_scan_req->bssid); @@ -1734,18 +1746,7 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) wiphy_free(local->hw.wiphy); } EXPORT_SYMBOL(ieee80211_free_hw); - -static const char * const drop_reasons_monitor[] = { -#define V(x) #x, - [0] = "RX_DROP_MONITOR", - MAC80211_DROP_REASONS_MONITOR(V) -}; - -static struct drop_reason_list drop_reason_list_monitor = { - .reasons = drop_reasons_monitor, - .n_reasons = ARRAY_SIZE(drop_reasons_monitor), -}; - +#define V(x) #x, static const char * const drop_reasons_unusable[] = { [0] = "RX_DROP_UNUSABLE", MAC80211_DROP_REASONS_UNUSABLE(V) @@ -1774,8 +1775,6 @@ static int __init ieee80211_init(void) if (ret) goto err_netdev; - drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR, - &drop_reason_list_monitor); drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE, &drop_reason_list_unusable); @@ -1794,7 +1793,6 @@ static void __exit ieee80211_exit(void) ieee80211_iface_exit(); - drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_MONITOR); drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE); rcu_barrier(); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 974081324aa4..5cc56d578048 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -8,6 +8,7 @@ #include <linux/slab.h> #include <linux/unaligned.h> +#include <net/sock.h> #include "ieee80211_i.h" #include "mesh.h" #include "wme.h" @@ -706,7 +707,7 @@ void ieee80211_mesh_root_setup(struct ieee80211_if_mesh *ifmsh) else { clear_bit(MESH_WORK_ROOT, &ifmsh->wrkq_flags); /* stop running timer */ - del_timer_sync(&ifmsh->mesh_path_root_timer); + timer_delete_sync(&ifmsh->mesh_path_root_timer); } } @@ -776,7 +777,7 @@ bool ieee80211_mesh_xmit_fast(struct ieee80211_sub_if_data *sdata, if (ethertype < ETH_P_802_3_MIN) return false; - if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) + if (sk_requests_wifi_status(skb->sk)) return false; if (skb->ip_summed == CHECKSUM_PARTIAL) { @@ -956,13 +957,10 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) u8 *pos; struct ieee80211_sub_if_data *sdata; int hdr_len = offsetofend(struct ieee80211_mgmt, u.beacon); - u32 rate_flags; sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); sband = ieee80211_get_sband(sdata); - rate_flags = - ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); ie_len_he_cap = ieee80211_ie_len_he_cap(sdata); ie_len_eht_cap = ieee80211_ie_len_eht_cap(sdata); @@ -1091,7 +1089,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) if (ieee80211_put_srates_elem(skb, sband, sdata->vif.bss_conf.basic_rates, - rate_flags, 0, WLAN_EID_SUPP_RATES) || + 0, WLAN_EID_SUPP_RATES) || mesh_add_ds_params_ie(sdata, skb)) goto out_free; @@ -1104,7 +1102,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) if (ieee80211_put_srates_elem(skb, sband, sdata->vif.bss_conf.basic_rates, - rate_flags, 0, WLAN_EID_EXT_SUPP_RATES) || + 0, WLAN_EID_EXT_SUPP_RATES) || mesh_add_rsn_ie(sdata, skb) || mesh_add_ht_cap_ie(sdata, skb) || mesh_add_ht_oper_ie(sdata, skb) || @@ -1241,9 +1239,9 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) local->total_ps_buffered -= skb_queue_len(&ifmsh->ps.bc_buf); skb_queue_purge(&ifmsh->ps.bc_buf); - del_timer_sync(&sdata->u.mesh.housekeeping_timer); - del_timer_sync(&sdata->u.mesh.mesh_path_root_timer); - del_timer_sync(&sdata->u.mesh.mesh_path_timer); + timer_delete_sync(&sdata->u.mesh.housekeeping_timer); + timer_delete_sync(&sdata->u.mesh.mesh_path_root_timer); + timer_delete_sync(&sdata->u.mesh.mesh_path_timer); /* clear any mesh work (for next join) we may have accrued */ ifmsh->wrkq_flags = 0; diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 4e9546e998b6..91444301a84a 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -367,6 +367,12 @@ u32 airtime_link_metric_get(struct ieee80211_local *local, return (u32)result; } +/* Check that the first metric is at least 10% better than the second one */ +static bool is_metric_better(u32 x, u32 y) +{ + return (x < y) && (x < (y - x / 10)); +} + /** * hwmp_route_info_get - Update routing info to originator and transmitter * @@ -458,8 +464,8 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, (mpath->sn == orig_sn && (rcu_access_pointer(mpath->next_hop) != sta ? - mult_frac(new_metric, 10, 9) : - new_metric) >= mpath->metric)) { + !is_metric_better(new_metric, mpath->metric) : + new_metric >= mpath->metric))) { process = false; fresh_info = false; } @@ -533,8 +539,8 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, if ((mpath->flags & MESH_PATH_FIXED) || ((mpath->flags & MESH_PATH_ACTIVE) && ((rcu_access_pointer(mpath->next_hop) != sta ? - mult_frac(last_hop_metric, 10, 9) : - last_hop_metric) > mpath->metric))) + !is_metric_better(last_hop_metric, mpath->metric) : + last_hop_metric > mpath->metric)))) fresh_info = false; } else { mpath = mesh_path_add(sdata, ta); @@ -630,7 +636,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, mesh_path_add_gate(mpath); } rcu_read_unlock(); - } else { + } else if (ifmsh->mshcfg.dot11MeshForwarding) { rcu_read_lock(); mpath = mesh_path_lookup(sdata, target_addr); if (mpath) { @@ -648,6 +654,8 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, } } rcu_read_unlock(); + } else { + forward = false; } if (reply) { @@ -665,7 +673,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, } } - if (forward && ifmsh->mshcfg.dot11MeshForwarding) { + if (forward) { u32 preq_id; u8 hopcount; diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index 9f9cb5af0a97..0319674be832 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -22,7 +22,7 @@ static void mesh_path_free_rcu(struct mesh_table *tbl, struct mesh_path *mpath); static u32 mesh_table_hash(const void *addr, u32 len, u32 seed) { /* Use last four bytes of hw addr as hash index */ - return jhash_1word(__get_unaligned_cpu32((u8 *)addr + 2), seed); + return jhash_1word(get_unaligned((u32 *)((u8 *)addr + 2)), seed); } static const struct rhashtable_params mesh_rht_params = { diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 6ea35c88dc48..9c6a2b342170 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -264,7 +264,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, if (action != WLAN_SP_MESH_PEERING_CLOSE) { struct ieee80211_supported_band *sband; - u32 rate_flags, basic_rates; + u32 basic_rates; sband = ieee80211_get_sband(sdata); if (!sband) { @@ -280,16 +280,12 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, put_unaligned_le16(sta->sta.aid, pos); } - rate_flags = - ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); basic_rates = sdata->vif.bss_conf.basic_rates; if (ieee80211_put_srates_elem(skb, sband, basic_rates, - rate_flags, 0, - WLAN_EID_SUPP_RATES) || + 0, WLAN_EID_SUPP_RATES) || ieee80211_put_srates_elem(skb, sband, basic_rates, - rate_flags, 0, - WLAN_EID_EXT_SUPP_RATES) || + 0, WLAN_EID_EXT_SUPP_RATES) || mesh_add_rsn_ie(sdata, skb) || mesh_add_meshid_ie(sdata, skb) || mesh_add_meshconf_ie(sdata, skb)) @@ -417,7 +413,7 @@ u64 mesh_plink_deactivate(struct sta_info *sta) } spin_unlock_bh(&sta->mesh->plink_lock); if (!sdata->u.mesh.user_mpm) - del_timer_sync(&sta->mesh->plink_timer); + timer_delete_sync(&sta->mesh->plink_timer); mesh_path_flush_by_nexthop(sta); /* make sure no readers can access nexthop sta from here on */ @@ -432,15 +428,14 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; - u32 rates, basic_rates = 0, changed = 0; + u32 rates, changed = 0; enum ieee80211_sta_rx_bandwidth bw = sta->sta.deflink.bandwidth; sband = ieee80211_get_sband(sdata); if (!sband) return; - rates = ieee80211_sta_get_rates(sdata, elems, sband->band, - &basic_rates); + rates = ieee80211_sta_get_rates(sdata, elems, sband->band, NULL); spin_lock_bh(&sta->mesh->plink_lock); sta->deflink.rx_stats.last_rx = jiffies; @@ -667,7 +662,7 @@ void mesh_plink_timer(struct timer_list *t) /* * This STA is valid because sta_info_destroy() will - * del_timer_sync() this timer after having made sure + * timer_delete_sync() this timer after having made sure * it cannot be re-added (by deleting the plink.) */ sta = mesh->plink_sta; @@ -690,7 +685,7 @@ void mesh_plink_timer(struct timer_list *t) return; } - /* del_timer() and handler may race when entering these states */ + /* timer_delete() and handler may race when entering these states */ if (sta->mesh->plink_state == NL80211_PLINK_LISTEN || sta->mesh->plink_state == NL80211_PLINK_ESTAB) { mpl_dbg(sta->sdata, @@ -736,7 +731,7 @@ void mesh_plink_timer(struct timer_list *t) break; case NL80211_PLINK_HOLDING: /* holding timer */ - del_timer(&sta->mesh->plink_timer); + timer_delete(&sta->mesh->plink_timer); mesh_plink_fsm_restart(sta); break; default: @@ -849,7 +844,7 @@ static u64 mesh_plink_establish(struct ieee80211_sub_if_data *sdata, struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg; u64 changed = 0; - del_timer(&sta->mesh->plink_timer); + timer_delete(&sta->mesh->plink_timer); sta->mesh->plink_state = NL80211_PLINK_ESTAB; changed |= mesh_plink_inc_estab_count(sdata); changed |= mesh_set_ht_prot_mode(sdata); @@ -976,7 +971,7 @@ static u64 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, case NL80211_PLINK_HOLDING: switch (event) { case CLS_ACPT: - del_timer(&sta->mesh->plink_timer); + timer_delete(&sta->mesh->plink_timer); mesh_plink_fsm_restart(sta); break; case OPN_ACPT: diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 61c318f5239f..948909a242d6 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -8,7 +8,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018 - 2025 Intel Corporation */ #include <linux/delay.h> @@ -168,6 +168,9 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, bool no_vht = false; u32 ht_cfreq; + if (ieee80211_hw_check(&sdata->local->hw, STRICT)) + ignore_ht_channel_mismatch = false; + *chandef = (struct cfg80211_chan_def) { .chan = channel, .width = NL80211_CHAN_WIDTH_20_NOHT, @@ -343,7 +346,117 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, } static bool +ieee80211_verify_sta_ht_mcs_support(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const struct ieee80211_ht_operation *ht_op) +{ + struct ieee80211_sta_ht_cap sta_ht_cap; + int i; + + if (sband->band == NL80211_BAND_6GHZ) + return true; + + if (!ht_op) + return false; + + memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); + ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); + + /* + * P802.11REVme/D7.0 - 6.5.4.2.4 + * ... + * If the MLME of an HT STA receives an MLME-JOIN.request primitive + * with the SelectedBSS parameter containing a Basic HT-MCS Set field + * in the HT Operation parameter that contains any unsupported MCSs, + * the MLME response in the resulting MLME-JOIN.confirm primitive shall + * contain a ResultCode parameter that is not set to the value SUCCESS. + * ... + */ + + /* Simply check that all basic rates are in the STA RX mask */ + for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) { + if ((ht_op->basic_set[i] & sta_ht_cap.mcs.rx_mask[i]) != + ht_op->basic_set[i]) + return false; + } + + return true; +} + +static bool +ieee80211_verify_sta_vht_mcs_support(struct ieee80211_sub_if_data *sdata, + int link_id, + struct ieee80211_supported_band *sband, + const struct ieee80211_vht_operation *vht_op) +{ + struct ieee80211_sta_vht_cap sta_vht_cap; + u16 ap_min_req_set, sta_rx_mcs_map, sta_tx_mcs_map; + int nss; + + if (sband->band != NL80211_BAND_5GHZ) + return true; + + if (!vht_op) + return false; + + memcpy(&sta_vht_cap, &sband->vht_cap, sizeof(sta_vht_cap)); + ieee80211_apply_vhtcap_overrides(sdata, &sta_vht_cap); + + ap_min_req_set = le16_to_cpu(vht_op->basic_mcs_set); + sta_rx_mcs_map = le16_to_cpu(sta_vht_cap.vht_mcs.rx_mcs_map); + sta_tx_mcs_map = le16_to_cpu(sta_vht_cap.vht_mcs.tx_mcs_map); + + /* + * Many APs are incorrectly advertising an all-zero value here, + * which really means MCS 0-7 are required for 1-8 streams, but + * they don't really mean it that way. + * Some other APs are incorrectly advertising 3 spatial streams + * with MCS 0-7 are required, but don't really mean it that way + * and we'll connect only with HT, rather than even HE. + * As a result, unfortunately the VHT basic MCS/NSS set cannot + * be used at all, so check it only in strict mode. + */ + if (!ieee80211_hw_check(&sdata->local->hw, STRICT)) + return true; + + /* + * P802.11REVme/D7.0 - 6.5.4.2.4 + * ... + * If the MLME of a VHT STA receives an MLME-JOIN.request primitive + * with a SelectedBSS parameter containing a Basic VHT-MCS And NSS Set + * field in the VHT Operation parameter that contains any unsupported + * <VHT-MCS, NSS> tuple, the MLME response in the resulting + * MLME-JOIN.confirm primitive shall contain a ResultCode parameter + * that is not set to the value SUCCESS. + * ... + */ + for (nss = 8; nss > 0; nss--) { + u8 ap_op_val = (ap_min_req_set >> (2 * (nss - 1))) & 3; + u8 sta_rx_val; + u8 sta_tx_val; + + if (ap_op_val == IEEE80211_HE_MCS_NOT_SUPPORTED) + continue; + + sta_rx_val = (sta_rx_mcs_map >> (2 * (nss - 1))) & 3; + sta_tx_val = (sta_tx_mcs_map >> (2 * (nss - 1))) & 3; + + if (sta_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + sta_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || + sta_rx_val < ap_op_val || sta_tx_val < ap_op_val) { + link_id_info(sdata, link_id, + "Missing mandatory rates for %d Nss, rx %d, tx %d oper %d, disable VHT\n", + nss, sta_rx_val, sta_tx_val, ap_op_val); + return false; + } + } + + return true; +} + +static bool ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata, + int link_id, const struct ieee80211_he_cap_elem *he_cap, const struct ieee80211_he_operation *he_op) { @@ -371,9 +484,9 @@ ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata, */ if ((mcs_80_map_tx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED || (mcs_80_map_rx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED) { - sdata_info(sdata, - "Missing mandatory rates for 1 Nss, rx 0x%x, tx 0x%x, disable HE\n", - mcs_80_map_tx, mcs_80_map_rx); + link_id_info(sdata, link_id, + "Missing mandatory rates for 1 Nss, rx 0x%x, tx 0x%x, disable HE\n", + mcs_80_map_tx, mcs_80_map_rx); return false; } @@ -387,7 +500,7 @@ ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata, * zeroes, which is nonsense, and completely inconsistent with itself * (it doesn't have 8 streams). Accept the settings in this case anyway. */ - if (!ap_min_req_set) + if (!ieee80211_hw_check(&sdata->local->hw, STRICT) && !ap_min_req_set) return true; /* make sure the AP is consistent with itself @@ -417,9 +530,9 @@ ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata, if (ap_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || ap_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED || ap_rx_val < ap_op_val || ap_tx_val < ap_op_val) { - sdata_info(sdata, - "Invalid rates for %d Nss, rx %d, tx %d oper %d, disable HE\n", - nss, ap_rx_val, ap_rx_val, ap_op_val); + link_id_info(sdata, link_id, + "Invalid rates for %d Nss, rx %d, tx %d oper %d, disable HE\n", + nss, ap_rx_val, ap_tx_val, ap_op_val); return false; } } @@ -447,7 +560,7 @@ ieee80211_verify_sta_he_mcs_support(struct ieee80211_sub_if_data *sdata, * zeroes, which is nonsense, and completely inconsistent with itself * (it doesn't have 8 streams). Accept the settings in this case anyway. */ - if (!ap_min_req_set) + if (!ieee80211_hw_check(&sdata->local->hw, STRICT) && !ap_min_req_set) return true; /* Need to go over for 80MHz, 160MHz and for 80+80 */ @@ -589,6 +702,68 @@ ieee80211_verify_sta_eht_mcs_support(struct ieee80211_sub_if_data *sdata, return true; } +static void ieee80211_get_rates(struct ieee80211_supported_band *sband, + const u8 *supp_rates, + unsigned int supp_rates_len, + const u8 *ext_supp_rates, + unsigned int ext_supp_rates_len, + u32 *rates, u32 *basic_rates, + unsigned long *unknown_rates_selectors, + bool *have_higher_than_11mbit, + int *min_rate, int *min_rate_index) +{ + int i, j; + + for (i = 0; i < supp_rates_len + ext_supp_rates_len; i++) { + u8 supp_rate = i < supp_rates_len ? + supp_rates[i] : + ext_supp_rates[i - supp_rates_len]; + int rate = supp_rate & 0x7f; + bool is_basic = !!(supp_rate & 0x80); + + if ((rate * 5) > 110 && have_higher_than_11mbit) + *have_higher_than_11mbit = true; + + /* + * Skip membership selectors since they're not rates. + * + * Note: Even though the membership selector and the basic + * rate flag share the same bit, they are not exactly + * the same. + */ + if (is_basic && rate >= BSS_MEMBERSHIP_SELECTOR_MIN) { + if (unknown_rates_selectors) + set_bit(rate, unknown_rates_selectors); + continue; + } + + for (j = 0; j < sband->n_bitrates; j++) { + struct ieee80211_rate *br; + int brate; + + br = &sband->bitrates[j]; + + brate = DIV_ROUND_UP(br->bitrate, 5); + if (brate == rate) { + if (rates) + *rates |= BIT(j); + if (is_basic && basic_rates) + *basic_rates |= BIT(j); + if (min_rate && (rate * 5) < *min_rate) { + *min_rate = rate * 5; + if (min_rate_index) + *min_rate_index = j; + } + break; + } + } + + /* Handle an unknown entry as if it is an unknown selector */ + if (is_basic && unknown_rates_selectors && j == sband->n_bitrates) + set_bit(rate, unknown_rates_selectors); + } +} + static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata, const struct cfg80211_chan_def *chandef, u32 prohibited_flags) @@ -613,7 +788,7 @@ static int ieee80211_chandef_num_subchans(const struct cfg80211_chan_def *c) if (c->width == NL80211_CHAN_WIDTH_80P80) return 4 + 4; - return nl80211_chan_width_to_mhz(c->width) / 20; + return cfg80211_chandef_get_width(c) / 20; } static int ieee80211_chandef_num_widths(const struct cfg80211_chan_def *c) @@ -814,12 +989,13 @@ static void ieee80211_set_chanreq_ap(struct ieee80211_sub_if_data *sdata, chanreq->ap = *ap_chandef; } -static struct ieee802_11_elems * +VISIBLE_IF_MAC80211_KUNIT struct ieee802_11_elems * ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, struct ieee80211_conn_settings *conn, struct cfg80211_bss *cbss, int link_id, struct ieee80211_chan_req *chanreq, - struct cfg80211_chan_def *ap_chandef) + struct cfg80211_chan_def *ap_chandef, + unsigned long *userspace_selectors) { const struct cfg80211_bss_ies *ies = rcu_dereference(cbss->ies); struct ieee80211_bss *bss = (void *)cbss->priv; @@ -833,6 +1009,8 @@ ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems; struct ieee80211_supported_band *sband; enum ieee80211_conn_mode ap_mode; + unsigned long unknown_rates_selectors[BITS_TO_LONGS(128)] = {}; + unsigned long sta_selectors[BITS_TO_LONGS(128)] = {}; int ret; again: @@ -861,6 +1039,11 @@ again: sband = sdata->local->hw.wiphy->bands[channel->band]; + ieee80211_get_rates(sband, elems->supp_rates, elems->supp_rates_len, + elems->ext_supp_rates, elems->ext_supp_rates_len, + NULL, NULL, unknown_rates_selectors, NULL, NULL, + NULL); + switch (channel->band) { case NL80211_BAND_S1GHZ: if (WARN_ON(ap_mode != IEEE80211_CONN_MODE_S1G)) { @@ -870,8 +1053,8 @@ again: return elems; case NL80211_BAND_6GHZ: if (ap_mode < IEEE80211_CONN_MODE_HE) { - sdata_info(sdata, - "Rejecting non-HE 6/7 GHz connection"); + link_id_info(sdata, link_id, + "Rejecting non-HE 6/7 GHz connection"); ret = -EINVAL; goto free; } @@ -911,6 +1094,29 @@ again: chanreq->oper = *ap_chandef; + bitmap_copy(sta_selectors, userspace_selectors, 128); + if (conn->mode >= IEEE80211_CONN_MODE_HT) + set_bit(BSS_MEMBERSHIP_SELECTOR_HT_PHY, sta_selectors); + if (conn->mode >= IEEE80211_CONN_MODE_VHT) + set_bit(BSS_MEMBERSHIP_SELECTOR_VHT_PHY, sta_selectors); + if (conn->mode >= IEEE80211_CONN_MODE_HE) + set_bit(BSS_MEMBERSHIP_SELECTOR_HE_PHY, sta_selectors); + if (conn->mode >= IEEE80211_CONN_MODE_EHT) + set_bit(BSS_MEMBERSHIP_SELECTOR_EHT_PHY, sta_selectors); + + /* + * We do not support EPD or GLK so never add them. + * SAE_H2E is handled through userspace_selectors. + */ + + /* Check if we support all required features */ + if (!bitmap_subset(unknown_rates_selectors, sta_selectors, 128)) { + link_id_info(sdata, link_id, + "required basic rate or BSS membership selectors not supported or disabled, rejecting connection\n"); + ret = -EINVAL; + goto free; + } + ieee80211_set_chanreq_ap(sdata, chanreq, conn, ap_chandef); while (!ieee80211_chandef_usable(sdata, &chanreq->oper, @@ -942,16 +1148,38 @@ again: } if (chanreq->oper.width != ap_chandef->width || ap_mode != conn->mode) - sdata_info(sdata, - "regulatory prevented using AP config, downgraded\n"); + link_id_info(sdata, link_id, + "regulatory prevented using AP config, downgraded\n"); + + if (conn->mode >= IEEE80211_CONN_MODE_HT && + !ieee80211_verify_sta_ht_mcs_support(sdata, sband, + elems->ht_operation)) { + conn->mode = IEEE80211_CONN_MODE_LEGACY; + conn->bw_limit = IEEE80211_CONN_BW_LIMIT_20; + link_id_info(sdata, link_id, + "required MCSes not supported, disabling HT\n"); + } + + if (conn->mode >= IEEE80211_CONN_MODE_VHT && + !ieee80211_verify_sta_vht_mcs_support(sdata, link_id, sband, + elems->vht_operation)) { + conn->mode = IEEE80211_CONN_MODE_HT; + conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, + conn->bw_limit, + IEEE80211_CONN_BW_LIMIT_40); + link_id_info(sdata, link_id, + "required MCSes not supported, disabling VHT\n"); + } if (conn->mode >= IEEE80211_CONN_MODE_HE && - (!ieee80211_verify_peer_he_mcs_support(sdata, (void *)elems->he_cap, + (!ieee80211_verify_peer_he_mcs_support(sdata, link_id, + (void *)elems->he_cap, elems->he_operation) || !ieee80211_verify_sta_he_mcs_support(sdata, sband, elems->he_operation))) { conn->mode = IEEE80211_CONN_MODE_VHT; - sdata_info(sdata, "required MCSes not supported, disabling HE\n"); + link_id_info(sdata, link_id, + "required MCSes not supported, disabling HE\n"); } if (conn->mode >= IEEE80211_CONN_MODE_EHT && @@ -961,7 +1189,8 @@ again: conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_160); - sdata_info(sdata, "required MCSes not supported, disabling EHT\n"); + link_id_info(sdata, link_id, + "required MCSes not supported, disabling EHT\n"); } /* the mode can only decrease, so this must terminate */ @@ -985,10 +1214,12 @@ free: kfree(elems); return ERR_PTR(ret); } +EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_determine_chan_mode); static int ieee80211_config_bw(struct ieee80211_link_data *link, struct ieee802_11_elems *elems, - bool update, u64 *changed) + bool update, u64 *changed, + const char *frame) { struct ieee80211_channel *channel = link->conf->chanreq.oper.chan; struct ieee80211_sub_if_data *sdata = link->sdata; @@ -1013,9 +1244,10 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link, if (ap_mode != link->u.mgd.conn.mode) { link_info(link, - "AP appears to change mode (expected %s, found %s), disconnect\n", + "AP %pM appears to change mode (expected %s, found %s) in %s, disconnect\n", + link->u.mgd.bssid, ieee80211_conn_mode_str(link->u.mgd.conn.mode), - ieee80211_conn_mode_str(ap_mode)); + ieee80211_conn_mode_str(ap_mode), frame); return -EINVAL; } @@ -1060,16 +1292,16 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link, return 0; link_info(link, - "AP %pM changed bandwidth, new used config is %d.%03d MHz, width %d (%d.%03d/%d MHz)\n", - link->u.mgd.bssid, chanreq.oper.chan->center_freq, + "AP %pM changed bandwidth in %s, new used config is %d.%03d MHz, width %d (%d.%03d/%d MHz)\n", + link->u.mgd.bssid, frame, chanreq.oper.chan->center_freq, chanreq.oper.chan->freq_offset, chanreq.oper.width, chanreq.oper.center_freq1, chanreq.oper.freq1_offset, chanreq.oper.center_freq2); if (!cfg80211_chandef_valid(&chanreq.oper)) { sdata_info(sdata, - "AP %pM changed caps/bw in a way we can't support - disconnect\n", - link->u.mgd.bssid); + "AP %pM changed caps/bw in %s in a way we can't support - disconnect\n", + link->u.mgd.bssid, frame); return -EINVAL; } @@ -1098,8 +1330,8 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link, ret = ieee80211_link_change_chanreq(link, &chanreq, changed); if (ret) { sdata_info(sdata, - "AP %pM changed bandwidth to incompatible one - disconnect\n", - link->u.mgd.bssid); + "AP %pM changed bandwidth in %s to incompatible one - disconnect\n", + link->u.mgd.bssid, frame); return ret; } @@ -1214,13 +1446,15 @@ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, * Some APs apparently get confused if our capabilities are better * than theirs, so restrict what we advertise in the assoc request. */ - if (!(ap_vht_cap->vht_cap_info & - cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE))) - cap &= ~(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | - IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); - else if (!(ap_vht_cap->vht_cap_info & - cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE))) - cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; + if (!ieee80211_hw_check(&local->hw, STRICT)) { + if (!(ap_vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE))) + cap &= ~(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | + IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); + else if (!(ap_vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE))) + cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; + } /* * If some other vif is using the MU-MIMO capability we cannot associate @@ -1262,14 +1496,16 @@ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, return mu_mimo_owner; } -static void ieee80211_assoc_add_rates(struct sk_buff *skb, +static void ieee80211_assoc_add_rates(struct ieee80211_local *local, + struct sk_buff *skb, enum nl80211_chan_width width, struct ieee80211_supported_band *sband, struct ieee80211_mgd_assoc_data *assoc_data) { u32 rates; - if (assoc_data->supp_rates_len) { + if (assoc_data->supp_rates_len && + !ieee80211_hw_check(&local->hw, STRICT)) { /* * Get all rates supported by the device and the AP as * some APs don't like getting a superset of their rates @@ -1289,9 +1525,9 @@ static void ieee80211_assoc_add_rates(struct sk_buff *skb, rates = ~0; } - ieee80211_put_srates_elem(skb, sband, 0, 0, ~rates, + ieee80211_put_srates_elem(skb, sband, 0, ~rates, WLAN_EID_SUPP_RATES); - ieee80211_put_srates_elem(skb, sband, 0, 0, ~rates, + ieee80211_put_srates_elem(skb, sband, 0, ~rates, WLAN_EID_EXT_SUPP_RATES); } @@ -1412,23 +1648,25 @@ static size_t ieee80211_add_before_he_elems(struct sk_buff *skb, #define PRESENT_ELEMS_MAX 8 #define PRESENT_ELEM_EXT_OFFS 0x100 -static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, u16 capab, - const struct element *ext_capa, - const u16 *present_elems); - -static size_t ieee80211_assoc_link_elems(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, u16 *capab, - const struct element *ext_capa, - const u8 *extra_elems, - size_t extra_elems_len, - unsigned int link_id, - struct ieee80211_link_data *link, - u16 *present_elems) +static void +ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, u16 capab, + const struct element *ext_capa, + const u16 *present_elems, + struct ieee80211_mgd_assoc_data *assoc_data); + +static size_t +ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, u16 *capab, + const struct element *ext_capa, + const u8 *extra_elems, + size_t extra_elems_len, + unsigned int link_id, + struct ieee80211_link_data *link, + u16 *present_elems, + struct ieee80211_mgd_assoc_data *assoc_data) { enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_channel *chan = cbss->channel; const struct ieee80211_sband_iftype_data *iftd; @@ -1483,7 +1721,7 @@ static size_t ieee80211_assoc_link_elems(struct ieee80211_sub_if_data *sdata, *capab |= WLAN_CAPABILITY_SPECTRUM_MGMT; if (sband->band != NL80211_BAND_S1GHZ) - ieee80211_assoc_add_rates(skb, width, sband, assoc_data); + ieee80211_assoc_add_rates(local, skb, width, sband, assoc_data); if (*capab & WLAN_CAPABILITY_SPECTRUM_MGMT || *capab & WLAN_CAPABILITY_RADIO_MEASURE) { @@ -1577,7 +1815,7 @@ static size_t ieee80211_assoc_link_elems(struct ieee80211_sub_if_data *sdata, if (link_id == assoc_data->assoc_link_id) ieee80211_assoc_add_ml_elem(sdata, skb, orig_capab, ext_capa, - present_elems); + present_elems, assoc_data); /* crash if somebody gets it wrong */ present_elems = NULL; @@ -1656,14 +1894,14 @@ static void ieee80211_add_non_inheritance_elem(struct sk_buff *skb, *len = skb->len - skb_len - 2; } -static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb, u16 capab, - const struct element *ext_capa, - const u16 *outer_present_elems) +static void +ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, u16 capab, + const struct element *ext_capa, + const u16 *outer_present_elems, + struct ieee80211_mgd_assoc_data *assoc_data) { struct ieee80211_local *local = sdata->local; - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; struct ieee80211_multi_link_elem *ml_elem; struct ieee80211_mle_basic_common_info *common; const struct wiphy_iftype_ext_capab *ift_ext_capa; @@ -1705,6 +1943,21 @@ static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, } skb_put_data(skb, &mld_capa_ops, sizeof(mld_capa_ops)); + /* Many APs have broken parsing of the extended MLD capa/ops field, + * dropping (re-)association request frames or replying with association + * response with a failure status if it's present. Without a clear + * indication as to whether the AP supports parsing this field or not do + * not include it in the common information unless strict mode is set. + */ + if (ieee80211_hw_check(&local->hw, STRICT) && + assoc_data->ext_mld_capa_ops) { + ml_elem->control |= + cpu_to_le16(IEEE80211_MLC_BASIC_PRES_EXT_MLD_CAPA_OP); + common->len += 2; + skb_put_data(skb, &assoc_data->ext_mld_capa_ops, + sizeof(assoc_data->ext_mld_capa_ops)); + } + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { u16 link_present_elems[PRESENT_ELEMS_MAX] = {}; const u8 *extra_elems; @@ -1736,16 +1989,17 @@ static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, * (if applicable) are skipped. So we only have * the capability field (remember the position and fill * later), followed by the elements added below by - * calling ieee80211_assoc_link_elems(). + * calling ieee80211_add_link_elems(). */ capab_pos = skb_put(skb, 2); - extra_used = ieee80211_assoc_link_elems(sdata, skb, &capab, - ext_capa, - extra_elems, - extra_elems_len, - link_id, NULL, - link_present_elems); + extra_used = ieee80211_add_link_elems(sdata, skb, &capab, + ext_capa, + extra_elems, + extra_elems_len, + link_id, NULL, + link_present_elems, + assoc_data); if (extra_elems) skb_put_data(skb, extra_elems + extra_used, extra_elems_len - extra_used); @@ -1762,6 +2016,55 @@ static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata, ieee80211_fragment_element(skb, ml_elem_len, WLAN_EID_FRAGMENT); } +static int +ieee80211_link_common_elems_size(struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype iftype, + struct cfg80211_bss *cbss, + size_t elems_len) +{ + struct ieee80211_local *local = sdata->local; + const struct ieee80211_sband_iftype_data *iftd; + struct ieee80211_supported_band *sband; + size_t size = 0; + + if (!cbss) + return size; + + sband = local->hw.wiphy->bands[cbss->channel->band]; + + /* add STA profile elements length */ + size += elems_len; + + /* and supported rates length */ + size += 4 + sband->n_bitrates; + + /* supported channels */ + size += 2 + 2 * sband->n_channels; + + iftd = ieee80211_get_sband_iftype_data(sband, iftype); + if (iftd) + size += iftd->vendor_elems.len; + + /* power capability */ + size += 4; + + /* HT, VHT, HE, EHT */ + size += 2 + sizeof(struct ieee80211_ht_cap); + size += 2 + sizeof(struct ieee80211_vht_cap); + size += 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + + sizeof(struct ieee80211_he_mcs_nss_supp) + + IEEE80211_HE_PPE_THRES_MAX_LEN; + + if (sband->band == NL80211_BAND_6GHZ) + size += 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa); + + size += 2 + 1 + sizeof(struct ieee80211_eht_cap_elem) + + sizeof(struct ieee80211_eht_mcs_nss_supp) + + IEEE80211_EHT_PPE_THRES_MAX_LEN; + + return size; +} + static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; @@ -1800,42 +2103,15 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; - const struct ieee80211_sband_iftype_data *iftd; - struct ieee80211_supported_band *sband; + size_t elems_len = assoc_data->link[link_id].elems_len; if (!cbss) continue; - sband = local->hw.wiphy->bands[cbss->channel->band]; - n_links++; - /* add STA profile elements length */ - size += assoc_data->link[link_id].elems_len; - /* and supported rates length */ - size += 4 + sband->n_bitrates; - /* supported channels */ - size += 2 + 2 * sband->n_channels; - - iftd = ieee80211_get_sband_iftype_data(sband, iftype); - if (iftd) - size += iftd->vendor_elems.len; - - /* power capability */ - size += 4; - - /* HT, VHT, HE, EHT */ - size += 2 + sizeof(struct ieee80211_ht_cap); - size += 2 + sizeof(struct ieee80211_vht_cap); - size += 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + - sizeof(struct ieee80211_he_mcs_nss_supp) + - IEEE80211_HE_PPE_THRES_MAX_LEN; - - if (sband->band == NL80211_BAND_6GHZ) - size += 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa); - - size += 2 + 1 + sizeof(struct ieee80211_eht_cap_elem) + - sizeof(struct ieee80211_eht_mcs_nss_supp) + - IEEE80211_EHT_PPE_THRES_MAX_LEN; + + size += ieee80211_link_common_elems_size(sdata, iftype, cbss, + elems_len); /* non-inheritance element */ size += 2 + 2 + PRESENT_ELEMS_MAX; @@ -1851,6 +2127,7 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) /* max common info field in basic multi-link element */ size += sizeof(struct ieee80211_mle_basic_common_info) + 2 + /* capa & op */ + 2 + /* ext capa & op */ 2; /* EML capa */ /* @@ -1927,17 +2204,18 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) * for some reason check it and want it to be set, set the bit for all * pre-EHT connections as we used to do. */ - if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT) + if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT && + !ieee80211_hw_check(&local->hw, STRICT)) capab |= WLAN_CAPABILITY_ESS; /* add the elements for the assoc (main) link */ link_capab = capab; - offset = ieee80211_assoc_link_elems(sdata, skb, &link_capab, - ext_capa, - assoc_data->ie, - assoc_data->ie_len, - assoc_data->assoc_link_id, link, - present_elems); + offset = ieee80211_add_link_elems(sdata, skb, &link_capab, + ext_capa, + assoc_data->ie, + assoc_data->ie_len, + assoc_data->assoc_link_id, link, + present_elems, assoc_data); put_unaligned_le16(link_capab, capab_pos); /* if present, add any custom non-vendor IEs */ @@ -2916,7 +3194,7 @@ static void ieee80211_change_ps(struct ieee80211_local *local) } else if (conf->flags & IEEE80211_CONF_PS) { conf->flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); - del_timer_sync(&local->dynamic_ps_timer); + timer_delete_sync(&local->dynamic_ps_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); } @@ -3251,10 +3529,10 @@ void ieee80211_mgd_set_link_qos_params(struct ieee80211_link_data *link) /* MLME */ static bool -ieee80211_sta_wmm_params(struct ieee80211_local *local, - struct ieee80211_link_data *link, - const u8 *wmm_param, size_t wmm_param_len, - const struct ieee80211_mu_edca_param_set *mu_edca) +_ieee80211_sta_wmm_params(struct ieee80211_local *local, + struct ieee80211_link_data *link, + const u8 *wmm_param, size_t wmm_param_len, + const struct ieee80211_mu_edca_param_set *mu_edca) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_tx_queue_params params[IEEE80211_NUM_ACS]; @@ -3383,6 +3661,19 @@ ieee80211_sta_wmm_params(struct ieee80211_local *local, for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) link->tx_conf[ac] = params[ac]; + return true; +} + +static bool +ieee80211_sta_wmm_params(struct ieee80211_local *local, + struct ieee80211_link_data *link, + const u8 *wmm_param, size_t wmm_param_len, + const struct ieee80211_mu_edca_param_set *mu_edca) +{ + if (!_ieee80211_sta_wmm_params(local, link, wmm_param, wmm_param_len, + mu_edca)) + return false; + ieee80211_mgd_set_link_qos_params(link); /* enable WMM or activate new settings */ @@ -3593,12 +3884,45 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, netif_carrier_on(sdata->dev); } +static void ieee80211_ml_reconf_reset(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_mgd_assoc_data *add_links_data = + sdata->u.mgd.reconf.add_links_data; + + if (!ieee80211_vif_is_mld(&sdata->vif) || + !(sdata->u.mgd.reconf.added_links | + sdata->u.mgd.reconf.removed_links)) + return; + + wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + &sdata->u.mgd.reconf.wk); + sdata->u.mgd.reconf.added_links = 0; + sdata->u.mgd.reconf.removed_links = 0; + sdata->u.mgd.reconf.dialog_token = 0; + + if (add_links_data) { + struct cfg80211_mlo_reconf_done_data done_data = {}; + u8 link_id; + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; + link_id++) + done_data.links[link_id].bss = + add_links_data->link[link_id].bss; + + cfg80211_mlo_reconf_add_done(sdata->dev, &done_data); + + kfree(sdata->u.mgd.reconf.add_links_data); + sdata->u.mgd.reconf.add_links_data = NULL; + } +} + static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, u16 stype, u16 reason, bool tx, u8 *frame_buf) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; + struct sta_info *ap_sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); unsigned int link_id; u64 changed = 0; struct ieee80211_prep_tx_info info = { @@ -3609,6 +3933,9 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, lockdep_assert_wiphy(local->hw.wiphy); + if (WARN_ON(!ap_sta)) + return; + if (WARN_ON_ONCE(tx && !frame_buf)) return; @@ -3619,8 +3946,34 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, ifmgd->associated = false; + if (tx) { + bool tx_link_found = false; + + for (link_id = 0; + link_id < ARRAY_SIZE(sdata->link); + link_id++) { + struct ieee80211_link_data *link; + + if (!ieee80211_vif_link_active(&sdata->vif, link_id)) + continue; + + link = sdata_dereference(sdata->link[link_id], sdata); + if (WARN_ON_ONCE(!link)) + continue; + + if (link->u.mgd.csa.blocked_tx) + continue; + + tx_link_found = true; + break; + } + + tx = tx_link_found; + } + /* other links will be destroyed */ sdata->deflink.conf->bss = NULL; + sdata->deflink.conf->epcs_support = false; sdata->deflink.smps_mode = IEEE80211_SMPS_OFF; netif_carrier_off(sdata->dev); @@ -3648,23 +4001,24 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, * insist sending these frames which can take time and delay * the disconnection and possible the roaming. */ - if (tx) - ieee80211_flush_queues(local, sdata, true); + ieee80211_flush_queues(local, sdata, true); - /* deauthenticate/disassociate now */ - if (tx || frame_buf) { + if (tx) { drv_mgd_prepare_tx(sdata->local, sdata, &info); ieee80211_send_deauth_disassoc(sdata, sdata->vif.cfg.ap_addr, sdata->vif.cfg.ap_addr, stype, - reason, tx, frame_buf); - } + reason, true, frame_buf); - /* flush out frame - make sure the deauth was actually sent */ - if (tx) + /* flush out frame - make sure the deauth was actually sent */ ieee80211_flush_queues(local, sdata, false); - drv_mgd_complete_tx(sdata->local, sdata, &info); + drv_mgd_complete_tx(sdata->local, sdata, &info); + } else if (frame_buf) { + ieee80211_send_deauth_disassoc(sdata, sdata->vif.cfg.ap_addr, + sdata->vif.cfg.ap_addr, stype, + reason, false, frame_buf); + } /* clear AP addr only after building the needed mgmt frames */ eth_zero_addr(sdata->deflink.u.mgd.bssid); @@ -3672,8 +4026,16 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, sdata->vif.cfg.ssid_len = 0; - /* remove AP and TDLS peers */ - sta_info_flush(sdata, -1); + /* Remove TDLS peers */ + __sta_info_flush(sdata, false, -1, ap_sta); + + if (sdata->vif.driver_flags & IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC) { + /* Only move the AP state */ + sta_info_move_state(ap_sta, IEEE80211_STA_NONE); + } else { + /* Remove AP peer */ + sta_info_flush(sdata, -1); + } /* finally reset all BSS / config parameters */ if (!ieee80211_vif_is_mld(&sdata->vif)) @@ -3707,7 +4069,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, sdata->deflink.ap_power_level = IEEE80211_UNSET_POWER_LEVEL; - del_timer_sync(&local->dynamic_ps_timer); + timer_delete_sync(&local->dynamic_ps_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); /* Disable ARP filtering */ @@ -3724,12 +4086,20 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, ieee80211_vif_cfg_change_notify(sdata, changed); } + if (sdata->vif.driver_flags & IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC) { + /* + * After notifying the driver about the disassoc, + * remove the ap sta. + */ + sta_info_flush(sdata, -1); + } + /* disassociated - set to defaults now */ ieee80211_set_wmm_default(&sdata->deflink, false, false); - del_timer_sync(&sdata->u.mgd.conn_mon_timer); - del_timer_sync(&sdata->u.mgd.bcn_mon_timer); - del_timer_sync(&sdata->u.mgd.timer); + timer_delete_sync(&sdata->u.mgd.conn_mon_timer); + timer_delete_sync(&sdata->u.mgd.bcn_mon_timer); + timer_delete_sync(&sdata->u.mgd.timer); sdata->vif.bss_conf.dtim_period = 0; sdata->vif.bss_conf.beacon_rate = NULL; @@ -3782,9 +4152,21 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, wiphy_work_cancel(sdata->local->hw.wiphy, &ifmgd->teardown_ttlm_work); + /* if disconnection happens in the middle of the ML reconfiguration + * flow, cfg80211 must called to release the BSS references obtained + * when the flow started. + */ + ieee80211_ml_reconf_reset(sdata); + ieee80211_vif_set_links(sdata, 0, 0); ifmgd->mcast_seq_last = IEEE80211_SN_MODULO; + + ifmgd->epcs.enabled = false; + ifmgd->epcs.dialog_token = 0; + + memset(ifmgd->userspace_selectors, 0, + sizeof(ifmgd->userspace_selectors)); } static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata) @@ -4065,33 +4447,12 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; - bool tx = false; lockdep_assert_wiphy(local->hw.wiphy); if (!ifmgd->associated) return; - /* only transmit if we have a link that makes that worthwhile */ - for (unsigned int link_id = 0; - link_id < ARRAY_SIZE(sdata->link); - link_id++) { - struct ieee80211_link_data *link; - - if (!ieee80211_vif_link_active(&sdata->vif, link_id)) - continue; - - link = sdata_dereference(sdata->link[link_id], sdata); - if (WARN_ON_ONCE(!link)) - continue; - - if (link->u.mgd.csa.blocked_tx) - continue; - - tx = true; - break; - } - if (!ifmgd->driver_disconnect) { unsigned int link_id; @@ -4108,7 +4469,7 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); - if (!link) + if (!link || !link->conf->bss) continue; cfg80211_unlink_bss(local->hw.wiphy, link->conf->bss); link->conf->bss = NULL; @@ -4119,14 +4480,14 @@ static void __ieee80211_disconnect(struct ieee80211_sub_if_data *sdata) ifmgd->driver_disconnect ? WLAN_REASON_DEAUTH_LEAVING : WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, - tx, frame_buf); + true, frame_buf); /* the other links will be destroyed */ sdata->vif.bss_conf.csa_active = false; sdata->deflink.u.mgd.csa.waiting_bcn = false; sdata->deflink.u.mgd.csa.blocked_tx = false; ieee80211_vif_unblock_queues_csa(sdata); - ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), tx, + ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, ifmgd->reconnect); ifmgd->reconnect = false; @@ -4220,13 +4581,15 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, lockdep_assert_wiphy(sdata->local->hw.wiphy); + sdata->u.mgd.auth_data = NULL; + if (!assoc) { /* * we are not authenticated yet, the only timer that could be * running is the timeout for the authentication response which * which is not relevant anymore. */ - del_timer_sync(&sdata->u.mgd.timer); + timer_delete_sync(&sdata->u.mgd.timer); sta_info_destroy_addr(sdata, auth_data->ap_addr); /* other links are destroyed */ @@ -4241,7 +4604,6 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata, cfg80211_put_bss(sdata->local->hw.wiphy, auth_data->bss); kfree(auth_data); - sdata->u.mgd.auth_data = NULL; } enum assoc_status { @@ -4258,13 +4620,15 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, lockdep_assert_wiphy(sdata->local->hw.wiphy); + sdata->u.mgd.assoc_data = NULL; + if (status != ASSOC_SUCCESS) { /* * we are not associated yet, the only timer that could be * running is the timeout for the association response which * which is not relevant anymore. */ - del_timer_sync(&sdata->u.mgd.timer); + timer_delete_sync(&sdata->u.mgd.timer); sta_info_destroy_addr(sdata, assoc_data->ap_addr); eth_zero_addr(sdata->deflink.u.mgd.bssid); @@ -4296,7 +4660,6 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, } kfree(assoc_data); - sdata->u.mgd.assoc_data = NULL; } static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, @@ -4386,6 +4749,8 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); status_code = le16_to_cpu(mgmt->u.auth.status_code); + info.link_id = ifmgd->auth_data->link_id; + if (auth_alg != ifmgd->auth_data->algorithm || (auth_alg != WLAN_AUTH_SAE && auth_transaction != ifmgd->auth_data->expected_transaction) || @@ -4599,57 +4964,6 @@ static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, false); } -static void ieee80211_get_rates(struct ieee80211_supported_band *sband, - u8 *supp_rates, unsigned int supp_rates_len, - u32 *rates, u32 *basic_rates, - bool *have_higher_than_11mbit, - int *min_rate, int *min_rate_index) -{ - int i, j; - - for (i = 0; i < supp_rates_len; i++) { - int rate = supp_rates[i] & 0x7f; - bool is_basic = !!(supp_rates[i] & 0x80); - - if ((rate * 5) > 110) - *have_higher_than_11mbit = true; - - /* - * Skip HT, VHT, HE, EHT and SAE H2E only BSS membership - * selectors since they're not rates. - * - * Note: Even though the membership selector and the basic - * rate flag share the same bit, they are not exactly - * the same. - */ - if (supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HT_PHY) || - supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY) || - supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HE_PHY) || - supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_EHT_PHY) || - supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_SAE_H2E)) - continue; - - for (j = 0; j < sband->n_bitrates; j++) { - struct ieee80211_rate *br; - int brate; - - br = &sband->bitrates[j]; - - brate = DIV_ROUND_UP(br->bitrate, 5); - if (brate == rate) { - *rates |= BIT(j); - if (is_basic) - *basic_rates |= BIT(j); - if ((rate * 5) < *min_rate) { - *min_rate = rate * 5; - *min_rate_index = j; - } - break; - } - } - } -} - static bool ieee80211_twt_req_supported(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct link_sta_info *link_sta, @@ -4702,6 +5016,82 @@ static bool ieee80211_twt_bcast_support(struct ieee80211_sub_if_data *sdata, IEEE80211_HE_MAC_CAP2_BCAST_TWT); } +static void ieee80211_epcs_changed(struct ieee80211_sub_if_data *sdata, + bool enabled) +{ + /* in any case this is called, dialog token should be reset */ + sdata->u.mgd.epcs.dialog_token = 0; + + if (sdata->u.mgd.epcs.enabled == enabled) + return; + + sdata->u.mgd.epcs.enabled = enabled; + cfg80211_epcs_changed(sdata->dev, enabled); +} + +static void ieee80211_epcs_teardown(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + u8 link_id; + + if (!sdata->u.mgd.epcs.enabled) + return; + + lockdep_assert_wiphy(local->hw.wiphy); + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + struct ieee802_11_elems *elems; + struct ieee80211_link_data *link; + const struct cfg80211_bss_ies *ies; + bool ret; + + rcu_read_lock(); + + link = sdata_dereference(sdata->link[link_id], sdata); + if (!link || !link->conf || !link->conf->bss) { + rcu_read_unlock(); + continue; + } + + if (link->u.mgd.disable_wmm_tracking) { + rcu_read_unlock(); + ieee80211_set_wmm_default(link, false, false); + continue; + } + + ies = rcu_dereference(link->conf->bss->beacon_ies); + if (!ies) { + rcu_read_unlock(); + ieee80211_set_wmm_default(link, false, false); + continue; + } + + elems = ieee802_11_parse_elems(ies->data, ies->len, false, + NULL); + if (!elems) { + rcu_read_unlock(); + ieee80211_set_wmm_default(link, false, false); + continue; + } + + ret = _ieee80211_sta_wmm_params(local, link, + elems->wmm_param, + elems->wmm_param_len, + elems->mu_edca_param_set); + + kfree(elems); + rcu_read_unlock(); + + if (!ret) { + ieee80211_set_wmm_default(link, false, false); + continue; + } + + ieee80211_mgd_set_link_qos_params(link); + ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_QOS); + } +} + static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, struct link_sta_info *link_sta, struct cfg80211_bss *cbss, @@ -4711,7 +5101,8 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, u64 *changed) { struct ieee80211_sub_if_data *sdata = link->sdata; - struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; + struct ieee80211_mgd_assoc_data *assoc_data = + sdata->u.mgd.assoc_data ?: sdata->u.mgd.reconf.add_links_data; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_local *local = sdata->local; unsigned int link_id = link->link_id; @@ -4802,7 +5193,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, * 2G/3G/4G wifi routers, reported models include the "Onda PN51T", * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device. */ - if (!is_6ghz && + if (!ieee80211_hw_check(&local->hw, STRICT) && !is_6ghz && ((assoc_data->wmm && !elems->wmm_param) || (link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT && (!elems->ht_cap_elem || !elems->ht_operation)) || @@ -4825,6 +5216,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, parse_params.start = bss_ies->data; parse_params.len = bss_ies->len; parse_params.bss = cbss; + parse_params.link_id = -1; bss_elems = ieee802_11_parse_elems_full(&parse_params); if (!bss_elems) { ret = false; @@ -4898,7 +5290,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, /* check/update if AP changed anything in assoc response vs. scan */ if (ieee80211_config_bw(link, elems, link_id == assoc_data->assoc_link_id, - changed)) { + changed, "assoc response")) { ret = false; goto out; } @@ -4937,6 +5329,15 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, bss_vht_cap = (const void *)elem->data; } + if (ieee80211_hw_check(&local->hw, STRICT) && + (!bss_vht_cap || memcmp(bss_vht_cap, elems->vht_cap_elem, + sizeof(*bss_vht_cap)))) { + rcu_read_unlock(); + ret = false; + link_info(link, "VHT capabilities mismatch\n"); + goto out; + } + ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband, elems->vht_cap_elem, bss_vht_cap, link_sta); @@ -4974,14 +5375,27 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, link_sta); bss_conf->eht_support = link_sta->pub->eht_cap.has_eht; + bss_conf->epcs_support = bss_conf->eht_support && + !!(elems->eht_cap->fixed.mac_cap_info[0] & + IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS); + + /* EPCS might be already enabled but a new added link + * does not support EPCS. This should not really happen + * in practice. + */ + if (sdata->u.mgd.epcs.enabled && + !bss_conf->epcs_support) + ieee80211_epcs_teardown(sdata); } else { bss_conf->eht_support = false; + bss_conf->epcs_support = false; } } else { bss_conf->he_support = false; bss_conf->twt_requester = false; bss_conf->twt_protected = false; bss_conf->eht_support = false; + bss_conf->epcs_support = false; } bss_conf->twt_broadcast = @@ -5121,7 +5535,9 @@ static int ieee80211_mgd_setup_link_sta(struct ieee80211_link_data *link, sband = local->hw.wiphy->bands[cbss->channel->band]; ieee80211_get_rates(sband, bss->supp_rates, bss->supp_rates_len, - &rates, &basic_rates, &have_higher_than_11mbit, + NULL, 0, + &rates, &basic_rates, NULL, + &have_higher_than_11mbit, &min_rate, &min_rate_index); /* @@ -5514,7 +5930,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, int link_id, struct cfg80211_bss *cbss, bool mlo, - struct ieee80211_conn_settings *conn) + struct ieee80211_conn_settings *conn, + unsigned long *userspace_selectors) { struct ieee80211_local *local = sdata->local; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; @@ -5527,7 +5944,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); elems = ieee80211_determine_chan_mode(sdata, conn, cbss, link_id, - &chanreq, &ap_chandef); + &chanreq, &ap_chandef, + userspace_selectors); if (IS_ERR(elems)) { rcu_read_unlock(); @@ -5721,7 +6139,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, link->u.mgd.conn = assoc_data->link[link_id].conn; err = ieee80211_prep_channel(sdata, link, link_id, cbss, - true, &link->u.mgd.conn); + true, &link->u.mgd.conn, + sdata->u.mgd.userspace_selectors); if (err) { link_info(link, "prep_channel failed\n"); goto out_err; @@ -5839,6 +6258,8 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (!assoc_data) return; + info.link_id = assoc_data->assoc_link_id; + parse_params.mode = assoc_data->link[assoc_data->assoc_link_id].conn.mode; @@ -6799,11 +7220,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, bssid = ieee80211_get_bssid(hdr, len, sdata->vif.type); if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { struct ieee80211_ext *ext = (void *) mgmt; - - if (ieee80211_is_s1g_short_beacon(ext->frame_control)) - variable = ext->u.s1g_short_beacon.variable; - else - variable = ext->u.s1g_beacon.variable; + variable = ext->u.s1g_beacon.variable + + ieee80211_s1g_optional_len(ext->frame_control); } baselen = (u8 *) variable - (u8 *) mgmt; @@ -7005,7 +7423,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, ieee80211_mgd_update_bss_param_ch_cnt(sdata, bss_conf, elems); - if (!link->u.mgd.disable_wmm_tracking && + if (!sdata->u.mgd.epcs.enabled && + !link->u.mgd.disable_wmm_tracking && ieee80211_sta_wmm_params(local, link, elems->wmm_param, elems->wmm_param_len, elems->mu_edca_param_set)) @@ -7056,7 +7475,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems); - if (ieee80211_config_bw(link, elems, true, &changed)) { + if (ieee80211_config_bw(link, elems, true, &changed, "beacon")) { ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, WLAN_REASON_DEAUTH_LEAVING, true, deauth_buf); @@ -7253,6 +7672,7 @@ ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, int hdr_len = offsetofend(struct ieee80211_mgmt, u.action.u.ttlm_res); int ttlm_max_len = 2 + 1 + sizeof(struct ieee80211_ttlm_elem) + 1 + 2 * 2 * IEEE80211_TTLM_NUM_TIDS; + u16 status_code; skb = dev_alloc_skb(local->tx_headroom + hdr_len + ttlm_max_len); if (!skb) @@ -7275,19 +7695,18 @@ ieee80211_send_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, WARN_ON(1); fallthrough; case NEG_TTLM_RES_REJECT: - mgmt->u.action.u.ttlm_res.status_code = - WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING; + status_code = WLAN_STATUS_DENIED_TID_TO_LINK_MAPPING; break; case NEG_TTLM_RES_ACCEPT: - mgmt->u.action.u.ttlm_res.status_code = WLAN_STATUS_SUCCESS; + status_code = WLAN_STATUS_SUCCESS; break; case NEG_TTLM_RES_SUGGEST_PREFERRED: - mgmt->u.action.u.ttlm_res.status_code = - WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED; + status_code = WLAN_STATUS_PREF_TID_TO_LINK_MAPPING_SUGGESTED; ieee80211_neg_ttlm_add_suggested_map(skb, neg_ttlm); break; } + mgmt->u.action.u.ttlm_res.status_code = cpu_to_le16(status_code); ieee80211_tx_skb(sdata, skb); } @@ -7453,17 +7872,13 @@ void ieee80211_process_neg_ttlm_res(struct ieee80211_sub_if_data *sdata, * This can be better implemented in the future, to handle request * rejections. */ - if (mgmt->u.action.u.ttlm_res.status_code != WLAN_STATUS_SUCCESS) + if (le16_to_cpu(mgmt->u.action.u.ttlm_res.status_code) != WLAN_STATUS_SUCCESS) __ieee80211_disconnect(sdata); } -static void ieee80211_teardown_ttlm_work(struct wiphy *wiphy, - struct wiphy_work *work) +void ieee80211_process_ttlm_teardown(struct ieee80211_sub_if_data *sdata) { u16 new_dormant_links; - struct ieee80211_sub_if_data *sdata = - container_of(work, struct ieee80211_sub_if_data, - u.mgd.teardown_ttlm_work); if (!sdata->vif.neg_ttlm.valid) return; @@ -7478,6 +7893,16 @@ static void ieee80211_teardown_ttlm_work(struct wiphy *wiphy, BSS_CHANGED_MLD_VALID_LINKS); } +static void ieee80211_teardown_ttlm_work(struct wiphy *wiphy, + struct wiphy_work *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + u.mgd.teardown_ttlm_work); + + ieee80211_process_ttlm_teardown(sdata); +} + void ieee80211_send_teardown_neg_ttlm(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); @@ -8135,6 +8560,25 @@ static void ieee80211_request_smps_mgd_work(struct wiphy *wiphy, link->u.mgd.driver_smps_mode); } +static void ieee80211_ml_sta_reconf_timeout(struct wiphy *wiphy, + struct wiphy_work *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + u.mgd.reconf.wk.work); + + if (!sdata->u.mgd.reconf.added_links && + !sdata->u.mgd.reconf.removed_links) + return; + + sdata_info(sdata, + "mlo: reconf: timeout: added=0x%x, removed=0x%x\n", + sdata->u.mgd.reconf.added_links, + sdata->u.mgd.reconf.removed_links); + + __ieee80211_disconnect(sdata); +} + /* interface setup */ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) { @@ -8149,6 +8593,8 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) ieee80211_tdls_peer_del_work); wiphy_delayed_work_init(&ifmgd->ml_reconf_work, ieee80211_ml_reconf_work); + wiphy_delayed_work_init(&ifmgd->reconf.wk, + ieee80211_ml_sta_reconf_timeout); timer_setup(&ifmgd->timer, ieee80211_sta_timer, 0); timer_setup(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, 0); timer_setup(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer, 0); @@ -8209,6 +8655,9 @@ void ieee80211_mgd_setup_link(struct ieee80211_link_data *link) if (sdata->u.mgd.assoc_data) ether_addr_copy(link->conf->addr, sdata->u.mgd.assoc_data->link[link_id].addr); + else if (sdata->u.mgd.reconf.add_links_data) + ether_addr_copy(link->conf->addr, + sdata->u.mgd.reconf.add_links_data->link[link_id].addr); else if (!is_valid_ether_addr(link->conf->addr)) eth_random_addr(link->conf->addr); } @@ -8231,7 +8680,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss, s8 link_id, const u8 *ap_mld_addr, bool assoc, struct ieee80211_conn_settings *conn, - bool override) + bool override, + unsigned long *userspace_selectors) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; @@ -8370,7 +8820,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, */ link->u.mgd.conn = *conn; err = ieee80211_prep_channel(sdata, link, link->link_id, cbss, - mlo, &link->u.mgd.conn); + mlo, &link->u.mgd.conn, + userspace_selectors); if (err) { if (new_sta) sta_info_free(local, new_sta); @@ -8486,6 +8937,22 @@ out: return ret; } +static void ieee80211_parse_cfg_selectors(unsigned long *userspace_selectors, + const u8 *supported_selectors, + u8 supported_selectors_len) +{ + if (supported_selectors) { + for (int i = 0; i < supported_selectors_len; i++) { + set_bit(supported_selectors[i], + userspace_selectors); + } + } else { + /* Assume SAE_H2E support for backward compatibility. */ + set_bit(BSS_MEMBERSHIP_SELECTOR_SAE_H2E, + userspace_selectors); + } +} + /* config hooks */ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, struct cfg80211_auth_request *req) @@ -8587,6 +9054,10 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, memcpy(auth_data->key, req->key, req->key_len); } + ieee80211_parse_cfg_selectors(auth_data->userspace_selectors, + req->supported_selectors, + req->supported_selectors_len); + auth_data->algorithm = auth_alg; /* try to authenticate/probe */ @@ -8640,7 +9111,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, err = ieee80211_prep_connection(sdata, req->bss, req->link_id, req->ap_mld_addr, cont_auth, - &conn, false); + &conn, false, + auth_data->userspace_selectors); if (err) goto err_clear; @@ -8911,6 +9383,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, else memcpy(assoc_data->ap_addr, cbss->bssid, ETH_ALEN); + assoc_data->ext_mld_capa_ops = cpu_to_le16(req->ext_mld_capa_ops); + if (ifmgd->associated) { u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; @@ -8927,6 +9401,12 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, false); } + memset(sdata->u.mgd.userspace_selectors, 0, + sizeof(sdata->u.mgd.userspace_selectors)); + ieee80211_parse_cfg_selectors(sdata->u.mgd.userspace_selectors, + req->supported_selectors, + req->supported_selectors_len); + memcpy(&ifmgd->ht_capa, &req->ht_capa, sizeof(ifmgd->ht_capa)); memcpy(&ifmgd->ht_capa_mask, &req->ht_capa_mask, sizeof(ifmgd->ht_capa_mask)); @@ -9173,7 +9653,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, /* only calculate the mode, hence link == NULL */ err = ieee80211_prep_channel(sdata, NULL, i, assoc_data->link[i].bss, true, - &assoc_data->link[i].conn); + &assoc_data->link[i].conn, + sdata->u.mgd.userspace_selectors); if (err) { req->links[i].error = err; goto err_clear; @@ -9189,7 +9670,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, err = ieee80211_prep_connection(sdata, cbss, req->link_id, req->ap_mld_addr, true, &assoc_data->link[assoc_link_id].conn, - override); + override, + sdata->u.mgd.userspace_selectors); if (err) goto err_clear; @@ -9295,7 +9777,6 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); - drv_mgd_complete_tx(sdata->local, sdata, &info); return 0; } @@ -9368,7 +9849,7 @@ void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata) ifmgd->assoc_req_ies = NULL; ifmgd->assoc_req_ies_len = 0; spin_unlock_bh(&ifmgd->teardown_lock); - del_timer_sync(&ifmgd->timer); + timer_delete_sync(&ifmgd->timer); } void ieee80211_cqm_rssi_notify(struct ieee80211_vif *vif, @@ -9433,3 +9914,886 @@ void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif) _ieee80211_enable_rssi_reports(sdata, 0, 0); } EXPORT_SYMBOL(ieee80211_disable_rssi_reports); + +void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + struct ieee80211_mgd_assoc_data *add_links_data = + ifmgd->reconf.add_links_data; + struct sta_info *sta; + struct cfg80211_mlo_reconf_done_data done_data = {}; + u16 sta_changed_links = sdata->u.mgd.reconf.added_links | + sdata->u.mgd.reconf.removed_links; + u16 link_mask, valid_links; + unsigned int link_id; + size_t orig_len = len; + u8 i, group_key_data_len; + u8 *pos; + + if (!ieee80211_vif_is_mld(&sdata->vif) || + len < offsetofend(typeof(*mgmt), u.action.u.ml_reconf_resp) || + mgmt->u.action.u.ml_reconf_resp.dialog_token != + sdata->u.mgd.reconf.dialog_token || + !sta_changed_links) + return; + + pos = mgmt->u.action.u.ml_reconf_resp.variable; + len -= offsetofend(typeof(*mgmt), u.action.u.ml_reconf_resp); + + /* each status duple is 3 octets */ + if (len < mgmt->u.action.u.ml_reconf_resp.count * 3) { + sdata_info(sdata, + "mlo: reconf: unexpected len=%zu, count=%u\n", + len, mgmt->u.action.u.ml_reconf_resp.count); + goto disconnect; + } + + link_mask = sta_changed_links; + for (i = 0; i < mgmt->u.action.u.ml_reconf_resp.count; i++) { + u16 status = get_unaligned_le16(pos + 1); + + link_id = *pos; + + if (!(link_mask & BIT(link_id))) { + sdata_info(sdata, + "mlo: reconf: unexpected link: %u, changed=0x%x\n", + link_id, sta_changed_links); + goto disconnect; + } + + /* clear the corresponding link, to detect the case that + * the same link was included more than one time + */ + link_mask &= ~BIT(link_id); + + /* Handle failure to remove links here. Failure to remove added + * links will be done later in the flow. + */ + if (status != WLAN_STATUS_SUCCESS) { + sdata_info(sdata, + "mlo: reconf: failed on link=%u, status=%u\n", + link_id, status); + + /* The AP MLD failed to remove a link that was already + * removed locally. As this is not expected behavior, + * disconnect + */ + if (sdata->u.mgd.reconf.removed_links & BIT(link_id)) + goto disconnect; + + /* The AP MLD failed to add a link. Remove it from the + * added links. + */ + sdata->u.mgd.reconf.added_links &= ~BIT(link_id); + } + + pos += 3; + len -= 3; + } + + if (link_mask) { + sdata_info(sdata, + "mlo: reconf: no response for links=0x%x\n", + link_mask); + goto disconnect; + } + + if (!sdata->u.mgd.reconf.added_links) + goto out; + + if (len < 1 || len < 1 + *pos) { + sdata_info(sdata, + "mlo: reconf: invalid group key data length"); + goto disconnect; + } + + /* The Group Key Data field must be present when links are added. This + * field should be processed by userland. + */ + group_key_data_len = *pos++; + + pos += group_key_data_len; + len -= group_key_data_len + 1; + + /* Process the information for the added links */ + sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); + if (WARN_ON(!sta)) + goto disconnect; + + valid_links = sdata->vif.valid_links; + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + if (!add_links_data->link[link_id].bss || + !(sdata->u.mgd.reconf.added_links & BIT(link_id))) + + continue; + + valid_links |= BIT(link_id); + if (ieee80211_sta_allocate_link(sta, link_id)) + goto disconnect; + } + + ieee80211_vif_set_links(sdata, valid_links, sdata->vif.dormant_links); + link_mask = 0; + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + struct cfg80211_bss *cbss = add_links_data->link[link_id].bss; + struct ieee80211_link_data *link; + struct link_sta_info *link_sta; + u64 changed = 0; + + if (!cbss) + continue; + + link = sdata_dereference(sdata->link[link_id], sdata); + if (WARN_ON(!link)) + goto disconnect; + + link_info(link, + "mlo: reconf: local address %pM, AP link address %pM\n", + add_links_data->link[link_id].addr, + add_links_data->link[link_id].bss->bssid); + + link_sta = rcu_dereference_protected(sta->link[link_id], + lockdep_is_held(&local->hw.wiphy->mtx)); + if (WARN_ON(!link_sta)) + goto disconnect; + + if (!link->u.mgd.have_beacon) { + const struct cfg80211_bss_ies *ies; + + rcu_read_lock(); + ies = rcu_dereference(cbss->beacon_ies); + if (ies) + link->u.mgd.have_beacon = true; + else + ies = rcu_dereference(cbss->ies); + ieee80211_get_dtim(ies, + &link->conf->sync_dtim_count, + &link->u.mgd.dtim_period); + link->conf->beacon_int = cbss->beacon_interval; + rcu_read_unlock(); + } + + link->conf->dtim_period = link->u.mgd.dtim_period ?: 1; + + link->u.mgd.conn = add_links_data->link[link_id].conn; + if (ieee80211_prep_channel(sdata, link, link_id, cbss, + true, &link->u.mgd.conn, + sdata->u.mgd.userspace_selectors)) { + link_info(link, "mlo: reconf: prep_channel failed\n"); + goto disconnect; + } + + if (ieee80211_mgd_setup_link_sta(link, sta, link_sta, + add_links_data->link[link_id].bss)) + goto disconnect; + + if (!ieee80211_assoc_config_link(link, link_sta, + add_links_data->link[link_id].bss, + mgmt, pos, len, + &changed)) + goto disconnect; + + /* The AP MLD indicated success for this link, but the station + * profile status indicated otherwise. Since there is an + * inconsistency in the ML reconfiguration response, disconnect + */ + if (add_links_data->link[link_id].status != WLAN_STATUS_SUCCESS) + goto disconnect; + + ieee80211_sta_init_nss(link_sta); + if (ieee80211_sta_activate_link(sta, link_id)) + goto disconnect; + + changed |= ieee80211_link_set_associated(link, cbss); + ieee80211_link_info_change_notify(sdata, link, changed); + + ieee80211_recalc_smps(sdata, link); + link_mask |= BIT(link_id); + } + + sdata_info(sdata, + "mlo: reconf: current valid_links=0x%x, added=0x%x\n", + valid_links, link_mask); + + /* links might have changed due to rejected ones, set them again */ + ieee80211_vif_set_links(sdata, valid_links, sdata->vif.dormant_links); + ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS); + + ieee80211_recalc_ps(local); + ieee80211_recalc_ps_vif(sdata); + + done_data.buf = (const u8 *)mgmt; + done_data.len = orig_len; + done_data.added_links = link_mask; + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + done_data.links[link_id].bss = add_links_data->link[link_id].bss; + done_data.links[link_id].addr = + add_links_data->link[link_id].addr; + } + + cfg80211_mlo_reconf_add_done(sdata->dev, &done_data); + kfree(sdata->u.mgd.reconf.add_links_data); + sdata->u.mgd.reconf.add_links_data = NULL; +out: + ieee80211_ml_reconf_reset(sdata); + return; + +disconnect: + __ieee80211_disconnect(sdata); +} + +static struct sk_buff * +ieee80211_build_ml_reconf_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgd_assoc_data *add_links_data, + u16 removed_links, __le16 ext_mld_capa_ops) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct ieee80211_multi_link_elem *ml_elem; + struct ieee80211_mle_basic_common_info *common; + enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); + struct sk_buff *skb; + size_t size; + unsigned int link_id; + __le16 eml_capa = 0, mld_capa_ops = 0; + struct ieee80211_tx_info *info; + u8 common_size, var_common_size; + u8 *ml_elem_len; + u16 capab = 0; + + size = local->hw.extra_tx_headroom + sizeof(*mgmt); + + /* Consider the maximal length of the reconfiguration ML element */ + size += sizeof(struct ieee80211_multi_link_elem); + + /* The Basic ML element and the Reconfiguration ML element have the same + * fixed common information fields in the context of ML reconfiguration + * action frame. The AP MLD MAC address must always be present + */ + common_size = sizeof(*common); + + /* when adding links, the MLD capabilities must be present */ + var_common_size = 0; + if (add_links_data) { + const struct wiphy_iftype_ext_capab *ift_ext_capa = + cfg80211_get_iftype_ext_capa(local->hw.wiphy, + ieee80211_vif_type_p2p(&sdata->vif)); + + if (ift_ext_capa) { + eml_capa = cpu_to_le16(ift_ext_capa->eml_capabilities); + mld_capa_ops = + cpu_to_le16(ift_ext_capa->mld_capa_and_ops); + } + + /* MLD capabilities and operation */ + var_common_size += 2; + + /* EML capabilities */ + if (eml_capa & cpu_to_le16((IEEE80211_EML_CAP_EMLSR_SUPP | + IEEE80211_EML_CAP_EMLMR_SUPPORT))) + var_common_size += 2; + } + + if (ext_mld_capa_ops) + var_common_size += 2; + + /* Add the common information length */ + size += common_size + var_common_size; + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + struct cfg80211_bss *cbss; + size_t elems_len; + + if (removed_links & BIT(link_id)) { + size += sizeof(struct ieee80211_mle_per_sta_profile) + + ETH_ALEN; + continue; + } + + if (!add_links_data || !add_links_data->link[link_id].bss) + continue; + + elems_len = add_links_data->link[link_id].elems_len; + cbss = add_links_data->link[link_id].bss; + + /* should be the same across all BSSes */ + if (cbss->capability & WLAN_CAPABILITY_PRIVACY) + capab |= WLAN_CAPABILITY_PRIVACY; + + size += 2 + sizeof(struct ieee80211_mle_per_sta_profile) + + ETH_ALEN; + + /* WMM */ + size += 9; + size += ieee80211_link_common_elems_size(sdata, iftype, cbss, + elems_len); + } + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return NULL; + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = skb_put_zero(skb, offsetofend(struct ieee80211_mgmt, + u.action.u.ml_reconf_req)); + + /* Add the MAC header */ + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); + + /* Add the action frame fixed fields */ + mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; + mgmt->u.action.u.ml_reconf_req.action_code = + WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ; + + /* allocate a dialog token and store it */ + sdata->u.mgd.reconf.dialog_token = ++sdata->u.mgd.dialog_token_alloc; + mgmt->u.action.u.ml_reconf_req.dialog_token = + sdata->u.mgd.reconf.dialog_token; + + /* Add the ML reconfiguration element and the common information */ + skb_put_u8(skb, WLAN_EID_EXTENSION); + ml_elem_len = skb_put(skb, 1); + skb_put_u8(skb, WLAN_EID_EXT_EHT_MULTI_LINK); + ml_elem = skb_put(skb, sizeof(*ml_elem)); + ml_elem->control = + cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_RECONF | + IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR); + common = skb_put(skb, common_size); + common->len = common_size + var_common_size; + memcpy(common->mld_mac_addr, sdata->vif.addr, ETH_ALEN); + + if (add_links_data) { + if (eml_capa & + cpu_to_le16((IEEE80211_EML_CAP_EMLSR_SUPP | + IEEE80211_EML_CAP_EMLMR_SUPPORT))) { + ml_elem->control |= + cpu_to_le16(IEEE80211_MLC_RECONF_PRES_EML_CAPA); + skb_put_data(skb, &eml_capa, sizeof(eml_capa)); + } + + ml_elem->control |= + cpu_to_le16(IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP); + + skb_put_data(skb, &mld_capa_ops, sizeof(mld_capa_ops)); + } + + if (ext_mld_capa_ops) { + ml_elem->control |= + cpu_to_le16(IEEE80211_MLC_RECONF_PRES_EXT_MLD_CAPA_OP); + skb_put_data(skb, &ext_mld_capa_ops, sizeof(ext_mld_capa_ops)); + } + + if (sdata->u.mgd.flags & IEEE80211_STA_ENABLE_RRM) + capab |= WLAN_CAPABILITY_RADIO_MEASURE; + + /* Add the per station profile */ + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + u8 *subelem_len = NULL; + u16 ctrl; + const u8 *addr; + + /* Skip links that are not changing */ + if (!(removed_links & BIT(link_id)) && + (!add_links_data || !add_links_data->link[link_id].bss)) + continue; + + ctrl = link_id | + IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT; + + if (removed_links & BIT(link_id)) { + struct ieee80211_bss_conf *conf = + sdata_dereference(sdata->vif.link_conf[link_id], + sdata); + if (!conf) + continue; + + addr = conf->addr; + ctrl |= u16_encode_bits(IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_DEL_LINK, + IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE); + } else { + addr = add_links_data->link[link_id].addr; + ctrl |= IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE | + u16_encode_bits(IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_ADD_LINK, + IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE); + } + + skb_put_u8(skb, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE); + subelem_len = skb_put(skb, 1); + + put_unaligned_le16(ctrl, skb_put(skb, sizeof(ctrl))); + skb_put_u8(skb, 1 + ETH_ALEN); + skb_put_data(skb, addr, ETH_ALEN); + + if (!(removed_links & BIT(link_id))) { + u16 link_present_elems[PRESENT_ELEMS_MAX] = {}; + size_t extra_used; + void *capab_pos; + u8 qos_info; + + capab_pos = skb_put(skb, 2); + + extra_used = + ieee80211_add_link_elems(sdata, skb, &capab, NULL, + add_links_data->link[link_id].elems, + add_links_data->link[link_id].elems_len, + link_id, NULL, + link_present_elems, + add_links_data); + + if (add_links_data->link[link_id].elems) + skb_put_data(skb, + add_links_data->link[link_id].elems + + extra_used, + add_links_data->link[link_id].elems_len - + extra_used); + if (sdata->u.mgd.flags & IEEE80211_STA_UAPSD_ENABLED) { + qos_info = sdata->u.mgd.uapsd_queues; + qos_info |= (sdata->u.mgd.uapsd_max_sp_len << + IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT); + } else { + qos_info = 0; + } + + ieee80211_add_wmm_info_ie(skb_put(skb, 9), qos_info); + put_unaligned_le16(capab, capab_pos); + } + + ieee80211_fragment_element(skb, subelem_len, + IEEE80211_MLE_SUBELEM_FRAGMENT); + } + + ieee80211_fragment_element(skb, ml_elem_len, WLAN_EID_FRAGMENT); + + info = IEEE80211_SKB_CB(skb); + info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + + return skb; +} + +int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata, + struct cfg80211_ml_reconf_req *req) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgd_assoc_data *data = NULL; + struct sta_info *sta; + struct sk_buff *skb; + u16 added_links, new_valid_links; + int link_id, err; + + if (!ieee80211_vif_is_mld(&sdata->vif) || + !(sdata->vif.cfg.mld_capa_op & + IEEE80211_MLD_CAP_OP_LINK_RECONF_SUPPORT)) + return -EINVAL; + + /* No support for concurrent ML reconfiguration operation */ + if (sdata->u.mgd.reconf.added_links || + sdata->u.mgd.reconf.removed_links) + return -EBUSY; + + added_links = 0; + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + if (!req->add_links[link_id].bss) + continue; + + added_links |= BIT(link_id); + } + + sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); + if (WARN_ON(!sta)) + return -ENOLINK; + + /* Adding links to the set of valid link is done only after a successful + * ML reconfiguration frame exchange. Here prepare the data for the ML + * reconfiguration frame construction and allocate the required + * resources + */ + if (added_links) { + bool uapsd_supported; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->assoc_link_id = -1; + data->wmm = true; + + uapsd_supported = true; + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; + link_id++) { + struct ieee80211_supported_band *sband; + struct cfg80211_bss *link_cbss = + req->add_links[link_id].bss; + struct ieee80211_bss *bss; + + if (!link_cbss) + continue; + + bss = (void *)link_cbss->priv; + + if (!bss->wmm_used) { + err = -EINVAL; + goto err_free; + } + + if (link_cbss->channel->band == NL80211_BAND_S1GHZ) { + err = -EINVAL; + goto err_free; + } + + eth_random_addr(data->link[link_id].addr); + data->link[link_id].conn = + ieee80211_conn_settings_unlimited; + sband = + local->hw.wiphy->bands[link_cbss->channel->band]; + + ieee80211_determine_our_sta_mode(sdata, sband, + NULL, true, link_id, + &data->link[link_id].conn); + + data->link[link_id].bss = link_cbss; + data->link[link_id].disabled = + req->add_links[link_id].disabled; + data->link[link_id].elems = + (u8 *)req->add_links[link_id].elems; + data->link[link_id].elems_len = + req->add_links[link_id].elems_len; + + if (!bss->uapsd_supported) + uapsd_supported = false; + + if (data->link[link_id].conn.mode < + IEEE80211_CONN_MODE_EHT) { + err = -EINVAL; + goto err_free; + } + + err = ieee80211_mgd_get_ap_ht_vht_capa(sdata, data, + link_id); + if (err) { + err = -EINVAL; + goto err_free; + } + } + + /* Require U-APSD support if we enabled it */ + if (sdata->u.mgd.flags & IEEE80211_STA_UAPSD_ENABLED && + !uapsd_supported) { + err = -EINVAL; + sdata_info(sdata, "U-APSD on but not available on (all) new links\n"); + goto err_free; + } + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; + link_id++) { + if (!data->link[link_id].bss) + continue; + + /* only used to verify the mode, nothing is allocated */ + err = ieee80211_prep_channel(sdata, NULL, link_id, + data->link[link_id].bss, + true, + &data->link[link_id].conn, + sdata->u.mgd.userspace_selectors); + if (err) + goto err_free; + } + } + + /* link removal is done before the ML reconfiguration frame exchange so + * that these links will not be used between their removal by the AP MLD + * and before the station got the ML reconfiguration response. Based on + * Section 35.3.6.4 in Draft P802.11be_D7.0 the AP MLD should accept the + * link removal request. + */ + if (req->rem_links) { + u16 new_active_links = + sdata->vif.active_links & ~req->rem_links; + + new_valid_links = sdata->vif.valid_links & ~req->rem_links; + + /* Should not be left with no valid links to perform the + * ML reconfiguration + */ + if (!new_valid_links || + !(new_valid_links & ~sdata->vif.dormant_links)) { + sdata_info(sdata, "mlo: reconf: no valid links\n"); + err = -EINVAL; + goto err_free; + } + + if (new_active_links != sdata->vif.active_links) { + if (!new_active_links) + new_active_links = + BIT(__ffs(new_valid_links & + ~sdata->vif.dormant_links)); + + err = ieee80211_set_active_links(&sdata->vif, + new_active_links); + if (err) { + sdata_info(sdata, + "mlo: reconf: failed set active links\n"); + goto err_free; + } + } + } + + /* Build the SKB before the link removal as the construction of the + * station info for removed links requires the local address. + * Invalidate the removed links, so that the transmission of the ML + * reconfiguration request frame would not be done using them, as the AP + * is expected to send the ML reconfiguration response frame on the link + * on which the request was received. + */ + skb = ieee80211_build_ml_reconf_req(sdata, data, req->rem_links, + cpu_to_le16(req->ext_mld_capa_ops)); + if (!skb) { + err = -ENOMEM; + goto err_free; + } + + if (req->rem_links) { + u16 new_dormant_links = + sdata->vif.dormant_links & ~req->rem_links; + + err = ieee80211_vif_set_links(sdata, new_valid_links, + new_dormant_links); + if (err) { + sdata_info(sdata, + "mlo: reconf: failed set valid links\n"); + kfree_skb(skb); + goto err_free; + } + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; + link_id++) { + if (!(req->rem_links & BIT(link_id))) + continue; + + ieee80211_sta_remove_link(sta, link_id); + } + + /* notify the driver and upper layers */ + ieee80211_vif_cfg_change_notify(sdata, + BSS_CHANGED_MLD_VALID_LINKS); + cfg80211_links_removed(sdata->dev, req->rem_links); + } + + sdata_info(sdata, "mlo: reconf: adding=0x%x, removed=0x%x\n", + added_links, req->rem_links); + + ieee80211_tx_skb(sdata, skb); + + sdata->u.mgd.reconf.added_links = added_links; + sdata->u.mgd.reconf.add_links_data = data; + sdata->u.mgd.reconf.removed_links = req->rem_links; + wiphy_delayed_work_queue(sdata->local->hw.wiphy, + &sdata->u.mgd.reconf.wk, + IEEE80211_ASSOC_TIMEOUT_SHORT); + return 0; + + err_free: + kfree(data); + return err; +} + +static bool ieee80211_mgd_epcs_supp(struct ieee80211_sub_if_data *sdata) +{ + unsigned long valid_links = sdata->vif.valid_links; + u8 link_id; + + lockdep_assert_wiphy(sdata->local->hw.wiphy); + + if (!ieee80211_vif_is_mld(&sdata->vif)) + return false; + + for_each_set_bit(link_id, &valid_links, IEEE80211_MLD_MAX_NUM_LINKS) { + struct ieee80211_bss_conf *bss_conf = + sdata_dereference(sdata->vif.link_conf[link_id], sdata); + + if (WARN_ON(!bss_conf) || !bss_conf->epcs_support) + return false; + } + + return true; +} + +int ieee80211_mgd_set_epcs(struct ieee80211_sub_if_data *sdata, bool enable) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct sk_buff *skb; + int frame_len = offsetofend(struct ieee80211_mgmt, + u.action.u.epcs) + (enable ? 1 : 0); + + if (!ieee80211_mgd_epcs_supp(sdata)) + return -EINVAL; + + if (sdata->u.mgd.epcs.enabled == enable && + !sdata->u.mgd.epcs.dialog_token) + return 0; + + /* Do not allow enabling EPCS if the AP didn't respond yet. + * However, allow disabling EPCS in such a case. + */ + if (sdata->u.mgd.epcs.dialog_token && enable) + return -EALREADY; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + frame_len); + if (!skb) + return -ENOBUFS; + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = skb_put_zero(skb, frame_len); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); + + mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; + if (enable) { + u8 *pos = mgmt->u.action.u.epcs.variable; + + mgmt->u.action.u.epcs.action_code = + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_REQ; + + *pos = ++sdata->u.mgd.dialog_token_alloc; + sdata->u.mgd.epcs.dialog_token = *pos; + } else { + mgmt->u.action.u.epcs.action_code = + WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN; + + ieee80211_epcs_teardown(sdata); + ieee80211_epcs_changed(sdata, false); + } + + ieee80211_tx_skb(sdata, skb); + return 0; +} + +static void ieee80211_ml_epcs(struct ieee80211_sub_if_data *sdata, + struct ieee802_11_elems *elems) +{ + const struct element *sub; + size_t scratch_len = elems->ml_epcs_len; + u8 *scratch __free(kfree) = kzalloc(scratch_len, GFP_KERNEL); + + lockdep_assert_wiphy(sdata->local->hw.wiphy); + + if (!ieee80211_vif_is_mld(&sdata->vif) || !elems->ml_epcs) + return; + + if (WARN_ON(!scratch)) + return; + + /* Directly parse the sub elements as the common information doesn't + * hold any useful information. + */ + for_each_mle_subelement(sub, (const u8 *)elems->ml_epcs, + elems->ml_epcs_len) { + struct ieee80211_link_data *link; + struct ieee802_11_elems *link_elems __free(kfree); + u8 *pos = (void *)sub->data; + u16 control; + ssize_t len; + u8 link_id; + + if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE) + continue; + + if (sub->datalen < sizeof(control)) + break; + + control = get_unaligned_le16(pos); + link_id = control & IEEE80211_MLE_STA_EPCS_CONTROL_LINK_ID; + + link = sdata_dereference(sdata->link[link_id], sdata); + if (!link) + continue; + + len = cfg80211_defragment_element(sub, (u8 *)elems->ml_epcs, + elems->ml_epcs_len, + scratch, scratch_len, + IEEE80211_MLE_SUBELEM_FRAGMENT); + if (len < (ssize_t)sizeof(control)) + continue; + + pos = scratch + sizeof(control); + len -= sizeof(control); + + link_elems = ieee802_11_parse_elems(pos, len, false, NULL); + if (!link_elems) + continue; + + if (ieee80211_sta_wmm_params(sdata->local, link, + link_elems->wmm_param, + link_elems->wmm_param_len, + link_elems->mu_edca_param_set)) + ieee80211_link_info_change_notify(sdata, link, + BSS_CHANGED_QOS); + } +} + +void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len) +{ + struct ieee802_11_elems *elems __free(kfree) = NULL; + size_t ies_len; + u16 status_code; + u8 *pos, dialog_token; + + if (!ieee80211_mgd_epcs_supp(sdata)) + return; + + /* Handle dialog token and status code */ + pos = mgmt->u.action.u.epcs.variable; + dialog_token = *pos; + status_code = get_unaligned_le16(pos + 1); + + /* An EPCS enable response with dialog token == 0 is an unsolicited + * notification from the AP MLD. In such a case, EPCS should already be + * enabled and status must be success + */ + if (!dialog_token && + (!sdata->u.mgd.epcs.enabled || + status_code != WLAN_STATUS_SUCCESS)) + return; + + if (sdata->u.mgd.epcs.dialog_token != dialog_token) + return; + + sdata->u.mgd.epcs.dialog_token = 0; + + if (status_code != WLAN_STATUS_SUCCESS) + return; + + pos += IEEE80211_EPCS_ENA_RESP_BODY_LEN; + ies_len = len - offsetof(struct ieee80211_mgmt, + u.action.u.epcs.variable) - + IEEE80211_EPCS_ENA_RESP_BODY_LEN; + + elems = ieee802_11_parse_elems(pos, ies_len, true, NULL); + if (!elems) + return; + + ieee80211_ml_epcs(sdata, elems); + ieee80211_epcs_changed(sdata, true); +} + +void ieee80211_process_epcs_teardown(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *mgmt, size_t len) +{ + if (!ieee80211_vif_is_mld(&sdata->vif) || + !sdata->u.mgd.epcs.enabled) + return; + + ieee80211_epcs_teardown(sdata); + ieee80211_epcs_changed(sdata, false); +} diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c index 6218abc3e441..ece1e83c7b2f 100644 --- a/net/mac80211/ocb.c +++ b/net/mac80211/ocb.c @@ -230,7 +230,7 @@ int ieee80211_ocb_leave(struct ieee80211_sub_if_data *sdata) skb_queue_purge(&sdata->skb_queue); - del_timer_sync(&sdata->u.ocb.housekeeping_timer); + timer_delete_sync(&sdata->u.ocb.housekeeping_timer); /* If the timer fired while we waited for it, it will have * requeued the work. Now the work will be running again * but will not rearm the timer again because it checks diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 29fab7ae47b4..2b9abc27462e 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -30,9 +30,9 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) /* FIXME: what to do when local->pspolling is true? */ - del_timer_sync(&local->dynamic_ps_timer); - del_timer_sync(&ifmgd->bcn_mon_timer); - del_timer_sync(&ifmgd->conn_mon_timer); + timer_delete_sync(&local->dynamic_ps_timer); + timer_delete_sync(&ifmgd->bcn_mon_timer); + timer_delete_sync(&ifmgd->conn_mon_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c index 279c5143b335..96584b39215e 100644 --- a/net/mac80211/parse.c +++ b/net/mac80211/parse.c @@ -44,6 +44,12 @@ struct ieee80211_elems_parse { /* The reconfiguration Multi-Link element in the original elements */ const struct element *ml_reconf_elem; + /* The EPCS Multi-Link element in the original elements */ + const struct element *ml_epcs_elem; + + bool multi_link_inner; + bool skip_vendor; + /* * scratch buffer that can be used for various element parsing related * tasks, e.g., element de-fragmentation etc. @@ -149,16 +155,18 @@ ieee80211_parse_extension_element(u32 *crc, switch (le16_get_bits(mle->control, IEEE80211_ML_CONTROL_TYPE)) { case IEEE80211_ML_CONTROL_TYPE_BASIC: - if (elems_parse->ml_basic_elem) { + if (elems_parse->multi_link_inner) { elems->parse_error |= IEEE80211_PARSE_ERR_DUP_NEST_ML_BASIC; break; } - elems_parse->ml_basic_elem = elem; break; case IEEE80211_ML_CONTROL_TYPE_RECONF: elems_parse->ml_reconf_elem = elem; break; + case IEEE80211_ML_CONTROL_TYPE_PRIO_ACCESS: + elems_parse->ml_epcs_elem = elem; + break; default: break; } @@ -393,6 +401,9 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, IEEE80211_PARSE_ERR_BAD_ELEM_SIZE; break; case WLAN_EID_VENDOR_SPECIFIC: + if (elems_parse->skip_vendor) + break; + if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 && pos[2] == 0xf2) { /* Microsoft OUI (00:50:F2) */ @@ -860,21 +871,36 @@ ieee80211_mle_get_sta_prof(struct ieee80211_elems_parse *elems_parse, } } -static void ieee80211_mle_parse_link(struct ieee80211_elems_parse *elems_parse, - struct ieee80211_elems_parse_params *params) +static const struct element * +ieee80211_prep_mle_link_parse(struct ieee80211_elems_parse *elems_parse, + struct ieee80211_elems_parse_params *params, + struct ieee80211_elems_parse_params *sub) { struct ieee802_11_elems *elems = &elems_parse->elems; struct ieee80211_mle_per_sta_profile *prof; - struct ieee80211_elems_parse_params sub = { - .mode = params->mode, - .action = params->action, - .from_ap = params->from_ap, - .link_id = -1, - }; - ssize_t ml_len = elems->ml_basic_len; - const struct element *non_inherit = NULL; + const struct element *tmp; + ssize_t ml_len; const u8 *end; + if (params->mode < IEEE80211_CONN_MODE_EHT) + return NULL; + + for_each_element_extid(tmp, WLAN_EID_EXT_EHT_MULTI_LINK, + elems->ie_start, elems->total_len) { + const struct ieee80211_multi_link_elem *mle = + (void *)tmp->data + 1; + + if (!ieee80211_mle_size_ok(tmp->data + 1, tmp->datalen - 1)) + continue; + + if (le16_get_bits(mle->control, IEEE80211_ML_CONTROL_TYPE) != + IEEE80211_ML_CONTROL_TYPE_BASIC) + continue; + + elems_parse->ml_basic_elem = tmp; + break; + } + ml_len = cfg80211_defragment_element(elems_parse->ml_basic_elem, elems->ie_start, elems->total_len, @@ -885,26 +911,26 @@ static void ieee80211_mle_parse_link(struct ieee80211_elems_parse *elems_parse, WLAN_EID_FRAGMENT); if (ml_len < 0) - return; + return NULL; elems->ml_basic = (const void *)elems_parse->scratch_pos; elems->ml_basic_len = ml_len; elems_parse->scratch_pos += ml_len; if (params->link_id == -1) - return; + return NULL; ieee80211_mle_get_sta_prof(elems_parse, params->link_id); prof = elems->prof; if (!prof) - return; + return NULL; /* check if we have the 4 bytes for the fixed part in assoc response */ if (elems->sta_prof_len < sizeof(*prof) + prof->sta_info_len - 1 + 4) { elems->prof = NULL; elems->sta_prof_len = 0; - return; + return NULL; } /* @@ -913,13 +939,17 @@ static void ieee80211_mle_parse_link(struct ieee80211_elems_parse *elems_parse, * the -1 is because the 'sta_info_len' is accounted to as part of the * per-STA profile, but not part of the 'u8 variable[]' portion. */ - sub.start = prof->variable + prof->sta_info_len - 1 + 4; + sub->start = prof->variable + prof->sta_info_len - 1 + 4; end = (const u8 *)prof + elems->sta_prof_len; - sub.len = end - sub.start; + sub->len = end - sub->start; + + sub->mode = params->mode; + sub->action = params->action; + sub->from_ap = params->from_ap; + sub->link_id = -1; - non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, - sub.start, sub.len); - _ieee802_11_parse_elems_full(&sub, elems_parse, non_inherit); + return cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + sub->start, sub->len); } static void @@ -943,18 +973,43 @@ ieee80211_mle_defrag_reconf(struct ieee80211_elems_parse *elems_parse) elems_parse->scratch_pos += ml_len; } +static void +ieee80211_mle_defrag_epcs(struct ieee80211_elems_parse *elems_parse) +{ + struct ieee802_11_elems *elems = &elems_parse->elems; + ssize_t ml_len; + + ml_len = cfg80211_defragment_element(elems_parse->ml_epcs_elem, + elems->ie_start, + elems->total_len, + elems_parse->scratch_pos, + elems_parse->scratch + + elems_parse->scratch_len - + elems_parse->scratch_pos, + WLAN_EID_FRAGMENT); + if (ml_len < 0) + return; + elems->ml_epcs = (void *)elems_parse->scratch_pos; + elems->ml_epcs_len = ml_len; + elems_parse->scratch_pos += ml_len; +} + struct ieee802_11_elems * ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) { + struct ieee80211_elems_parse_params sub = {}; struct ieee80211_elems_parse *elems_parse; - struct ieee802_11_elems *elems; const struct element *non_inherit = NULL; - u8 *nontransmitted_profile; - int nontransmitted_profile_len = 0; + struct ieee802_11_elems *elems; size_t scratch_len = 3 * params->len; + bool multi_link_inner = false; BUILD_BUG_ON(offsetof(typeof(*elems_parse), elems) != 0); + /* cannot parse for both a specific link and non-transmitted BSS */ + if (WARN_ON(params->link_id >= 0 && params->bss)) + return NULL; + elems_parse = kzalloc(struct_size(elems_parse, scratch, scratch_len), GFP_ATOMIC); if (!elems_parse) @@ -971,36 +1026,55 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) ieee80211_clear_tpe(&elems->tpe); ieee80211_clear_tpe(&elems->csa_tpe); - nontransmitted_profile = elems_parse->scratch_pos; - nontransmitted_profile_len = - ieee802_11_find_bssid_profile(params->start, params->len, - elems, params->bss, - nontransmitted_profile); - elems_parse->scratch_pos += nontransmitted_profile_len; - non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, - nontransmitted_profile, - nontransmitted_profile_len); + /* + * If we're looking for a non-transmitted BSS then we cannot at + * the same time be looking for a second link as the two can only + * appear in the same frame carrying info for different BSSes. + * + * In any case, we only look for one at a time, as encoded by + * the WARN_ON above. + */ + if (params->bss) { + int nontx_len = + ieee802_11_find_bssid_profile(params->start, + params->len, + elems, params->bss, + elems_parse->scratch_pos); + sub.start = elems_parse->scratch_pos; + sub.mode = params->mode; + sub.len = nontx_len; + sub.action = params->action; + sub.link_id = params->link_id; + + /* consume the space used for non-transmitted profile */ + elems_parse->scratch_pos += nontx_len; + + non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + sub.start, nontx_len); + } else { + /* must always parse to get elems_parse->ml_basic_elem */ + non_inherit = ieee80211_prep_mle_link_parse(elems_parse, params, + &sub); + multi_link_inner = true; + } + elems_parse->skip_vendor = + cfg80211_find_elem(WLAN_EID_VENDOR_SPECIFIC, + sub.start, sub.len); elems->crc = _ieee802_11_parse_elems_full(params, elems_parse, non_inherit); - /* Override with nontransmitted profile, if found */ - if (nontransmitted_profile_len) { - struct ieee80211_elems_parse_params sub = { - .mode = params->mode, - .start = nontransmitted_profile, - .len = nontransmitted_profile_len, - .action = params->action, - .link_id = params->link_id, - }; - + /* Override with nontransmitted/per-STA profile if found */ + if (sub.len) { + elems_parse->multi_link_inner = multi_link_inner; + elems_parse->skip_vendor = false; _ieee802_11_parse_elems_full(&sub, elems_parse, NULL); } - ieee80211_mle_parse_link(elems_parse, params); - ieee80211_mle_defrag_reconf(elems_parse); + ieee80211_mle_defrag_epcs(elems_parse); + if (elems->tim && !elems->parse_error) { const struct ieee80211_tim_ie *tim_ie = elems->tim; @@ -1027,7 +1101,6 @@ int ieee80211_parse_bitrates(enum nl80211_chan_width width, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates) { - u32 rate_flags = ieee80211_chanwidth_rate_flags(width); struct ieee80211_rate *br; int brate, rate, i, j, count = 0; @@ -1038,8 +1111,6 @@ int ieee80211_parse_bitrates(enum nl80211_chan_width width, for (j = 0; j < sband->n_bitrates; j++) { br = &sband->bitrates[j]; - if ((rate_flags & br->flags) != rate_flags) - continue; brate = DIV_ROUND_UP(br->bitrate, 5); if (brate == rate) { diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index 7be52345f218..a9cc832240a5 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -69,14 +69,14 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) flush_workqueue(local->workqueue); /* Don't try to run timers while suspended. */ - del_timer_sync(&local->sta_cleanup); + timer_delete_sync(&local->sta_cleanup); /* * Note that this particular timer doesn't need to be * restarted at resume. */ wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); - del_timer_sync(&local->dynamic_ps_timer); + timer_delete_sync(&local->dynamic_ps_timer); local->wowlan = wowlan; if (local->wowlan) { diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 0d056db9f81e..3cb2ad6d0b28 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -368,9 +368,8 @@ static void __rate_control_send_low(struct ieee80211_hw *hw, struct ieee80211_tx_info *info, u32 rate_mask) { + u32 rate_flags = 0; int i; - u32 rate_flags = - ieee80211_chandef_rate_flags(&hw->conf.chandef); if (sband->band == NL80211_BAND_S1GHZ) { info->control.rates[0].flags |= IEEE80211_TX_RC_S1G_MCS; @@ -778,14 +777,9 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], u16 vht_mask[NL80211_VHT_NSS_MAX]) { - u32 i, flags; + u32 i; *mask = sdata->rc_rateidx_mask[sband->band]; - flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); - for (i = 0; i < sband->n_bitrates; i++) { - if ((flags & sband->bitrates[i].flags) != flags) - *mask &= ~BIT(i); - } if (*mask == (1 << sband->n_bitrates) - 1 && !sdata->rc_has_mcs_mask[sband->band] && @@ -990,8 +984,6 @@ int rate_control_set_rates(struct ieee80211_hw *hw, if (sta->uploaded) drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta); - ieee80211_sta_set_expected_throughput(pubsta, sta_get_expected_throughput(sta)); - return 0; } EXPORT_SYMBOL(rate_control_set_rates); diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 706cbc99f718..f66910013218 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1873,16 +1873,13 @@ minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta) static void minstrel_ht_fill_rate_array(u8 *dest, struct ieee80211_supported_band *sband, - const s16 *bitrates, int n_rates, u32 rate_flags) + const s16 *bitrates, int n_rates) { int i, j; for (i = 0; i < sband->n_bitrates; i++) { struct ieee80211_rate *rate = &sband->bitrates[i]; - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; - for (j = 0; j < n_rates; j++) { if (rate->bitrate != bitrates[j]) continue; @@ -1898,7 +1895,6 @@ minstrel_ht_init_cck_rates(struct minstrel_priv *mp) { static const s16 bitrates[4] = { 10, 20, 55, 110 }; struct ieee80211_supported_band *sband; - u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); memset(mp->cck_rates, 0xff, sizeof(mp->cck_rates)); sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ]; @@ -1908,8 +1904,7 @@ minstrel_ht_init_cck_rates(struct minstrel_priv *mp) BUILD_BUG_ON(ARRAY_SIZE(mp->cck_rates) != ARRAY_SIZE(bitrates)); minstrel_ht_fill_rate_array(mp->cck_rates, sband, minstrel_cck_bitrates, - ARRAY_SIZE(minstrel_cck_bitrates), - rate_flags); + ARRAY_SIZE(minstrel_cck_bitrates)); } static void @@ -1917,7 +1912,6 @@ minstrel_ht_init_ofdm_rates(struct minstrel_priv *mp, enum nl80211_band band) { static const s16 bitrates[8] = { 60, 90, 120, 180, 240, 360, 480, 540 }; struct ieee80211_supported_band *sband; - u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); memset(mp->ofdm_rates[band], 0xff, sizeof(mp->ofdm_rates[band])); sband = mp->hw->wiphy->bands[band]; @@ -1927,8 +1921,7 @@ minstrel_ht_init_ofdm_rates(struct minstrel_priv *mp, enum nl80211_band band) BUILD_BUG_ON(ARRAY_SIZE(mp->ofdm_rates[band]) != ARRAY_SIZE(bitrates)); minstrel_ht_fill_rate_array(mp->ofdm_rates[band], sband, minstrel_ofdm_bitrates, - ARRAY_SIZE(minstrel_ofdm_bitrates), - rate_flags); + ARRAY_SIZE(minstrel_ofdm_bitrates)); } static void * diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 2bec18fc1b03..09beb65d6108 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -6,7 +6,7 @@ * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/jiffies.h> @@ -1045,14 +1045,14 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) if (is_multicast_ether_addr(hdr->addr1)) { if (ieee80211_has_tods(hdr->frame_control) || !ieee80211_has_fromds(hdr->frame_control)) - return RX_DROP_MONITOR; + return RX_DROP; if (ether_addr_equal(hdr->addr3, dev_addr)) - return RX_DROP_MONITOR; + return RX_DROP; } else { if (!ieee80211_has_a4(hdr->frame_control)) - return RX_DROP_MONITOR; + return RX_DROP; if (ether_addr_equal(hdr->addr4, dev_addr)) - return RX_DROP_MONITOR; + return RX_DROP; } } @@ -1064,20 +1064,20 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) struct ieee80211_mgmt *mgmt; if (!ieee80211_is_mgmt(hdr->frame_control)) - return RX_DROP_MONITOR; + return RX_DROP; if (ieee80211_is_action(hdr->frame_control)) { u8 category; /* make sure category field is present */ if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE) - return RX_DROP_MONITOR; + return RX_DROP; mgmt = (struct ieee80211_mgmt *)hdr; category = mgmt->u.action.category; if (category != WLAN_CATEGORY_MESH_ACTION && category != WLAN_CATEGORY_SELF_PROTECTED) - return RX_DROP_MONITOR; + return RX_DROP; return RX_CONTINUE; } @@ -1087,7 +1087,7 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) ieee80211_is_auth(hdr->frame_control)) return RX_CONTINUE; - return RX_DROP_MONITOR; + return RX_DROP; } return RX_CONTINUE; @@ -1242,7 +1242,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata, tid_agg_rx->reorder_time[j] + 1 + HT_RX_REORDER_BUF_TIMEOUT); } else { - del_timer(&tid_agg_rx->reorder_timer); + timer_delete(&tid_agg_rx->reorder_timer); } } @@ -1513,7 +1513,7 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) hdrlen = ieee80211_hdrlen(hdr->frame_control); if (rx->skb->len < hdrlen + 8) - return RX_DROP_MONITOR; + return RX_DROP; skb_copy_bits(rx->skb, hdrlen + 6, ðertype, 2); if (ethertype == rx->sdata->control_port_protocol) @@ -1526,7 +1526,7 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) GFP_ATOMIC)) return RX_DROP_U_SPURIOUS; - return RX_DROP_MONITOR; + return RX_DROP; } return RX_CONTINUE; @@ -1862,7 +1862,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) cfg80211_rx_unexpected_4addr_frame( rx->sdata->dev, sta->sta.addr, GFP_ATOMIC); - return RX_DROP_M_UNEXPECTED_4ADDR_FRAME; + return RX_DROP_U_UNEXPECTED_4ADDR_FRAME; } /* * Update counter and free packet here to avoid @@ -1997,7 +1997,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, skb->data, skb->len); - return RX_DROP_M_BAD_BCN_KEYIDX; + return RX_DROP_U_BAD_BCN_KEYIDX; } rx->key = ieee80211_rx_get_bigtk(rx, mmie_keyidx); @@ -2011,11 +2011,11 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (mmie_keyidx < NUM_DEFAULT_KEYS || mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) - return RX_DROP_M_BAD_MGMT_KEYIDX; /* unexpected BIP keyidx */ + return RX_DROP_U_BAD_MGMT_KEYIDX; /* unexpected BIP keyidx */ if (rx->link_sta) { if (ieee80211_is_group_privacy_action(skb) && test_sta_flag(rx->sta, WLAN_STA_MFP)) - return RX_DROP_MONITOR; + return RX_DROP; rx->key = rcu_dereference(rx->link_sta->gtk[mmie_keyidx]); } @@ -2100,11 +2100,11 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (rx->key) { if (unlikely(rx->key->flags & KEY_FLAG_TAINTED)) - return RX_DROP_MONITOR; + return RX_DROP; /* TODO: add threshold stuff again */ } else { - return RX_DROP_MONITOR; + return RX_DROP; } switch (rx->key->conf.cipher) { @@ -2278,7 +2278,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) goto out; if (is_multicast_ether_addr(hdr->addr1)) - return RX_DROP_MONITOR; + return RX_DROP; I802_DEBUG_INC(rx->local->rx_handlers_fragments); @@ -2333,7 +2333,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) rx->seqno_idx, hdr); if (!entry) { I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag); - return RX_DROP_MONITOR; + return RX_DROP; } /* "The receiver shall discard MSDUs and MMPDUs whose constituent @@ -2855,25 +2855,25 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta return RX_CONTINUE; if (!pskb_may_pull(skb, sizeof(*eth) + 6)) - return RX_DROP_MONITOR; + return RX_DROP; mesh_hdr = (struct ieee80211s_hdr *)(skb->data + sizeof(*eth)); mesh_hdrlen = ieee80211_get_mesh_hdrlen(mesh_hdr); if (!pskb_may_pull(skb, sizeof(*eth) + mesh_hdrlen)) - return RX_DROP_MONITOR; + return RX_DROP; eth = (struct ethhdr *)skb->data; multicast = is_multicast_ether_addr(eth->h_dest); mesh_hdr = (struct ieee80211s_hdr *)(eth + 1); if (!mesh_hdr->ttl) - return RX_DROP_MONITOR; + return RX_DROP; /* frame is in RMC, don't forward */ if (is_multicast_ether_addr(eth->h_dest) && mesh_rmc_check(sdata, eth->h_source, mesh_hdr)) - return RX_DROP_MONITOR; + return RX_DROP; /* forward packet */ if (sdata->crypto_tx_tailroom_needed_cnt) @@ -2890,7 +2890,7 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta /* has_a4 already checked in ieee80211_rx_mesh_check */ proxied_addr = mesh_hdr->eaddr2; else - return RX_DROP_MONITOR; + return RX_DROP; rcu_read_lock(); mppath = mpp_path_lookup(sdata, proxied_addr); @@ -2922,14 +2922,14 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta goto rx_accept; IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl); - return RX_DROP_MONITOR; + return RX_DROP; } if (!ifmsh->mshcfg.dot11MeshForwarding) { if (is_multicast_ether_addr(eth->h_dest)) goto rx_accept; - return RX_DROP_MONITOR; + return RX_DROP; } skb_set_queue_mapping(skb, ieee802_1d_to_ac[skb->priority]); @@ -3001,6 +3001,7 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta } IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, fwded_frames); + ieee80211_set_qos_hdr(sdata, fwd_skb); ieee80211_add_pending_skb(local, fwd_skb); rx_accept: @@ -3034,8 +3035,7 @@ __ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset) check_da = NULL; break; case NL80211_IFTYPE_STATION: - if (!rx->sta || - !test_sta_flag(rx->sta, WLAN_STA_TDLS_PEER)) + if (!test_sta_flag(rx->sta, WLAN_STA_TDLS_PEER)) check_sa = NULL; break; case NL80211_IFTYPE_MESH_POINT: @@ -3122,7 +3122,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (unlikely(!ieee80211_is_data_present(fc))) - return RX_DROP_MONITOR; + return RX_DROP; if (unlikely(ieee80211_has_a4(hdr->frame_control))) { switch (rx->sdata->vif.type) { @@ -3179,19 +3179,16 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) - return RX_DROP_MONITOR; + return RX_DROP; - /* - * Send unexpected-4addr-frame event to hostapd. For older versions, - * also drop the frame to cooked monitor interfaces. - */ + /* Send unexpected-4addr-frame event to hostapd */ if (ieee80211_has_a4(hdr->frame_control) && sdata->vif.type == NL80211_IFTYPE_AP) { if (rx->sta && !test_and_set_sta_flag(rx->sta, WLAN_STA_4ADDR_EVENT)) cfg80211_rx_unexpected_4addr_frame( rx->sdata->dev, rx->sta->sta.addr, GFP_ATOMIC); - return RX_DROP_MONITOR; + return RX_DROP; } res = __ieee80211_data_to_8023(rx, &port_control); @@ -3203,7 +3200,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) return res; if (!ieee80211_frame_allowed(rx, fc)) - return RX_DROP_MONITOR; + return RX_DROP; /* directly handle TDLS channel switch requests/responses */ if (unlikely(((struct ethhdr *)rx->skb->data)->h_proto == @@ -3268,11 +3265,11 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) }; if (!rx->sta) - return RX_DROP_MONITOR; + return RX_DROP; if (skb_copy_bits(skb, offsetof(struct ieee80211_bar, control), &bar_data, sizeof(bar_data))) - return RX_DROP_MONITOR; + return RX_DROP; tid = le16_to_cpu(bar_data.control) >> 12; @@ -3284,7 +3281,7 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]); if (!tid_agg_rx) - return RX_DROP_MONITOR; + return RX_DROP; start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4; event.u.ba.tid = tid; @@ -3308,12 +3305,7 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) return RX_QUEUED; } - /* - * After this point, we only want management frames, - * so we can drop all remaining control frames to - * cooked monitor interfaces. - */ - return RX_DROP_MONITOR; + return RX_DROP; } static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, @@ -3329,8 +3321,8 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, return; } - if (!ether_addr_equal(mgmt->sa, sdata->deflink.u.mgd.bssid) || - !ether_addr_equal(mgmt->bssid, sdata->deflink.u.mgd.bssid)) { + if (!ether_addr_equal(mgmt->sa, sdata->vif.cfg.ap_addr) || + !ether_addr_equal(mgmt->bssid, sdata->vif.cfg.ap_addr)) { /* Not from the current AP or not associated yet. */ return; } @@ -3346,9 +3338,9 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, skb_reserve(skb, local->hw.extra_tx_headroom); resp = skb_put_zero(skb, 24); - memcpy(resp->da, mgmt->sa, ETH_ALEN); + memcpy(resp->da, sdata->vif.cfg.ap_addr, ETH_ALEN); memcpy(resp->sa, sdata->vif.addr, ETH_ALEN); - memcpy(resp->bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN); + memcpy(resp->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION); skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query)); @@ -3422,10 +3414,10 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) * and unknown (reserved) frames are useless. */ if (rx->skb->len < 24) - return RX_DROP_MONITOR; + return RX_DROP; if (!ieee80211_is_mgmt(mgmt->frame_control)) - return RX_DROP_MONITOR; + return RX_DROP; /* drop too small action frames */ if (ieee80211_is_action(mgmt->frame_control) && @@ -3819,6 +3811,43 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) u.action.u.ttlm_res)) goto invalid; goto queue; + case WLAN_PROTECTED_EHT_ACTION_TTLM_TEARDOWN: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + + if (len < offsetofend(typeof(*mgmt), + u.action.u.ttlm_tear_down)) + goto invalid; + goto queue; + case WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + + /* The reconfiguration response action frame must + * least one 'Status Duple' entry (3 octets) + */ + if (len < + offsetofend(typeof(*mgmt), + u.action.u.ml_reconf_resp) + 3) + goto invalid; + goto queue; + case WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_RESP: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + + if (len < offsetofend(typeof(*mgmt), + u.action.u.epcs) + + IEEE80211_EPCS_ENA_RESP_BODY_LEN) + goto invalid; + goto queue; + case WLAN_PROTECTED_EHT_ACTION_EPCS_ENABLE_TEARDOWN: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + + if (len < offsetofend(typeof(*mgmt), + u.action.u.epcs)) + goto invalid; + goto queue; default: break; } @@ -3939,17 +3968,16 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) * ones. For all other modes we will return them to the sender, * setting the 0x80 bit in the action category, as required by * 802.11-2012 9.24.4. - * Newer versions of hostapd shall also use the management frame - * registration mechanisms, but older ones still use cooked - * monitor interfaces so push all frames there. + * Newer versions of hostapd use the management frame registration + * mechanisms and old cooked monitor interface is no longer supported. */ if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) && (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) - return RX_DROP_MONITOR; + return RX_DROP; if (is_multicast_ether_addr(mgmt->da)) - return RX_DROP_MONITOR; + return RX_DROP; /* do not return rejected action frames */ if (mgmt->u.action.category & 0x80) @@ -3994,7 +4022,7 @@ ieee80211_rx_h_ext(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (sdata->vif.type != NL80211_IFTYPE_STATION) - return RX_DROP_MONITOR; + return RX_DROP; /* for now only beacons are ext, so queue them */ ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb); @@ -4015,7 +4043,7 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_OCB && sdata->vif.type != NL80211_IFTYPE_STATION) - return RX_DROP_MONITOR; + return RX_DROP; switch (stype) { case cpu_to_le16(IEEE80211_STYPE_AUTH): @@ -4026,32 +4054,32 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) case cpu_to_le16(IEEE80211_STYPE_DEAUTH): if (is_multicast_ether_addr(mgmt->da) && !is_broadcast_ether_addr(mgmt->da)) - return RX_DROP_MONITOR; + return RX_DROP; /* process only for station/IBSS */ if (sdata->vif.type != NL80211_IFTYPE_STATION && sdata->vif.type != NL80211_IFTYPE_ADHOC) - return RX_DROP_MONITOR; + return RX_DROP; break; case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP): case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP): case cpu_to_le16(IEEE80211_STYPE_DISASSOC): if (is_multicast_ether_addr(mgmt->da) && !is_broadcast_ether_addr(mgmt->da)) - return RX_DROP_MONITOR; + return RX_DROP; /* process only for station */ if (sdata->vif.type != NL80211_IFTYPE_STATION) - return RX_DROP_MONITOR; + return RX_DROP; break; case cpu_to_le16(IEEE80211_STYPE_PROBE_REQ): /* process only for ibss and mesh */ if (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT) - return RX_DROP_MONITOR; + return RX_DROP; break; default: - return RX_DROP_MONITOR; + return RX_DROP; } ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb); @@ -4059,82 +4087,9 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) return RX_QUEUED; } -static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, - struct ieee80211_rate *rate, - ieee80211_rx_result reason) -{ - struct ieee80211_sub_if_data *sdata; - struct ieee80211_local *local = rx->local; - struct sk_buff *skb = rx->skb, *skb2; - struct net_device *prev_dev = NULL; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - int needed_headroom; - - /* - * If cooked monitor has been processed already, then - * don't do it again. If not, set the flag. - */ - if (rx->flags & IEEE80211_RX_CMNTR) - goto out_free_skb; - rx->flags |= IEEE80211_RX_CMNTR; - - /* If there are no cooked monitor interfaces, just free the SKB */ - if (!local->cooked_mntrs) - goto out_free_skb; - - /* room for the radiotap header based on driver features */ - needed_headroom = ieee80211_rx_radiotap_hdrlen(local, status, skb); - - if (skb_headroom(skb) < needed_headroom && - pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC)) - goto out_free_skb; - - /* prepend radiotap information */ - ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom, - false); - - skb_reset_mac_header(skb); - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->pkt_type = PACKET_OTHERHOST; - skb->protocol = htons(ETH_P_802_2); - - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata)) - continue; - - if (sdata->vif.type != NL80211_IFTYPE_MONITOR || - !(sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES)) - continue; - - if (prev_dev) { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2) { - skb2->dev = prev_dev; - netif_receive_skb(skb2); - } - } - - prev_dev = sdata->dev; - dev_sw_netstats_rx_add(sdata->dev, skb->len); - } - - if (prev_dev) { - skb->dev = prev_dev; - netif_receive_skb(skb); - return; - } - - out_free_skb: - kfree_skb_reason(skb, (__force u32)reason); -} - static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, ieee80211_rx_result res) { - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); - struct ieee80211_supported_band *sband; - struct ieee80211_rate *rate = NULL; - if (res == RX_QUEUED) { I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued); return; @@ -4146,23 +4101,13 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, rx->link_sta->rx_stats.dropped++; } - if (u32_get_bits((__force u32)res, SKB_DROP_REASON_SUBSYS_MASK) == - SKB_DROP_REASON_SUBSYS_MAC80211_UNUSABLE) { - kfree_skb_reason(rx->skb, (__force u32)res); - return; - } - - sband = rx->local->hw.wiphy->bands[status->band]; - if (status->encoding == RX_ENC_LEGACY) - rate = &sband->bitrates[status->rate_idx]; - - ieee80211_rx_cooked_monitor(rx, rate, res); + kfree_skb_reason(rx->skb, (__force u32)res); } static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) { - ieee80211_rx_result res = RX_DROP_MONITOR; + ieee80211_rx_result res = RX_DROP; struct sk_buff *skb; #define CALL_RXH(rxh) \ @@ -4226,7 +4171,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx) { struct sk_buff_head reorder_release; - ieee80211_rx_result res = RX_DROP_MONITOR; + ieee80211_rx_result res = RX_DROP; __skb_queue_head_init(&reorder_release); @@ -4562,7 +4507,9 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) return ieee80211_is_public_action(hdr, skb->len) || ieee80211_is_probe_req(hdr->frame_control) || ieee80211_is_probe_resp(hdr->frame_control) || - ieee80211_is_beacon(hdr->frame_control); + ieee80211_is_beacon(hdr->frame_control) || + (ieee80211_is_auth(hdr->frame_control) && + ether_addr_equal(sdata->vif.addr, hdr->addr1)); case NL80211_IFTYPE_NAN: /* Currently no frames on NAN interface are allowed */ return false; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index cb7079071885..cd8385ecafd9 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -240,6 +240,9 @@ static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, u32 scan_flags, const u8 *da) { + struct ieee80211_link_data *link_sdata; + u8 link_id; + if (!sdata) return false; @@ -251,7 +254,20 @@ static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, if (scan_flags & NL80211_SCAN_FLAG_RANDOM_ADDR) return true; - return ether_addr_equal(da, sdata->vif.addr); + + if (ether_addr_equal(da, sdata->vif.addr)) + return true; + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + link_sdata = rcu_dereference(sdata->link[link_id]); + if (!link_sdata) + continue; + + if (ether_addr_equal(da, link_sdata->conf->addr)) + return true; + } + + return false; } void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) @@ -260,6 +276,7 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_bss *bss; struct ieee80211_channel *channel; + struct ieee80211_ext *ext; size_t min_hdr_len = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); @@ -269,12 +286,10 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) return; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { - if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) - min_hdr_len = offsetof(struct ieee80211_ext, - u.s1g_short_beacon.variable); - else - min_hdr_len = offsetof(struct ieee80211_ext, - u.s1g_beacon); + ext = (struct ieee80211_ext *)mgmt; + min_hdr_len = + offsetof(struct ieee80211_ext, u.s1g_beacon.variable) + + ieee80211_s1g_optional_len(ext->frame_control); } if (skb->len < min_hdr_len) diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index c6015cd00372..7422888d3640 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -147,14 +147,14 @@ validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; u32 control_freq, center_freq1, center_freq2; enum nl80211_chan_width chan_width; - struct { - struct ieee80211_he_operation _oper; - struct ieee80211_he_6ghz_oper _6ghz_oper; - } __packed he; - struct { - struct ieee80211_eht_operation _oper; - struct ieee80211_eht_operation_info _oper_info; - } __packed eht; + DEFINE_RAW_FLEX(struct ieee80211_he_operation, he, optional, + sizeof(struct ieee80211_he_6ghz_oper)); + struct ieee80211_he_6ghz_oper *_6ghz_oper = + (struct ieee80211_he_6ghz_oper *)he->optional; + DEFINE_RAW_FLEX(struct ieee80211_eht_operation, eht, optional, + sizeof(struct ieee80211_eht_operation_info)); + struct ieee80211_eht_operation_info *_oper_info = + (struct ieee80211_eht_operation_info *)eht->optional; const struct ieee80211_eht_operation *eht_oper; if (conn->mode < IEEE80211_CONN_MODE_HE) { @@ -167,38 +167,38 @@ validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata, center_freq2 = chandef->center_freq2; chan_width = chandef->width; - he._oper.he_oper_params = + he->he_oper_params = le32_encode_bits(1, IEEE80211_HE_OPERATION_6GHZ_OP_INFO); - he._6ghz_oper.primary = + _6ghz_oper->primary = ieee80211_frequency_to_channel(control_freq); - he._6ghz_oper.ccfs0 = ieee80211_frequency_to_channel(center_freq1); - he._6ghz_oper.ccfs1 = center_freq2 ? + _6ghz_oper->ccfs0 = ieee80211_frequency_to_channel(center_freq1); + _6ghz_oper->ccfs1 = center_freq2 ? ieee80211_frequency_to_channel(center_freq2) : 0; switch (chan_width) { case NL80211_CHAN_WIDTH_320: - he._6ghz_oper.ccfs1 = he._6ghz_oper.ccfs0; - he._6ghz_oper.ccfs0 += control_freq < center_freq1 ? -16 : 16; - he._6ghz_oper.control = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ; + _6ghz_oper->ccfs1 = _6ghz_oper->ccfs0; + _6ghz_oper->ccfs0 += control_freq < center_freq1 ? -16 : 16; + _6ghz_oper->control = IEEE80211_EHT_OPER_CHAN_WIDTH_320MHZ; break; case NL80211_CHAN_WIDTH_160: - he._6ghz_oper.ccfs1 = he._6ghz_oper.ccfs0; - he._6ghz_oper.ccfs0 += control_freq < center_freq1 ? -8 : 8; + _6ghz_oper->ccfs1 = _6ghz_oper->ccfs0; + _6ghz_oper->ccfs0 += control_freq < center_freq1 ? -8 : 8; fallthrough; case NL80211_CHAN_WIDTH_80P80: - he._6ghz_oper.control = + _6ghz_oper->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; break; case NL80211_CHAN_WIDTH_80: - he._6ghz_oper.control = + _6ghz_oper->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; break; case NL80211_CHAN_WIDTH_40: - he._6ghz_oper.control = + _6ghz_oper->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; break; default: - he._6ghz_oper.control = + _6ghz_oper->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; break; } @@ -206,15 +206,14 @@ validate_chandef_by_6ghz_he_eht_oper(struct ieee80211_sub_if_data *sdata, if (conn->mode < IEEE80211_CONN_MODE_EHT) { eht_oper = NULL; } else { - eht._oper.params = IEEE80211_EHT_OPER_INFO_PRESENT; - eht._oper_info.control = he._6ghz_oper.control; - eht._oper_info.ccfs0 = he._6ghz_oper.ccfs0; - eht._oper_info.ccfs1 = he._6ghz_oper.ccfs1; - eht_oper = &eht._oper; + eht->params = IEEE80211_EHT_OPER_INFO_PRESENT; + _oper_info->control = _6ghz_oper->control; + _oper_info->ccfs0 = _6ghz_oper->ccfs0; + _oper_info->ccfs1 = _6ghz_oper->ccfs1; + eht_oper = eht; } - if (!ieee80211_chandef_he_6ghz_oper(local, &he._oper, - eht_oper, chandef)) + if (!ieee80211_chandef_he_6ghz_oper(local, he, eht_oper, chandef)) chandef->chan = NULL; } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index aa22f09e6d14..84b18be1f0b1 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/module.h> @@ -18,7 +18,6 @@ #include <linux/timer.h> #include <linux/rtnetlink.h> -#include <net/codel.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" @@ -509,6 +508,24 @@ static int sta_info_alloc_link(struct ieee80211_local *local, for (i = 0; i < ARRAY_SIZE(link_info->rx_stats_avg.chain_signal); i++) ewma_signal_init(&link_info->rx_stats_avg.chain_signal[i]); + link_info->rx_omi_bw_rx = IEEE80211_STA_RX_BW_MAX; + link_info->rx_omi_bw_tx = IEEE80211_STA_RX_BW_MAX; + link_info->rx_omi_bw_staging = IEEE80211_STA_RX_BW_MAX; + + /* + * Cause (a) warning(s) if IEEE80211_STA_RX_BW_MAX != 320 + * or if new values are added to the enum. + */ + switch (link_info->cur_max_bandwidth) { + case IEEE80211_STA_RX_BW_20: + case IEEE80211_STA_RX_BW_40: + case IEEE80211_STA_RX_BW_80: + case IEEE80211_STA_RX_BW_160: + case IEEE80211_STA_RX_BW_MAX: + /* intentionally nothing */ + break; + } + return 0; } @@ -683,12 +700,6 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, } } - sta->cparams.ce_threshold = CODEL_DISABLED_THRESHOLD; - sta->cparams.target = MS2TIME(20); - sta->cparams.interval = MS2TIME(100); - sta->cparams.ecn = true; - sta->cparams.ce_threshold_selector = 0; - sta->cparams.ce_threshold_mask = 0; sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); @@ -1317,9 +1328,13 @@ static int _sta_info_move_state(struct sta_info *sta, sta->sta.addr, new_state); /* notify the driver before the actual changes so it can - * fail the transition + * fail the transition if the state is increasing. + * The driver is required not to fail when the transition + * is decreasing the state, so first, do all the preparation + * work and only then, notify the driver. */ - if (test_sta_flag(sta, WLAN_STA_INSERTED)) { + if (new_state > sta->sta_state && + test_sta_flag(sta, WLAN_STA_INSERTED)) { int err = drv_sta_state(sta->local, sta->sdata, sta, sta->sta_state, new_state); if (err) @@ -1395,6 +1410,16 @@ static int _sta_info_move_state(struct sta_info *sta, break; } + if (new_state < sta->sta_state && + test_sta_flag(sta, WLAN_STA_INSERTED)) { + int err = drv_sta_state(sta->local, sta->sdata, sta, + sta->sta_state, new_state); + + WARN_ONCE(err, + "Driver is not allowed to fail if the sta_state is transitioning down the list: %d\n", + err); + } + sta->sta_state = new_state; return 0; @@ -1560,14 +1585,14 @@ int sta_info_init(struct ieee80211_local *local) void sta_info_stop(struct ieee80211_local *local) { - del_timer_sync(&local->sta_cleanup); + timer_delete_sync(&local->sta_cleanup); rhltable_destroy(&local->sta_hash); rhltable_destroy(&local->link_sta_hash); } int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, - int link_id) + int link_id, struct sta_info *do_not_flush_sta) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; @@ -1585,6 +1610,9 @@ int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, (!vlans || sdata->bss != sta->sdata->bss)) continue; + if (sta == do_not_flush_sta) + continue; + if (link_id >= 0 && sta->sta.valid_links && !(sta->sta.valid_links & BIT(link_id))) continue; @@ -2563,6 +2591,39 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) return value; } +#ifdef CONFIG_MAC80211_MESH +static void sta_set_mesh_sinfo(struct sta_info *sta, + struct station_info *sinfo) +{ + struct ieee80211_local *local = sta->sdata->local; + + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) | + BIT_ULL(NL80211_STA_INFO_PLID) | + BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | + BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | + BIT_ULL(NL80211_STA_INFO_PEER_PM) | + BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | + BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); + + sinfo->llid = sta->mesh->llid; + sinfo->plid = sta->mesh->plid; + sinfo->plink_state = sta->mesh->plink_state; + if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET); + sinfo->t_offset = sta->mesh->t_offset; + } + sinfo->local_pm = sta->mesh->local_pm; + sinfo->peer_pm = sta->mesh->peer_pm; + sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; + sinfo->connected_to_gate = sta->mesh->connected_to_gate; + sinfo->connected_to_as = sta->mesh->connected_to_as; + + sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_LINK_METRIC); + sinfo->airtime_link_metric = airtime_link_metric_get(local, sta); +} +#endif + void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats) { @@ -2747,31 +2808,10 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sta_set_tidstats(sta, &sinfo->pertid[i], i); } - if (ieee80211_vif_is_mesh(&sdata->vif)) { #ifdef CONFIG_MAC80211_MESH - sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) | - BIT_ULL(NL80211_STA_INFO_PLID) | - BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | - BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | - BIT_ULL(NL80211_STA_INFO_PEER_PM) | - BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | - BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | - BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); - - sinfo->llid = sta->mesh->llid; - sinfo->plid = sta->mesh->plid; - sinfo->plink_state = sta->mesh->plink_state; - if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { - sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET); - sinfo->t_offset = sta->mesh->t_offset; - } - sinfo->local_pm = sta->mesh->local_pm; - sinfo->peer_pm = sta->mesh->peer_pm; - sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; - sinfo->connected_to_gate = sta->mesh->connected_to_gate; - sinfo->connected_to_as = sta->mesh->connected_to_as; + if (ieee80211_vif_is_mesh(&sdata->vif)) + sta_set_mesh_sinfo(sta, sinfo); #endif - } sinfo->bss_param.flags = 0; if (sdata->vif.bss_conf.use_cts_prot) @@ -2827,12 +2867,6 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } - - if (ieee80211_vif_is_mesh(&sdata->vif)) { - sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_LINK_METRIC); - sinfo->airtime_link_metric = - airtime_link_metric_get(local, sta); - } } u32 sta_get_expected_throughput(struct sta_info *sta) @@ -2864,27 +2898,6 @@ unsigned long ieee80211_sta_last_active(struct sta_info *sta) return sta->deflink.status_stats.last_ack; } -static void sta_update_codel_params(struct sta_info *sta, u32 thr) -{ - if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) { - sta->cparams.target = MS2TIME(50); - sta->cparams.interval = MS2TIME(300); - sta->cparams.ecn = false; - } else { - sta->cparams.target = MS2TIME(20); - sta->cparams.interval = MS2TIME(100); - sta->cparams.ecn = true; - } -} - -void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, - u32 thr) -{ - struct sta_info *sta = container_of(pubsta, struct sta_info, sta); - - sta_update_codel_params(sta, thr); -} - int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 9f89fb5bee37..7a95d8d34fca 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -466,14 +466,6 @@ struct ieee80211_fragment_cache { unsigned int next; }; -/* - * The bandwidth threshold below which the per-station CoDel parameters will be - * scaled to be more lenient (to prevent starvation of slow stations). This - * value will be scaled by the number of active stations when it is being - * applied. - */ -#define STA_SLOW_THRESHOLD 6000 /* 6 Mbps */ - /** * struct link_sta_info - Link STA information * All link specific sta info are stored here for reference. This can be @@ -512,6 +504,10 @@ struct ieee80211_fragment_cache { * @status_stats.avg_ack_signal: average ACK signal * @cur_max_bandwidth: maximum bandwidth to use for TX to the station, * taken from HT/VHT capabilities or VHT operating mode notification + * @rx_omi_bw_rx: RX OMI bandwidth restriction to apply for RX + * @rx_omi_bw_tx: RX OMI bandwidth restriction to apply for TX + * @rx_omi_bw_staging: RX OMI bandwidth restriction to apply later + * during finalize * @debugfs_dir: debug filesystem directory dentry * @pub: public (driver visible) link STA data * TODO Move other link params from sta_info as required for MLD operation @@ -561,6 +557,9 @@ struct link_sta_info { } tx_stats; enum ieee80211_sta_rx_bandwidth cur_max_bandwidth; + enum ieee80211_sta_rx_bandwidth rx_omi_bw_rx, + rx_omi_bw_tx, + rx_omi_bw_staging; #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfs_dir; @@ -619,7 +618,6 @@ struct link_sta_info { * @sta: station information we share with the driver * @sta_state: duplicates information about station state (for debug) * @rcu_head: RCU head used for freeing this station struct - * @cparams: CoDel parameters for this station. * @reserved_tid: reserved TID (if any, otherwise IEEE80211_TID_UNRESERVED) * @amsdu_mesh_control: track the mesh A-MSDU format used by the peer: * @@ -710,8 +708,6 @@ struct sta_info { struct dentry *debugfs_dir; #endif - struct codel_params cparams; - u8 reserved_tid; s8 amsdu_mesh_control; @@ -899,9 +895,10 @@ void sta_info_stop(struct ieee80211_local *local); * @link_id: if given (>=0), all those STA entries using @link_id only * will be removed. If -1 is passed, all STA entries will be * removed. + * @do_not_flush_sta: a station that shouldn't be flushed. */ int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, - int link_id); + int link_id, struct sta_info *do_not_flush_sta); /** * sta_info_flush - flush matching STA entries from the STA table @@ -916,7 +913,7 @@ int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata, int link_id) { - return __sta_info_flush(sdata, false, link_id); + return __sta_info_flush(sdata, false, link_id, NULL); } void sta_set_rate_info_tx(struct sta_info *sta, diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 5f28f3633fa0..a362254b310c 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -895,8 +895,7 @@ static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, } void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, - int retry_count, bool send_to_cooked, - struct ieee80211_tx_status *status) + int retry_count, struct ieee80211_tx_status *status) { struct sk_buff *skb2; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); @@ -930,10 +929,6 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, if (sdata->u.mntr.flags & MONITOR_FLAG_SKIP_TX) continue; - if ((sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) && - !send_to_cooked) - continue; - if (prev_dev) { skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) { @@ -964,7 +959,6 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, struct ieee80211_tx_info *info = status->info; struct sta_info *sta; __le16 fc; - bool send_to_cooked; bool acked; bool noack_success; struct ieee80211_bar *bar; @@ -1091,28 +1085,16 @@ static void __ieee80211_tx_status(struct ieee80211_hw *hw, ieee80211_report_used_skb(local, skb, false, status->ack_hwtstamp); - /* this was a transmitted frame, but now we want to reuse it */ - skb_orphan(skb); - - /* Need to make a copy before skb->cb gets cleared */ - send_to_cooked = !!(info->flags & IEEE80211_TX_CTL_INJECTED) || - !(ieee80211_is_data(fc)); - /* * This is a bit racy but we can avoid a lot of work * with this test... */ - if (!local->tx_mntrs && (!send_to_cooked || !local->cooked_mntrs)) { - if (status->free_list) - list_add_tail(&skb->list, status->free_list); - else - dev_kfree_skb(skb); - return; - } - - /* send to monitor interfaces */ - ieee80211_tx_monitor(local, skb, retry_count, - send_to_cooked, status); + if (local->tx_mntrs) + ieee80211_tx_monitor(local, skb, retry_count, status); + else if (status->free_list) + list_add_tail(&skb->list, status->free_list); + else + dev_kfree_skb(skb); } void ieee80211_tx_status_skb(struct ieee80211_hw *hw, struct sk_buff *skb) diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 2f92e7c7f203..94714f8ffd22 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -382,8 +382,8 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_link_data *link, if (WARN_ON_ONCE(!sband)) return; - ieee80211_put_srates_elem(skb, sband, 0, 0, 0, WLAN_EID_SUPP_RATES); - ieee80211_put_srates_elem(skb, sband, 0, 0, 0, WLAN_EID_EXT_SUPP_RATES); + ieee80211_put_srates_elem(skb, sband, 0, 0, WLAN_EID_SUPP_RATES); + ieee80211_put_srates_elem(skb, sband, 0, 0, WLAN_EID_EXT_SUPP_RATES); ieee80211_tdls_add_supp_channels(sdata, skb); /* add any custom IEs that go before Extended Capabilities */ diff --git a/net/mac80211/tests/Makefile b/net/mac80211/tests/Makefile index 511dfa226699..3b0c08356fc5 100644 --- a/net/mac80211/tests/Makefile +++ b/net/mac80211/tests/Makefile @@ -1,3 +1,3 @@ -mac80211-tests-y += module.o elems.o mfp.o tpe.o +mac80211-tests-y += module.o util.o elems.o mfp.o tpe.o chan-mode.o obj-$(CONFIG_MAC80211_KUNIT_TEST) += mac80211-tests.o diff --git a/net/mac80211/tests/chan-mode.c b/net/mac80211/tests/chan-mode.c new file mode 100644 index 000000000000..96c7b3ab2744 --- /dev/null +++ b/net/mac80211/tests/chan-mode.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit tests for channel mode functions + * + * Copyright (C) 2024 Intel Corporation + */ +#include <net/cfg80211.h> +#include <kunit/test.h> + +#include "util.h" + +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); + +static const struct determine_chan_mode_case { + const char *desc; + u8 extra_supp_rate; + enum ieee80211_conn_mode conn_mode; + enum ieee80211_conn_mode expected_mode; + bool strict; + u8 userspace_selector; + struct ieee80211_ht_cap ht_capa_mask; + struct ieee80211_vht_cap vht_capa; + struct ieee80211_vht_cap vht_capa_mask; + u8 vht_basic_mcs_1_4_set:1, + vht_basic_mcs_5_8_set:1, + he_basic_mcs_1_4_set:1, + he_basic_mcs_5_8_set:1; + u8 vht_basic_mcs_1_4, vht_basic_mcs_5_8; + u8 he_basic_mcs_1_4, he_basic_mcs_5_8; + u8 eht_mcs7_min_nss; + int error; +} determine_chan_mode_cases[] = { + { + .desc = "Normal case, EHT is working", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .expected_mode = IEEE80211_CONN_MODE_EHT, + }, { + .desc = "Requiring EHT support is fine", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .expected_mode = IEEE80211_CONN_MODE_EHT, + .extra_supp_rate = 0x80 | BSS_MEMBERSHIP_SELECTOR_EHT_PHY, + }, { + .desc = "Lowering the mode limits us", + .conn_mode = IEEE80211_CONN_MODE_VHT, + .expected_mode = IEEE80211_CONN_MODE_VHT, + }, { + .desc = "Requesting a basic rate/selector that we do not support", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .extra_supp_rate = 0x80 | (BSS_MEMBERSHIP_SELECTOR_MIN - 1), + .error = EINVAL, + }, { + .desc = "As before, but userspace says it is taking care of it", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .userspace_selector = BSS_MEMBERSHIP_SELECTOR_MIN - 1, + .extra_supp_rate = 0x80 | (BSS_MEMBERSHIP_SELECTOR_MIN - 1), + .expected_mode = IEEE80211_CONN_MODE_EHT, + }, { + .desc = "Masking out a supported rate in HT capabilities", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .expected_mode = IEEE80211_CONN_MODE_LEGACY, + .ht_capa_mask = { + .mcs.rx_mask[0] = 0xf7, + }, + }, { + .desc = "Masking out a RX rate in VHT capabilities", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .expected_mode = IEEE80211_CONN_MODE_HT, + /* Only one RX stream at MCS 0-7 */ + .vht_capa = { + .supp_mcs.rx_mcs_map = + cpu_to_le16(IEEE80211_VHT_MCS_SUPPORT_0_7), + }, + .vht_capa_mask = { + .supp_mcs.rx_mcs_map = cpu_to_le16(0xffff), + }, + .strict = true, + }, { + .desc = "Masking out a TX rate in VHT capabilities", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .expected_mode = IEEE80211_CONN_MODE_HT, + /* Only one TX stream at MCS 0-7 */ + .vht_capa = { + .supp_mcs.tx_mcs_map = + cpu_to_le16(IEEE80211_VHT_MCS_SUPPORT_0_7), + }, + .vht_capa_mask = { + .supp_mcs.tx_mcs_map = cpu_to_le16(0xffff), + }, + .strict = true, + }, { + .desc = "AP has higher VHT requirement than client", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .expected_mode = IEEE80211_CONN_MODE_HT, + .vht_basic_mcs_5_8_set = 1, + .vht_basic_mcs_5_8 = 0xFE, /* require 5th stream */ + .strict = true, + }, { + .desc = "all zero VHT basic rates are ignored (many APs broken)", + .conn_mode = IEEE80211_CONN_MODE_VHT, + .expected_mode = IEEE80211_CONN_MODE_VHT, + .vht_basic_mcs_1_4_set = 1, + .vht_basic_mcs_5_8_set = 1, + }, { + .desc = "AP requires 3 HE streams but client only has two", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .expected_mode = IEEE80211_CONN_MODE_VHT, + .he_basic_mcs_1_4 = 0b11001010, + .he_basic_mcs_1_4_set = 1, + }, { + .desc = "all zero HE basic rates are ignored (iPhone workaround)", + .conn_mode = IEEE80211_CONN_MODE_HE, + .expected_mode = IEEE80211_CONN_MODE_HE, + .he_basic_mcs_1_4_set = 1, + .he_basic_mcs_5_8_set = 1, + }, { + .desc = "AP requires too many RX streams with EHT MCS 7", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .expected_mode = IEEE80211_CONN_MODE_HE, + .eht_mcs7_min_nss = 0x15, + }, { + .desc = "AP requires too many TX streams with EHT MCS 7", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .expected_mode = IEEE80211_CONN_MODE_HE, + .eht_mcs7_min_nss = 0x51, + }, { + .desc = "AP requires too many RX streams with EHT MCS 7 and EHT is required", + .extra_supp_rate = 0x80 | BSS_MEMBERSHIP_SELECTOR_EHT_PHY, + .conn_mode = IEEE80211_CONN_MODE_EHT, + .eht_mcs7_min_nss = 0x15, + .error = EINVAL, + } +}; +KUNIT_ARRAY_PARAM_DESC(determine_chan_mode, determine_chan_mode_cases, desc) + +static void test_determine_chan_mode(struct kunit *test) +{ + const struct determine_chan_mode_case *params = test->param_value; + struct t_sdata *t_sdata = T_SDATA(test); + struct ieee80211_conn_settings conn = { + .mode = params->conn_mode, + .bw_limit = IEEE80211_CONN_BW_LIMIT_20, + }; + struct cfg80211_bss cbss = { + .channel = &t_sdata->band_5ghz.channels[0], + }; + unsigned long userspace_selectors[BITS_TO_LONGS(128)] = {}; + u8 bss_ies[] = { + /* Supported Rates */ + WLAN_EID_SUPP_RATES, 0x08, + 0x82, 0x84, 0x8b, 0x96, 0xc, 0x12, 0x18, 0x24, + /* Extended Supported Rates */ + WLAN_EID_EXT_SUPP_RATES, 0x05, + 0x30, 0x48, 0x60, 0x6c, params->extra_supp_rate, + /* HT Capabilities */ + WLAN_EID_HT_CAPABILITY, 0x1a, + 0x0c, 0x00, 0x1b, 0xff, 0xff, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, + /* HT Information (0xff for 1 stream) */ + WLAN_EID_HT_OPERATION, 0x16, + 0x24, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + /* VHT Capabilities */ + WLAN_EID_VHT_CAPABILITY, 0xc, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, + 0xff, 0xff, 0x00, 0x00, + /* VHT Operation */ + WLAN_EID_VHT_OPERATION, 0x05, + 0x00, 0x00, 0x00, + params->vht_basic_mcs_1_4_set ? + params->vht_basic_mcs_1_4 : + le16_get_bits(t_sdata->band_5ghz.vht_cap.vht_mcs.rx_mcs_map, 0xff), + params->vht_basic_mcs_5_8_set ? + params->vht_basic_mcs_5_8 : + le16_get_bits(t_sdata->band_5ghz.vht_cap.vht_mcs.rx_mcs_map, 0xff00), + /* HE Capabilities */ + WLAN_EID_EXTENSION, 0x16, WLAN_EID_EXT_HE_CAPABILITY, + 0x01, 0x78, 0xc8, 0x1a, 0x40, 0x00, 0x00, 0xbf, + 0xce, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0xfa, 0xff, 0xfa, 0xff, + /* HE Operation (permit overriding values) */ + WLAN_EID_EXTENSION, 0x07, WLAN_EID_EXT_HE_OPERATION, + 0xf0, 0x3f, 0x00, 0xb0, + params->he_basic_mcs_1_4_set ? params->he_basic_mcs_1_4 : 0xfc, + params->he_basic_mcs_5_8_set ? params->he_basic_mcs_5_8 : 0xff, + /* EHT Capabilities */ + WLAN_EID_EXTENSION, 0x12, WLAN_EID_EXT_EHT_CAPABILITY, + 0x07, 0x00, 0x1c, 0x00, 0x00, 0xfe, 0xff, 0xff, + 0x7f, 0x01, 0x00, 0x88, 0x88, 0x88, 0x00, 0x00, + 0x00, + /* EHT Operation */ + WLAN_EID_EXTENSION, 0x09, WLAN_EID_EXT_EHT_OPERATION, + 0x01, params->eht_mcs7_min_nss ? params->eht_mcs7_min_nss : 0x11, + 0x00, 0x00, 0x00, 0x00, 0x24, 0x00, + }; + struct ieee80211_chan_req chanreq = {}; + struct cfg80211_chan_def ap_chandef = {}; + struct ieee802_11_elems *elems; + + if (params->strict) + set_bit(IEEE80211_HW_STRICT, t_sdata->local.hw.flags); + else + clear_bit(IEEE80211_HW_STRICT, t_sdata->local.hw.flags); + + t_sdata->sdata->u.mgd.ht_capa_mask = params->ht_capa_mask; + t_sdata->sdata->u.mgd.vht_capa = params->vht_capa; + t_sdata->sdata->u.mgd.vht_capa_mask = params->vht_capa_mask; + + if (params->userspace_selector) + set_bit(params->userspace_selector, userspace_selectors); + + rcu_assign_pointer(cbss.ies, + kunit_kzalloc(test, + sizeof(cbss) + sizeof(bss_ies), + GFP_KERNEL)); + KUNIT_ASSERT_NOT_NULL(test, rcu_access_pointer(cbss.ies)); + ((struct cfg80211_bss_ies *)rcu_access_pointer(cbss.ies))->len = sizeof(bss_ies); + + memcpy((void *)rcu_access_pointer(cbss.ies)->data, bss_ies, + sizeof(bss_ies)); + + rcu_read_lock(); + elems = ieee80211_determine_chan_mode(t_sdata->sdata, &conn, &cbss, + 0, &chanreq, &ap_chandef, + userspace_selectors); + rcu_read_unlock(); + + /* We do not need elems, free them if they are valid. */ + if (!IS_ERR_OR_NULL(elems)) + kfree(elems); + + if (params->error) { + KUNIT_ASSERT_TRUE(test, IS_ERR(elems)); + KUNIT_ASSERT_EQ(test, PTR_ERR(elems), -params->error); + } else { + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, elems); + KUNIT_ASSERT_EQ(test, conn.mode, params->expected_mode); + } +} + +static struct kunit_case chan_mode_cases[] = { + KUNIT_CASE_PARAM(test_determine_chan_mode, + determine_chan_mode_gen_params), + {} +}; + +static struct kunit_suite chan_mode = { + .name = "mac80211-mlme-chan-mode", + .test_cases = chan_mode_cases, +}; + +kunit_test_suite(chan_mode); diff --git a/net/mac80211/tests/util.c b/net/mac80211/tests/util.c new file mode 100644 index 000000000000..9c2d63a5cd2b --- /dev/null +++ b/net/mac80211/tests/util.c @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Utilities for mac80211 unit testing + * + * Copyright (C) 2024 Intel Corporation + */ +#include <linux/ieee80211.h> +#include <net/mac80211.h> +#include <kunit/test.h> +#include <kunit/test-bug.h> +#include "util.h" + +#define CHAN2G(_freq) { \ + .band = NL80211_BAND_2GHZ, \ + .center_freq = (_freq), \ + .hw_value = (_freq), \ +} + +static const struct ieee80211_channel channels_2ghz[] = { + CHAN2G(2412), /* Channel 1 */ + CHAN2G(2417), /* Channel 2 */ + CHAN2G(2422), /* Channel 3 */ + CHAN2G(2427), /* Channel 4 */ + CHAN2G(2432), /* Channel 5 */ + CHAN2G(2437), /* Channel 6 */ + CHAN2G(2442), /* Channel 7 */ + CHAN2G(2447), /* Channel 8 */ + CHAN2G(2452), /* Channel 9 */ + CHAN2G(2457), /* Channel 10 */ + CHAN2G(2462), /* Channel 11 */ + CHAN2G(2467), /* Channel 12 */ + CHAN2G(2472), /* Channel 13 */ + CHAN2G(2484), /* Channel 14 */ +}; + +#define CHAN5G(_freq) { \ + .band = NL80211_BAND_5GHZ, \ + .center_freq = (_freq), \ + .hw_value = (_freq), \ +} + +static const struct ieee80211_channel channels_5ghz[] = { + CHAN5G(5180), /* Channel 36 */ + CHAN5G(5200), /* Channel 40 */ + CHAN5G(5220), /* Channel 44 */ + CHAN5G(5240), /* Channel 48 */ +}; + +static const struct ieee80211_rate bitrates[] = { + { .bitrate = 10 }, + { .bitrate = 20, .flags = IEEE80211_RATE_SHORT_PREAMBLE }, + { .bitrate = 55, .flags = IEEE80211_RATE_SHORT_PREAMBLE }, + { .bitrate = 110, .flags = IEEE80211_RATE_SHORT_PREAMBLE }, + { .bitrate = 60 }, + { .bitrate = 90 }, + { .bitrate = 120 }, + { .bitrate = 180 }, + { .bitrate = 240 }, + { .bitrate = 360 }, + { .bitrate = 480 }, + { .bitrate = 540 } +}; + +/* Copied from hwsim except that it only supports 4 EHT streams and STA/P2P mode */ +static const struct ieee80211_sband_iftype_data sband_capa_5ghz[] = { + { + .types_mask = BIT(NL80211_IFTYPE_STATION) | + BIT(NL80211_IFTYPE_P2P_CLIENT), + .he_cap = { + .has_he = true, + .he_cap_elem = { + .mac_cap_info[0] = + IEEE80211_HE_MAC_CAP0_HTC_HE, + .mac_cap_info[1] = + IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US | + IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8, + .mac_cap_info[2] = + IEEE80211_HE_MAC_CAP2_BSR | + IEEE80211_HE_MAC_CAP2_MU_CASCADING | + IEEE80211_HE_MAC_CAP2_ACK_EN, + .mac_cap_info[3] = + IEEE80211_HE_MAC_CAP3_OMI_CONTROL | + IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3, + .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU, + .phy_cap_info[0] = + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G | + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G | + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G, + .phy_cap_info[1] = + IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK | + IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A | + IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD | + IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS, + .phy_cap_info[2] = + IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US | + IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ | + IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ | + IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO | + IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO, + + /* Leave all the other PHY capability bytes + * unset, as DCM, beam forming, RU and PPE + * threshold information are not supported + */ + }, + .he_mcs_nss_supp = { + .rx_mcs_80 = cpu_to_le16(0xfffa), + .tx_mcs_80 = cpu_to_le16(0xfffa), + .rx_mcs_160 = cpu_to_le16(0xfffa), + .tx_mcs_160 = cpu_to_le16(0xfffa), + .rx_mcs_80p80 = cpu_to_le16(0xfffa), + .tx_mcs_80p80 = cpu_to_le16(0xfffa), + }, + }, + .eht_cap = { + .has_eht = true, + .eht_cap_elem = { + .mac_cap_info[0] = + IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS | + IEEE80211_EHT_MAC_CAP0_OM_CONTROL | + IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1, + .phy_cap_info[0] = + IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ | + IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI | + IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO | + IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER | + IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE | + IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK, + .phy_cap_info[1] = + IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK | + IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK, + .phy_cap_info[2] = + IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK | + IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK, + .phy_cap_info[3] = + IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK | + IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK | + IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK | + IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK | + IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK | + IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK | + IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK, + .phy_cap_info[4] = + IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO | + IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP | + IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP | + IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI | + IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK, + .phy_cap_info[5] = + IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK | + IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP | + IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP | + IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT | + IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK | + IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK, + .phy_cap_info[6] = + IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK | + IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK, + .phy_cap_info[7] = + IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW | + IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ | + IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ | + IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ | + IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ, + }, + + /* For all MCS and bandwidth, set 4 NSS for both Tx and + * Rx + */ + .eht_mcs_nss_supp = { + /* + * As B1 and B2 are set in the supported + * channel width set field in the HE PHY + * capabilities information field include all + * the following MCS/NSS. + */ + .bw._80 = { + .rx_tx_mcs9_max_nss = 0x44, + .rx_tx_mcs11_max_nss = 0x44, + .rx_tx_mcs13_max_nss = 0x44, + }, + .bw._160 = { + .rx_tx_mcs9_max_nss = 0x44, + .rx_tx_mcs11_max_nss = 0x44, + .rx_tx_mcs13_max_nss = 0x44, + }, + }, + /* PPE threshold information is not supported */ + }, + }, +}; + +int t_sdata_init(struct kunit_resource *resource, void *ctx) +{ + struct kunit *test = kunit_get_current_test(); + struct t_sdata *t_sdata; + + t_sdata = kzalloc(sizeof(*t_sdata), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, t_sdata); + + resource->data = t_sdata; + resource->name = "sdata"; + + t_sdata->sdata = kzalloc(sizeof(*t_sdata->sdata), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, t_sdata->sdata); + + t_sdata->wiphy = kzalloc(sizeof(*t_sdata->wiphy), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, t_sdata->wiphy); + + strscpy(t_sdata->sdata->name, "kunit"); + + t_sdata->sdata->local = &t_sdata->local; + t_sdata->sdata->local->hw.wiphy = t_sdata->wiphy; + t_sdata->sdata->wdev.wiphy = t_sdata->wiphy; + t_sdata->sdata->vif.type = NL80211_IFTYPE_STATION; + + t_sdata->sdata->deflink.sdata = t_sdata->sdata; + t_sdata->sdata->deflink.link_id = 0; + + t_sdata->wiphy->bands[NL80211_BAND_2GHZ] = &t_sdata->band_2ghz; + t_sdata->wiphy->bands[NL80211_BAND_5GHZ] = &t_sdata->band_5ghz; + + for (int band = NL80211_BAND_2GHZ; band <= NL80211_BAND_5GHZ; band++) { + struct ieee80211_supported_band *sband; + + sband = t_sdata->wiphy->bands[band]; + sband->band = band; + + sband->bitrates = + kmemdup(bitrates, sizeof(bitrates), GFP_KERNEL); + sband->n_bitrates = ARRAY_SIZE(bitrates); + + /* Initialize channels, feel free to add more channels/bands */ + switch (band) { + case NL80211_BAND_2GHZ: + sband->channels = kmemdup(channels_2ghz, + sizeof(channels_2ghz), + GFP_KERNEL); + sband->n_channels = ARRAY_SIZE(channels_2ghz); + sband->bitrates = kmemdup(bitrates, + sizeof(bitrates), + GFP_KERNEL); + sband->n_bitrates = ARRAY_SIZE(bitrates); + break; + case NL80211_BAND_5GHZ: + sband->channels = kmemdup(channels_5ghz, + sizeof(channels_5ghz), + GFP_KERNEL); + sband->n_channels = ARRAY_SIZE(channels_5ghz); + sband->bitrates = kmemdup(bitrates, + sizeof(bitrates), + GFP_KERNEL); + sband->n_bitrates = ARRAY_SIZE(bitrates); + + sband->vht_cap.vht_supported = true; + sband->vht_cap.cap = + IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 | + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ | + IEEE80211_VHT_CAP_RXLDPC | + IEEE80211_VHT_CAP_SHORT_GI_80 | + IEEE80211_VHT_CAP_SHORT_GI_160 | + IEEE80211_VHT_CAP_TXSTBC | + IEEE80211_VHT_CAP_RXSTBC_4 | + IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; + sband->vht_cap.vht_mcs.rx_mcs_map = + cpu_to_le16(IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | + IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 | + IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 | + IEEE80211_VHT_MCS_SUPPORT_0_9 << 6); + sband->vht_cap.vht_mcs.tx_mcs_map = + sband->vht_cap.vht_mcs.rx_mcs_map; + break; + default: + continue; + } + + sband->ht_cap.ht_supported = band != NL80211_BAND_6GHZ; + sband->ht_cap.cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 | + IEEE80211_HT_CAP_GRN_FLD | + IEEE80211_HT_CAP_SGI_20 | + IEEE80211_HT_CAP_SGI_40 | + IEEE80211_HT_CAP_DSSSCCK40; + sband->ht_cap.ampdu_factor = 0x3; + sband->ht_cap.ampdu_density = 0x6; + memset(&sband->ht_cap.mcs, 0, sizeof(sband->ht_cap.mcs)); + sband->ht_cap.mcs.rx_mask[0] = 0xff; + sband->ht_cap.mcs.rx_mask[1] = 0xff; + sband->ht_cap.mcs.tx_params = IEEE80211_HT_MCS_TX_DEFINED; + } + + ieee80211_set_sband_iftype_data(&t_sdata->band_5ghz, sband_capa_5ghz); + + return 0; +} + +void t_sdata_exit(struct kunit_resource *resource) +{ + struct t_sdata *t_sdata = resource->data; + + kfree(t_sdata->band_2ghz.channels); + kfree(t_sdata->band_2ghz.bitrates); + kfree(t_sdata->band_5ghz.channels); + kfree(t_sdata->band_5ghz.bitrates); + + kfree(t_sdata->sdata); + kfree(t_sdata->wiphy); + + kfree(t_sdata); +} diff --git a/net/mac80211/tests/util.h b/net/mac80211/tests/util.h new file mode 100644 index 000000000000..6615880c123f --- /dev/null +++ b/net/mac80211/tests/util.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Utilities for mac80211 unit testing + * + * Copyright (C) 2024 Intel Corporation + */ +#ifndef __MAC80211_UTILS_H +#define __MAC80211_UTILS_H + +#include "../ieee80211_i.h" + +struct t_sdata { + struct ieee80211_sub_if_data *sdata; + struct wiphy *wiphy; + struct ieee80211_local local; + + void *ctx; + + struct ieee80211_supported_band band_2ghz; + struct ieee80211_supported_band band_5ghz; +}; + +#define T_SDATA(test) ({ \ + struct t_sdata *__t_sdata = \ + kunit_alloc_resource(test, t_sdata_init, \ + t_sdata_exit, \ + GFP_KERNEL, NULL); \ + \ + KUNIT_ASSERT_NOT_NULL(test, __t_sdata); \ + __t_sdata; \ + }) + +int t_sdata_init(struct kunit_resource *resource, void *data); +void t_sdata_exit(struct kunit_resource *resource); + +#endif /* __MAC80211_UTILS_H */ diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 7a4985fc2b16..72fad8ea8bb9 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2173,13 +2173,14 @@ DEFINE_EVENT(chanswitch_evt, drv_channel_switch_rx_beacon, TRACE_EVENT(drv_get_txpower, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - int dbm, int ret), + unsigned int link_id, int dbm, int ret), - TP_ARGS(local, sdata, dbm, ret), + TP_ARGS(local, sdata, link_id, dbm, ret), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY + __field(unsigned int, link_id) __field(int, dbm) __field(int, ret) ), @@ -2187,13 +2188,14 @@ TRACE_EVENT(drv_get_txpower, TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; + __entry->link_id = link_id; __entry->dbm = dbm; __entry->ret = ret; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT " dbm:%d ret:%d", - LOCAL_PR_ARG, VIF_PR_ARG, __entry->dbm, __entry->ret + LOCAL_PR_FMT VIF_PR_FMT " link_id:%d dbm:%d ret:%d", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, __entry->dbm, __entry->ret ) ); @@ -2588,6 +2590,45 @@ TRACE_EVENT(drv_change_sta_links, * Tracing for API calls that drivers call. */ +TRACE_EVENT(api_return_bool, + TP_PROTO(struct ieee80211_local *local, bool result), + + TP_ARGS(local, result), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(bool, result) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->result = result; + ), + + TP_printk( + LOCAL_PR_FMT " result=%d", + LOCAL_PR_ARG, __entry->result + ) +); + +TRACE_EVENT(api_return_void, + TP_PROTO(struct ieee80211_local *local), + + TP_ARGS(local), + + TP_STRUCT__entry( + LOCAL_ENTRY + ), + + TP_fast_assign( + LOCAL_ASSIGN; + ), + + TP_printk( + LOCAL_PR_FMT, LOCAL_PR_ARG + ) +); + TRACE_EVENT(api_start_tx_ba_session, TP_PROTO(struct ieee80211_sta *sta, u16 tid), @@ -3052,6 +3093,65 @@ TRACE_EVENT(api_request_smps, ) ); +TRACE_EVENT(api_prepare_rx_omi_bw, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct link_sta_info *link_sta, + enum ieee80211_sta_rx_bandwidth bw), + + TP_ARGS(local, sdata, link_sta, bw), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + __field(int, link_id) + __field(u32, bw) + __field(bool, result) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_NAMED_ASSIGN(link_sta->sta); + __entry->link_id = link_sta->link_id; + __entry->bw = bw; + ), + + TP_printk( + LOCAL_PR_FMT " " VIF_PR_FMT " " STA_PR_FMT " link:%d, bw:%d", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, + __entry->link_id, __entry->bw + ) +); + +TRACE_EVENT(api_finalize_rx_omi_bw, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct link_sta_info *link_sta), + + TP_ARGS(local, sdata, link_sta), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + __field(int, link_id) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_NAMED_ASSIGN(link_sta->sta); + __entry->link_id = link_sta->link_id; + ), + + TP_printk( + LOCAL_PR_FMT " " VIF_PR_FMT " " STA_PR_FMT " link:%d", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->link_id + ) +); + /* * Tracing for internal functions * (which may also be called in response to driver calls) @@ -3059,49 +3159,55 @@ TRACE_EVENT(api_request_smps, TRACE_EVENT(wake_queue, TP_PROTO(struct ieee80211_local *local, u16 queue, - enum queue_stop_reason reason), + enum queue_stop_reason reason, int refcount), - TP_ARGS(local, queue, reason), + TP_ARGS(local, queue, reason, refcount), TP_STRUCT__entry( LOCAL_ENTRY __field(u16, queue) __field(u32, reason) + __field(int, refcount) ), TP_fast_assign( LOCAL_ASSIGN; __entry->queue = queue; __entry->reason = reason; + __entry->refcount = refcount; ), TP_printk( - LOCAL_PR_FMT " queue:%d, reason:%d", - LOCAL_PR_ARG, __entry->queue, __entry->reason + LOCAL_PR_FMT " queue:%d, reason:%d, refcount: %d", + LOCAL_PR_ARG, __entry->queue, __entry->reason, + __entry->refcount ) ); TRACE_EVENT(stop_queue, TP_PROTO(struct ieee80211_local *local, u16 queue, - enum queue_stop_reason reason), + enum queue_stop_reason reason, int refcount), - TP_ARGS(local, queue, reason), + TP_ARGS(local, queue, reason, refcount), TP_STRUCT__entry( LOCAL_ENTRY __field(u16, queue) __field(u32, reason) + __field(int, refcount) ), TP_fast_assign( LOCAL_ASSIGN; __entry->queue = queue; __entry->reason = reason; + __entry->refcount = refcount; ), TP_printk( - LOCAL_PR_FMT " queue:%d, reason:%d", - LOCAL_PR_ARG, __entry->queue, __entry->reason + LOCAL_PR_FMT " queue:%d, reason:%d, refcount: %d", + LOCAL_PR_ARG, __entry->queue, __entry->reason, + __entry->refcount ) ); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index a24636bda679..d8d4f3d7d7f2 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -26,6 +26,7 @@ #include <net/codel_impl.h> #include <linux/unaligned.h> #include <net/fq_impl.h> +#include <net/sock.h> #include <net/gso.h> #include "ieee80211_i.h" @@ -49,19 +50,11 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, struct ieee80211_supported_band *sband; struct ieee80211_hdr *hdr; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - struct ieee80211_chanctx_conf *chanctx_conf; - u32 rate_flags = 0; /* assume HW handles this */ if (tx->rate.flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS)) return 0; - rcu_read_lock(); - chanctx_conf = rcu_dereference(tx->sdata->vif.bss_conf.chanctx_conf); - if (chanctx_conf) - rate_flags = ieee80211_chandef_rate_flags(&chanctx_conf->def); - rcu_read_unlock(); - /* uh huh? */ if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; @@ -138,9 +131,6 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (r->bitrate > txrate->bitrate) break; - if ((rate_flags & r->flags) != rate_flags) - continue; - if (tx->sdata->vif.bss_conf.basic_rates & BIT(i)) rate = r->bitrate; @@ -1182,7 +1172,8 @@ void ieee80211_aggr_check(struct ieee80211_sub_if_data *sdata, if (!ref || !(ref->ops->capa & RATE_CTRL_CAPA_AMPDU_TRIGGER)) return; - if (!sta || !sta->sta.deflink.ht_cap.ht_supported || + if (!sta || + (!sta->sta.valid_links && !sta->sta.deflink.ht_cap.ht_supported) || !sta->sta.wme || skb_get_queue_mapping(skb) == IEEE80211_AC_VO || skb->protocol == sdata->control_port_protocol) return; @@ -1401,16 +1392,9 @@ static struct sk_buff *fq_tin_dequeue_func(struct fq *fq, local = container_of(fq, struct ieee80211_local, fq); txqi = container_of(tin, struct txq_info, tin); + cparams = &local->cparams; cstats = &txqi->cstats; - if (txqi->txq.sta) { - struct sta_info *sta = container_of(txqi->txq.sta, - struct sta_info, sta); - cparams = &sta->cparams; - } else { - cparams = &local->cparams; - } - if (flow == &tin->default_flow) cvars = &txqi->def_cvars; else @@ -2875,8 +2859,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, } if (unlikely(!multicast && - ((skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) || + (sk_requests_wifi_status(skb->sk) || ctrl_flags & IEEE80211_TX_CTL_REQ_TX_STATUS))) info_id = ieee80211_store_ack_skb(local, skb, &info_flags, cookie); @@ -3773,7 +3756,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, return false; /* don't handle TX status request here either */ - if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) + if (sk_requests_wifi_status(skb->sk)) return false; if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) { @@ -4525,8 +4508,10 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, IEEE80211_TX_CTRL_MLO_LINK_UNSPEC, NULL); } else if (ieee80211_vif_is_mld(&sdata->vif) && - sdata->vif.type == NL80211_IFTYPE_AP && - !ieee80211_hw_check(&sdata->local->hw, MLO_MCAST_MULTI_LINK_TX)) { + ((sdata->vif.type == NL80211_IFTYPE_AP && + !ieee80211_hw_check(&sdata->local->hw, MLO_MCAST_MULTI_LINK_TX)) || + (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && + !sdata->wdev.use_4addr))) { ieee80211_mlo_multicast_tx(dev, skb); } else { normal: @@ -4663,8 +4648,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, memcpy(IEEE80211_SKB_CB(seg), info, sizeof(*info)); } - if (unlikely(skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) { + if (unlikely(sk_requests_wifi_status(skb->sk))) { info->status_data = ieee80211_store_ack_skb(local, skb, &info->flags, NULL); if (info->status_data) @@ -5617,7 +5601,7 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, if (!copy) return bcn; - ieee80211_tx_monitor(hw_to_local(hw), copy, 1, false, NULL); + ieee80211_tx_monitor(hw_to_local(hw), copy, 1, NULL); return bcn; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index dc0b74443c8d..27d414efa3fd 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -6,7 +6,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * utilities for mac80211 */ @@ -437,8 +437,6 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, { struct ieee80211_local *local = hw_to_local(hw); - trace_wake_queue(local, queue, reason); - if (WARN_ON(queue >= hw->queues)) return; @@ -456,6 +454,9 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, if (local->q_stop_reasons[queue][reason] == 0) __clear_bit(reason, &local->queue_stop_reasons[queue]); + trace_wake_queue(local, queue, reason, + local->q_stop_reasons[queue][reason]); + if (local->queue_stop_reasons[queue] != 0) /* someone still has this queue stopped */ return; @@ -502,8 +503,6 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, { struct ieee80211_local *local = hw_to_local(hw); - trace_stop_queue(local, queue, reason); - if (WARN_ON(queue >= hw->queues)) return; @@ -512,6 +511,9 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, else local->q_stop_reasons[queue][reason]++; + trace_stop_queue(local, queue, reason, + local->q_stop_reasons[queue][reason]); + set_bit(reason, &local->queue_stop_reasons[queue]); } @@ -685,7 +687,7 @@ void __ieee80211_flush_queues(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, unsigned int queues, bool drop) { - if (!local->ops->flush) + if (!local->ops->flush && !drop) return; /* @@ -712,7 +714,8 @@ void __ieee80211_flush_queues(struct ieee80211_local *local, } } - drv_flush(local, sdata, queues, drop); + if (local->ops->flush) + drv_flush(local, sdata, queues, drop); ieee80211_wake_queues_by_reason(&local->hw, queues, IEEE80211_QUEUE_STOP_REASON_FLUSH, @@ -1201,7 +1204,6 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb, struct ieee80211_supported_band *sband; int i, err; size_t noffset; - u32 rate_flags; bool have_80mhz = false; *offset = 0; @@ -1210,13 +1212,11 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb, if (WARN_ON_ONCE(!sband)) return 0; - rate_flags = ieee80211_chandef_rate_flags(chandef); - /* For direct scan add S1G IE and consider its override bits */ if (band == NL80211_BAND_S1GHZ) return ieee80211_put_s1g_cap(skb, &sband->s1g_cap); - err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags, + err = ieee80211_put_srates_elem(skb, sband, 0, ~rate_mask, WLAN_EID_SUPP_RATES); if (err) return err; @@ -1238,7 +1238,7 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb, *offset = noffset; } - err = ieee80211_put_srates_elem(skb, sband, 0, rate_flags, + err = ieee80211_put_srates_elem(skb, sband, 0, ~rate_mask, WLAN_EID_EXT_SUPP_RATES); if (err) return err; @@ -1519,16 +1519,13 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, { struct ieee80211_supported_band *sband; size_t num_rates; - u32 supp_rates, rate_flags; + u32 supp_rates; int i, j; sband = sdata->local->hw.wiphy->bands[band]; if (WARN_ON(!sband)) return 1; - rate_flags = - ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); - num_rates = sband->n_bitrates; supp_rates = 0; for (i = 0; i < elems->supp_rates_len + @@ -1548,12 +1545,7 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, continue; for (j = 0; j < num_rates; j++) { - int brate; - if ((rate_flags & sband->bitrates[j].flags) - != rate_flags) - continue; - - brate = sband->bitrates[j].bitrate; + int brate = sband->bitrates[j].bitrate; if (brate == own_rate) { supp_rates |= BIT(j); @@ -2153,7 +2145,8 @@ int ieee80211_reconfig(struct ieee80211_local *local) wake_up: - if (local->monitors == local->open_count && local->monitors > 0) + if (local->virt_monitors > 0 && + local->virt_monitors == local->open_count) ieee80211_add_virtual_monitor(local); /* @@ -2190,8 +2183,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) ieee80211_reconfig_roc(local); /* Requeue all works */ - list_for_each_entry(sdata, &local->interfaces, list) - wiphy_work_queue(local->hw.wiphy, &sdata->work); + list_for_each_entry(sdata, &local->interfaces, list) { + if (ieee80211_sdata_running(sdata)) + wiphy_work_queue(local->hw.wiphy, &sdata->work); + } } ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP, @@ -2748,6 +2743,7 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef) { struct ieee80211_he_operation *he_oper; struct ieee80211_he_6ghz_oper *he_6ghz_op; + struct cfg80211_chan_def he_chandef; u32 he_oper_params; u8 ie_len = 1 + sizeof(struct ieee80211_he_operation); @@ -2779,27 +2775,33 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef) if (chandef->chan->band != NL80211_BAND_6GHZ) goto out; + cfg80211_chandef_create(&he_chandef, chandef->chan, NL80211_CHAN_NO_HT); + he_chandef.center_freq1 = chandef->center_freq1; + he_chandef.center_freq2 = chandef->center_freq2; + he_chandef.width = chandef->width; + /* TODO add VHT operational */ he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos; he_6ghz_op->minrate = 6; /* 6 Mbps */ he_6ghz_op->primary = - ieee80211_frequency_to_channel(chandef->chan->center_freq); + ieee80211_frequency_to_channel(he_chandef.chan->center_freq); he_6ghz_op->ccfs0 = - ieee80211_frequency_to_channel(chandef->center_freq1); - if (chandef->center_freq2) + ieee80211_frequency_to_channel(he_chandef.center_freq1); + if (he_chandef.center_freq2) he_6ghz_op->ccfs1 = - ieee80211_frequency_to_channel(chandef->center_freq2); + ieee80211_frequency_to_channel(he_chandef.center_freq2); else he_6ghz_op->ccfs1 = 0; - switch (chandef->width) { + switch (he_chandef.width) { case NL80211_CHAN_WIDTH_320: - /* - * TODO: mesh operation is not defined over 6GHz 320 MHz - * channels. + /* Downgrade EHT 320 MHz BW to 160 MHz for HE and set new + * center_freq1 */ - WARN_ON(1); - break; + ieee80211_chandef_downgrade(&he_chandef, NULL); + he_6ghz_op->ccfs0 = + ieee80211_frequency_to_channel(he_chandef.center_freq1); + fallthrough; case NL80211_CHAN_WIDTH_160: /* Convert 160 MHz channel width to new style as interop * workaround. @@ -2807,7 +2809,7 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef) he_6ghz_op->control = IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0; - if (chandef->chan->center_freq < chandef->center_freq1) + if (he_chandef.chan->center_freq < he_chandef.center_freq1) he_6ghz_op->ccfs0 -= 8; else he_6ghz_op->ccfs0 += 8; @@ -3210,15 +3212,13 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, int ieee80211_put_srates_elem(struct sk_buff *skb, const struct ieee80211_supported_band *sband, - u32 basic_rates, u32 rate_flags, u32 masked_rates, + u32 basic_rates, u32 masked_rates, u8 element_id) { u8 i, rates, skip; rates = 0; for (i = 0; i < sband->n_bitrates; i++) { - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; if (masked_rates & BIT(i)) continue; rates++; @@ -3244,8 +3244,6 @@ int ieee80211_put_srates_elem(struct sk_buff *skb, int rate; u8 basic; - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - continue; if (masked_rates & BIT(i)) continue; @@ -3647,31 +3645,6 @@ again: WARN_ON_ONCE(!cfg80211_chandef_valid(c)); } -/* - * Returns true if smps_mode_new is strictly more restrictive than - * smps_mode_old. - */ -bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old, - enum ieee80211_smps_mode smps_mode_new) -{ - if (WARN_ON_ONCE(smps_mode_old == IEEE80211_SMPS_AUTOMATIC || - smps_mode_new == IEEE80211_SMPS_AUTOMATIC)) - return false; - - switch (smps_mode_old) { - case IEEE80211_SMPS_STATIC: - return false; - case IEEE80211_SMPS_DYNAMIC: - return smps_mode_new == IEEE80211_SMPS_STATIC; - case IEEE80211_SMPS_OFF: - return smps_mode_new != IEEE80211_SMPS_OFF; - default: - WARN_ON(1); - } - - return false; -} - int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings *csa_settings) { diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 6a20fa099190..c5c5d16ed6c8 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -350,9 +350,9 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, } /* FIXME: move this to some better location - parses HE/EHT now */ -enum ieee80211_sta_rx_bandwidth -_ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta, - struct cfg80211_chan_def *chandef) +static enum ieee80211_sta_rx_bandwidth +__ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta, + struct cfg80211_chan_def *chandef) { unsigned int link_id = link_sta->link_id; struct ieee80211_sub_if_data *sdata = link_sta->sta->sdata; @@ -423,6 +423,28 @@ _ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta, return IEEE80211_STA_RX_BW_80; } +enum ieee80211_sta_rx_bandwidth +_ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta, + struct cfg80211_chan_def *chandef) +{ + /* + * With RX OMI, also pretend that the STA's capability changed. + * Of course this isn't really true, it didn't change, only our + * RX capability was changed by notifying RX OMI to the STA. + * The purpose, however, is to save power, and that requires + * changing also transmissions to the AP and the chanctx. The + * transmissions depend on link_sta->bandwidth which is set in + * _ieee80211_sta_cur_vht_bw() below, but the chanctx depends + * on the result of this function which is also called by + * _ieee80211_sta_cur_vht_bw(), so we need to do that here as + * well. This is sufficient for the steady state, but during + * the transition we already need to change TX/RX separately, + * so _ieee80211_sta_cur_vht_bw() below applies the _tx one. + */ + return min(__ieee80211_sta_cap_rx_bw(link_sta, chandef), + link_sta->rx_omi_bw_rx); +} + enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta) { @@ -503,8 +525,11 @@ _ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta, rcu_read_unlock(); } - bw = _ieee80211_sta_cap_rx_bw(link_sta, chandef); + /* intentionally do not take rx_bw_omi_rx into account */ + bw = __ieee80211_sta_cap_rx_bw(link_sta, chandef); bw = min(bw, link_sta->cur_max_bandwidth); + /* but do apply rx_omi_bw_tx */ + bw = min(bw, link_sta->rx_omi_bw_tx); /* Don't consider AP's bandwidth for TDLS peers, section 11.23.1 of * IEEE80211-2016 specification makes higher bandwidth operation diff --git a/net/mac80211/wbrf.c b/net/mac80211/wbrf.c index 3a8612309137..478b34b81919 100644 --- a/net/mac80211/wbrf.c +++ b/net/mac80211/wbrf.c @@ -2,6 +2,7 @@ /* * Wifi Band Exclusion Interface for WLAN * Copyright (C) 2023 Advanced Micro Devices + * Copyright (C) 2025 Intel Corporation * */ @@ -45,7 +46,7 @@ static void get_ranges_from_chandef(struct cfg80211_chan_def *chandef, u64 start_freq2, end_freq2; int bandwidth; - bandwidth = nl80211_chan_width_to_mhz(chandef->width); + bandwidth = cfg80211_chandef_get_width(chandef); get_chan_freq_boundary(chandef->center_freq1, bandwidth, &start_freq1, &end_freq1); diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 08dd521a51a5..8f2bff268392 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -194,9 +194,6 @@ int ieee802154_mlme_tx_locked(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct sk_buff *skb); void ieee802154_mlme_op_post(struct ieee802154_local *local); -int ieee802154_mlme_tx_one(struct ieee802154_local *local, - struct ieee802154_sub_if_data *sdata, - struct sk_buff *skb); int ieee802154_mlme_tx_one_locked(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct sk_buff *skb); diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 21b7c3b280b4..ea1efef3572a 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -213,8 +213,8 @@ int ieee802154_register_hw(struct ieee802154_hw *hw) goto out_wq; } - hrtimer_init(&local->ifs_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - local->ifs_timer.function = ieee802154_xmit_ifs_timer; + hrtimer_setup(&local->ifs_timer, ieee802154_xmit_ifs_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); wpan_phy_set_dev(local->phy, local->hw.parent); diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index 337d6faf0d2a..4d13f18f6f2c 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -178,19 +178,6 @@ void ieee802154_mlme_op_post(struct ieee802154_local *local) ieee802154_release_queue(local); } -int ieee802154_mlme_tx_one(struct ieee802154_local *local, - struct ieee802154_sub_if_data *sdata, - struct sk_buff *skb) -{ - int ret; - - ieee802154_mlme_op_pre(local); - ret = ieee802154_mlme_tx(local, sdata, skb); - ieee802154_mlme_op_post(local); - - return ret; -} - int ieee802154_mlme_tx_one_locked(struct ieee802154_local *local, struct ieee802154_sub_if_data *sdata, struct sk_buff *skb) diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index f6de136008f6..9b12ca97f412 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -630,6 +630,9 @@ static int mctp_sk_hash(struct sock *sk) { struct net *net = sock_net(sk); + /* Bind lookup runs under RCU, remain live during that. */ + sock_set_flag(sk, SOCK_RCU_FREE); + mutex_lock(&net->mctp.bind_lock); sk_add_node_rcu(sk, &net->mctp.binds); mutex_unlock(&net->mctp.bind_lock); @@ -663,7 +666,7 @@ static void mctp_sk_unhash(struct sock *sk) * keys), stop any pending expiry events. the timer cannot be re-queued * as the sk is no longer observable */ - del_timer_sync(&msk->key_expiry); + timer_delete_sync(&msk->key_expiry); } static void mctp_sk_destruct(struct sock *sk) diff --git a/net/mctp/device.c b/net/mctp/device.c index 26ce34b7e88e..4d404edd7446 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -20,8 +20,7 @@ #include <net/sock.h> struct mctp_dump_cb { - int h; - int idx; + unsigned long ifindex; size_t a_idx; }; @@ -115,43 +114,36 @@ static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct mctp_dump_cb *mcb = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - struct hlist_head *head; struct net_device *dev; struct ifaddrmsg *hdr; struct mctp_dev *mdev; - int ifindex; - int idx = 0, rc; - - hdr = nlmsg_data(cb->nlh); - // filter by ifindex if requested - ifindex = hdr->ifa_index; + int ifindex = 0, rc; + + /* Filter by ifindex if a header is provided */ + hdr = nlmsg_payload(cb->nlh, sizeof(*hdr)); + if (hdr) { + ifindex = hdr->ifa_index; + } else { + if (cb->strict_check) { + NL_SET_ERR_MSG(cb->extack, "mctp: Invalid header for addr dump request"); + return -EINVAL; + } + } rcu_read_lock(); - for (; mcb->h < NETDEV_HASHENTRIES; mcb->h++, mcb->idx = 0) { - idx = 0; - head = &net->dev_index_head[mcb->h]; - hlist_for_each_entry_rcu(dev, head, index_hlist) { - if (idx >= mcb->idx && - (ifindex == 0 || ifindex == dev->ifindex)) { - mdev = __mctp_dev_get(dev); - if (mdev) { - rc = mctp_dump_dev_addrinfo(mdev, - skb, cb); - mctp_dev_put(mdev); - // Error indicates full buffer, this - // callback will get retried. - if (rc < 0) - goto out; - } - } - idx++; - // reset for next iteration - mcb->a_idx = 0; - } + for_each_netdev_dump(net, dev, mcb->ifindex) { + if (ifindex && ifindex != dev->ifindex) + continue; + mdev = __mctp_dev_get(dev); + if (!mdev) + continue; + rc = mctp_dump_dev_addrinfo(mdev, skb, cb); + mctp_dev_put(mdev); + if (rc < 0) + break; + mcb->a_idx = 0; } -out: rcu_read_unlock(); - mcb->idx = idx; return skb->len; } @@ -531,9 +523,12 @@ static struct notifier_block mctp_dev_nb = { }; static const struct rtnl_msg_handler mctp_device_rtnl_msg_handlers[] = { - {THIS_MODULE, PF_MCTP, RTM_NEWADDR, mctp_rtm_newaddr, NULL, 0}, - {THIS_MODULE, PF_MCTP, RTM_DELADDR, mctp_rtm_deladdr, NULL, 0}, - {THIS_MODULE, PF_MCTP, RTM_GETADDR, NULL, mctp_dump_addrinfo, 0}, + {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_NEWADDR, + .doit = mctp_rtm_newaddr}, + {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_DELADDR, + .doit = mctp_rtm_deladdr}, + {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_GETADDR, + .dumpit = mctp_dump_addrinfo}, }; int __init mctp_device_init(void) diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c index 590f642413e4..05b899f22d90 100644 --- a/net/mctp/neigh.c +++ b/net/mctp/neigh.c @@ -250,7 +250,10 @@ static int mctp_rtm_getneigh(struct sk_buff *skb, struct netlink_callback *cb) int idx; } *cbctx = (void *)cb->ctx; - ndmsg = nlmsg_data(cb->nlh); + ndmsg = nlmsg_payload(cb->nlh, sizeof(*ndmsg)); + if (!ndmsg) + return -EINVAL; + req_ifindex = ndmsg->ndm_ifindex; idx = 0; diff --git a/net/mctp/route.c b/net/mctp/route.c index 3f2bd65ff5e3..d9c8e5a5f9ce 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -313,8 +313,10 @@ static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) key = flow->key; - if (WARN_ON(key->dev && key->dev != dev)) + if (key->dev) { + WARN_ON(key->dev != dev); return; + } mctp_dev_set_key(dev, key); } @@ -332,8 +334,14 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) & MCTP_HDR_SEQ_MASK; if (!key->reasm_head) { - key->reasm_head = skb; - key->reasm_tailp = &(skb_shinfo(skb)->frag_list); + /* Since we're manipulating the shared frag_list, ensure it isn't + * shared with any other SKBs. + */ + key->reasm_head = skb_unshare(skb, GFP_ATOMIC); + if (!key->reasm_head) + return -ENOMEM; + + key->reasm_tailp = &(skb_shinfo(key->reasm_head)->frag_list); key->last_seq = this_seq; return 0; } diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 17165b86ce22..06c1897b685a 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -921,6 +921,114 @@ static void mctp_test_route_input_sk_fail_frag(struct kunit *test) __mctp_route_test_fini(test, dev, rt, sock); } +/* Input route to socket, using a fragmented message created from clones. + */ +static void mctp_test_route_input_cloned_frag(struct kunit *test) +{ + /* 5 packet fragments, forming 2 complete messages */ + const struct mctp_hdr hdrs[5] = { + RX_FRAG(FL_S, 0), + RX_FRAG(0, 1), + RX_FRAG(FL_E, 2), + RX_FRAG(FL_S, 0), + RX_FRAG(FL_E, 1), + }; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct sk_buff *skb[5]; + struct sk_buff *rx_skb; + struct socket *sock; + size_t data_len; + u8 compare[100]; + u8 flat[100]; + size_t total; + void *p; + int rc; + + /* Arbitrary length */ + data_len = 3; + total = data_len + sizeof(struct mctp_hdr); + + __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + + /* Create a single skb initially with concatenated packets */ + skb[0] = mctp_test_create_skb(&hdrs[0], 5 * total); + mctp_test_skb_set_dev(skb[0], dev); + memset(skb[0]->data, 0 * 0x11, skb[0]->len); + memcpy(skb[0]->data, &hdrs[0], sizeof(struct mctp_hdr)); + + /* Extract and populate packets */ + for (int i = 1; i < 5; i++) { + skb[i] = skb_clone(skb[i - 1], GFP_ATOMIC); + KUNIT_ASSERT_TRUE(test, skb[i]); + p = skb_pull(skb[i], total); + KUNIT_ASSERT_TRUE(test, p); + skb_reset_network_header(skb[i]); + memcpy(skb[i]->data, &hdrs[i], sizeof(struct mctp_hdr)); + memset(&skb[i]->data[sizeof(struct mctp_hdr)], i * 0x11, data_len); + } + for (int i = 0; i < 5; i++) + skb_trim(skb[i], total); + + /* SOM packets have a type byte to match the socket */ + skb[0]->data[4] = 0; + skb[3]->data[4] = 0; + + skb_dump("pkt1 ", skb[0], false); + skb_dump("pkt2 ", skb[1], false); + skb_dump("pkt3 ", skb[2], false); + skb_dump("pkt4 ", skb[3], false); + skb_dump("pkt5 ", skb[4], false); + + for (int i = 0; i < 5; i++) { + KUNIT_EXPECT_EQ(test, refcount_read(&skb[i]->users), 1); + /* Take a reference so we can check refcounts at the end */ + skb_get(skb[i]); + } + + /* Feed the fragments into MCTP core */ + for (int i = 0; i < 5; i++) { + rc = mctp_route_input(&rt->rt, skb[i]); + KUNIT_EXPECT_EQ(test, rc, 0); + } + + /* Receive first reassembled message */ + rx_skb = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc); + KUNIT_EXPECT_EQ(test, rc, 0); + KUNIT_EXPECT_EQ(test, rx_skb->len, 3 * data_len); + rc = skb_copy_bits(rx_skb, 0, flat, rx_skb->len); + for (int i = 0; i < rx_skb->len; i++) + compare[i] = (i / data_len) * 0x11; + /* Set type byte */ + compare[0] = 0; + + KUNIT_EXPECT_MEMEQ(test, flat, compare, rx_skb->len); + KUNIT_EXPECT_EQ(test, refcount_read(&rx_skb->users), 1); + kfree_skb(rx_skb); + + /* Receive second reassembled message */ + rx_skb = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc); + KUNIT_EXPECT_EQ(test, rc, 0); + KUNIT_EXPECT_EQ(test, rx_skb->len, 2 * data_len); + rc = skb_copy_bits(rx_skb, 0, flat, rx_skb->len); + for (int i = 0; i < rx_skb->len; i++) + compare[i] = (i / data_len + 3) * 0x11; + /* Set type byte */ + compare[0] = 0; + + KUNIT_EXPECT_MEMEQ(test, flat, compare, rx_skb->len); + KUNIT_EXPECT_EQ(test, refcount_read(&rx_skb->users), 1); + kfree_skb(rx_skb); + + /* Check input skb refcounts */ + for (int i = 0; i < 5; i++) { + KUNIT_EXPECT_EQ(test, refcount_read(&skb[i]->users), 1); + kfree_skb(skb[i]); + } + + __mctp_route_test_fini(test, dev, rt, sock); +} + #if IS_ENABLED(CONFIG_MCTP_FLOWS) static void mctp_test_flow_init(struct kunit *test, @@ -1144,6 +1252,7 @@ static struct kunit_case mctp_test_cases[] = { KUNIT_CASE(mctp_test_packet_flow), KUNIT_CASE(mctp_test_fragment_flow), KUNIT_CASE(mctp_test_route_output_key_create), + KUNIT_CASE(mctp_test_route_input_cloned_frag), {} }; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 1f63b32d76d6..d536c97144e9 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2095,12 +2095,12 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct rtmsg *rtm; int err, i; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for FIB dump request"); return -EINVAL; } - rtm = nlmsg_data(nlh); if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { @@ -2288,7 +2288,8 @@ static int mpls_valid_getroute_req(struct sk_buff *skb, struct rtmsg *rtm; int i, err; - if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { + rtm = nlmsg_payload(nlh, sizeof(*rtm)); + if (!rtm) { NL_SET_ERR_MSG_MOD(extack, "Invalid header for get route request"); return -EINVAL; @@ -2298,7 +2299,6 @@ static int mpls_valid_getroute_req(struct sk_buff *skb, return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, extack); - rtm = nlmsg_data(nlh); if ((rtm->rtm_dst_len && rtm->rtm_dst_len != 20) || rtm->rtm_src_len || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { diff --git a/net/mpls/internal.h b/net/mpls/internal.h index b9f492ddf93b..83c629529b57 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -33,7 +33,7 @@ struct mpls_dev { #define MPLS_INC_STATS_LEN(mdev, len, pkts_field, bytes_field) \ do { \ - __typeof__(*(mdev)->stats) *ptr = \ + TYPEOF_UNQUAL(*(mdev)->stats) *ptr = \ raw_cpu_ptr((mdev)->stats); \ local_bh_disable(); \ u64_stats_update_begin(&ptr->syncp); \ @@ -45,7 +45,7 @@ struct mpls_dev { #define MPLS_INC_STATS(mdev, field) \ do { \ - __typeof__(*(mdev)->stats) *ptr = \ + TYPEOF_UNQUAL(*(mdev)->stats) *ptr = \ raw_cpu_ptr((mdev)->stats); \ local_bh_disable(); \ u64_stats_update_begin(&ptr->syncp); \ diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index bcf1dbf3a432..89bf6c47c818 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_MPTCP) += mptcp.o mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o \ - mptcp_pm_gen.o + mptcp_pm_gen.o pm_kernel.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index b0dd008e2114..d9290c5bb6c7 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -32,12 +32,14 @@ struct mptcp_pernet { unsigned int close_timeout; unsigned int stale_loss_cnt; atomic_t active_disable_times; + u8 syn_retrans_before_tcp_fallback; unsigned long active_disable_stamp; u8 mptcp_enabled; u8 checksum_enabled; u8 allow_join_initial_addr_port; u8 pm_type; char scheduler[MPTCP_SCHED_NAME_MAX]; + char path_manager[MPTCP_PM_NAME_MAX]; }; static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) @@ -82,6 +84,11 @@ int mptcp_get_pm_type(const struct net *net) return mptcp_get_pernet(net)->pm_type; } +const char *mptcp_get_path_manager(const struct net *net) +{ + return mptcp_get_pernet(net)->path_manager; +} + const char *mptcp_get_scheduler(const struct net *net) { return mptcp_get_pernet(net)->scheduler; @@ -92,6 +99,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) pernet->mptcp_enabled = 1; pernet->add_addr_timeout = TCP_RTO_MAX; pernet->blackhole_timeout = 3600; + pernet->syn_retrans_before_tcp_fallback = 2; atomic_set(&pernet->active_disable_times, 0); pernet->close_timeout = TCP_TIMEWAIT_LEN; pernet->checksum_enabled = 0; @@ -99,6 +107,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) pernet->stale_loss_cnt = 4; pernet->pm_type = MPTCP_PM_TYPE_KERNEL; strscpy(pernet->scheduler, "default", sizeof(pernet->scheduler)); + strscpy(pernet->path_manager, "kernel", sizeof(pernet->path_manager)); } #ifdef CONFIG_SYSCTL @@ -172,6 +181,96 @@ static int proc_blackhole_detect_timeout(const struct ctl_table *table, return ret; } +static int mptcp_set_path_manager(char *path_manager, const char *name) +{ + struct mptcp_pm_ops *pm_ops; + int ret = 0; + + rcu_read_lock(); + pm_ops = mptcp_pm_find(name); + if (pm_ops) + strscpy(path_manager, name, MPTCP_PM_NAME_MAX); + else + ret = -ENOENT; + rcu_read_unlock(); + + return ret; +} + +static int proc_path_manager(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct mptcp_pernet *pernet = container_of(ctl->data, + struct mptcp_pernet, + path_manager); + char (*path_manager)[MPTCP_PM_NAME_MAX] = ctl->data; + char pm_name[MPTCP_PM_NAME_MAX]; + const struct ctl_table tbl = { + .data = pm_name, + .maxlen = MPTCP_PM_NAME_MAX, + }; + int ret; + + strscpy(pm_name, *path_manager, MPTCP_PM_NAME_MAX); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (write && ret == 0) { + ret = mptcp_set_path_manager(*path_manager, pm_name); + if (ret == 0) { + u8 pm_type = __MPTCP_PM_TYPE_NR; + + if (strncmp(pm_name, "kernel", MPTCP_PM_NAME_MAX) == 0) + pm_type = MPTCP_PM_TYPE_KERNEL; + else if (strncmp(pm_name, "userspace", MPTCP_PM_NAME_MAX) == 0) + pm_type = MPTCP_PM_TYPE_USERSPACE; + pernet->pm_type = pm_type; + } + } + + return ret; +} + +static int proc_pm_type(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct mptcp_pernet *pernet = container_of(ctl->data, + struct mptcp_pernet, + pm_type); + int ret; + + ret = proc_dou8vec_minmax(ctl, write, buffer, lenp, ppos); + if (write && ret == 0) { + u8 pm_type = READ_ONCE(*(u8 *)ctl->data); + char *pm_name = ""; + + if (pm_type == MPTCP_PM_TYPE_KERNEL) + pm_name = "kernel"; + else if (pm_type == MPTCP_PM_TYPE_USERSPACE) + pm_name = "userspace"; + mptcp_set_path_manager(pernet->path_manager, pm_name); + } + + return ret; +} + +static int proc_available_path_managers(const struct ctl_table *ctl, + int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct ctl_table tbl = { .maxlen = MPTCP_PM_BUF_MAX, }; + int ret; + + tbl.data = kmalloc(tbl.maxlen, GFP_USER); + if (!tbl.data) + return -ENOMEM; + + mptcp_pm_get_available(tbl.data, MPTCP_PM_BUF_MAX); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + kfree(tbl.data); + + return ret; +} + static struct ctl_table mptcp_sysctl_table[] = { { .procname = "enabled", @@ -216,7 +315,7 @@ static struct ctl_table mptcp_sysctl_table[] = { .procname = "pm_type", .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dou8vec_minmax, + .proc_handler = proc_pm_type, .extra1 = SYSCTL_ZERO, .extra2 = &mptcp_pm_type_max }, @@ -245,6 +344,24 @@ static struct ctl_table mptcp_sysctl_table[] = { .proc_handler = proc_blackhole_detect_timeout, .extra1 = SYSCTL_ZERO, }, + { + .procname = "syn_retrans_before_tcp_fallback", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + }, + { + .procname = "path_manager", + .maxlen = MPTCP_PM_NAME_MAX, + .mode = 0644, + .proc_handler = proc_path_manager, + }, + { + .procname = "available_path_managers", + .maxlen = MPTCP_PM_BUF_MAX, + .mode = 0444, + .proc_handler = proc_available_path_managers, + }, }; static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) @@ -269,6 +386,9 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) /* table[7] is for available_schedulers which is read-only info */ table[8].data = &pernet->close_timeout; table[9].data = &pernet->blackhole_timeout; + table[10].data = &pernet->syn_retrans_before_tcp_fallback; + table[11].data = &pernet->path_manager; + /* table[12] is for available_path_managers which is read-only info */ hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table, ARRAY_SIZE(mptcp_sysctl_table)); @@ -392,22 +512,30 @@ void mptcp_active_enable(struct sock *sk) void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) { struct mptcp_subflow_context *subflow; - u32 timeouts; + u8 timeouts, to_max; + struct net *net; - if (!sk_is_mptcp(ssk)) + /* Only check MPTCP SYN ... */ + if (likely(!sk_is_mptcp(ssk) || ssk->sk_state != TCP_SYN_SENT)) return; - timeouts = inet_csk(ssk)->icsk_retransmits; subflow = mptcp_subflow_ctx(ssk); - if (subflow->request_mptcp && ssk->sk_state == TCP_SYN_SENT) { - if (timeouts == 2 || (timeouts < 2 && expired)) { - MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEACTIVEDROP); - subflow->mpc_drop = 1; - mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); - } else { - subflow->mpc_drop = 0; - } + /* ... + MP_CAPABLE */ + if (!subflow->request_mptcp) { + /* Mark as blackhole iif the 1st non-MPTCP SYN is accepted */ + subflow->mpc_drop = 0; + return; + } + + net = sock_net(ssk); + timeouts = inet_csk(ssk)->icsk_retransmits; + to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback; + + if (timeouts == to_max || (timeouts < to_max && expired)) { + MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); + subflow->mpc_drop = 1; + mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); } } diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c index 02205f7994d7..70cf9ebce833 100644 --- a/net/mptcp/diag.c +++ b/net/mptcp/diag.c @@ -12,7 +12,7 @@ #include <net/netlink.h> #include "protocol.h" -static int subflow_get_info(struct sock *sk, struct sk_buff *skb) +static int subflow_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin) { struct mptcp_subflow_context *sf; struct nlattr *start; @@ -56,15 +56,6 @@ static int subflow_get_info(struct sock *sk, struct sk_buff *skb) if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) || - nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, - sf->rel_write_seq) || - nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq, - MPTCP_SUBFLOW_ATTR_PAD) || - nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, - sf->map_subflow_seq) || - nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) || - nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN, - sf->map_data_len) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) || nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) || nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, subflow_get_local_id(sf))) { @@ -72,6 +63,21 @@ static int subflow_get_info(struct sock *sk, struct sk_buff *skb) goto nla_failure; } + /* Only export seq related counters to user with CAP_NET_ADMIN */ + if (net_admin && + (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, + sf->rel_write_seq) || + nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq, + MPTCP_SUBFLOW_ATTR_PAD) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, + sf->map_subflow_seq) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) || + nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN, + sf->map_data_len))) { + err = -EMSGSIZE; + goto nla_failure; + } + rcu_read_unlock(); unlock_sock_fast(sk, slow); nla_nest_end(skb, start); @@ -84,22 +90,26 @@ nla_failure: return err; } -static size_t subflow_get_info_size(const struct sock *sk) +static size_t subflow_get_info_size(const struct sock *sk, bool net_admin) { size_t size = 0; size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */ - nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ - nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ - nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ - nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ - nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */ 0; + + if (net_admin) + size += nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ + nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ + nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ + 0; + return size; } diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index a29ff901df75..b9e451197902 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -40,17 +40,17 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf tp->copied_seq += skb->len; subflow->ssn_offset += skb->len; - /* initialize a dummy sequence number, we will update it at MPC - * completion, if needed - */ + /* Only the sequence delta is relevant */ MPTCP_SKB_CB(skb)->map_seq = -skb->len; MPTCP_SKB_CB(skb)->end_seq = 0; MPTCP_SKB_CB(skb)->offset = 0; MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + MPTCP_SKB_CB(skb)->cant_coalesce = 1; mptcp_data_lock(sk); + DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk)); - mptcp_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); mptcp_sk(sk)->bytes_received += skb->len; @@ -58,22 +58,3 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf mptcp_data_unlock(sk); } - -void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - const struct mptcp_options_received *mp_opt) -{ - struct sock *sk = (struct sock *)msk; - struct sk_buff *skb; - - skb = skb_peek_tail(&sk->sk_receive_queue); - if (skb) { - WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq); - pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx\n", sk, - MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq, - MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq); - MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq; - MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq; - } - - pr_debug("msk=%p ack_seq=%llx\n", msk, msk->ack_seq); -} diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index 19eb9292bd60..0c24545f0e8d 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -28,6 +28,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), + SNMP_MIB_ITEM("MPJoinRejected", MPTCP_MIB_JOINREJECTED), SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX), SNMP_MIB_ITEM("MPJoinSynTxCreatSkErr", MPTCP_MIB_JOINSYNTXCREATSKERR), SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR), diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 128282982843..250c6b77977e 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -23,6 +23,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ + MPTCP_MIB_JOINREJECTED, /* The PM rejected the JOIN request */ MPTCP_MIB_JOINSYNTX, /* Sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXCREATSKERR, /* Not able to create a socket when sending a SYN + MP_JOIN */ MPTCP_MIB_JOINSYNTXBINDERR, /* Not able to bind() the address when sending a SYN + MP_JOIN */ diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 123f3f297284..421ced031289 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -108,7 +108,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->suboptions |= OPTION_MPTCP_DSS; mp_opt->use_map = 1; mp_opt->mpc_map = 1; - mp_opt->use_ack = 0; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } @@ -157,11 +156,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, pr_debug("DSS\n"); ptr++; - /* we must clear 'mpc_map' be able to detect MP_CAPABLE - * map vs DSS map in mptcp_incoming_options(), and reconstruct - * map info accordingly - */ - mp_opt->mpc_map = 0; flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; @@ -369,8 +363,11 @@ void mptcp_get_options(const struct sk_buff *skb, const unsigned char *ptr; int length; - /* initialize option status */ - mp_opt->suboptions = 0; + /* Ensure that casting the whole status to u32 is efficient and safe */ + BUILD_BUG_ON(sizeof_field(struct mptcp_options_received, status) != sizeof(u32)); + BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_options_received, status), + sizeof(u32))); + *(u32 *)&mp_opt->status = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -435,7 +432,6 @@ static void clear_3rdack_retransmission(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); sk_stop_timer(sk, &icsk->icsk_delack_timer); - icsk->icsk_ack.timeout = 0; icsk->icsk_ack.ato = 0; icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); } @@ -654,6 +650,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * struct mptcp_sock *msk = mptcp_sk(subflow->conn); bool drop_other_suboptions = false; unsigned int opt_size = *size; + struct mptcp_addr_info addr; bool echo; int len; @@ -662,7 +659,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * */ if (!mptcp_pm_should_add_signal(msk) || (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || - !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, + !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr, &echo, &drop_other_suboptions)) return false; @@ -675,7 +672,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * else if (opts->suboptions & OPTION_MPTCP_DSS) return false; - len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); + len = mptcp_add_addr_len(addr.family, echo, !!addr.port); if (remaining < len) return false; @@ -692,6 +689,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * opts->ahmac = 0; *size -= opt_size; } + opts->addr = addr; opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 16c336c51940..1306d4dc287b 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -5,12 +5,393 @@ */ #define pr_fmt(fmt) "MPTCP: " fmt -#include <linux/kernel.h> -#include <net/mptcp.h> +#include <linux/rculist.h> +#include <linux/spinlock.h> #include "protocol.h" - #include "mib.h" +#define ADD_ADDR_RETRANS_MAX 3 + +struct mptcp_pm_add_entry { + struct list_head list; + struct mptcp_addr_info addr; + u8 retrans_times; + struct timer_list add_timer; + struct mptcp_sock *sock; +}; + +static DEFINE_SPINLOCK(mptcp_pm_list_lock); +static LIST_HEAD(mptcp_pm_list); + +/* path manager helpers */ + +/* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, + * otherwise allow any matching local/remote pair + */ +bool mptcp_pm_addr_families_match(const struct sock *sk, + const struct mptcp_addr_info *loc, + const struct mptcp_addr_info *rem) +{ + bool mptcp_is_v4 = sk->sk_family == AF_INET; + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6); + bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6); + + if (mptcp_is_v4) + return loc_is_v4 && rem_is_v4; + + if (ipv6_only_sock(sk)) + return !loc_is_v4 && !rem_is_v4; + + return loc_is_v4 == rem_is_v4; +#else + return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET; +#endif +} + +bool mptcp_addresses_equal(const struct mptcp_addr_info *a, + const struct mptcp_addr_info *b, bool use_port) +{ + bool addr_equals = false; + + if (a->family == b->family) { + if (a->family == AF_INET) + addr_equals = a->addr.s_addr == b->addr.s_addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else + addr_equals = ipv6_addr_equal(&a->addr6, &b->addr6); + } else if (a->family == AF_INET) { + if (ipv6_addr_v4mapped(&b->addr6)) + addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; + } else if (b->family == AF_INET) { + if (ipv6_addr_v4mapped(&a->addr6)) + addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr; +#endif + } + + if (!addr_equals) + return false; + if (!use_port) + return true; + + return a->port == b->port; +} + +void mptcp_local_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) +{ + addr->family = skc->skc_family; + addr->port = htons(skc->skc_num); + if (addr->family == AF_INET) + addr->addr.s_addr = skc->skc_rcv_saddr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + addr->addr6 = skc->skc_v6_rcv_saddr; +#endif +} + +void mptcp_remote_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) +{ + addr->family = skc->skc_family; + addr->port = skc->skc_dport; + if (addr->family == AF_INET) + addr->addr.s_addr = skc->skc_daddr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + addr->addr6 = skc->skc_v6_daddr; +#endif +} + +static bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *remote) +{ + struct mptcp_addr_info mpc_remote; + + mptcp_remote_address((struct sock_common *)msk, &mpc_remote); + return mptcp_addresses_equal(&mpc_remote, remote, remote->port); +} + +bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, + const struct mptcp_addr_info *saddr) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info cur; + struct sock_common *skc; + + list_for_each_entry(subflow, list, node) { + skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); + + mptcp_local_address(skc, &cur); + if (mptcp_addresses_equal(&cur, saddr, saddr->port)) + return true; + } + + return false; +} + +static struct mptcp_pm_add_entry * +mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_add_entry *entry; + + lockdep_assert_held(&msk->pm.lock); + + list_for_each_entry(entry, &msk->pm.anno_list, list) { + if (mptcp_addresses_equal(&entry->addr, addr, true)) + return entry; + } + + return NULL; +} + +bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_add_entry *entry; + bool ret; + + entry = mptcp_pm_del_add_timer(msk, addr, false); + ret = entry; + kfree(entry); + + return ret; +} + +bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) +{ + struct mptcp_pm_add_entry *entry; + struct mptcp_addr_info saddr; + bool ret = false; + + mptcp_local_address((struct sock_common *)sk, &saddr); + + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.anno_list, list) { + if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { + ret = true; + goto out; + } + } + +out: + spin_unlock_bh(&msk->pm.lock); + return ret; +} + +static void __mptcp_pm_send_ack(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + bool prio, bool backup) +{ + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + + pr_debug("send ack for %s\n", + prio ? "mp_prio" : + (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); + + slow = lock_sock_fast(ssk); + if (prio) { + subflow->send_mp_prio = 1; + subflow->request_bkup = backup; + } + + __mptcp_subflow_send_ack(ssk); + unlock_sock_fast(ssk, slow); +} + +void mptcp_pm_send_ack(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + bool prio, bool backup) +{ + spin_unlock_bh(&msk->pm.lock); + __mptcp_pm_send_ack(msk, subflow, prio, backup); + spin_lock_bh(&msk->pm.lock); +} + +void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *alt = NULL; + + msk_owned_by_me(msk); + lockdep_assert_held(&msk->pm.lock); + + if (!mptcp_pm_should_add_signal(msk) && + !mptcp_pm_should_rm_signal(msk)) + return; + + mptcp_for_each_subflow(msk, subflow) { + if (__mptcp_subflow_active(subflow)) { + if (!subflow->stale) { + mptcp_pm_send_ack(msk, subflow, false, false); + return; + } + + if (!alt) + alt = subflow; + } + } + + if (alt) + mptcp_pm_send_ack(msk, alt, false, false); +} + +int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup) +{ + struct mptcp_subflow_context *subflow; + + pr_debug("bkup=%d\n", bkup); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct mptcp_addr_info local, remote; + + mptcp_local_address((struct sock_common *)ssk, &local); + if (!mptcp_addresses_equal(&local, addr, addr->port)) + continue; + + if (rem && rem->family != AF_UNSPEC) { + mptcp_remote_address((struct sock_common *)ssk, &remote); + if (!mptcp_addresses_equal(&remote, rem, rem->port)) + continue; + } + + __mptcp_pm_send_ack(msk, subflow, true, bkup); + return 0; + } + + return -EINVAL; +} + +static void mptcp_pm_add_timer(struct timer_list *timer) +{ + struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); + struct mptcp_sock *msk = entry->sock; + struct sock *sk = (struct sock *)msk; + + pr_debug("msk=%p\n", msk); + + if (!msk) + return; + + if (inet_sk_state_load(sk) == TCP_CLOSE) + return; + + if (!entry->addr.id) + return; + + if (mptcp_pm_should_add_signal_addr(msk)) { + sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); + goto out; + } + + spin_lock_bh(&msk->pm.lock); + + if (!mptcp_pm_should_add_signal_addr(msk)) { + pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); + mptcp_pm_announce_addr(msk, &entry->addr, false); + mptcp_pm_add_addr_send_ack(msk); + entry->retrans_times++; + } + + if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) + sk_reset_timer(sk, timer, + jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); + + spin_unlock_bh(&msk->pm.lock); + + if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) + mptcp_pm_subflow_established(msk); + +out: + __sock_put(sk); +} + +struct mptcp_pm_add_entry * +mptcp_pm_del_add_timer(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr, bool check_id) +{ + struct mptcp_pm_add_entry *entry; + struct sock *sk = (struct sock *)msk; + struct timer_list *add_timer = NULL; + + spin_lock_bh(&msk->pm.lock); + entry = mptcp_lookup_anno_list_by_saddr(msk, addr); + if (entry && (!check_id || entry->addr.id == addr->id)) { + entry->retrans_times = ADD_ADDR_RETRANS_MAX; + add_timer = &entry->add_timer; + } + if (!check_id && entry) + list_del(&entry->list); + spin_unlock_bh(&msk->pm.lock); + + /* no lock, because sk_stop_timer_sync() is calling timer_delete_sync() */ + if (add_timer) + sk_stop_timer_sync(sk, add_timer); + + return entry; +} + +bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_add_entry *add_entry = NULL; + struct sock *sk = (struct sock *)msk; + struct net *net = sock_net(sk); + + lockdep_assert_held(&msk->pm.lock); + + add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); + + if (add_entry) { + if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) + return false; + + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); + return true; + } + + add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); + if (!add_entry) + return false; + + list_add(&add_entry->list, &msk->pm.anno_list); + + add_entry->addr = *addr; + add_entry->sock = msk; + add_entry->retrans_times = 0; + + timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); + sk_reset_timer(sk, &add_entry->add_timer, + jiffies + mptcp_get_add_addr_timeout(net)); + + return true; +} + +static void mptcp_pm_free_anno_list(struct mptcp_sock *msk) +{ + struct mptcp_pm_add_entry *entry, *tmp; + struct sock *sk = (struct sock *)msk; + LIST_HEAD(free_list); + + pr_debug("msk=%p\n", msk); + + spin_lock_bh(&msk->pm.lock); + list_splice_init(&msk->pm.anno_list, &free_list); + spin_unlock_bh(&msk->pm.lock); + + list_for_each_entry_safe(entry, tmp, &free_list, list) { + sk_stop_timer_sync(sk, &entry->add_timer); + kfree(entry); + } +} + /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, @@ -56,7 +437,7 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ msk->pm.rm_list_tx = *rm_list; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); WRITE_ONCE(msk->pm.addr_signal, rm_addr); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); return 0; } @@ -138,13 +519,13 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) * be sure to serve this event only once. */ if (READ_ONCE(pm->work_pending) && - !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) + !(pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED))) mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); - if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0) + if ((pm->status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0) announce = true; - msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); + pm->status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED); spin_unlock_bh(&pm->lock); if (announce) @@ -230,7 +611,7 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } /* id0 should not have a different address */ - } else if ((addr->id == 0 && !mptcp_pm_nl_is_init_remote_addr(msk, addr)) || + } else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) || (addr->id > 0 && !READ_ONCE(pm->accept_addr))) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); @@ -250,6 +631,9 @@ void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, pr_debug("msk=%p\n", msk); + if (!READ_ONCE(pm->work_pending)) + return; + spin_lock_bh(&pm->lock); if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending)) @@ -266,6 +650,80 @@ void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk) mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK); } +static void mptcp_pm_rm_addr_or_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list, + enum linux_mptcp_mib_field rm_type) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct sock *sk = (struct sock *)msk; + u8 i; + + pr_debug("%s rm_list_nr %d\n", + rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); + + msk_owned_by_me(msk); + + if (sk->sk_state == TCP_LISTEN) + return; + + if (!rm_list->nr) + return; + + if (list_empty(&msk->conn_list)) + return; + + for (i = 0; i < rm_list->nr; i++) { + u8 rm_id = rm_list->ids[i]; + bool removed = false; + + mptcp_for_each_subflow_safe(msk, subflow, tmp) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + u8 remote_id = READ_ONCE(subflow->remote_id); + int how = RCV_SHUTDOWN | SEND_SHUTDOWN; + u8 id = subflow_get_local_id(subflow); + + if ((1 << inet_sk_state_load(ssk)) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE)) + continue; + if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) + continue; + if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) + continue; + + pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", + rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", + i, rm_id, id, remote_id, msk->mpc_endpoint_id); + spin_unlock_bh(&msk->pm.lock); + mptcp_subflow_shutdown(sk, ssk, how); + removed |= subflow->request_join; + + /* the following takes care of updating the subflows counter */ + mptcp_close_ssk(sk, ssk, subflow); + spin_lock_bh(&msk->pm.lock); + + if (rm_type == MPTCP_MIB_RMSUBFLOW) + __MPTCP_INC_STATS(sock_net(sk), rm_type); + } + + if (rm_type == MPTCP_MIB_RMADDR) { + __MPTCP_INC_STATS(sock_net(sk), rm_type); + if (removed && mptcp_pm_is_kernel(msk)) + mptcp_pm_nl_rm_addr(msk, rm_id); + } + } +} + +static void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk) +{ + mptcp_pm_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); +} + +void mptcp_pm_rm_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) +{ + mptcp_pm_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); +} + void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list) { @@ -321,8 +779,6 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) } } -/* path manager helpers */ - bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, unsigned int opt_size, unsigned int remaining, struct mptcp_addr_info *addr, bool *echo, @@ -402,7 +858,7 @@ out_unlock: int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) { - struct mptcp_addr_info skc_local; + struct mptcp_pm_addr_entry skc_local = { 0 }; struct mptcp_addr_info msk_local; if (WARN_ON_ONCE(!msk)) @@ -412,10 +868,13 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) * addr */ mptcp_local_address((struct sock_common *)msk, &msk_local); - mptcp_local_address((struct sock_common *)skc, &skc_local); - if (mptcp_addresses_equal(&msk_local, &skc_local, false)) + mptcp_local_address((struct sock_common *)skc, &skc_local.addr); + if (mptcp_addresses_equal(&msk_local, &skc_local.addr, false)) return 0; + skc_local.addr.id = 0; + skc_local.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; + if (mptcp_pm_is_userspace(msk)) return mptcp_userspace_pm_get_local_id(msk, &skc_local); return mptcp_pm_nl_get_local_id(msk, &skc_local); @@ -433,27 +892,41 @@ bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_is_backup(msk, &skc_local); } -int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info) +static void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { - if (info->attrs[MPTCP_PM_ATTR_TOKEN]) - return mptcp_userspace_pm_get_addr(skb, info); - return mptcp_pm_nl_get_addr(skb, info); -} - -int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) -{ - const struct genl_info *info = genl_info_dump(cb); - - if (info->attrs[MPTCP_PM_ATTR_TOKEN]) - return mptcp_userspace_pm_dump_addr(msg, cb); - return mptcp_pm_nl_dump_addr(msg, cb); -} + struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = (struct sock *)msk; + unsigned int active_max_loss_cnt; + struct net *net = sock_net(sk); + unsigned int stale_loss_cnt; + bool slow; + + stale_loss_cnt = mptcp_stale_loss_cnt(net); + if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) + return; -int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info) -{ - if (info->attrs[MPTCP_PM_ATTR_TOKEN]) - return mptcp_userspace_pm_set_flags(skb, info); - return mptcp_pm_nl_set_flags(skb, info); + /* look for another available subflow not in loss state */ + active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); + mptcp_for_each_subflow(msk, iter) { + if (iter != subflow && mptcp_subflow_active(iter) && + iter->stale_count < active_max_loss_cnt) { + /* we have some alternatives, try to mark this subflow as idle ...*/ + slow = lock_sock_fast(ssk); + if (!tcp_rtx_and_write_queues_empty(ssk)) { + subflow->stale = 1; + __mptcp_retransmit_pending_data(sk); + MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); + } + unlock_sock_fast(ssk, slow); + + /* always try to push the pending data regardless of re-injections: + * we can possibly use backup subflows now, and subflow selection + * is cheap under the msk socket lock + */ + __mptcp_push_pending(sk, 0); + return; + } + } } void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) @@ -468,36 +941,44 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) } else if (subflow->stale_rcv_tstamp == rcv_tstamp) { if (subflow->stale_count < U8_MAX) subflow->stale_count++; - mptcp_pm_nl_subflow_chk_stale(msk, ssk); + mptcp_pm_subflows_chk_stale(msk, ssk); } else { subflow->stale_count = 0; mptcp_subflow_set_active(subflow); } } -/* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses, - * otherwise allow any matching local/remote pair - */ -bool mptcp_pm_addr_families_match(const struct sock *sk, - const struct mptcp_addr_info *loc, - const struct mptcp_addr_info *rem) +void mptcp_pm_worker(struct mptcp_sock *msk) { - bool mptcp_is_v4 = sk->sk_family == AF_INET; + struct mptcp_pm_data *pm = &msk->pm; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6); - bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6); + msk_owned_by_me(msk); - if (mptcp_is_v4) - return loc_is_v4 && rem_is_v4; + if (!(pm->status & MPTCP_PM_WORK_MASK)) + return; - if (ipv6_only_sock(sk)) - return !loc_is_v4 && !rem_is_v4; + spin_lock_bh(&msk->pm.lock); - return loc_is_v4 == rem_is_v4; -#else - return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET; -#endif + pr_debug("msk=%p status=%x\n", msk, pm->status); + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); + mptcp_pm_addr_send_ack(msk); + } + if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); + mptcp_pm_rm_addr_recv(msk); + } + __mptcp_pm_kernel_worker(msk); + + spin_unlock_bh(&msk->pm.lock); +} + +void mptcp_pm_destroy(struct mptcp_sock *msk) +{ + mptcp_pm_free_anno_list(msk); + + if (mptcp_pm_is_userspace(msk)) + mptcp_userspace_pm_free_local_addr_list(msk); } void mptcp_pm_data_reset(struct mptcp_sock *msk) @@ -505,10 +986,7 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk)); struct mptcp_pm_data *pm = &msk->pm; - pm->add_addr_signaled = 0; - pm->add_addr_accepted = 0; - pm->local_addr_used = 0; - pm->subflows = 0; + memset(&pm->reset, 0, sizeof(pm->reset)); pm->rm_list_tx.nr = 0; pm->rm_list_rx.nr = 0; WRITE_ONCE(pm->pm_type, pm_type); @@ -527,16 +1005,9 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) !!mptcp_pm_get_add_addr_accept_max(msk) && subflows_allowed); WRITE_ONCE(pm->accept_subflow, subflows_allowed); - } else { - WRITE_ONCE(pm->work_pending, 0); - WRITE_ONCE(pm->accept_addr, 0); - WRITE_ONCE(pm->accept_subflow, 0); - } - WRITE_ONCE(pm->addr_signal, 0); - WRITE_ONCE(pm->remote_deny_join_id0, false); - pm->status = 0; - bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + bitmap_fill(pm->id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + } } void mptcp_pm_data_init(struct mptcp_sock *msk) @@ -549,5 +1020,75 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) void __init mptcp_pm_init(void) { + mptcp_pm_kernel_register(); + mptcp_pm_userspace_register(); mptcp_pm_nl_init(); } + +/* Must be called with rcu read lock held */ +struct mptcp_pm_ops *mptcp_pm_find(const char *name) +{ + struct mptcp_pm_ops *pm_ops; + + list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) { + if (!strcmp(pm_ops->name, name)) + return pm_ops; + } + + return NULL; +} + +int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops) +{ + return 0; +} + +int mptcp_pm_register(struct mptcp_pm_ops *pm_ops) +{ + int ret; + + ret = mptcp_pm_validate(pm_ops); + if (ret) + return ret; + + spin_lock(&mptcp_pm_list_lock); + if (mptcp_pm_find(pm_ops->name)) { + spin_unlock(&mptcp_pm_list_lock); + return -EEXIST; + } + list_add_tail_rcu(&pm_ops->list, &mptcp_pm_list); + spin_unlock(&mptcp_pm_list_lock); + + pr_debug("%s registered\n", pm_ops->name); + return 0; +} + +void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops) +{ + /* skip unregistering the default path manager */ + if (WARN_ON_ONCE(pm_ops == &mptcp_pm_kernel)) + return; + + spin_lock(&mptcp_pm_list_lock); + list_del_rcu(&pm_ops->list); + spin_unlock(&mptcp_pm_list_lock); +} + +/* Build string with list of available path manager values. + * Similar to tcp_get_available_congestion_control() + */ +void mptcp_pm_get_available(char *buf, size_t maxlen) +{ + struct mptcp_pm_ops *pm_ops; + size_t offs = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(pm_ops, &mptcp_pm_list, list) { + offs += snprintf(buf + offs, maxlen - offs, "%s%s", + offs == 0 ? "" : " ", pm_ops->name); + + if (WARN_ON_ONCE(offs >= maxlen)) + break; + } + rcu_read_unlock(); +} diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c new file mode 100644 index 000000000000..d39e7c178460 --- /dev/null +++ b/net/mptcp/pm_kernel.c @@ -0,0 +1,1412 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2025, Matthieu Baerts. + */ + +#define pr_fmt(fmt) "MPTCP: " fmt + +#include <net/netns/generic.h> + +#include "protocol.h" +#include "mib.h" +#include "mptcp_pm_gen.h" + +static int pm_nl_pernet_id; + +struct pm_nl_pernet { + /* protects pernet updates */ + spinlock_t lock; + struct list_head local_addr_list; + unsigned int addrs; + unsigned int stale_loss_cnt; + unsigned int add_addr_signal_max; + unsigned int add_addr_accept_max; + unsigned int local_addr_max; + unsigned int subflows_max; + unsigned int next_id; + DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); +}; + +#define MPTCP_PM_ADDR_MAX 8 + +static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) +{ + return net_generic(net, pm_nl_pernet_id); +} + +static struct pm_nl_pernet * +pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) +{ + return pm_nl_get_pernet(sock_net((struct sock *)msk)); +} + +static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) +{ + return pm_nl_get_pernet(genl_info_net(info)); +} + +unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) +{ + const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->add_addr_signal_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); + +unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->add_addr_accept_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); + +unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->subflows_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); + +unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->local_addr_max); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); + +static bool lookup_subflow_by_daddr(const struct list_head *list, + const struct mptcp_addr_info *daddr) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info cur; + + list_for_each_entry(subflow, list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (!((1 << inet_sk_state_load(ssk)) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV))) + continue; + + mptcp_remote_address((struct sock_common *)ssk, &cur); + if (mptcp_addresses_equal(&cur, daddr, daddr->port)) + return true; + } + + return false; +} + +static bool +select_local_address(const struct pm_nl_pernet *pernet, + const struct mptcp_sock *msk, + struct mptcp_pm_local *new_local) +{ + struct mptcp_pm_addr_entry *entry; + bool found = false; + + msk_owned_by_me(msk); + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) + continue; + + if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) + continue; + + new_local->addr = entry->addr; + new_local->flags = entry->flags; + new_local->ifindex = entry->ifindex; + found = true; + break; + } + rcu_read_unlock(); + + return found; +} + +static bool +select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, + struct mptcp_pm_local *new_local) +{ + struct mptcp_pm_addr_entry *entry; + bool found = false; + + rcu_read_lock(); + /* do not keep any additional per socket state, just signal + * the address list in order. + * Note: removal from the local address list during the msk life-cycle + * can lead to additional addresses not being announced. + */ + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) + continue; + + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) + continue; + + new_local->addr = entry->addr; + new_local->flags = entry->flags; + new_local->ifindex = entry->ifindex; + found = true; + break; + } + rcu_read_unlock(); + + return found; +} + +/* Fill all the remote addresses into the array addrs[], + * and return the array size. + */ +static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *local, + bool fullmesh, + struct mptcp_addr_info *addrs) +{ + bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); + struct sock *sk = (struct sock *)msk, *ssk; + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info remote = { 0 }; + unsigned int subflows_max; + int i = 0; + + subflows_max = mptcp_pm_get_subflows_max(msk); + mptcp_remote_address((struct sock_common *)sk, &remote); + + /* Non-fullmesh endpoint, fill in the single entry + * corresponding to the primary MPC subflow remote address + */ + if (!fullmesh) { + if (deny_id0) + return 0; + + if (!mptcp_pm_addr_families_match(sk, local, &remote)) + return 0; + + msk->pm.subflows++; + addrs[i++] = remote; + } else { + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + + /* Forbid creation of new subflows matching existing + * ones, possibly already created by incoming ADD_ADDR + */ + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) + if (READ_ONCE(subflow->local_id) == local->id) + __set_bit(subflow->remote_id, unavail_id); + + mptcp_for_each_subflow(msk, subflow) { + ssk = mptcp_subflow_tcp_sock(subflow); + mptcp_remote_address((struct sock_common *)ssk, &addrs[i]); + addrs[i].id = READ_ONCE(subflow->remote_id); + if (deny_id0 && !addrs[i].id) + continue; + + if (test_bit(addrs[i].id, unavail_id)) + continue; + + if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) + continue; + + if (msk->pm.subflows < subflows_max) { + /* forbid creating multiple address towards + * this id + */ + __set_bit(addrs[i].id, unavail_id); + msk->pm.subflows++; + i++; + } + } + } + + return i; +} + +static struct mptcp_pm_addr_entry * +__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + lockdep_is_held(&pernet->lock)) { + if (entry->addr.id == id) + return entry; + } + return NULL; +} + +static struct mptcp_pm_addr_entry * +__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + lockdep_is_held(&pernet->lock)) { + if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) + return entry; + } + return NULL; +} + +static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + unsigned int add_addr_signal_max; + bool signal_and_subflow = false; + unsigned int local_addr_max; + struct pm_nl_pernet *pernet; + struct mptcp_pm_local local; + unsigned int subflows_max; + + pernet = pm_nl_get_pernet(sock_net(sk)); + + add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); + local_addr_max = mptcp_pm_get_local_addr_max(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); + + /* do lazy endpoint usage accounting for the MPC subflows */ + if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info mpc_addr; + bool backup = false; + + mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); + rcu_read_lock(); + entry = __lookup_addr(pernet, &mpc_addr); + if (entry) { + __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); + msk->mpc_endpoint_id = entry->addr.id; + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + } + rcu_read_unlock(); + + if (backup) + mptcp_pm_send_ack(msk, subflow, true, backup); + + msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); + } + + pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", + msk->pm.local_addr_used, local_addr_max, + msk->pm.add_addr_signaled, add_addr_signal_max, + msk->pm.subflows, subflows_max); + + /* check first for announce */ + if (msk->pm.add_addr_signaled < add_addr_signal_max) { + /* due to racing events on both ends we can reach here while + * previous add address is still running: if we invoke now + * mptcp_pm_announce_addr(), that will fail and the + * corresponding id will be marked as used. + * Instead let the PM machinery reschedule us when the + * current address announce will be completed. + */ + if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) + return; + + if (!select_signal_address(pernet, msk, &local)) + goto subflow; + + /* If the alloc fails, we are on memory pressure, not worth + * continuing, and trying to create subflows. + */ + if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) + return; + + __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); + msk->pm.add_addr_signaled++; + + /* Special case for ID0: set the correct ID */ + if (local.addr.id == msk->mpc_endpoint_id) + local.addr.id = 0; + + mptcp_pm_announce_addr(msk, &local.addr, false); + mptcp_pm_addr_send_ack(msk); + + if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + signal_and_subflow = true; + } + +subflow: + /* check if should create a new subflow */ + while (msk->pm.local_addr_used < local_addr_max && + msk->pm.subflows < subflows_max) { + struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; + bool fullmesh; + int i, nr; + + if (signal_and_subflow) + signal_and_subflow = false; + else if (!select_local_address(pernet, msk, &local)) + break; + + fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH); + + __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); + + /* Special case for ID0: set the correct ID */ + if (local.addr.id == msk->mpc_endpoint_id) + local.addr.id = 0; + else /* local_addr_used is not decr for ID 0 */ + msk->pm.local_addr_used++; + + nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs); + if (nr == 0) + continue; + + spin_unlock_bh(&msk->pm.lock); + for (i = 0; i < nr; i++) + __mptcp_subflow_connect(sk, &local, &addrs[i]); + spin_lock_bh(&msk->pm.lock); + } + mptcp_pm_nl_check_work_pending(msk); +} + +static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) +{ + mptcp_pm_create_subflow_or_signal_addr(msk); +} + +static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) +{ + mptcp_pm_create_subflow_or_signal_addr(msk); +} + +/* Fill all the local addresses into the array addrs[], + * and return the array size. + */ +static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info mpc_addr; + struct pm_nl_pernet *pernet; + unsigned int subflows_max; + int i = 0; + + pernet = pm_nl_get_pernet_from_msk(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); + + mptcp_local_address((struct sock_common *)msk, &mpc_addr); + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) + continue; + + if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) + continue; + + if (msk->pm.subflows < subflows_max) { + locals[i].addr = entry->addr; + locals[i].flags = entry->flags; + locals[i].ifindex = entry->ifindex; + + /* Special case for ID0: set the correct ID */ + if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port)) + locals[i].addr.id = 0; + + msk->pm.subflows++; + i++; + } + } + rcu_read_unlock(); + + /* If the array is empty, fill in the single + * 'IPADDRANY' local address + */ + if (!i) { + memset(&locals[i], 0, sizeof(locals[i])); + locals[i].addr.family = +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + remote->family == AF_INET6 && + ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : +#endif + remote->family; + + if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote)) + return 0; + + msk->pm.subflows++; + i++; + } + + return i; +} + +static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) +{ + struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; + struct sock *sk = (struct sock *)msk; + unsigned int add_addr_accept_max; + struct mptcp_addr_info remote; + unsigned int subflows_max; + bool sf_created = false; + int i, nr; + + add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); + + pr_debug("accepted %d:%d remote family %d\n", + msk->pm.add_addr_accepted, add_addr_accept_max, + msk->pm.remote.family); + + remote = msk->pm.remote; + mptcp_pm_announce_addr(msk, &remote, true); + mptcp_pm_addr_send_ack(msk); + + if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) + return; + + /* pick id 0 port, if none is provided the remote address */ + if (!remote.port) + remote.port = sk->sk_dport; + + /* connect to the specified remote address, using whatever + * local address the routing configuration will pick. + */ + nr = fill_local_addresses_vec(msk, &remote, locals); + if (nr == 0) + return; + + spin_unlock_bh(&msk->pm.lock); + for (i = 0; i < nr; i++) + if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0) + sf_created = true; + spin_lock_bh(&msk->pm.lock); + + if (sf_created) { + /* add_addr_accepted is not decr for ID 0 */ + if (remote.id) + msk->pm.add_addr_accepted++; + if (msk->pm.add_addr_accepted >= add_addr_accept_max || + msk->pm.subflows >= subflows_max) + WRITE_ONCE(msk->pm.accept_addr, false); + } +} + +void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) +{ + if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + /* Note: if the subflow has been closed before, this + * add_addr_accepted counter will not be decremented. + */ + if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + WRITE_ONCE(msk->pm.accept_addr, true); + } +} + +static bool address_use_port(struct mptcp_pm_addr_entry *entry) +{ + return (entry->flags & + (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == + MPTCP_PM_ADDR_FLAG_SIGNAL; +} + +/* caller must ensure the RCU grace period is already elapsed */ +static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) +{ + if (entry->lsk) + sock_release(entry->lsk); + kfree(entry); +} + +static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, + struct mptcp_pm_addr_entry *entry, + bool needs_id, bool replace) +{ + struct mptcp_pm_addr_entry *cur, *del_entry = NULL; + unsigned int addr_max; + int ret = -EINVAL; + + spin_lock_bh(&pernet->lock); + /* to keep the code simple, don't do IDR-like allocation for address ID, + * just bail when we exceed limits + */ + if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) + pernet->next_id = 1; + if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { + ret = -ERANGE; + goto out; + } + if (test_bit(entry->addr.id, pernet->id_bitmap)) { + ret = -EBUSY; + goto out; + } + + /* do not insert duplicate address, differentiate on port only + * singled addresses + */ + if (!address_use_port(entry)) + entry->addr.port = 0; + list_for_each_entry(cur, &pernet->local_addr_list, list) { + if (mptcp_addresses_equal(&cur->addr, &entry->addr, + cur->addr.port || entry->addr.port)) { + /* allow replacing the exiting endpoint only if such + * endpoint is an implicit one and the user-space + * did not provide an endpoint id + */ + if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) { + ret = -EEXIST; + goto out; + } + if (entry->addr.id) + goto out; + + /* allow callers that only need to look up the local + * addr's id to skip replacement. This allows them to + * avoid calling synchronize_rcu in the packet recv + * path. + */ + if (!replace) { + kfree(entry); + ret = cur->addr.id; + goto out; + } + + pernet->addrs--; + entry->addr.id = cur->addr.id; + list_del_rcu(&cur->list); + del_entry = cur; + break; + } + } + + if (!entry->addr.id && needs_id) { +find_next: + entry->addr.id = find_next_zero_bit(pernet->id_bitmap, + MPTCP_PM_MAX_ADDR_ID + 1, + pernet->next_id); + if (!entry->addr.id && pernet->next_id != 1) { + pernet->next_id = 1; + goto find_next; + } + } + + if (!entry->addr.id && needs_id) + goto out; + + __set_bit(entry->addr.id, pernet->id_bitmap); + if (entry->addr.id > pernet->next_id) + pernet->next_id = entry->addr.id; + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + addr_max = pernet->add_addr_signal_max; + WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + addr_max = pernet->local_addr_max; + WRITE_ONCE(pernet->local_addr_max, addr_max + 1); + } + + pernet->addrs++; + if (!entry->addr.port) + list_add_tail_rcu(&entry->list, &pernet->local_addr_list); + else + list_add_rcu(&entry->list, &pernet->local_addr_list); + ret = entry->addr.id; + +out: + spin_unlock_bh(&pernet->lock); + + /* just replaced an existing entry, free it */ + if (del_entry) { + synchronize_rcu(); + __mptcp_pm_release_addr_entry(del_entry); + } + return ret; +} + +static struct lock_class_key mptcp_slock_keys[2]; +static struct lock_class_key mptcp_keys[2]; + +static int mptcp_pm_nl_create_listen_socket(struct sock *sk, + struct mptcp_pm_addr_entry *entry) +{ + bool is_ipv6 = sk->sk_family == AF_INET6; + int addrlen = sizeof(struct sockaddr_in); + struct sockaddr_storage addr; + struct sock *newsk, *ssk; + int backlog = 1024; + int err; + + err = sock_create_kern(sock_net(sk), entry->addr.family, + SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk); + if (err) + return err; + + newsk = entry->lsk->sk; + if (!newsk) + return -EINVAL; + + /* The subflow socket lock is acquired in a nested to the msk one + * in several places, even by the TCP stack, and this msk is a kernel + * socket: lockdep complains. Instead of propagating the _nested + * modifiers in several places, re-init the lock class for the msk + * socket to an mptcp specific one. + */ + sock_lock_init_class_and_name(newsk, + is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET", + &mptcp_slock_keys[is_ipv6], + is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET", + &mptcp_keys[is_ipv6]); + + lock_sock(newsk); + ssk = __mptcp_nmpc_sk(mptcp_sk(newsk)); + release_sock(newsk); + if (IS_ERR(ssk)) + return PTR_ERR(ssk); + + mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (entry->addr.family == AF_INET6) + addrlen = sizeof(struct sockaddr_in6); +#endif + if (ssk->sk_family == AF_INET) + err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (ssk->sk_family == AF_INET6) + err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); +#endif + if (err) + return err; + + /* We don't use mptcp_set_state() here because it needs to be called + * under the msk socket lock. For the moment, that will not bring + * anything more than only calling inet_sk_state_store(), because the + * old status is known (TCP_CLOSE). + */ + inet_sk_state_store(newsk, TCP_LISTEN); + lock_sock(ssk); + WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true); + err = __inet_listen_sk(ssk, backlog); + if (!err) + mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); + release_sock(ssk); + return err; +} + +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc) +{ + struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; + int ret; + + pernet = pm_nl_get_pernet_from_msk(msk); + + rcu_read_lock(); + entry = __lookup_addr(pernet, &skc->addr); + ret = entry ? entry->addr.id : -1; + rcu_read_unlock(); + if (ret >= 0) + return ret; + + /* address not found, add to local list */ + entry = kmemdup(skc, sizeof(*skc), GFP_ATOMIC); + if (!entry) + return -ENOMEM; + + entry->addr.port = 0; + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false); + if (ret < 0) + kfree(entry); + + return ret; +} + +bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + struct mptcp_pm_addr_entry *entry; + bool backup; + + rcu_read_lock(); + entry = __lookup_addr(pernet, skc); + backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + rcu_read_unlock(); + + return backup; +} + +static int mptcp_nl_add_subflow_or_signal_addr(struct net *net, + struct mptcp_addr_info *addr) +{ + struct mptcp_sock *msk; + long s_slot = 0, s_num = 0; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info mpc_addr; + + if (!READ_ONCE(msk->fully_established) || + mptcp_pm_is_userspace(msk)) + goto next; + + /* if the endp linked to the init sf is re-added with a != ID */ + mptcp_local_address((struct sock_common *)msk, &mpc_addr); + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + if (mptcp_addresses_equal(addr, &mpc_addr, addr->port)) + msk->mpc_endpoint_id = addr->id; + mptcp_pm_create_subflow_or_signal_addr(msk); + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return 0; +} + +static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr, + struct genl_info *info) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + + if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, + mptcp_pm_address_nl_policy, info->extack) && + tb[MPTCP_PM_ADDR_ATTR_ID]) + return true; + return false; +} + +/* Add an MPTCP endpoint */ +int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + struct nlattr *attr; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) + return -EINVAL; + + attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; + ret = mptcp_pm_parse_entry(attr, info, true, &addr); + if (ret < 0) + return ret; + + if (addr.addr.port && !address_use_port(&addr)) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "flags must have signal and not subflow when using port"); + return -EINVAL; + } + + if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && + addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "flags mustn't have both signal and fullmesh"); + return -EINVAL; + } + + if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "can't create IMPLICIT endpoint"); + return -EINVAL; + } + + entry = kmemdup(&addr, sizeof(addr), GFP_KERNEL_ACCOUNT); + if (!entry) { + GENL_SET_ERR_MSG(info, "can't allocate addr"); + return -ENOMEM; + } + + if (entry->addr.port) { + ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry); + if (ret) { + GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret); + goto out_free; + } + } + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, + !mptcp_pm_has_addr_attr_id(attr, info), + true); + if (ret < 0) { + GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); + goto out_free; + } + + mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr); + return 0; + +out_free: + __mptcp_pm_release_addr_entry(entry); + return ret; +} + +static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; +} + +static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr, + bool force) +{ + struct mptcp_rm_list list = { .nr = 0 }; + bool ret; + + list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); + + ret = mptcp_remove_anno_list_by_saddr(msk, addr); + if (ret || force) { + spin_lock_bh(&msk->pm.lock); + if (ret) { + __set_bit(addr->id, msk->pm.id_avail_bitmap); + msk->pm.add_addr_signaled--; + } + mptcp_pm_remove_addr(msk, &list); + spin_unlock_bh(&msk->pm.lock); + } + return ret; +} + +static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id) +{ + /* If it was marked as used, and not ID 0, decrement local_addr_used */ + if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) && + id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0)) + msk->pm.local_addr_used--; +} + +static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, + const struct mptcp_pm_addr_entry *entry) +{ + const struct mptcp_addr_info *addr = &entry->addr; + struct mptcp_rm_list list = { .nr = 1 }; + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + pr_debug("remove_id=%d\n", addr->id); + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + bool remove_subflow; + + if (mptcp_pm_is_userspace(msk)) + goto next; + + lock_sock(sk); + remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr); + mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && + !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); + + list.ids[0] = mptcp_endp_get_local_id(msk, addr); + if (remove_subflow) { + spin_lock_bh(&msk->pm.lock); + mptcp_pm_rm_subflow(msk, &list); + spin_unlock_bh(&msk->pm.lock); + } + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + spin_lock_bh(&msk->pm.lock); + __mark_subflow_endp_available(msk, list.ids[0]); + spin_unlock_bh(&msk->pm.lock); + } + + if (msk->mpc_endpoint_id == entry->addr.id) + msk->mpc_endpoint_id = 0; + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return 0; +} + +static int mptcp_nl_remove_id_zero_address(struct net *net, + struct mptcp_addr_info *addr) +{ + struct mptcp_rm_list list = { .nr = 0 }; + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + list.ids[list.nr++] = 0; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info msk_local; + + if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) + goto next; + + mptcp_local_address((struct sock_common *)msk, &msk_local); + if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) + goto next; + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + mptcp_pm_remove_addr(msk, &list); + mptcp_pm_rm_subflow(msk, &list); + __mark_subflow_endp_available(msk, 0); + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } + + return 0; +} + +/* Remove an MPTCP endpoint */ +int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + unsigned int addr_max; + struct nlattr *attr; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) + return -EINVAL; + + attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; + ret = mptcp_pm_parse_entry(attr, info, false, &addr); + if (ret < 0) + return ret; + + /* the zero id address is special: the first address used by the msk + * always gets such an id, so different subflows can have different zero + * id addresses. Additionally zero id is not accounted for in id_bitmap. + * Let's use an 'mptcp_rm_list' instead of the common remove code. + */ + if (addr.addr.id == 0) + return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr); + + spin_lock_bh(&pernet->lock); + entry = __lookup_addr_by_id(pernet, addr.addr.id); + if (!entry) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); + spin_unlock_bh(&pernet->lock); + return -EINVAL; + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + addr_max = pernet->add_addr_signal_max; + WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + addr_max = pernet->local_addr_max; + WRITE_ONCE(pernet->local_addr_max, addr_max - 1); + } + + pernet->addrs--; + list_del_rcu(&entry->list); + __clear_bit(entry->addr.id, pernet->id_bitmap); + spin_unlock_bh(&pernet->lock); + + mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry); + synchronize_rcu(); + __mptcp_pm_release_addr_entry(entry); + + return ret; +} + +static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, + struct list_head *rm_list) +{ + struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, rm_list, list) { + if (slist.nr < MPTCP_RM_IDS_MAX && + mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) + slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); + + if (alist.nr < MPTCP_RM_IDS_MAX && + mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) + alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); + } + + spin_lock_bh(&msk->pm.lock); + if (alist.nr) { + msk->pm.add_addr_signaled -= alist.nr; + mptcp_pm_remove_addr(msk, &alist); + } + if (slist.nr) + mptcp_pm_rm_subflow(msk, &slist); + /* Reset counters: maybe some subflows have been removed before */ + bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + msk->pm.local_addr_used = 0; + spin_unlock_bh(&msk->pm.lock); +} + +static void mptcp_nl_flush_addrs_list(struct net *net, + struct list_head *rm_list) +{ + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + if (list_empty(rm_list)) + return; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + + if (!mptcp_pm_is_userspace(msk)) { + lock_sock(sk); + mptcp_pm_flush_addrs_and_subflows(msk, rm_list); + release_sock(sk); + } + + sock_put(sk); + cond_resched(); + } +} + +/* caller must ensure the RCU grace period is already elapsed */ +static void __flush_addrs(struct list_head *list) +{ + while (!list_empty(list)) { + struct mptcp_pm_addr_entry *cur; + + cur = list_entry(list->next, + struct mptcp_pm_addr_entry, list); + list_del_rcu(&cur->list); + __mptcp_pm_release_addr_entry(cur); + } +} + +static void __reset_counters(struct pm_nl_pernet *pernet) +{ + WRITE_ONCE(pernet->add_addr_signal_max, 0); + WRITE_ONCE(pernet->add_addr_accept_max, 0); + WRITE_ONCE(pernet->local_addr_max, 0); + pernet->addrs = 0; +} + +int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + LIST_HEAD(free_list); + + spin_lock_bh(&pernet->lock); + list_splice_init(&pernet->local_addr_list, &free_list); + __reset_counters(pernet); + pernet->next_id = 1; + bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + spin_unlock_bh(&pernet->lock); + mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list); + synchronize_rcu(); + __flush_addrs(&free_list); + return 0; +} + +int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, + struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry *entry; + int ret = -EINVAL; + + rcu_read_lock(); + entry = __lookup_addr_by_id(pernet, id); + if (entry) { + *addr = *entry; + ret = 0; + } + rcu_read_unlock(); + + return ret; +} + +int mptcp_pm_nl_dump_addr(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct net *net = sock_net(msg->sk); + struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; + int id = cb->args[0]; + int i; + + pernet = pm_nl_get_pernet(net); + + rcu_read_lock(); + for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { + if (test_bit(i, pernet->id_bitmap)) { + entry = __lookup_addr_by_id(pernet, i); + if (!entry) + break; + + if (entry->addr.id <= id) + continue; + + if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0) + break; + + id = entry->addr.id; + } + } + rcu_read_unlock(); + + cb->args[0] = id; + return msg->len; +} + +static int parse_limit(struct genl_info *info, int id, unsigned int *limit) +{ + struct nlattr *attr = info->attrs[id]; + + if (!attr) + return 0; + + *limit = nla_get_u32(attr); + if (*limit > MPTCP_PM_ADDR_MAX) { + NL_SET_ERR_MSG_ATTR_FMT(info->extack, attr, + "limit greater than maximum (%u)", + MPTCP_PM_ADDR_MAX); + return -EINVAL; + } + return 0; +} + +int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + unsigned int rcv_addrs, subflows; + int ret; + + spin_lock_bh(&pernet->lock); + rcv_addrs = pernet->add_addr_accept_max; + ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); + if (ret) + goto unlock; + + subflows = pernet->subflows_max; + ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); + if (ret) + goto unlock; + + WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); + WRITE_ONCE(pernet->subflows_max, subflows); + +unlock: + spin_unlock_bh(&pernet->lock); + return ret; +} + +int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct sk_buff *msg; + void *reply; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, + MPTCP_PM_CMD_GET_LIMITS); + if (!reply) + goto fail; + + if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, + READ_ONCE(pernet->add_addr_accept_max))) + goto fail; + + if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, + READ_ONCE(pernet->subflows_max))) + goto fail; + + genlmsg_end(msg, reply); + return genlmsg_reply(msg, info); + +fail: + GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); + nlmsg_free(msg); + return -EMSGSIZE; +} + +static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, + struct mptcp_addr_info *addr) +{ + struct mptcp_rm_list list = { .nr = 0 }; + + list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); + + spin_lock_bh(&msk->pm.lock); + mptcp_pm_rm_subflow(msk, &list); + __mark_subflow_endp_available(msk, list.ids[0]); + mptcp_pm_create_subflow_or_signal_addr(msk); + spin_unlock_bh(&msk->pm.lock); +} + +static void mptcp_pm_nl_set_flags_all(struct net *net, + struct mptcp_pm_addr_entry *local, + u8 changed) +{ + u8 is_subflow = !!(local->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW); + u8 bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); + long s_slot = 0, s_num = 0; + struct mptcp_sock *msk; + + if (changed == MPTCP_PM_ADDR_FLAG_FULLMESH && !is_subflow) + return; + + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { + struct sock *sk = (struct sock *)msk; + + if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) + goto next; + + lock_sock(sk); + if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) + mptcp_pm_mp_prio_send_ack(msk, &local->addr, NULL, bkup); + /* Subflows will only be recreated if the SUBFLOW flag is set */ + if (is_subflow && (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)) + mptcp_pm_nl_fullmesh(msk, &local->addr); + release_sock(sk); + +next: + sock_put(sk); + cond_resched(); + } +} + +int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | + MPTCP_PM_ADDR_FLAG_FULLMESH; + struct net *net = genl_info_net(info); + struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; + u8 lookup_by_id = 0; + + pernet = pm_nl_get_pernet(net); + + if (local->addr.family == AF_UNSPEC) { + lookup_by_id = 1; + if (!local->addr.id) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "missing address ID"); + return -EOPNOTSUPP; + } + } + + spin_lock_bh(&pernet->lock); + entry = lookup_by_id ? __lookup_addr_by_id(pernet, local->addr.id) : + __lookup_addr(pernet, &local->addr); + if (!entry) { + spin_unlock_bh(&pernet->lock); + NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); + return -EINVAL; + } + if ((local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && + (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | + MPTCP_PM_ADDR_FLAG_IMPLICIT))) { + spin_unlock_bh(&pernet->lock); + NL_SET_ERR_MSG_ATTR(info->extack, attr, "invalid addr flags"); + return -EINVAL; + } + + changed = (local->flags ^ entry->flags) & mask; + entry->flags = (entry->flags & ~mask) | (local->flags & mask); + *local = *entry; + spin_unlock_bh(&pernet->lock); + + mptcp_pm_nl_set_flags_all(net, local, changed); + return 0; +} + +bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || + (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, + MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { + WRITE_ONCE(msk->pm.work_pending, false); + return false; + } + return true; +} + +/* Called under PM lock */ +void __mptcp_pm_kernel_worker(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); + mptcp_pm_nl_add_addr_received(msk); + } + if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); + mptcp_pm_nl_fully_established(msk); + } + if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); + mptcp_pm_nl_subflow_established(msk); + } +} + +static int __net_init pm_nl_init_net(struct net *net) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); + + INIT_LIST_HEAD_RCU(&pernet->local_addr_list); + + /* Cit. 2 subflows ought to be enough for anybody. */ + pernet->subflows_max = 2; + pernet->next_id = 1; + pernet->stale_loss_cnt = 4; + spin_lock_init(&pernet->lock); + + /* No need to initialize other pernet fields, the struct is zeroed at + * allocation time. + */ + + return 0; +} + +static void __net_exit pm_nl_exit_net(struct list_head *net_list) +{ + struct net *net; + + list_for_each_entry(net, net_list, exit_list) { + struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); + + /* net is removed from namespace list, can't race with + * other modifiers, also netns core already waited for a + * RCU grace period. + */ + __flush_addrs(&pernet->local_addr_list); + } +} + +static struct pernet_operations mptcp_pm_pernet_ops = { + .init = pm_nl_init_net, + .exit_batch = pm_nl_exit_net, + .id = &pm_nl_pernet_id, + .size = sizeof(struct pm_nl_pernet), +}; + +struct mptcp_pm_ops mptcp_pm_kernel = { + .name = "kernel", + .owner = THIS_MODULE, +}; + +void __init mptcp_pm_kernel_register(void) +{ + if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) + panic("Failed to register MPTCP PM pernet subsystem.\n"); + + mptcp_pm_register(&mptcp_pm_kernel); +} diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 7a0f7998376a..50aaf259959a 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -6,1186 +6,9 @@ #define pr_fmt(fmt) "MPTCP: " fmt -#include <linux/inet.h> -#include <linux/kernel.h> -#include <net/inet_common.h> -#include <net/netns/generic.h> -#include <net/mptcp.h> - #include "protocol.h" -#include "mib.h" #include "mptcp_pm_gen.h" -static int pm_nl_pernet_id; - -struct mptcp_pm_add_entry { - struct list_head list; - struct mptcp_addr_info addr; - u8 retrans_times; - struct timer_list add_timer; - struct mptcp_sock *sock; -}; - -struct pm_nl_pernet { - /* protects pernet updates */ - spinlock_t lock; - struct list_head local_addr_list; - unsigned int addrs; - unsigned int stale_loss_cnt; - unsigned int add_addr_signal_max; - unsigned int add_addr_accept_max; - unsigned int local_addr_max; - unsigned int subflows_max; - unsigned int next_id; - DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); -}; - -#define MPTCP_PM_ADDR_MAX 8 -#define ADD_ADDR_RETRANS_MAX 3 - -static struct pm_nl_pernet *pm_nl_get_pernet(const struct net *net) -{ - return net_generic(net, pm_nl_pernet_id); -} - -static struct pm_nl_pernet * -pm_nl_get_pernet_from_msk(const struct mptcp_sock *msk) -{ - return pm_nl_get_pernet(sock_net((struct sock *)msk)); -} - -bool mptcp_addresses_equal(const struct mptcp_addr_info *a, - const struct mptcp_addr_info *b, bool use_port) -{ - bool addr_equals = false; - - if (a->family == b->family) { - if (a->family == AF_INET) - addr_equals = a->addr.s_addr == b->addr.s_addr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else - addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6); - } else if (a->family == AF_INET) { - if (ipv6_addr_v4mapped(&b->addr6)) - addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; - } else if (b->family == AF_INET) { - if (ipv6_addr_v4mapped(&a->addr6)) - addr_equals = a->addr6.s6_addr32[3] == b->addr.s_addr; -#endif - } - - if (!addr_equals) - return false; - if (!use_port) - return true; - - return a->port == b->port; -} - -void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr) -{ - addr->family = skc->skc_family; - addr->port = htons(skc->skc_num); - if (addr->family == AF_INET) - addr->addr.s_addr = skc->skc_rcv_saddr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (addr->family == AF_INET6) - addr->addr6 = skc->skc_v6_rcv_saddr; -#endif -} - -static void remote_address(const struct sock_common *skc, - struct mptcp_addr_info *addr) -{ - addr->family = skc->skc_family; - addr->port = skc->skc_dport; - if (addr->family == AF_INET) - addr->addr.s_addr = skc->skc_daddr; -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (addr->family == AF_INET6) - addr->addr6 = skc->skc_v6_daddr; -#endif -} - -static bool lookup_subflow_by_saddr(const struct list_head *list, - const struct mptcp_addr_info *saddr) -{ - struct mptcp_subflow_context *subflow; - struct mptcp_addr_info cur; - struct sock_common *skc; - - list_for_each_entry(subflow, list, node) { - skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); - - mptcp_local_address(skc, &cur); - if (mptcp_addresses_equal(&cur, saddr, saddr->port)) - return true; - } - - return false; -} - -static bool lookup_subflow_by_daddr(const struct list_head *list, - const struct mptcp_addr_info *daddr) -{ - struct mptcp_subflow_context *subflow; - struct mptcp_addr_info cur; - - list_for_each_entry(subflow, list, node) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - if (!((1 << inet_sk_state_load(ssk)) & - (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV))) - continue; - - remote_address((struct sock_common *)ssk, &cur); - if (mptcp_addresses_equal(&cur, daddr, daddr->port)) - return true; - } - - return false; -} - -static bool -select_local_address(const struct pm_nl_pernet *pernet, - const struct mptcp_sock *msk, - struct mptcp_pm_local *new_local) -{ - struct mptcp_pm_addr_entry *entry; - bool found = false; - - msk_owned_by_me(msk); - - rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) - continue; - - if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) - continue; - - new_local->addr = entry->addr; - new_local->flags = entry->flags; - new_local->ifindex = entry->ifindex; - found = true; - break; - } - rcu_read_unlock(); - - return found; -} - -static bool -select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, - struct mptcp_pm_local *new_local) -{ - struct mptcp_pm_addr_entry *entry; - bool found = false; - - rcu_read_lock(); - /* do not keep any additional per socket state, just signal - * the address list in order. - * Note: removal from the local address list during the msk life-cycle - * can lead to additional addresses not being announced. - */ - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) - continue; - - if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) - continue; - - new_local->addr = entry->addr; - new_local->flags = entry->flags; - new_local->ifindex = entry->ifindex; - found = true; - break; - } - rcu_read_unlock(); - - return found; -} - -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) -{ - const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->add_addr_signal_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); - -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->add_addr_accept_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); - -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->subflows_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); - -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - return READ_ONCE(pernet->local_addr_max); -} -EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); - -bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - - if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || - (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, - MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { - WRITE_ONCE(msk->pm.work_pending, false); - return false; - } - return true; -} - -struct mptcp_pm_add_entry * -mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - struct mptcp_pm_add_entry *entry; - - lockdep_assert_held(&msk->pm.lock); - - list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (mptcp_addresses_equal(&entry->addr, addr, true)) - return entry; - } - - return NULL; -} - -bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk) -{ - struct mptcp_pm_add_entry *entry; - struct mptcp_addr_info saddr; - bool ret = false; - - mptcp_local_address((struct sock_common *)sk, &saddr); - - spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.anno_list, list) { - if (mptcp_addresses_equal(&entry->addr, &saddr, true)) { - ret = true; - goto out; - } - } - -out: - spin_unlock_bh(&msk->pm.lock); - return ret; -} - -static void mptcp_pm_add_timer(struct timer_list *timer) -{ - struct mptcp_pm_add_entry *entry = from_timer(entry, timer, add_timer); - struct mptcp_sock *msk = entry->sock; - struct sock *sk = (struct sock *)msk; - - pr_debug("msk=%p\n", msk); - - if (!msk) - return; - - if (inet_sk_state_load(sk) == TCP_CLOSE) - return; - - if (!entry->addr.id) - return; - - if (mptcp_pm_should_add_signal_addr(msk)) { - sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); - goto out; - } - - spin_lock_bh(&msk->pm.lock); - - if (!mptcp_pm_should_add_signal_addr(msk)) { - pr_debug("retransmit ADD_ADDR id=%d\n", entry->addr.id); - mptcp_pm_announce_addr(msk, &entry->addr, false); - mptcp_pm_add_addr_send_ack(msk); - entry->retrans_times++; - } - - if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) - sk_reset_timer(sk, timer, - jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); - - spin_unlock_bh(&msk->pm.lock); - - if (entry->retrans_times == ADD_ADDR_RETRANS_MAX) - mptcp_pm_subflow_established(msk); - -out: - __sock_put(sk); -} - -struct mptcp_pm_add_entry * -mptcp_pm_del_add_timer(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr, bool check_id) -{ - struct mptcp_pm_add_entry *entry; - struct sock *sk = (struct sock *)msk; - struct timer_list *add_timer = NULL; - - spin_lock_bh(&msk->pm.lock); - entry = mptcp_lookup_anno_list_by_saddr(msk, addr); - if (entry && (!check_id || entry->addr.id == addr->id)) { - entry->retrans_times = ADD_ADDR_RETRANS_MAX; - add_timer = &entry->add_timer; - } - if (!check_id && entry) - list_del(&entry->list); - spin_unlock_bh(&msk->pm.lock); - - /* no lock, because sk_stop_timer_sync() is calling del_timer_sync() */ - if (add_timer) - sk_stop_timer_sync(sk, add_timer); - - return entry; -} - -bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - struct mptcp_pm_add_entry *add_entry = NULL; - struct sock *sk = (struct sock *)msk; - struct net *net = sock_net(sk); - - lockdep_assert_held(&msk->pm.lock); - - add_entry = mptcp_lookup_anno_list_by_saddr(msk, addr); - - if (add_entry) { - if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) - return false; - - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); - return true; - } - - add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); - if (!add_entry) - return false; - - list_add(&add_entry->list, &msk->pm.anno_list); - - add_entry->addr = *addr; - add_entry->sock = msk; - add_entry->retrans_times = 0; - - timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); - - return true; -} - -void mptcp_pm_free_anno_list(struct mptcp_sock *msk) -{ - struct mptcp_pm_add_entry *entry, *tmp; - struct sock *sk = (struct sock *)msk; - LIST_HEAD(free_list); - - pr_debug("msk=%p\n", msk); - - spin_lock_bh(&msk->pm.lock); - list_splice_init(&msk->pm.anno_list, &free_list); - spin_unlock_bh(&msk->pm.lock); - - list_for_each_entry_safe(entry, tmp, &free_list, list) { - sk_stop_timer_sync(sk, &entry->add_timer); - kfree(entry); - } -} - -/* Fill all the remote addresses into the array addrs[], - * and return the array size. - */ -static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, - struct mptcp_addr_info *local, - bool fullmesh, - struct mptcp_addr_info *addrs) -{ - bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); - struct sock *sk = (struct sock *)msk, *ssk; - struct mptcp_subflow_context *subflow; - struct mptcp_addr_info remote = { 0 }; - unsigned int subflows_max; - int i = 0; - - subflows_max = mptcp_pm_get_subflows_max(msk); - remote_address((struct sock_common *)sk, &remote); - - /* Non-fullmesh endpoint, fill in the single entry - * corresponding to the primary MPC subflow remote address - */ - if (!fullmesh) { - if (deny_id0) - return 0; - - if (!mptcp_pm_addr_families_match(sk, local, &remote)) - return 0; - - msk->pm.subflows++; - addrs[i++] = remote; - } else { - DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); - - /* Forbid creation of new subflows matching existing - * ones, possibly already created by incoming ADD_ADDR - */ - bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); - mptcp_for_each_subflow(msk, subflow) - if (READ_ONCE(subflow->local_id) == local->id) - __set_bit(subflow->remote_id, unavail_id); - - mptcp_for_each_subflow(msk, subflow) { - ssk = mptcp_subflow_tcp_sock(subflow); - remote_address((struct sock_common *)ssk, &addrs[i]); - addrs[i].id = READ_ONCE(subflow->remote_id); - if (deny_id0 && !addrs[i].id) - continue; - - if (test_bit(addrs[i].id, unavail_id)) - continue; - - if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) - continue; - - if (msk->pm.subflows < subflows_max) { - /* forbid creating multiple address towards - * this id - */ - __set_bit(addrs[i].id, unavail_id); - msk->pm.subflows++; - i++; - } - } - } - - return i; -} - -static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - bool prio, bool backup) -{ - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow; - - pr_debug("send ack for %s\n", - prio ? "mp_prio" : (mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr")); - - slow = lock_sock_fast(ssk); - if (prio) { - subflow->send_mp_prio = 1; - subflow->request_bkup = backup; - } - - __mptcp_subflow_send_ack(ssk); - unlock_sock_fast(ssk, slow); -} - -static void mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - bool prio, bool backup) -{ - spin_unlock_bh(&msk->pm.lock); - __mptcp_pm_send_ack(msk, subflow, prio, backup); - spin_lock_bh(&msk->pm.lock); -} - -static struct mptcp_pm_addr_entry * -__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) -{ - struct mptcp_pm_addr_entry *entry; - - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, - lockdep_is_held(&pernet->lock)) { - if (entry->addr.id == id) - return entry; - } - return NULL; -} - -static struct mptcp_pm_addr_entry * -__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) -{ - struct mptcp_pm_addr_entry *entry; - - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, - lockdep_is_held(&pernet->lock)) { - if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) - return entry; - } - return NULL; -} - -static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) -{ - struct sock *sk = (struct sock *)msk; - unsigned int add_addr_signal_max; - bool signal_and_subflow = false; - unsigned int local_addr_max; - struct pm_nl_pernet *pernet; - struct mptcp_pm_local local; - unsigned int subflows_max; - - pernet = pm_nl_get_pernet(sock_net(sk)); - - add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); - local_addr_max = mptcp_pm_get_local_addr_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - - /* do lazy endpoint usage accounting for the MPC subflows */ - if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); - struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info mpc_addr; - bool backup = false; - - mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); - rcu_read_lock(); - entry = __lookup_addr(pernet, &mpc_addr); - if (entry) { - __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); - msk->mpc_endpoint_id = entry->addr.id; - backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - } - rcu_read_unlock(); - - if (backup) - mptcp_pm_send_ack(msk, subflow, true, backup); - - msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); - } - - pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", - msk->pm.local_addr_used, local_addr_max, - msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.subflows, subflows_max); - - /* check first for announce */ - if (msk->pm.add_addr_signaled < add_addr_signal_max) { - /* due to racing events on both ends we can reach here while - * previous add address is still running: if we invoke now - * mptcp_pm_announce_addr(), that will fail and the - * corresponding id will be marked as used. - * Instead let the PM machinery reschedule us when the - * current address announce will be completed. - */ - if (msk->pm.addr_signal & BIT(MPTCP_ADD_ADDR_SIGNAL)) - return; - - if (!select_signal_address(pernet, msk, &local)) - goto subflow; - - /* If the alloc fails, we are on memory pressure, not worth - * continuing, and trying to create subflows. - */ - if (!mptcp_pm_alloc_anno_list(msk, &local.addr)) - return; - - __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); - msk->pm.add_addr_signaled++; - - /* Special case for ID0: set the correct ID */ - if (local.addr.id == msk->mpc_endpoint_id) - local.addr.id = 0; - - mptcp_pm_announce_addr(msk, &local.addr, false); - mptcp_pm_nl_addr_send_ack(msk); - - if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) - signal_and_subflow = true; - } - -subflow: - /* check if should create a new subflow */ - while (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { - struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; - bool fullmesh; - int i, nr; - - if (signal_and_subflow) - signal_and_subflow = false; - else if (!select_local_address(pernet, msk, &local)) - break; - - fullmesh = !!(local.flags & MPTCP_PM_ADDR_FLAG_FULLMESH); - - __clear_bit(local.addr.id, msk->pm.id_avail_bitmap); - - /* Special case for ID0: set the correct ID */ - if (local.addr.id == msk->mpc_endpoint_id) - local.addr.id = 0; - else /* local_addr_used is not decr for ID 0 */ - msk->pm.local_addr_used++; - - nr = fill_remote_addresses_vec(msk, &local.addr, fullmesh, addrs); - if (nr == 0) - continue; - - spin_unlock_bh(&msk->pm.lock); - for (i = 0; i < nr; i++) - __mptcp_subflow_connect(sk, &local, &addrs[i]); - spin_lock_bh(&msk->pm.lock); - } - mptcp_pm_nl_check_work_pending(msk); -} - -static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) -{ - mptcp_pm_create_subflow_or_signal_addr(msk); -} - -static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) -{ - mptcp_pm_create_subflow_or_signal_addr(msk); -} - -/* Fill all the local addresses into the array addrs[], - * and return the array size. - */ -static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, - struct mptcp_addr_info *remote, - struct mptcp_pm_local *locals) -{ - struct sock *sk = (struct sock *)msk; - struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info mpc_addr; - struct pm_nl_pernet *pernet; - unsigned int subflows_max; - int i = 0; - - pernet = pm_nl_get_pernet_from_msk(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - - mptcp_local_address((struct sock_common *)msk, &mpc_addr); - - rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { - if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) - continue; - - if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) - continue; - - if (msk->pm.subflows < subflows_max) { - locals[i].addr = entry->addr; - locals[i].flags = entry->flags; - locals[i].ifindex = entry->ifindex; - - /* Special case for ID0: set the correct ID */ - if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port)) - locals[i].addr.id = 0; - - msk->pm.subflows++; - i++; - } - } - rcu_read_unlock(); - - /* If the array is empty, fill in the single - * 'IPADDRANY' local address - */ - if (!i) { - memset(&locals[i], 0, sizeof(locals[i])); - locals[i].addr.family = -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - remote->family == AF_INET6 && - ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : -#endif - remote->family; - - if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote)) - return 0; - - msk->pm.subflows++; - i++; - } - - return i; -} - -static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) -{ - struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; - struct sock *sk = (struct sock *)msk; - unsigned int add_addr_accept_max; - struct mptcp_addr_info remote; - unsigned int subflows_max; - bool sf_created = false; - int i, nr; - - add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - - pr_debug("accepted %d:%d remote family %d\n", - msk->pm.add_addr_accepted, add_addr_accept_max, - msk->pm.remote.family); - - remote = msk->pm.remote; - mptcp_pm_announce_addr(msk, &remote, true); - mptcp_pm_nl_addr_send_ack(msk); - - if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) - return; - - /* pick id 0 port, if none is provided the remote address */ - if (!remote.port) - remote.port = sk->sk_dport; - - /* connect to the specified remote address, using whatever - * local address the routing configuration will pick. - */ - nr = fill_local_addresses_vec(msk, &remote, locals); - if (nr == 0) - return; - - spin_unlock_bh(&msk->pm.lock); - for (i = 0; i < nr; i++) - if (__mptcp_subflow_connect(sk, &locals[i], &remote) == 0) - sf_created = true; - spin_lock_bh(&msk->pm.lock); - - if (sf_created) { - /* add_addr_accepted is not decr for ID 0 */ - if (remote.id) - msk->pm.add_addr_accepted++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) - WRITE_ONCE(msk->pm.accept_addr, false); - } -} - -bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *remote) -{ - struct mptcp_addr_info mpc_remote; - - remote_address((struct sock_common *)msk, &mpc_remote); - return mptcp_addresses_equal(&mpc_remote, remote, remote->port); -} - -void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow, *alt = NULL; - - msk_owned_by_me(msk); - lockdep_assert_held(&msk->pm.lock); - - if (!mptcp_pm_should_add_signal(msk) && - !mptcp_pm_should_rm_signal(msk)) - return; - - mptcp_for_each_subflow(msk, subflow) { - if (__mptcp_subflow_active(subflow)) { - if (!subflow->stale) { - mptcp_pm_send_ack(msk, subflow, false, false); - return; - } - - if (!alt) - alt = subflow; - } - } - - if (alt) - mptcp_pm_send_ack(msk, alt, false, false); -} - -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - struct mptcp_addr_info *rem, - u8 bkup) -{ - struct mptcp_subflow_context *subflow; - - pr_debug("bkup=%d\n", bkup); - - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - struct mptcp_addr_info local, remote; - - mptcp_local_address((struct sock_common *)ssk, &local); - if (!mptcp_addresses_equal(&local, addr, addr->port)) - continue; - - if (rem && rem->family != AF_UNSPEC) { - remote_address((struct sock_common *)ssk, &remote); - if (!mptcp_addresses_equal(&remote, rem, rem->port)) - continue; - } - - __mptcp_pm_send_ack(msk, subflow, true, bkup); - return 0; - } - - return -EINVAL; -} - -static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list, - enum linux_mptcp_mib_field rm_type) -{ - struct mptcp_subflow_context *subflow, *tmp; - struct sock *sk = (struct sock *)msk; - u8 i; - - pr_debug("%s rm_list_nr %d\n", - rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", rm_list->nr); - - msk_owned_by_me(msk); - - if (sk->sk_state == TCP_LISTEN) - return; - - if (!rm_list->nr) - return; - - if (list_empty(&msk->conn_list)) - return; - - for (i = 0; i < rm_list->nr; i++) { - u8 rm_id = rm_list->ids[i]; - bool removed = false; - - mptcp_for_each_subflow_safe(msk, subflow, tmp) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - u8 remote_id = READ_ONCE(subflow->remote_id); - int how = RCV_SHUTDOWN | SEND_SHUTDOWN; - u8 id = subflow_get_local_id(subflow); - - if ((1 << inet_sk_state_load(ssk)) & - (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE)) - continue; - if (rm_type == MPTCP_MIB_RMADDR && remote_id != rm_id) - continue; - if (rm_type == MPTCP_MIB_RMSUBFLOW && id != rm_id) - continue; - - pr_debug(" -> %s rm_list_ids[%d]=%u local_id=%u remote_id=%u mpc_id=%u\n", - rm_type == MPTCP_MIB_RMADDR ? "address" : "subflow", - i, rm_id, id, remote_id, msk->mpc_endpoint_id); - spin_unlock_bh(&msk->pm.lock); - mptcp_subflow_shutdown(sk, ssk, how); - removed |= subflow->request_join; - - /* the following takes care of updating the subflows counter */ - mptcp_close_ssk(sk, ssk, subflow); - spin_lock_bh(&msk->pm.lock); - - if (rm_type == MPTCP_MIB_RMSUBFLOW) - __MPTCP_INC_STATS(sock_net(sk), rm_type); - } - - if (rm_type == MPTCP_MIB_RMADDR) - __MPTCP_INC_STATS(sock_net(sk), rm_type); - - if (!removed) - continue; - - if (!mptcp_pm_is_kernel(msk)) - continue; - - if (rm_type == MPTCP_MIB_RMADDR && rm_id && - !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { - /* Note: if the subflow has been closed before, this - * add_addr_accepted counter will not be decremented. - */ - if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) - WRITE_ONCE(msk->pm.accept_addr, true); - } - } -} - -static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) -{ - mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); -} - -static void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list) -{ - mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); -} - -void mptcp_pm_nl_work(struct mptcp_sock *msk) -{ - struct mptcp_pm_data *pm = &msk->pm; - - msk_owned_by_me(msk); - - if (!(pm->status & MPTCP_PM_WORK_MASK)) - return; - - spin_lock_bh(&msk->pm.lock); - - pr_debug("msk=%p status=%x\n", msk, pm->status); - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); - mptcp_pm_nl_add_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { - pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); - mptcp_pm_nl_addr_send_ack(msk); - } - if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { - pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); - mptcp_pm_nl_rm_addr_received(msk); - } - if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); - mptcp_pm_nl_fully_established(msk); - } - if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { - pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); - mptcp_pm_nl_subflow_established(msk); - } - - spin_unlock_bh(&msk->pm.lock); -} - -static bool address_use_port(struct mptcp_pm_addr_entry *entry) -{ - return (entry->flags & - (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == - MPTCP_PM_ADDR_FLAG_SIGNAL; -} - -/* caller must ensure the RCU grace period is already elapsed */ -static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry) -{ - if (entry->lsk) - sock_release(entry->lsk); - kfree(entry); -} - -static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, - struct mptcp_pm_addr_entry *entry, - bool needs_id) -{ - struct mptcp_pm_addr_entry *cur, *del_entry = NULL; - unsigned int addr_max; - int ret = -EINVAL; - - spin_lock_bh(&pernet->lock); - /* to keep the code simple, don't do IDR-like allocation for address ID, - * just bail when we exceed limits - */ - if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) - pernet->next_id = 1; - if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { - ret = -ERANGE; - goto out; - } - if (test_bit(entry->addr.id, pernet->id_bitmap)) { - ret = -EBUSY; - goto out; - } - - /* do not insert duplicate address, differentiate on port only - * singled addresses - */ - if (!address_use_port(entry)) - entry->addr.port = 0; - list_for_each_entry(cur, &pernet->local_addr_list, list) { - if (mptcp_addresses_equal(&cur->addr, &entry->addr, - cur->addr.port || entry->addr.port)) { - /* allow replacing the exiting endpoint only if such - * endpoint is an implicit one and the user-space - * did not provide an endpoint id - */ - if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) { - ret = -EEXIST; - goto out; - } - if (entry->addr.id) - goto out; - - pernet->addrs--; - entry->addr.id = cur->addr.id; - list_del_rcu(&cur->list); - del_entry = cur; - break; - } - } - - if (!entry->addr.id && needs_id) { -find_next: - entry->addr.id = find_next_zero_bit(pernet->id_bitmap, - MPTCP_PM_MAX_ADDR_ID + 1, - pernet->next_id); - if (!entry->addr.id && pernet->next_id != 1) { - pernet->next_id = 1; - goto find_next; - } - } - - if (!entry->addr.id && needs_id) - goto out; - - __set_bit(entry->addr.id, pernet->id_bitmap); - if (entry->addr.id > pernet->next_id) - pernet->next_id = entry->addr.id; - - if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); - } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max + 1); - } - - pernet->addrs++; - if (!entry->addr.port) - list_add_tail_rcu(&entry->list, &pernet->local_addr_list); - else - list_add_rcu(&entry->list, &pernet->local_addr_list); - ret = entry->addr.id; - -out: - spin_unlock_bh(&pernet->lock); - - /* just replaced an existing entry, free it */ - if (del_entry) { - synchronize_rcu(); - __mptcp_pm_release_addr_entry(del_entry); - } - return ret; -} - -static struct lock_class_key mptcp_slock_keys[2]; -static struct lock_class_key mptcp_keys[2]; - -static int mptcp_pm_nl_create_listen_socket(struct sock *sk, - struct mptcp_pm_addr_entry *entry) -{ - bool is_ipv6 = sk->sk_family == AF_INET6; - int addrlen = sizeof(struct sockaddr_in); - struct sockaddr_storage addr; - struct sock *newsk, *ssk; - int backlog = 1024; - int err; - - err = sock_create_kern(sock_net(sk), entry->addr.family, - SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk); - if (err) - return err; - - newsk = entry->lsk->sk; - if (!newsk) - return -EINVAL; - - /* The subflow socket lock is acquired in a nested to the msk one - * in several places, even by the TCP stack, and this msk is a kernel - * socket: lockdep complains. Instead of propagating the _nested - * modifiers in several places, re-init the lock class for the msk - * socket to an mptcp specific one. - */ - sock_lock_init_class_and_name(newsk, - is_ipv6 ? "mlock-AF_INET6" : "mlock-AF_INET", - &mptcp_slock_keys[is_ipv6], - is_ipv6 ? "msk_lock-AF_INET6" : "msk_lock-AF_INET", - &mptcp_keys[is_ipv6]); - - lock_sock(newsk); - ssk = __mptcp_nmpc_sk(mptcp_sk(newsk)); - release_sock(newsk); - if (IS_ERR(ssk)) - return PTR_ERR(ssk); - - mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (entry->addr.family == AF_INET6) - addrlen = sizeof(struct sockaddr_in6); -#endif - if (ssk->sk_family == AF_INET) - err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - else if (ssk->sk_family == AF_INET6) - err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); -#endif - if (err) - return err; - - /* We don't use mptcp_set_state() here because it needs to be called - * under the msk socket lock. For the moment, that will not bring - * anything more than only calling inet_sk_state_store(), because the - * old status is known (TCP_CLOSE). - */ - inet_sk_state_store(newsk, TCP_LISTEN); - lock_sock(ssk); - WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true); - err = __inet_listen_sk(ssk, backlog); - if (!err) - mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); - release_sock(ssk); - return err; -} - -int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) -{ - struct mptcp_pm_addr_entry *entry; - struct pm_nl_pernet *pernet; - int ret; - - pernet = pm_nl_get_pernet_from_msk(msk); - - rcu_read_lock(); - entry = __lookup_addr(pernet, skc); - ret = entry ? entry->addr.id : -1; - rcu_read_unlock(); - if (ret >= 0) - return ret; - - /* address not found, add to local list */ - entry = kmalloc(sizeof(*entry), GFP_ATOMIC); - if (!entry) - return -ENOMEM; - - entry->addr = *skc; - entry->addr.id = 0; - entry->addr.port = 0; - entry->ifindex = 0; - entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; - entry->lsk = NULL; - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true); - if (ret < 0) - kfree(entry); - - return ret; -} - -bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - struct mptcp_pm_addr_entry *entry; - bool backup; - - rcu_read_lock(); - entry = __lookup_addr(pernet, skc); - backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - rcu_read_unlock(); - - return backup; -} - #define MPTCP_PM_CMD_GRP_OFFSET 0 #define MPTCP_PM_EV_GRP_OFFSET 1 @@ -1196,43 +19,6 @@ static const struct genl_multicast_group mptcp_pm_mcgrps[] = { }, }; -void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) -{ - struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); - struct sock *sk = (struct sock *)msk; - unsigned int active_max_loss_cnt; - struct net *net = sock_net(sk); - unsigned int stale_loss_cnt; - bool slow; - - stale_loss_cnt = mptcp_stale_loss_cnt(net); - if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) - return; - - /* look for another available subflow not in loss state */ - active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); - mptcp_for_each_subflow(msk, iter) { - if (iter != subflow && mptcp_subflow_active(iter) && - iter->stale_count < active_max_loss_cnt) { - /* we have some alternatives, try to mark this subflow as idle ...*/ - slow = lock_sock_fast(ssk); - if (!tcp_rtx_and_write_queues_empty(ssk)) { - subflow->stale = 1; - __mptcp_retransmit_pending_data(sk); - MPTCP_INC_STATS(net, MPTCP_MIB_SUBFLOWSTALE); - } - unlock_sock_fast(ssk, slow); - - /* always try to push the pending data regardless of re-injections: - * we can possibly use backup subflows now, and subflow selection - * is cheap under the msk socket lock - */ - __mptcp_push_pending(sk, 0); - return; - } - } -} - static int mptcp_pm_family_to_addr(int family) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -1341,420 +127,8 @@ int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, return 0; } -static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) -{ - return pm_nl_get_pernet(genl_info_net(info)); -} - -static int mptcp_nl_add_subflow_or_signal_addr(struct net *net, - struct mptcp_addr_info *addr) -{ - struct mptcp_sock *msk; - long s_slot = 0, s_num = 0; - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - struct mptcp_addr_info mpc_addr; - - if (!READ_ONCE(msk->fully_established) || - mptcp_pm_is_userspace(msk)) - goto next; - - /* if the endp linked to the init sf is re-added with a != ID */ - mptcp_local_address((struct sock_common *)msk, &mpc_addr); - - lock_sock(sk); - spin_lock_bh(&msk->pm.lock); - if (mptcp_addresses_equal(addr, &mpc_addr, addr->port)) - msk->mpc_endpoint_id = addr->id; - mptcp_pm_create_subflow_or_signal_addr(msk); - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); - -next: - sock_put(sk); - cond_resched(); - } - - return 0; -} - -static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr, - struct genl_info *info) -{ - struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; - - if (!nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, - mptcp_pm_address_nl_policy, info->extack) && - tb[MPTCP_PM_ADDR_ATTR_ID]) - return true; - return false; -} - -int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct mptcp_pm_addr_entry addr, *entry; - int ret; - - ret = mptcp_pm_parse_entry(attr, info, true, &addr); - if (ret < 0) - return ret; - - if (addr.addr.port && !address_use_port(&addr)) { - GENL_SET_ERR_MSG(info, "flags must have signal and not subflow when using port"); - return -EINVAL; - } - - if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && - addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { - GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh"); - return -EINVAL; - } - - if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { - GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint"); - return -EINVAL; - } - - entry = kzalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT); - if (!entry) { - GENL_SET_ERR_MSG(info, "can't allocate addr"); - return -ENOMEM; - } - - *entry = addr; - if (entry->addr.port) { - ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry); - if (ret) { - GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret); - goto out_free; - } - } - ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, - !mptcp_pm_has_addr_attr_id(attr, info)); - if (ret < 0) { - GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret); - goto out_free; - } - - mptcp_nl_add_subflow_or_signal_addr(sock_net(skb->sk), &entry->addr); - return 0; - -out_free: - __mptcp_pm_release_addr_entry(entry); - return ret; -} - -static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - struct mptcp_pm_add_entry *entry; - - entry = mptcp_pm_del_add_timer(msk, addr, false); - if (entry) { - kfree(entry); - return true; - } - - return false; -} - -static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; -} - -static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr, - bool force) -{ - struct mptcp_rm_list list = { .nr = 0 }; - bool ret; - - list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); - - ret = remove_anno_list_by_saddr(msk, addr); - if (ret || force) { - spin_lock_bh(&msk->pm.lock); - if (ret) { - __set_bit(addr->id, msk->pm.id_avail_bitmap); - msk->pm.add_addr_signaled--; - } - mptcp_pm_remove_addr(msk, &list); - spin_unlock_bh(&msk->pm.lock); - } - return ret; -} - -static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id) -{ - /* If it was marked as used, and not ID 0, decrement local_addr_used */ - if (!__test_and_set_bit(id ? : msk->mpc_endpoint_id, msk->pm.id_avail_bitmap) && - id && !WARN_ON_ONCE(msk->pm.local_addr_used == 0)) - msk->pm.local_addr_used--; -} - -static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, - const struct mptcp_pm_addr_entry *entry) -{ - const struct mptcp_addr_info *addr = &entry->addr; - struct mptcp_rm_list list = { .nr = 1 }; - long s_slot = 0, s_num = 0; - struct mptcp_sock *msk; - - pr_debug("remove_id=%d\n", addr->id); - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - bool remove_subflow; - - if (mptcp_pm_is_userspace(msk)) - goto next; - - if (list_empty(&msk->conn_list)) { - mptcp_pm_remove_anno_addr(msk, addr, false); - goto next; - } - - lock_sock(sk); - remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr); - mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && - !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); - - list.ids[0] = mptcp_endp_get_local_id(msk, addr); - if (remove_subflow) { - spin_lock_bh(&msk->pm.lock); - mptcp_pm_nl_rm_subflow_received(msk, &list); - spin_unlock_bh(&msk->pm.lock); - } - - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - spin_lock_bh(&msk->pm.lock); - __mark_subflow_endp_available(msk, list.ids[0]); - spin_unlock_bh(&msk->pm.lock); - } - - if (msk->mpc_endpoint_id == entry->addr.id) - msk->mpc_endpoint_id = 0; - release_sock(sk); - -next: - sock_put(sk); - cond_resched(); - } - - return 0; -} - -static int mptcp_nl_remove_id_zero_address(struct net *net, - struct mptcp_addr_info *addr) -{ - struct mptcp_rm_list list = { .nr = 0 }; - long s_slot = 0, s_num = 0; - struct mptcp_sock *msk; - - list.ids[list.nr++] = 0; - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - struct mptcp_addr_info msk_local; - - if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) - goto next; - - mptcp_local_address((struct sock_common *)msk, &msk_local); - if (!mptcp_addresses_equal(&msk_local, addr, addr->port)) - goto next; - - lock_sock(sk); - spin_lock_bh(&msk->pm.lock); - mptcp_pm_remove_addr(msk, &list); - mptcp_pm_nl_rm_subflow_received(msk, &list); - __mark_subflow_endp_available(msk, 0); - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); - -next: - sock_put(sk); - cond_resched(); - } - - return 0; -} - -int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct mptcp_pm_addr_entry addr, *entry; - unsigned int addr_max; - int ret; - - ret = mptcp_pm_parse_entry(attr, info, false, &addr); - if (ret < 0) - return ret; - - /* the zero id address is special: the first address used by the msk - * always gets such an id, so different subflows can have different zero - * id addresses. Additionally zero id is not accounted for in id_bitmap. - * Let's use an 'mptcp_rm_list' instead of the common remove code. - */ - if (addr.addr.id == 0) - return mptcp_nl_remove_id_zero_address(sock_net(skb->sk), &addr.addr); - - spin_lock_bh(&pernet->lock); - entry = __lookup_addr_by_id(pernet, addr.addr.id); - if (!entry) { - GENL_SET_ERR_MSG(info, "address not found"); - spin_unlock_bh(&pernet->lock); - return -EINVAL; - } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); - } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max - 1); - } - - pernet->addrs--; - list_del_rcu(&entry->list); - __clear_bit(entry->addr.id, pernet->id_bitmap); - spin_unlock_bh(&pernet->lock); - - mptcp_nl_remove_subflow_and_signal_addr(sock_net(skb->sk), entry); - synchronize_rcu(); - __mptcp_pm_release_addr_entry(entry); - - return ret; -} - -/* Called from the userspace PM only */ -void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list) -{ - struct mptcp_rm_list alist = { .nr = 0 }; - struct mptcp_pm_addr_entry *entry; - int anno_nr = 0; - - list_for_each_entry(entry, rm_list, list) { - if (alist.nr >= MPTCP_RM_IDS_MAX) - break; - - /* only delete if either announced or matching a subflow */ - if (remove_anno_list_by_saddr(msk, &entry->addr)) - anno_nr++; - else if (!lookup_subflow_by_saddr(&msk->conn_list, - &entry->addr)) - continue; - - alist.ids[alist.nr++] = entry->addr.id; - } - - if (alist.nr) { - spin_lock_bh(&msk->pm.lock); - msk->pm.add_addr_signaled -= anno_nr; - mptcp_pm_remove_addr(msk, &alist); - spin_unlock_bh(&msk->pm.lock); - } -} - -/* Called from the in-kernel PM only */ -static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, - struct list_head *rm_list) -{ - struct mptcp_rm_list alist = { .nr = 0 }, slist = { .nr = 0 }; - struct mptcp_pm_addr_entry *entry; - - list_for_each_entry(entry, rm_list, list) { - if (slist.nr < MPTCP_RM_IDS_MAX && - lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) - slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); - - if (alist.nr < MPTCP_RM_IDS_MAX && - remove_anno_list_by_saddr(msk, &entry->addr)) - alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr); - } - - spin_lock_bh(&msk->pm.lock); - if (alist.nr) { - msk->pm.add_addr_signaled -= alist.nr; - mptcp_pm_remove_addr(msk, &alist); - } - if (slist.nr) - mptcp_pm_nl_rm_subflow_received(msk, &slist); - /* Reset counters: maybe some subflows have been removed before */ - bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); - msk->pm.local_addr_used = 0; - spin_unlock_bh(&msk->pm.lock); -} - -static void mptcp_nl_flush_addrs_list(struct net *net, - struct list_head *rm_list) -{ - long s_slot = 0, s_num = 0; - struct mptcp_sock *msk; - - if (list_empty(rm_list)) - return; - - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - - if (!mptcp_pm_is_userspace(msk)) { - lock_sock(sk); - mptcp_pm_flush_addrs_and_subflows(msk, rm_list); - release_sock(sk); - } - - sock_put(sk); - cond_resched(); - } -} - -/* caller must ensure the RCU grace period is already elapsed */ -static void __flush_addrs(struct list_head *list) -{ - while (!list_empty(list)) { - struct mptcp_pm_addr_entry *cur; - - cur = list_entry(list->next, - struct mptcp_pm_addr_entry, list); - list_del_rcu(&cur->list); - __mptcp_pm_release_addr_entry(cur); - } -} - -static void __reset_counters(struct pm_nl_pernet *pernet) -{ - WRITE_ONCE(pernet->add_addr_signal_max, 0); - WRITE_ONCE(pernet->add_addr_accept_max, 0); - WRITE_ONCE(pernet->local_addr_max, 0); - pernet->addrs = 0; -} - -int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - LIST_HEAD(free_list); - - spin_lock_bh(&pernet->lock); - list_splice_init(&pernet->local_addr_list, &free_list); - __reset_counters(pernet); - pernet->next_id = 1; - bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); - spin_unlock_bh(&pernet->lock); - mptcp_nl_flush_addrs_list(sock_net(skb->sk), &free_list); - synchronize_rcu(); - __flush_addrs(&free_list); - return 0; -} - -int mptcp_nl_fill_addr(struct sk_buff *skb, - struct mptcp_pm_addr_entry *entry) +static int mptcp_nl_fill_addr(struct sk_buff *skb, + struct mptcp_pm_addr_entry *entry) { struct mptcp_addr_info *addr = &entry->addr; struct nlattr *attr; @@ -1792,15 +166,26 @@ nla_put_failure: return -EMSGSIZE; } -int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info) +static int mptcp_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, + struct genl_info *info) { - struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct mptcp_pm_addr_entry addr, *entry; + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) + return mptcp_userspace_pm_get_addr(id, addr, info); + return mptcp_pm_nl_get_addr(id, addr, info); +} + +int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct mptcp_pm_addr_entry addr; + struct nlattr *attr; struct sk_buff *msg; void *reply; int ret; + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) + return -EINVAL; + + attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; @@ -1817,257 +202,83 @@ int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info) goto fail; } - rcu_read_lock(); - entry = __lookup_addr_by_id(pernet, addr.addr.id); - if (!entry) { - GENL_SET_ERR_MSG(info, "address not found"); - ret = -EINVAL; - goto unlock_fail; + ret = mptcp_pm_get_addr(addr.addr.id, &addr, info); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); + goto fail; } - ret = mptcp_nl_fill_addr(msg, entry); + ret = mptcp_nl_fill_addr(msg, &addr); if (ret) - goto unlock_fail; + goto fail; genlmsg_end(msg, reply); ret = genlmsg_reply(msg, info); - rcu_read_unlock(); return ret; -unlock_fail: - rcu_read_unlock(); - fail: nlmsg_free(msg); return ret; } -int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) -{ - return mptcp_pm_get_addr(skb, info); -} - -int mptcp_pm_nl_dump_addr(struct sk_buff *msg, - struct netlink_callback *cb) +int mptcp_pm_genl_fill_addr(struct sk_buff *msg, + struct netlink_callback *cb, + struct mptcp_pm_addr_entry *entry) { - struct net *net = sock_net(msg->sk); - struct mptcp_pm_addr_entry *entry; - struct pm_nl_pernet *pernet; - int id = cb->args[0]; void *hdr; - int i; - - pernet = pm_nl_get_pernet(net); - - rcu_read_lock(); - for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { - if (test_bit(i, pernet->id_bitmap)) { - entry = __lookup_addr_by_id(pernet, i); - if (!entry) - break; - - if (entry->addr.id <= id) - continue; - - hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, &mptcp_genl_family, - NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); - if (!hdr) - break; - - if (mptcp_nl_fill_addr(msg, entry) < 0) { - genlmsg_cancel(msg, hdr); - break; - } - - id = entry->addr.id; - genlmsg_end(msg, hdr); - } - } - rcu_read_unlock(); - - cb->args[0] = id; - return msg->len; -} -int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - return mptcp_pm_dump_addr(msg, cb); -} - -static int parse_limit(struct genl_info *info, int id, unsigned int *limit) -{ - struct nlattr *attr = info->attrs[id]; - - if (!attr) - return 0; + hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &mptcp_genl_family, + NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); + if (!hdr) + return -EINVAL; - *limit = nla_get_u32(attr); - if (*limit > MPTCP_PM_ADDR_MAX) { - GENL_SET_ERR_MSG(info, "limit greater than maximum"); + if (mptcp_nl_fill_addr(msg, entry) < 0) { + genlmsg_cancel(msg, hdr); return -EINVAL; } - return 0; -} - -int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) -{ - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - unsigned int rcv_addrs, subflows; - int ret; - - spin_lock_bh(&pernet->lock); - rcv_addrs = pernet->add_addr_accept_max; - ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); - if (ret) - goto unlock; - subflows = pernet->subflows_max; - ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); - if (ret) - goto unlock; - - WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); - WRITE_ONCE(pernet->subflows_max, subflows); - -unlock: - spin_unlock_bh(&pernet->lock); - return ret; + genlmsg_end(msg, hdr); + return 0; } -int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) +static int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) { - struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct sk_buff *msg; - void *reply; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, - MPTCP_PM_CMD_GET_LIMITS); - if (!reply) - goto fail; - - if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, - READ_ONCE(pernet->add_addr_accept_max))) - goto fail; - - if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, - READ_ONCE(pernet->subflows_max))) - goto fail; + const struct genl_info *info = genl_info_dump(cb); - genlmsg_end(msg, reply); - return genlmsg_reply(msg, info); - -fail: - GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); - nlmsg_free(msg); - return -EMSGSIZE; + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) + return mptcp_userspace_pm_dump_addr(msg, cb); + return mptcp_pm_nl_dump_addr(msg, cb); } -static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, - struct mptcp_addr_info *addr) +int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) { - struct mptcp_rm_list list = { .nr = 0 }; - - list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); - - spin_lock_bh(&msk->pm.lock); - mptcp_pm_nl_rm_subflow_received(msk, &list); - __mark_subflow_endp_available(msk, list.ids[0]); - mptcp_pm_create_subflow_or_signal_addr(msk); - spin_unlock_bh(&msk->pm.lock); + return mptcp_pm_dump_addr(msg, cb); } -static int mptcp_nl_set_flags(struct net *net, - struct mptcp_addr_info *addr, - u8 bkup, u8 changed) +static int mptcp_pm_set_flags(struct genl_info *info) { - long s_slot = 0, s_num = 0; - struct mptcp_sock *msk; + struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; + struct nlattr *attr_loc; int ret = -EINVAL; - while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { - struct sock *sk = (struct sock *)msk; - - if (list_empty(&msk->conn_list) || mptcp_pm_is_userspace(msk)) - goto next; - - lock_sock(sk); - if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) - ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup); - if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) - mptcp_pm_nl_fullmesh(msk, addr); - release_sock(sk); - -next: - sock_put(sk); - cond_resched(); - } - - return ret; -} - -int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) -{ - struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }; - struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; - u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | - MPTCP_PM_ADDR_FLAG_FULLMESH; - struct net *net = sock_net(skb->sk); - struct mptcp_pm_addr_entry *entry; - struct pm_nl_pernet *pernet; - u8 lookup_by_id = 0; - u8 bkup = 0; - int ret; - - pernet = pm_nl_get_pernet(net); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR)) + return ret; - ret = mptcp_pm_parse_entry(attr, info, false, &addr); + attr_loc = info->attrs[MPTCP_PM_ATTR_ADDR]; + ret = mptcp_pm_parse_entry(attr_loc, info, false, &loc); if (ret < 0) return ret; - if (addr.addr.family == AF_UNSPEC) { - lookup_by_id = 1; - if (!addr.addr.id) { - GENL_SET_ERR_MSG(info, "missing required inputs"); - return -EOPNOTSUPP; - } - } - - if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) - bkup = 1; - - spin_lock_bh(&pernet->lock); - entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) : - __lookup_addr(pernet, &addr.addr); - if (!entry) { - spin_unlock_bh(&pernet->lock); - GENL_SET_ERR_MSG(info, "address not found"); - return -EINVAL; - } - if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && - (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { - spin_unlock_bh(&pernet->lock); - GENL_SET_ERR_MSG(info, "invalid addr flags"); - return -EINVAL; - } - - changed = (addr.flags ^ entry->flags) & mask; - entry->flags = (entry->flags & ~mask) | (addr.flags & mask); - addr = *entry; - spin_unlock_bh(&pernet->lock); - - mptcp_nl_set_flags(net, &addr.addr, bkup, changed); - return 0; + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) + return mptcp_userspace_pm_set_flags(&loc, info); + return mptcp_pm_nl_set_flags(&loc, info); } int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info) { - return mptcp_pm_set_flags(skb, info); + return mptcp_pm_set_flags(info); } static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) @@ -2100,9 +311,7 @@ static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { - const struct ipv6_pinfo *np = inet6_sk(ssk); - - if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) + if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr)) return -EMSGSIZE; if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr)) return -EMSGSIZE; @@ -2329,9 +538,7 @@ void mptcp_event_pm_listener(const struct sock *ssk, break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { - const struct ipv6_pinfo *np = inet6_sk(ssk); - - if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) + if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr)) goto nla_put_failure; break; } @@ -2419,52 +626,8 @@ struct genl_family mptcp_genl_family __ro_after_init = { .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), }; -static int __net_init pm_nl_init_net(struct net *net) -{ - struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); - - INIT_LIST_HEAD_RCU(&pernet->local_addr_list); - - /* Cit. 2 subflows ought to be enough for anybody. */ - pernet->subflows_max = 2; - pernet->next_id = 1; - pernet->stale_loss_cnt = 4; - spin_lock_init(&pernet->lock); - - /* No need to initialize other pernet fields, the struct is zeroed at - * allocation time. - */ - - return 0; -} - -static void __net_exit pm_nl_exit_net(struct list_head *net_list) -{ - struct net *net; - - list_for_each_entry(net, net_list, exit_list) { - struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); - - /* net is removed from namespace list, can't race with - * other modifiers, also netns core already waited for a - * RCU grace period. - */ - __flush_addrs(&pernet->local_addr_list); - } -} - -static struct pernet_operations mptcp_pm_pernet_ops = { - .init = pm_nl_init_net, - .exit_batch = pm_nl_exit_net, - .id = &pm_nl_pernet_id, - .size = sizeof(struct pm_nl_pernet), -}; - void __init mptcp_pm_nl_init(void) { - if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) - panic("Failed to register MPTCP PM pernet subsystem.\n"); - if (genl_register_family(&mptcp_genl_family)) panic("Failed to register MPTCP PM netlink family\n"); } diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index e35178f5205f..a715dcbe0146 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -8,15 +8,16 @@ #include "mib.h" #include "mptcp_pm_gen.h" -void mptcp_free_local_addr_list(struct mptcp_sock *msk) +#define mptcp_for_each_userspace_pm_addr(__msk, __entry) \ + list_for_each_entry(__entry, \ + &((__msk)->pm.userspace_pm_local_addr_list), list) + +void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk) { struct mptcp_pm_addr_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; LIST_HEAD(free_list); - if (!mptcp_pm_is_userspace(msk)) - return; - spin_lock_bh(&msk->pm.lock); list_splice_init(&msk->pm.userspace_pm_local_addr_list, &free_list); spin_unlock_bh(&msk->pm.lock); @@ -26,12 +27,24 @@ void mptcp_free_local_addr_list(struct mptcp_sock *msk) } } +static struct mptcp_pm_addr_entry * +mptcp_userspace_pm_lookup_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_addr_entry *entry; + + mptcp_for_each_userspace_pm_addr(msk, entry) { + if (mptcp_addresses_equal(&entry->addr, addr, false)) + return entry; + } + return NULL; +} + static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *entry, bool needs_id) { DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); - struct mptcp_pm_addr_entry *match = NULL; struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *e; bool addr_match = false; @@ -41,31 +54,26 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); spin_lock_bh(&msk->pm.lock); - list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { + mptcp_for_each_userspace_pm_addr(msk, e) { addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true); if (addr_match && entry->addr.id == 0 && needs_id) entry->addr.id = e->addr.id; id_match = (e->addr.id == entry->addr.id); - if (addr_match && id_match) { - match = e; - break; - } else if (addr_match || id_match) { + if (addr_match || id_match) break; - } __set_bit(e->addr.id, id_bitmap); } - if (!match && !addr_match && !id_match) { + if (!addr_match && !id_match) { /* Memory for the entry is allocated from the * sock option buffer. */ - e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC); + e = sock_kmemdup(sk, entry, sizeof(*entry), GFP_ATOMIC); if (!e) { ret = -ENOMEM; goto append_err; } - *e = *entry; if (!e->addr.id && needs_id) e->addr.id = find_next_zero_bit(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, @@ -73,7 +81,7 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list); msk->pm.local_addr_used++; ret = e->addr.id; - } else if (match) { + } else if (addr_match && id_match) { ret = entry->addr.id; } @@ -90,22 +98,20 @@ append_err: static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *addr) { - struct mptcp_pm_addr_entry *entry, *tmp; struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; - list_for_each_entry_safe(entry, tmp, &msk->pm.userspace_pm_local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, &addr->addr, false)) { - /* TODO: a refcount is needed because the entry can - * be used multiple times (e.g. fullmesh mode). - */ - list_del_rcu(&entry->list); - sock_kfree_s(sk, entry, sizeof(*entry)); - msk->pm.local_addr_used--; - return 0; - } - } - - return -EINVAL; + entry = mptcp_userspace_pm_lookup_addr(msk, &addr->addr); + if (!entry) + return -EINVAL; + + /* TODO: a refcount is needed because the entry can + * be used multiple times (e.g. fullmesh mode). + */ + list_del_rcu(&entry->list); + sock_kfree_s(sk, entry, sizeof(*entry)); + msk->pm.local_addr_used--; + return 0; } static struct mptcp_pm_addr_entry * @@ -113,7 +119,7 @@ mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + mptcp_for_each_userspace_pm_addr(msk, entry) { if (entry->addr.id == id) return entry; } @@ -121,97 +127,101 @@ mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) } int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, - struct mptcp_addr_info *skc) + struct mptcp_pm_addr_entry *skc) { - struct mptcp_pm_addr_entry *entry = NULL, *e, new_entry; __be16 msk_sport = ((struct inet_sock *) inet_sk((struct sock *)msk))->inet_sport; + struct mptcp_pm_addr_entry *entry; spin_lock_bh(&msk->pm.lock); - list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { - if (mptcp_addresses_equal(&e->addr, skc, false)) { - entry = e; - break; - } - } + entry = mptcp_userspace_pm_lookup_addr(msk, &skc->addr); spin_unlock_bh(&msk->pm.lock); if (entry) return entry->addr.id; - memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); - new_entry.addr = *skc; - new_entry.addr.id = 0; - new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; - - if (new_entry.addr.port == msk_sport) - new_entry.addr.port = 0; + if (skc->addr.port == msk_sport) + skc->addr.port = 0; - return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry, true); + return mptcp_userspace_pm_append_new_local_addr(msk, skc, true); } bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { struct mptcp_pm_addr_entry *entry; - bool backup = false; + bool backup; spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, skc, false)) { - backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - break; - } - } + entry = mptcp_userspace_pm_lookup_addr(msk, skc); + backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); spin_unlock_bh(&msk->pm.lock); return backup; } +static struct mptcp_sock *mptcp_userspace_pm_get_sock(const struct genl_info *info) +{ + struct mptcp_sock *msk; + struct nlattr *token; + + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_TOKEN)) + return NULL; + + token = info->attrs[MPTCP_PM_ATTR_TOKEN]; + msk = mptcp_token_get_sock(genl_info_net(info), nla_get_u32(token)); + if (!msk) { + NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + return NULL; + } + + if (!mptcp_pm_is_userspace(msk)) { + NL_SET_ERR_MSG_ATTR(info->extack, token, + "userspace PM not selected"); + sock_put((struct sock *)msk); + return NULL; + } + + return msk; +} + int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; - struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry addr_val; struct mptcp_sock *msk; + struct nlattr *addr; int err = -EINVAL; struct sock *sk; - u32 token_val; - if (!addr || !token) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR)) return err; - } - token_val = nla_get_u32(token); - - msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return err; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + addr = info->attrs[MPTCP_PM_ATTR_ADDR]; + err = mptcp_pm_parse_entry(addr, info, true, &addr_val); + if (err < 0) goto announce_err; - } - err = mptcp_pm_parse_entry(addr, info, true, &addr_val); - if (err < 0) { - GENL_SET_ERR_MSG(info, "error parsing local address"); + if (addr_val.addr.id == 0) { + NL_SET_ERR_MSG_ATTR(info->extack, addr, "invalid addr id"); + err = -EINVAL; goto announce_err; } - if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { - GENL_SET_ERR_MSG(info, "invalid addr id or flags"); + if (!(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + NL_SET_ERR_MSG_ATTR(info->extack, addr, "invalid addr flags"); err = -EINVAL; goto announce_err; } err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val, false); if (err < 0) { - GENL_SET_ERR_MSG(info, "did not match address and id"); + NL_SET_ERR_MSG_ATTR(info->extack, addr, + "did not match address and id"); goto announce_err; } @@ -221,7 +231,7 @@ int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) if (mptcp_pm_alloc_anno_list(msk, &addr_val.addr)) { msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &addr_val.addr, false); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); } spin_unlock_bh(&msk->pm.lock); @@ -233,8 +243,7 @@ int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) return err; } -static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, - struct genl_info *info) +static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk) { struct mptcp_rm_list list = { .nr = 0 }; struct mptcp_subflow_context *subflow; @@ -249,10 +258,8 @@ static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, break; } } - if (!has_id_0) { - GENL_SET_ERR_MSG(info, "address with id 0 not found"); + if (!has_id_0) goto remove_err; - } list.ids[list.nr++] = 0; @@ -267,42 +274,49 @@ remove_err: return err; } +void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry) +{ + struct mptcp_rm_list alist = { .nr = 0 }; + int anno_nr = 0; + + /* only delete if either announced or matching a subflow */ + if (mptcp_remove_anno_list_by_saddr(msk, &entry->addr)) + anno_nr++; + else if (!mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) + return; + + alist.ids[alist.nr++] = entry->addr.id; + + spin_lock_bh(&msk->pm.lock); + msk->pm.add_addr_signaled -= anno_nr; + mptcp_pm_remove_addr(msk, &alist); + spin_unlock_bh(&msk->pm.lock); +} + int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; - struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; struct mptcp_pm_addr_entry *match; - struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; - LIST_HEAD(free_list); + struct nlattr *id; int err = -EINVAL; struct sock *sk; - u32 token_val; u8 id_val; - if (!id || !token) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_LOC_ID)) return err; - } + id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; id_val = nla_get_u8(id); - token_val = nla_get_u32(token); - msk = mptcp_token_get_sock(sock_net(skb->sk), token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return err; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto out; - } - if (id_val == 0) { - err = mptcp_userspace_pm_remove_id_zero_address(msk, info); + err = mptcp_userspace_pm_remove_id_zero_address(msk); goto out; } @@ -311,80 +325,71 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) spin_lock_bh(&msk->pm.lock); match = mptcp_userspace_pm_lookup_addr_by_id(msk, id_val); if (!match) { - GENL_SET_ERR_MSG(info, "address with specified id not found"); spin_unlock_bh(&msk->pm.lock); release_sock(sk); goto out; } - list_move(&match->list, &free_list); + list_del_rcu(&match->list); spin_unlock_bh(&msk->pm.lock); - mptcp_pm_remove_addrs(msk, &free_list); + mptcp_pm_remove_addr_entry(msk, match); release_sock(sk); - list_for_each_entry_safe(match, entry, &free_list, list) { - sock_kfree_s(sk, match, sizeof(*match)); - } + kfree_rcu_mightsleep(match); + /* Adjust sk_omem_alloc like sock_kfree_s() does, to match + * with allocation of this memory by sock_kmemdup() + */ + atomic_sub(sizeof(*match), &sk->sk_omem_alloc); err = 0; out: + if (err) + NL_SET_ERR_MSG_ATTR_FMT(info->extack, id, + "address with id %u not found", + id_val); + sock_put(sk); return err; } int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; - struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry entry = { 0 }; struct mptcp_addr_info addr_r; + struct nlattr *raddr, *laddr; struct mptcp_pm_local local; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; - u32 token_val; - if (!laddr || !raddr || !token) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR) || + GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR_REMOTE)) return err; - } - - token_val = nla_get_u32(token); - msk = mptcp_token_get_sock(genl_info_net(info), token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return err; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto create_err; - } - + laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; err = mptcp_pm_parse_entry(laddr, info, true, &entry); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); + if (err < 0) goto create_err; - } if (entry.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - GENL_SET_ERR_MSG(info, "invalid addr flags"); + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "invalid addr flags"); err = -EINVAL; goto create_err; } entry.flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW; + raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; err = mptcp_pm_parse_addr(raddr, info, &addr_r); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); + if (err < 0) goto create_err; - } if (!mptcp_pm_addr_families_match(sk, &entry.addr, &addr_r)) { GENL_SET_ERR_MSG(info, "families mismatch"); @@ -394,7 +399,8 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) err = mptcp_userspace_pm_append_new_local_addr(msk, &entry, false); if (err < 0) { - GENL_SET_ERR_MSG(info, "did not match address and id"); + NL_SET_ERR_MSG_ATTR(info->extack, laddr, + "did not match address and id"); goto create_err; } @@ -406,6 +412,9 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) err = __mptcp_subflow_connect(sk, &local, &addr_r); release_sock(sk); + if (err) + GENL_SET_ERR_MSG_FMT(info, "connect error: %d", err); + spin_lock_bh(&msk->pm.lock); if (err) mptcp_userspace_pm_delete_local_addr(msk, &entry); @@ -446,9 +455,7 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { - const struct ipv6_pinfo *pinfo = inet6_sk(ssk); - - if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) || + if (!ipv6_addr_equal(&local->addr6, &issk->pinet6->saddr) || !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr)) continue; break; @@ -468,86 +475,76 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; - struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; - struct mptcp_addr_info addr_l; + struct mptcp_pm_addr_entry addr_l; struct mptcp_addr_info addr_r; + struct nlattr *raddr, *laddr; struct mptcp_sock *msk; struct sock *sk, *ssk; int err = -EINVAL; - u32 token_val; - if (!laddr || !raddr || !token) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR) || + GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR_REMOTE)) return err; - } - - token_val = nla_get_u32(token); - msk = mptcp_token_get_sock(genl_info_net(info), token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return err; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; + err = mptcp_pm_parse_entry(laddr, info, true, &addr_l); + if (err < 0) goto destroy_err; - } - - err = mptcp_pm_parse_addr(laddr, info, &addr_l); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); - goto destroy_err; - } + raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; err = mptcp_pm_parse_addr(raddr, info, &addr_r); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); + if (err < 0) goto destroy_err; - } #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (addr_l.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) { - ipv6_addr_set_v4mapped(addr_l.addr.s_addr, &addr_l.addr6); - addr_l.family = AF_INET6; + if (addr_l.addr.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) { + ipv6_addr_set_v4mapped(addr_l.addr.addr.s_addr, &addr_l.addr.addr6); + addr_l.addr.family = AF_INET6; } - if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr6)) { + if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr.addr6)) { ipv6_addr_set_v4mapped(addr_r.addr.s_addr, &addr_r.addr6); addr_r.family = AF_INET6; } #endif - if (addr_l.family != addr_r.family) { + if (addr_l.addr.family != addr_r.family) { GENL_SET_ERR_MSG(info, "address families do not match"); err = -EINVAL; goto destroy_err; } - if (!addr_l.port || !addr_r.port) { - GENL_SET_ERR_MSG(info, "missing local or remote port"); + if (!addr_l.addr.port) { + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local port"); err = -EINVAL; goto destroy_err; } - lock_sock(sk); - ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r); - if (ssk) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - struct mptcp_pm_addr_entry entry = { .addr = addr_l }; + if (!addr_r.port) { + NL_SET_ERR_MSG_ATTR(info->extack, raddr, "missing remote port"); + err = -EINVAL; + goto destroy_err; + } - spin_lock_bh(&msk->pm.lock); - mptcp_userspace_pm_delete_local_addr(msk, &entry); - spin_unlock_bh(&msk->pm.lock); - mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); - mptcp_close_ssk(sk, ssk, subflow); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); - err = 0; - } else { + lock_sock(sk); + ssk = mptcp_nl_find_ssk(msk, &addr_l.addr, &addr_r); + if (!ssk) { + GENL_SET_ERR_MSG(info, "subflow not found"); err = -ESRCH; + goto release_sock; } + + spin_lock_bh(&msk->pm.lock); + mptcp_userspace_pm_delete_local_addr(msk, &addr_l); + spin_unlock_bh(&msk->pm.lock); + mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN); + mptcp_close_ssk(sk, ssk, mptcp_subflow_ctx(ssk)); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW); +release_sock: release_sock(sk); destroy_err: @@ -555,71 +552,67 @@ destroy_err: return err; } -int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) +int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info) { - struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; - struct mptcp_pm_addr_entry rem = { .addr = { .family = AF_UNSPEC }, }; - struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; - struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; - struct net *net = sock_net(skb->sk); + struct mptcp_addr_info rem = { .family = AF_UNSPEC, }; struct mptcp_pm_addr_entry *entry; + struct nlattr *attr, *attr_rem; struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; - u32 token_val; u8 bkup = 0; - token_val = nla_get_u32(token); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR_REMOTE)) + return ret; - msk = mptcp_token_get_sock(net, token_val); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return ret; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "userspace PM not selected"); + attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + if (local->addr.family == AF_UNSPEC) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "invalid local address family"); + ret = -EINVAL; goto set_flags_err; } - ret = mptcp_pm_parse_entry(attr, info, false, &loc); + attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; + ret = mptcp_pm_parse_addr(attr_rem, info, &rem); if (ret < 0) goto set_flags_err; - if (attr_rem) { - ret = mptcp_pm_parse_entry(attr_rem, info, false, &rem); - if (ret < 0) - goto set_flags_err; - } - - if (loc.addr.family == AF_UNSPEC || - rem.addr.family == AF_UNSPEC) { - GENL_SET_ERR_MSG(info, "invalid address families"); + if (rem.family == AF_UNSPEC) { + NL_SET_ERR_MSG_ATTR(info->extack, attr_rem, + "invalid remote address family"); ret = -EINVAL; goto set_flags_err; } - if (loc.flags & MPTCP_PM_ADDR_FLAG_BACKUP) + if (local->flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { - if (mptcp_addresses_equal(&entry->addr, &loc.addr, false)) { - if (bkup) - entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; - else - entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; - } + entry = mptcp_userspace_pm_lookup_addr(msk, &local->addr); + if (entry) { + if (bkup) + entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; } spin_unlock_bh(&msk->pm.lock); lock_sock(sk); - ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc.addr, &rem.addr, bkup); + ret = mptcp_pm_mp_prio_send_ack(msk, &local->addr, &rem, bkup); release_sock(sk); + /* mptcp_pm_mp_prio_send_ack() only fails in one case */ + if (ret < 0) + GENL_SET_ERR_MSG(info, "subflow not found"); + set_flags_err: sock_put(sk); return ret; @@ -632,129 +625,74 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, DECLARE_BITMAP(map, MPTCP_PM_MAX_ADDR_ID + 1); } *bitmap; const struct genl_info *info = genl_info_dump(cb); - struct net *net = sock_net(msg->sk); struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; - struct nlattr *token; int ret = -EINVAL; struct sock *sk; - void *hdr; + + BUILD_BUG_ON(sizeof(struct id_bitmap) > sizeof(cb->ctx)); bitmap = (struct id_bitmap *)cb->ctx; - token = info->attrs[MPTCP_PM_ATTR_TOKEN]; - msk = mptcp_token_get_sock(net, nla_get_u32(token)); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return ret; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto out; - } - lock_sock(sk); spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + mptcp_for_each_userspace_pm_addr(msk, entry) { if (test_bit(entry->addr.id, bitmap->map)) continue; - hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, &mptcp_genl_family, - NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); - if (!hdr) + if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0) break; - if (mptcp_nl_fill_addr(msg, entry) < 0) { - genlmsg_cancel(msg, hdr); - break; - } - __set_bit(entry->addr.id, bitmap->map); - genlmsg_end(msg, hdr); } spin_unlock_bh(&msk->pm.lock); release_sock(sk); ret = msg->len; -out: sock_put(sk); return ret; } -int mptcp_userspace_pm_get_addr(struct sk_buff *skb, +int mptcp_userspace_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info) { - struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; - struct mptcp_pm_addr_entry addr, *entry; - struct net *net = sock_net(skb->sk); + struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; - struct sk_buff *msg; int ret = -EINVAL; struct sock *sk; - void *reply; - msk = mptcp_token_get_sock(net, nla_get_u32(token)); - if (!msk) { - NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); + msk = mptcp_userspace_pm_get_sock(info); + if (!msk) return ret; - } sk = (struct sock *)msk; - if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); - goto out; - } - - ret = mptcp_pm_parse_entry(attr, info, false, &addr); - if (ret < 0) - goto out; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - ret = -ENOMEM; - goto out; - } - - reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, - info->genlhdr->cmd); - if (!reply) { - GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); - ret = -EMSGSIZE; - goto fail; - } - lock_sock(sk); spin_lock_bh(&msk->pm.lock); - entry = mptcp_userspace_pm_lookup_addr_by_id(msk, addr.addr.id); - if (!entry) { - GENL_SET_ERR_MSG(info, "address not found"); - ret = -EINVAL; - goto unlock_fail; + entry = mptcp_userspace_pm_lookup_addr_by_id(msk, id); + if (entry) { + *addr = *entry; + ret = 0; } - - ret = mptcp_nl_fill_addr(msg, entry); - if (ret) - goto unlock_fail; - - genlmsg_end(msg, reply); - ret = genlmsg_reply(msg, info); spin_unlock_bh(&msk->pm.lock); release_sock(sk); - sock_put(sk); - return ret; -unlock_fail: - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); -fail: - nlmsg_free(msg); -out: sock_put(sk); return ret; } + +static struct mptcp_pm_ops mptcp_pm_userspace = { + .name = "userspace", + .owner = THIS_MODULE, +}; + +void __init mptcp_pm_userspace_register(void) +{ + mptcp_pm_register(&mptcp_pm_userspace); +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 1b2e7cbb577f..0749733ea897 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -46,8 +46,10 @@ static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_sm static void __mptcp_destroy_sock(struct sock *sk); static void mptcp_check_send_data_fin(struct sock *sk); -DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); -static struct net_device mptcp_napi_dev; +DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; +static struct net_device *mptcp_napi_dev; /* Returns end sequence number of the receiver's advertised window */ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) @@ -118,24 +120,14 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); } -static void mptcp_rmem_fwd_alloc_add(struct sock *sk, int size) -{ - WRITE_ONCE(mptcp_sk(sk)->rmem_fwd_alloc, - mptcp_sk(sk)->rmem_fwd_alloc + size); -} - -static void mptcp_rmem_charge(struct sock *sk, int size) -{ - mptcp_rmem_fwd_alloc_add(sk, -size); -} - static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from) { bool fragstolen; int delta; - if (MPTCP_SKB_CB(from)->offset || + if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) || + MPTCP_SKB_CB(from)->offset || ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; @@ -150,7 +142,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, * negative one */ atomic_add(delta, &sk->sk_rmem_alloc); - mptcp_rmem_charge(sk, delta); + sk_mem_charge(sk, delta); kfree_skb_partial(from, fragstolen); return true; @@ -165,44 +157,6 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, return mptcp_try_coalesce((struct sock *)msk, to, from); } -static void __mptcp_rmem_reclaim(struct sock *sk, int amount) -{ - amount >>= PAGE_SHIFT; - mptcp_rmem_charge(sk, amount << PAGE_SHIFT); - __sk_mem_reduce_allocated(sk, amount); -} - -static void mptcp_rmem_uncharge(struct sock *sk, int size) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - int reclaimable; - - mptcp_rmem_fwd_alloc_add(sk, size); - reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); - - /* see sk_mem_uncharge() for the rationale behind the following schema */ - if (unlikely(reclaimable >= PAGE_SIZE)) - __mptcp_rmem_reclaim(sk, reclaimable); -} - -static void mptcp_rfree(struct sk_buff *skb) -{ - unsigned int len = skb->truesize; - struct sock *sk = skb->sk; - - atomic_sub(len, &sk->sk_rmem_alloc); - mptcp_rmem_uncharge(sk, len); -} - -void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) -{ - skb_orphan(skb); - skb->sk = sk; - skb->destructor = mptcp_rfree; - atomic_add(skb->truesize, &sk->sk_rmem_alloc); - mptcp_rmem_charge(sk, skb->truesize); -} - /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks @@ -315,25 +269,7 @@ merge_right: end: skb_condense(skb); - mptcp_set_owner_r(skb, sk); -} - -static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - int amt, amount; - - if (size <= msk->rmem_fwd_alloc) - return true; - - size -= msk->rmem_fwd_alloc; - amt = sk_mem_pages(size); - amount = amt << PAGE_SHIFT; - if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) - return false; - - mptcp_rmem_fwd_alloc_add(sk, amount); - return true; + skb_set_owner_r(skb, sk); } static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, @@ -351,7 +287,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, skb_orphan(skb); /* try to fetch required memory from subflow */ - if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) { + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); goto drop; } @@ -366,6 +302,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset; MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; + MPTCP_SKB_CB(skb)->cant_coalesce = 0; if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ @@ -375,7 +312,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, if (tail && mptcp_try_coalesce(sk, tail, skb)) return true; - mptcp_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); return true; } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { @@ -487,7 +424,7 @@ static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subfl const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? - inet_csk(ssk)->icsk_timeout - jiffies : 0; + icsk_timeout(inet_csk(ssk)) - jiffies : 0; } static void mptcp_set_timeout(struct sock *sk) @@ -561,7 +498,7 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) bool cleanup, rx_empty; cleanup = (space > 0) && (space >= (old_space << 1)) && copied; - rx_empty = !__mptcp_rmem(sk) && copied; + rx_empty = !sk_rmem_alloc_get(sk) && copied; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -634,27 +571,13 @@ static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, - struct sock *ssk, - unsigned int *bytes) + struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; - unsigned int moved = 0; bool more_data_avail; struct tcp_sock *tp; - bool done = false; - int sk_rbuf; - - sk_rbuf = READ_ONCE(sk->sk_rcvbuf); - - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); - - if (unlikely(ssk_rbuf > sk_rbuf)) { - WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); - sk_rbuf = ssk_rbuf; - } - } + bool ret = false; pr_debug("msk=%p ssk=%p\n", msk, ssk); tp = tcp_sk(ssk); @@ -664,20 +587,16 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sk_buff *skb; bool fin; + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) + break; + /* try to move as much data as available */ map_remaining = subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); skb = skb_peek(&ssk->sk_receive_queue); - if (!skb) { - /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), - * a different CPU can have already processed the pending - * data, stop here or we can enter an infinite loop - */ - if (!moved) - done = true; + if (unlikely(!skb)) break; - } if (__mptcp_check_fallback(msk)) { /* Under fallback skbs have no MPTCP extension and TCP could @@ -690,19 +609,13 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, offset = seq - TCP_SKB_CB(skb)->seq; fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; - if (fin) { - done = true; + if (fin) seq++; - } if (offset < skb->len) { size_t len = skb->len - offset; - if (tp->urg_data) - done = true; - - if (__mptcp_move_skb(msk, ssk, skb, offset, len)) - moved += len; + ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret; seq += len; if (unlikely(map_remaining < len)) { @@ -716,22 +629,16 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } sk_eat_skb(ssk, skb); - done = true; } WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); - if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { - done = true; - break; - } } while (more_data_avail); - if (moved > 0) + if (ret) msk->last_data_recv = tcp_jiffies32; - *bytes += moved; - return done; + return ret; } static bool __mptcp_ofo_queue(struct mptcp_sock *msk) @@ -825,9 +732,9 @@ void __mptcp_error_report(struct sock *sk) static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; - unsigned int moved = 0; + bool moved; - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + moved = __mptcp_move_skbs_from_subflow(msk, ssk); __mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) { if (!sock_owned_by_user(sk)) @@ -843,14 +750,29 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) */ if (mptcp_pending_data_fin(sk, NULL)) mptcp_schedule_work(sk); - return moved > 0; + return moved; +} + +static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk) +{ + if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf)) + WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf); +} + +static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + __mptcp_rcvbuf_update(sk, ssk); + + /* Wake-up the reader only for in-sequence data */ + if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) + sk->sk_data_ready(sk); } void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - struct mptcp_sock *msk = mptcp_sk(sk); - int sk_rbuf, ssk_rbuf; /* The peer can send data while we are shutting down this * subflow at msk destruction time, but we must avoid enqueuing @@ -859,19 +781,11 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) if (unlikely(subflow->disposable)) return; - ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); - sk_rbuf = READ_ONCE(sk->sk_rcvbuf); - if (unlikely(ssk_rbuf > sk_rbuf)) - sk_rbuf = ssk_rbuf; - - /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ - if (__mptcp_rmem(sk) > sk_rbuf) - return; - - /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); - if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) - sk->sk_data_ready(sk); + if (!sock_owned_by_user(sk)) + __mptcp_data_ready(sk, ssk); + else + __set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } @@ -950,20 +864,6 @@ bool mptcp_schedule_work(struct sock *sk) return false; } -static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - msk_owned_by_me(msk); - - mptcp_for_each_subflow(msk, subflow) { - if (READ_ONCE(subflow->data_avail)) - return mptcp_subflow_tcp_sock(subflow); - } - - return NULL; -} - static bool mptcp_skb_can_collapse_to(u64 write_seq, const struct sk_buff *skb, const struct mptcp_ext *mpext) @@ -1767,8 +1667,10 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, * see mptcp_disconnect(). * Attempt it again outside the problematic scope. */ - if (!mptcp_disconnect(sk, 0)) + if (!mptcp_disconnect(sk, 0)) { + sk->sk_disconnects++; sk->sk_socket->state = SS_UNCONNECTED; + } } inet_clear_bit(DEFER_CONNECT, sk); @@ -1942,16 +1844,17 @@ do_error: static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); -static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, +static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { + struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb, *tmp; int copied = 0; - skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { + skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); @@ -1983,10 +1886,11 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, } if (!(flags & MSG_PEEK)) { - /* we will bulk release the skb memory later */ + /* avoid the indirect call, we know the destructor is sock_wfree */ skb->destructor = NULL; - WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); - __skb_unlink(skb, &msk->receive_queue); + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, skb->truesize); + __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); msk->bytes_consumed += count; } @@ -2099,66 +2003,65 @@ new_measure: msk->rcvq_space.time = mstamp; } -static void __mptcp_update_rmem(struct sock *sk) +static struct mptcp_subflow_context * +__mptcp_first_ready_from(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) { - struct mptcp_sock *msk = mptcp_sk(sk); - - if (!msk->rmem_released) - return; + struct mptcp_subflow_context *start_subflow = subflow; - atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); - mptcp_rmem_uncharge(sk, msk->rmem_released); - WRITE_ONCE(msk->rmem_released, 0); + while (!READ_ONCE(subflow->data_avail)) { + subflow = mptcp_next_subflow(msk, subflow); + if (subflow == start_subflow) + return NULL; + } + return subflow; } -static void __mptcp_splice_receive_queue(struct sock *sk) +static bool __mptcp_move_skbs(struct sock *sk) { + struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); + bool ret = false; - skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); -} + if (list_empty(&msk->conn_list)) + return false; -static bool __mptcp_move_skbs(struct mptcp_sock *msk) -{ - struct sock *sk = (struct sock *)msk; - unsigned int moved = 0; - bool ret, done; + /* verify we can move any data from the subflow, eventually updating */ + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + mptcp_for_each_subflow(msk, subflow) + __mptcp_rcvbuf_update(sk, subflow->tcp_sock); - do { - struct sock *ssk = mptcp_subflow_recv_lookup(msk); + subflow = list_first_entry(&msk->conn_list, + struct mptcp_subflow_context, node); + for (;;) { + struct sock *ssk; bool slowpath; - /* we can have data pending in the subflows only if the msk - * receive buffer was full at subflow_data_ready() time, - * that is an unlikely slow path. + /* + * As an optimization avoid traversing the subflows list + * and ev. acquiring the subflow socket lock before baling out */ - if (likely(!ssk)) + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) break; - slowpath = lock_sock_fast(ssk); - mptcp_data_lock(sk); - __mptcp_update_rmem(sk); - done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - mptcp_data_unlock(sk); + subflow = __mptcp_first_ready_from(msk, subflow); + if (!subflow) + break; + ssk = mptcp_subflow_tcp_sock(subflow); + slowpath = lock_sock_fast(ssk); + ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret; if (unlikely(ssk->sk_err)) __mptcp_error_report(sk); unlock_sock_fast(ssk, slowpath); - } while (!done); - /* acquire the data lock only if some input data is pending */ - ret = moved > 0; - if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || - !skb_queue_empty_lockless(&sk->sk_receive_queue)) { - mptcp_data_lock(sk); - __mptcp_update_rmem(sk); - ret |= __mptcp_ofo_queue(msk); - __mptcp_splice_receive_queue(sk); - mptcp_data_unlock(sk); + subflow = mptcp_next_subflow(msk, subflow); } + + __mptcp_ofo_queue(msk); if (ret) mptcp_check_data_fin((struct sock *)msk); - return !skb_queue_empty(&msk->receive_queue); + return ret; } static unsigned int mptcp_inq_hint(const struct sock *sk) @@ -2166,7 +2069,7 @@ static unsigned int mptcp_inq_hint(const struct sock *sk) const struct mptcp_sock *msk = mptcp_sk(sk); const struct sk_buff *skb; - skb = skb_peek(&msk->receive_queue); + skb = skb_peek(&sk->sk_receive_queue); if (skb) { u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq; @@ -2212,7 +2115,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, while (copied < len) { int err, bytes_read; - bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); + bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; @@ -2221,7 +2124,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) + if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk)) continue; /* only the MPTCP socket status is relevant here. The exit @@ -2247,7 +2150,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, /* race breaker: the shutdown could be after the * previous receive queue check */ - if (__mptcp_move_skbs(msk)) + if (__mptcp_move_skbs(sk)) continue; break; } @@ -2291,9 +2194,8 @@ out_err: } } - pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", - msk, skb_queue_empty_lockless(&sk->sk_receive_queue), - skb_queue_empty(&msk->receive_queue), copied); + pr_debug("msk=%p rx queue empty=%d copied=%d\n", + msk, skb_queue_empty(&sk->sk_receive_queue), copied); release_sock(sk); return copied; @@ -2781,7 +2683,7 @@ static void mptcp_worker(struct work_struct *work) mptcp_check_fastclose(msk); - mptcp_pm_nl_work(msk); + mptcp_pm_worker(msk); mptcp_check_send_data_fin(sk); mptcp_check_data_fin_ack(sk); @@ -2820,11 +2722,8 @@ static void __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); - __skb_queue_head_init(&msk->receive_queue); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; - WRITE_ONCE(msk->rmem_fwd_alloc, 0); - WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival = TCP_RTO_MIN; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; @@ -3050,8 +2949,6 @@ static void __mptcp_destroy_sock(struct sock *sk) sk->sk_prot->destroy(sk); - WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc)); - WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); @@ -3247,9 +3144,9 @@ static int mptcp_disconnect(struct sock *sk, int flags) #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) { - unsigned int offset = sizeof(struct mptcp6_sock) - sizeof(struct ipv6_pinfo); + struct mptcp6_sock *msk6 = container_of(mptcp_sk(sk), struct mptcp6_sock, msk); - return (struct ipv6_pinfo *)(((u8 *)sk) + offset); + return &msk6->np; } static void mptcp_copy_ip6_options(struct sock *newsk, const struct sock *sk) @@ -3283,12 +3180,9 @@ static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + + newopt = sock_kmemdup(newsk, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); - if (newopt) - memcpy(newopt, inet_opt, sizeof(*inet_opt) + - inet_opt->opt.optlen); - else + if (!newopt) net_warn_ratelimited("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); @@ -3403,21 +3297,14 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); - /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ - mptcp_data_lock(sk); - skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); - mptcp_data_unlock(sk); /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it */ - sk_forward_alloc_add(sk, msk->rmem_fwd_alloc); - WRITE_ONCE(msk->rmem_fwd_alloc, 0); mptcp_token_destroy(msk); - mptcp_pm_free_anno_list(msk); - mptcp_free_local_addr_list(msk); + mptcp_pm_destroy(msk); } static void mptcp_destroy(struct sock *sk) @@ -3451,7 +3338,8 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ BIT(MPTCP_RETRANSMIT) | \ - BIT(MPTCP_FLUSH_JOIN_LIST)) + BIT(MPTCP_FLUSH_JOIN_LIST) | \ + BIT(MPTCP_DEQUEUE)) /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) @@ -3485,6 +3373,11 @@ static void mptcp_release_cb(struct sock *sk) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) __mptcp_retrans(sk); + if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) { + /* notify ack seq update */ + mptcp_cleanup_rbuf(msk, 0); + sk->sk_data_ready(sk); + } cond_resched(); spin_lock_bh(&sk->sk_lock.slock); @@ -3504,8 +3397,6 @@ static void mptcp_release_cb(struct sock *sk) if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) __mptcp_sync_sndbuf(sk); } - - __mptcp_update_rmem(sk); } /* MP_JOIN client subflow must wait for 4th ack before sending any data: @@ -3531,7 +3422,6 @@ static void schedule_3rdack_retransmission(struct sock *ssk) WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); smp_store_release(&icsk->icsk_ack.pending, icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER); - icsk->icsk_ack.timeout = timeout; sk_reset_timer(ssk, &icsk->icsk_delack_timer, timeout); } @@ -3639,8 +3529,10 @@ bool mptcp_finish_join(struct sock *ssk) return true; } - if (!mptcp_pm_allow_new_subflow(msk)) + if (!mptcp_pm_allow_new_subflow(msk)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_JOINREJECTED); goto err_prohibited; + } /* If we can't acquire msk socket lock here, let the release callback * handle it @@ -3676,12 +3568,6 @@ static void mptcp_shutdown(struct sock *sk, int how) __mptcp_wr_shutdown(sk); } -static int mptcp_forward_alloc_get(const struct sock *sk) -{ - return READ_ONCE(sk->sk_forward_alloc) + - READ_ONCE(mptcp_sk(sk)->rmem_fwd_alloc); -} - static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) { const struct sock *sk = (void *)msk; @@ -3722,7 +3608,8 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) return -EINVAL; lock_sock(sk); - __mptcp_move_skbs(msk); + if (__mptcp_move_skbs(sk)) + mptcp_cleanup_rbuf(msk, 0); *karg = mptcp_inq_hint(sk); release_sock(sk); break; @@ -3839,7 +3726,6 @@ static struct proto mptcp_prot = { .hash = mptcp_hash, .unhash = mptcp_unhash, .get_port = mptcp_get_port, - .forward_alloc_get = mptcp_forward_alloc_get, .stream_memory_free = mptcp_stream_memory_free, .sockets_allocated = &mptcp_sockets_allocated, @@ -4147,11 +4033,13 @@ void __init mptcp_proto_init(void) if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) panic("Failed to allocate MPTCP pcpu counter\n"); - init_dummy_netdev(&mptcp_napi_dev); + mptcp_napi_dev = alloc_netdev_dummy(0); + if (!mptcp_napi_dev) + panic("Failed to allocate MPTCP dummy netdev\n"); for_each_possible_cpu(cpu) { delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu); INIT_LIST_HEAD(&delegated->head); - netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi, + netif_napi_add_tx(mptcp_napi_dev, &delegated->napi, mptcp_napi_poll); napi_enable(&delegated->napi); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 73526f1d768f..3dd11dd3ba16 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -124,12 +124,14 @@ #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_SYNC_STATE 6 #define MPTCP_SYNC_SNDBUF 7 +#define MPTCP_DEQUEUE 8 struct mptcp_skb_cb { u64 map_seq; u64 end_seq; u32 offset; - u8 has_rxtstamp:1; + u8 has_rxtstamp; + u8 cant_coalesce; }; #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) @@ -149,22 +151,24 @@ struct mptcp_options_received { u32 subflow_seq; u16 data_len; __sum16 csum; - u16 suboptions; + struct_group(status, + u16 suboptions; + u16 use_map:1, + dsn64:1, + data_fin:1, + use_ack:1, + ack64:1, + mpc_map:1, + reset_reason:4, + reset_transient:1, + echo:1, + backup:1, + deny_join_id0:1, + __unused:2; + ); + u8 join_id; u32 token; u32 nonce; - u16 use_map:1, - dsn64:1, - data_fin:1, - use_ack:1, - ack64:1, - mpc_map:1, - reset_reason:4, - reset_transient:1, - echo:1, - backup:1, - deny_join_id0:1, - __unused:2; - u8 join_id; u64 thmac; u8 hmac[MPTCPOPT_HMAC_LEN]; struct mptcp_addr_info addr; @@ -219,6 +223,8 @@ struct mptcp_pm_data { spinlock_t lock; /*protects the whole PM data */ + struct_group(reset, + u8 addr_signal; bool server_side; bool work_pending; @@ -231,6 +237,9 @@ struct mptcp_pm_data { u8 pm_type; u8 subflows; u8 status; + + ); + DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_rm_list rm_list_tx; struct mptcp_rm_list rm_list_rx; @@ -277,7 +286,6 @@ struct mptcp_sock { u64 rcv_data_fin_seq; u64 bytes_retrans; u64 bytes_consumed; - int rmem_fwd_alloc; int snd_burst; int old_wspace; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; @@ -292,7 +300,6 @@ struct mptcp_sock { u32 last_ack_recv; unsigned long timer_ival; u32 token; - int rmem_released; unsigned long flags; unsigned long cb_flags; bool recovery; /* closing subflow write queue reinjected */ @@ -322,7 +329,6 @@ struct mptcp_sock { struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; - struct sk_buff_head receive_queue; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; @@ -353,6 +359,8 @@ struct mptcp_sock { list_for_each_entry(__subflow, &((__msk)->conn_list), node) #define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) +#define mptcp_next_subflow(__msk, __subflow) \ + list_next_entry_circular(__subflow, &((__msk)->conn_list), node) extern struct genl_family mptcp_genl_family; @@ -379,14 +387,6 @@ static inline void msk_owned_by_me(const struct mptcp_sock *msk) #define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) #endif -/* the msk socket don't use the backlog, also account for the bulk - * free memory - */ -static inline int __mptcp_rmem(const struct sock *sk) -{ - return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); -} - static inline int mptcp_win_from_space(const struct sock *sk, int space) { return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); @@ -399,7 +399,8 @@ static inline int mptcp_space_from_win(const struct sock *sk, int win) static inline int __mptcp_space(const struct sock *sk) { - return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); + return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - + sk_rmem_alloc_get(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) @@ -478,6 +479,7 @@ mptcp_subflow_rsk(const struct request_sock *rsk) struct mptcp_delegated_action { struct napi_struct napi; + local_lock_t bh_lock; struct list_head head; }; @@ -669,9 +671,11 @@ static inline void mptcp_subflow_delegate(struct mptcp_subflow_context *subflow, if (WARN_ON_ONCE(!list_empty(&subflow->delegated_node))) return; + local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); delegated = this_cpu_ptr(&mptcp_delegated_actions); schedule = list_empty(&delegated->head); list_add_tail(&subflow->delegated_node, &delegated->head); + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); sock_hold(mptcp_subflow_tcp_sock(subflow)); if (schedule) napi_schedule(&delegated->napi); @@ -683,11 +687,15 @@ mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) { struct mptcp_subflow_context *ret; - if (list_empty(&delegated->head)) + local_lock_nested_bh(&mptcp_delegated_actions.bh_lock); + if (list_empty(&delegated->head)) { + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return NULL; + } ret = list_first_entry(&delegated->head, struct mptcp_subflow_context, delegated_node); list_del_init(&ret->delegated_node); + local_unlock_nested_bh(&mptcp_delegated_actions.bh_lock); return ret; } @@ -698,6 +706,7 @@ int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); unsigned int mptcp_close_timeout(const struct sock *sk); int mptcp_get_pm_type(const struct net *net); +const char *mptcp_get_path_manager(const struct net *net); const char *mptcp_get_scheduler(const struct net *net); void mptcp_active_disable(struct sock *sk); @@ -724,12 +733,14 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); void __mptcp_unaccepted_force_close(struct sock *sk); -void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); void mptcp_set_state(struct sock *sk, int state); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); -void mptcp_local_address(const struct sock_common *skc, struct mptcp_addr_info *addr); +void mptcp_local_address(const struct sock_common *skc, + struct mptcp_addr_info *addr); +void mptcp_remote_address(const struct sock_common *skc, + struct mptcp_addr_info *addr); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, @@ -740,6 +751,7 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); struct mptcp_sched_ops *mptcp_sched_find(const char *name); +int mptcp_validate_scheduler(struct mptcp_sched_ops *sched); int mptcp_register_scheduler(struct mptcp_sched_ops *sched); void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched); void mptcp_sched_init(void); @@ -988,6 +1000,7 @@ __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum su void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); void mptcp_pm_data_reset(struct mptcp_sock *msk); +void mptcp_pm_destroy(struct mptcp_sock *msk); int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, struct mptcp_addr_info *addr); int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, @@ -997,7 +1010,6 @@ bool mptcp_pm_addr_families_match(const struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *rem); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); -void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); @@ -1011,37 +1023,52 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); -bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *remote); -void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); +void mptcp_pm_send_ack(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + bool prio, bool backup); +void mptcp_pm_addr_send_ack(struct mptcp_sock *msk); +void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id); +void mptcp_pm_rm_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - struct mptcp_addr_info *rem, - u8 bkup); +int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); -void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * mptcp_pm_del_add_timer(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool check_id); -struct mptcp_pm_add_entry * -mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk, - const struct mptcp_addr_info *addr); -int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info); -int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info); -int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info); +bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, + const struct mptcp_addr_info *saddr); +bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr); +int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info); +int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); -void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list); +void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry); + +/* the default path manager, used in mptcp_pm_unregister */ +extern struct mptcp_pm_ops mptcp_pm_kernel; -void mptcp_free_local_addr_list(struct mptcp_sock *msk); +struct mptcp_pm_ops *mptcp_pm_find(const char *name); +int mptcp_pm_register(struct mptcp_pm_ops *pm_ops); +void mptcp_pm_unregister(struct mptcp_pm_ops *pm_ops); +int mptcp_pm_validate(struct mptcp_pm_ops *pm_ops); +void mptcp_pm_get_available(char *buf, size_t maxlen); + +void mptcp_userspace_pm_free_local_addr_list(struct mptcp_sock *msk); void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); @@ -1051,12 +1078,11 @@ void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); -void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - const struct mptcp_options_received *mp_opt); void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req); -int mptcp_nl_fill_addr(struct sk_buff *skb, - struct mptcp_pm_addr_entry *entry); +int mptcp_pm_genl_fill_addr(struct sk_buff *msg, + struct netlink_callback *cb, + struct mptcp_pm_addr_entry *entry); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { @@ -1119,19 +1145,20 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); -int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); -int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc); +int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc); bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc); bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); -int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); -int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info); -int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info); -int mptcp_userspace_pm_get_addr(struct sk_buff *skb, +int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, + struct genl_info *info); +int mptcp_userspace_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info); static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow) @@ -1143,8 +1170,11 @@ static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflo return local_id; } +void __init mptcp_pm_kernel_register(void); +void __init mptcp_pm_userspace_register(void); void __init mptcp_pm_nl_init(void); -void mptcp_pm_nl_work(struct mptcp_sock *msk); +void mptcp_pm_worker(struct mptcp_sock *msk); +void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); @@ -1192,6 +1222,8 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk) pr_debug("TCP fallback already done (msk=%p)\n", msk); return; } + if (WARN_ON_ONCE(!READ_ONCE(msk->allow_infinite_fallback))) + return; set_bit(MPTCP_FALLBACK_DONE, &msk->flags); } diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index df7dbcfa3b71..1e59072d478c 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -16,13 +16,23 @@ static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); -static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, - struct mptcp_sched_data *data) +static int mptcp_sched_default_get_send(struct mptcp_sock *msk) { struct sock *ssk; - ssk = data->reinject ? mptcp_subflow_get_retrans(msk) : - mptcp_subflow_get_send(msk); + ssk = mptcp_subflow_get_send(msk); + if (!ssk) + return -EINVAL; + + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); + return 0; +} + +static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk) +{ + struct sock *ssk; + + ssk = mptcp_subflow_get_retrans(msk); if (!ssk) return -EINVAL; @@ -31,7 +41,8 @@ static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, } static struct mptcp_sched_ops mptcp_sched_default = { - .get_subflow = mptcp_sched_default_get_subflow, + .get_send = mptcp_sched_default_get_send, + .get_retrans = mptcp_sched_default_get_retrans, .name = "default", .owner = THIS_MODULE, }; @@ -71,10 +82,23 @@ void mptcp_get_available_schedulers(char *buf, size_t maxlen) rcu_read_unlock(); } -int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +int mptcp_validate_scheduler(struct mptcp_sched_ops *sched) { - if (!sched->get_subflow) + if (!sched->get_send) { + pr_err("%s does not implement required ops\n", sched->name); return -EINVAL; + } + + return 0; +} + +int mptcp_register_scheduler(struct mptcp_sched_ops *sched) +{ + int ret; + + ret = mptcp_validate_scheduler(sched); + if (ret) + return ret; spin_lock(&mptcp_sched_list_lock); if (mptcp_sched_find(sched->name)) { @@ -144,7 +168,6 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data data; msk_owned_by_me(msk); @@ -164,16 +187,14 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) return 0; } - data.reinject = false; if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_subflow(msk, &data); - return msk->sched->get_subflow(msk, &data); + return mptcp_sched_default_get_send(msk); + return msk->sched->get_send(msk); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data data; msk_owned_by_me(msk); @@ -186,8 +207,9 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) return 0; } - data.reinject = true; if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_subflow(msk, &data); - return msk->sched->get_subflow(msk, &data); + return mptcp_sched_default_get_retrans(msk); + if (msk->sched->get_retrans) + return msk->sched->get_retrans(msk); + return msk->sched->get_send(msk); } diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 505445a9598f..3caa0a9d3b38 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1419,6 +1419,12 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, switch (optname) { case IP_TOS: return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos)); + case IP_FREEBIND: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(FREEBIND, sk)); + case IP_TRANSPARENT: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(TRANSPARENT, sk)); case IP_BIND_ADDRESS_NO_PORT: return mptcp_put_int_option(msk, optval, optlen, inet_test_bit(BIND_ADDRESS_NO_PORT, sk)); @@ -1430,6 +1436,26 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname, return -EOPNOTSUPP; } +static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = (void *)msk; + + switch (optname) { + case IPV6_V6ONLY: + return mptcp_put_int_option(msk, optval, optlen, + sk->sk_ipv6only); + case IPV6_TRANSPARENT: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(TRANSPARENT, sk)); + case IPV6_FREEBIND: + return mptcp_put_int_option(msk, optval, optlen, + inet_test_bit(FREEBIND, sk)); + } + + return -EOPNOTSUPP; +} + static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { @@ -1469,6 +1495,8 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname, if (level == SOL_IP) return mptcp_getsockopt_v4(msk, optname, optval, option); + if (level == SOL_IPV6) + return mptcp_getsockopt_v6(msk, optname, optval, option); if (level == SOL_TCP) return mptcp_getsockopt_sol_tcp(msk, optname, optval, option); if (level == SOL_MPTCP) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index fd021cf8286e..15613d691bfe 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -247,6 +247,7 @@ again: if (unlikely(req->syncookie)) { if (!mptcp_can_accept_new_subflow(subflow_req->msk)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); return -EPERM; } @@ -745,17 +746,11 @@ struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *op EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc); /* validate hmac received in third ACK */ -static bool subflow_hmac_valid(const struct request_sock *req, +static bool subflow_hmac_valid(const struct mptcp_subflow_request_sock *subflow_req, const struct mptcp_options_received *mp_opt) { - const struct mptcp_subflow_request_sock *subflow_req; + struct mptcp_sock *msk = subflow_req->msk; u8 hmac[SHA256_DIGEST_SIZE]; - struct mptcp_sock *msk; - - subflow_req = mptcp_subflow_rsk(req); - msk = subflow_req->msk; - if (!msk) - return false; subflow_generate_hmac(READ_ONCE(msk->remote_key), READ_ONCE(msk->local_key), @@ -802,9 +797,6 @@ void __mptcp_subflow_fully_established(struct mptcp_sock *msk, subflow_set_remote_key(msk, subflow, mp_opt); WRITE_ONCE(subflow->fully_established, true); WRITE_ONCE(msk->fully_established, true); - - if (subflow->is_mptfo) - __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, @@ -853,12 +845,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); - if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK) || - !subflow_hmac_valid(req, &mp_opt) || - !mptcp_can_accept_new_subflow(subflow_req->msk)) { - SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); + if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK)) fallback = true; - } } create_child: @@ -908,6 +896,18 @@ create_child: goto dispose_child; } + if (!subflow_hmac_valid(subflow_req, &mp_opt)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + goto dispose_child; + } + + if (!mptcp_can_accept_new_subflow(owner)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINREJECTED); + subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT); + goto dispose_child; + } + /* move the msk reference ownership to the subflow */ subflow_req->msk = NULL; ctx->conn = (struct sock *)owner; @@ -1142,7 +1142,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (data_len == 0) { pr_debug("infinite mapping received\n"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); - subflow->map_data_len = 0; return MAPPING_INVALID; } @@ -1271,7 +1270,12 @@ out: subflow->map_valid = 0; } -/* sched mptcp worker to remove the subflow if no more data is pending */ +static bool subflow_is_done(const struct sock *sk) +{ + return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; +} + +/* sched mptcp worker for subflow cleanup if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; @@ -1281,21 +1285,19 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss inet_sk_state_load(sk) != TCP_ESTABLISHED))) return; - if (skb_queue_empty(&ssk->sk_receive_queue) && - !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) - mptcp_schedule_work(sk); -} + if (!skb_queue_empty(&ssk->sk_receive_queue)) + return; -static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) -{ - struct mptcp_sock *msk = mptcp_sk(subflow->conn); + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + mptcp_schedule_work(sk); - if (subflow->mp_join) - return false; - else if (READ_ONCE(msk->csum_enabled)) - return !subflow->valid_csum_seen; - else - return READ_ONCE(msk->allow_infinite_fallback); + /* when the fallback subflow closes the rx side, trigger a 'dummy' + * ingress data fin, so that the msk state will follow along + */ + if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) && + msk->first == ssk && + mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) + mptcp_schedule_work(sk); } static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) @@ -1393,7 +1395,7 @@ fallback: return true; } - if (!subflow_can_fallback(subflow) && subflow->map_data_len) { + if (!READ_ONCE(msk->allow_infinite_fallback)) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ @@ -1772,10 +1774,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, * needs it. * Update ns_tracker to current stack trace and refcounted tracker. */ - __netns_tracker_free(net, &sf->sk->ns_tracker, false); - sf->sk->sk_net_refcnt = 1; - get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sf->sk); err = tcp_set_ulp(sf->sk, "mptcp"); if (err) goto err_free; @@ -1842,11 +1841,6 @@ static void __subflow_state_change(struct sock *sk) rcu_read_unlock(); } -static bool subflow_is_done(const struct sock *sk) -{ - return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; -} - static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -1873,13 +1867,6 @@ static void subflow_state_change(struct sock *sk) subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); - - /* when the fallback subflow closes the rx side, trigger a 'dummy' - * ingress data fin, so that the msk state will follow along - */ - if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && - mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) - mptcp_schedule_work(parent); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 4e0842df5234..e76c6de0c784 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -143,16 +143,15 @@ struct ncsi_channel_vlan_filter { }; struct ncsi_channel_stats { - u32 hnc_cnt_hi; /* Counter cleared */ - u32 hnc_cnt_lo; /* Counter cleared */ - u32 hnc_rx_bytes; /* Rx bytes */ - u32 hnc_tx_bytes; /* Tx bytes */ - u32 hnc_rx_uc_pkts; /* Rx UC packets */ - u32 hnc_rx_mc_pkts; /* Rx MC packets */ - u32 hnc_rx_bc_pkts; /* Rx BC packets */ - u32 hnc_tx_uc_pkts; /* Tx UC packets */ - u32 hnc_tx_mc_pkts; /* Tx MC packets */ - u32 hnc_tx_bc_pkts; /* Tx BC packets */ + u64 hnc_cnt; /* Counter cleared */ + u64 hnc_rx_bytes; /* Rx bytes */ + u64 hnc_tx_bytes; /* Tx bytes */ + u64 hnc_rx_uc_pkts; /* Rx UC packets */ + u64 hnc_rx_mc_pkts; /* Rx MC packets */ + u64 hnc_rx_bc_pkts; /* Rx BC packets */ + u64 hnc_tx_uc_pkts; /* Tx UC packets */ + u64 hnc_tx_mc_pkts; /* Tx MC packets */ + u64 hnc_tx_bc_pkts; /* Tx BC packets */ u32 hnc_fcs_err; /* FCS errors */ u32 hnc_align_err; /* Alignment errors */ u32 hnc_false_carrier; /* False carrier detection */ @@ -181,7 +180,7 @@ struct ncsi_channel_stats { u32 hnc_tx_1023_frames; /* Tx 512-1023 bytes frames */ u32 hnc_tx_1522_frames; /* Tx 1024-1522 bytes frames */ u32 hnc_tx_9022_frames; /* Tx 1523-9022 bytes frames */ - u32 hnc_rx_valid_bytes; /* Rx valid bytes */ + u64 hnc_rx_valid_bytes; /* Rx valid bytes */ u32 hnc_rx_runt_pkts; /* Rx error runt packets */ u32 hnc_rx_jabber_pkts; /* Rx error jabber packets */ u32 ncsi_rx_cmds; /* Rx NCSI commands */ @@ -323,7 +322,7 @@ struct ncsi_dev_priv { #define NCSI_DEV_RESHUFFLE 4 #define NCSI_DEV_RESET 8 /* Reset state of NC */ unsigned int gma_flag; /* OEM GMA flag */ - struct sockaddr pending_mac; /* MAC address received from GMA */ + struct sockaddr_storage pending_mac; /* MAC address received from GMA */ spinlock_t lock; /* Protect the NCSI device */ unsigned int package_probe_id;/* Current ID during probe */ unsigned int package_num; /* Number of packages */ diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index bf276eaf9330..b36947063783 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -189,7 +189,7 @@ void ncsi_stop_channel_monitor(struct ncsi_channel *nc) nc->monitor.enabled = false; spin_unlock_irqrestore(&nc->lock, flags); - del_timer_sync(&nc->monitor.timer); + timer_delete_sync(&nc->monitor.timer); } struct ncsi_channel *ncsi_find_channel(struct ncsi_package *np, @@ -396,7 +396,7 @@ void ncsi_free_request(struct ncsi_request *nr) if (nr->enabled) { nr->enabled = false; - del_timer_sync(&nr->timer); + timer_delete_sync(&nr->timer); } spin_lock_irqsave(&ndp->lock, flags); @@ -1385,6 +1385,12 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_package; break; case ncsi_dev_state_probe_package: + if (ndp->package_probe_id >= 8) { + /* Last package probed, finishing */ + ndp->flags |= NCSI_DEV_PROBED; + break; + } + ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_SP; @@ -1501,13 +1507,8 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) if (ret) goto error; - /* Probe next package */ + /* Probe next package after receiving response */ ndp->package_probe_id++; - if (ndp->package_probe_id >= 8) { - /* Probe finished */ - ndp->flags |= NCSI_DEV_PROBED; - break; - } nd->state = ncsi_dev_state_probe_package; ndp->active_package = NULL; break; diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index f2f3b5c1b941..24edb2737972 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -252,16 +252,15 @@ struct ncsi_rsp_gp_pkt { /* Get Controller Packet Statistics */ struct ncsi_rsp_gcps_pkt { struct ncsi_rsp_pkt_hdr rsp; /* Response header */ - __be32 cnt_hi; /* Counter cleared */ - __be32 cnt_lo; /* Counter cleared */ - __be32 rx_bytes; /* Rx bytes */ - __be32 tx_bytes; /* Tx bytes */ - __be32 rx_uc_pkts; /* Rx UC packets */ - __be32 rx_mc_pkts; /* Rx MC packets */ - __be32 rx_bc_pkts; /* Rx BC packets */ - __be32 tx_uc_pkts; /* Tx UC packets */ - __be32 tx_mc_pkts; /* Tx MC packets */ - __be32 tx_bc_pkts; /* Tx BC packets */ + __be64 cnt; /* Counter cleared */ + __be64 rx_bytes; /* Rx bytes */ + __be64 tx_bytes; /* Tx bytes */ + __be64 rx_uc_pkts; /* Rx UC packets */ + __be64 rx_mc_pkts; /* Rx MC packets */ + __be64 rx_bc_pkts; /* Rx BC packets */ + __be64 tx_uc_pkts; /* Tx UC packets */ + __be64 tx_mc_pkts; /* Tx MC packets */ + __be64 tx_bc_pkts; /* Tx BC packets */ __be32 fcs_err; /* FCS errors */ __be32 align_err; /* Alignment errors */ __be32 false_carrier; /* False carrier detection */ @@ -290,11 +289,11 @@ struct ncsi_rsp_gcps_pkt { __be32 tx_1023_frames; /* Tx 512-1023 bytes frames */ __be32 tx_1522_frames; /* Tx 1024-1522 bytes frames */ __be32 tx_9022_frames; /* Tx 1523-9022 bytes frames */ - __be32 rx_valid_bytes; /* Rx valid bytes */ + __be64 rx_valid_bytes; /* Rx valid bytes */ __be32 rx_runt_pkts; /* Rx error runt packets */ __be32 rx_jabber_pkts; /* Rx error jabber packets */ __be32 checksum; /* Checksum */ -}; +} __packed __aligned(4); /* Get NCSI Statistics */ struct ncsi_rsp_gns_pkt { diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 14bd66909ca4..472cc68ad86f 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -628,7 +628,7 @@ static int ncsi_rsp_handler_snfc(struct ncsi_request *nr) static int ncsi_rsp_handler_oem_gma(struct ncsi_request *nr, int mfr_id) { struct ncsi_dev_priv *ndp = nr->ndp; - struct sockaddr *saddr = &ndp->pending_mac; + struct sockaddr_storage *saddr = &ndp->pending_mac; struct net_device *ndev = ndp->ndev.dev; struct ncsi_rsp_oem_pkt *rsp; u32 mac_addr_off = 0; @@ -644,11 +644,11 @@ static int ncsi_rsp_handler_oem_gma(struct ncsi_request *nr, int mfr_id) else if (mfr_id == NCSI_OEM_MFR_INTEL_ID) mac_addr_off = INTEL_MAC_ADDR_OFFSET; - saddr->sa_family = ndev->type; - memcpy(saddr->sa_data, &rsp->data[mac_addr_off], ETH_ALEN); + saddr->ss_family = ndev->type; + memcpy(saddr->__data, &rsp->data[mac_addr_off], ETH_ALEN); if (mfr_id == NCSI_OEM_MFR_BCM_ID || mfr_id == NCSI_OEM_MFR_INTEL_ID) - eth_addr_inc((u8 *)saddr->sa_data); - if (!is_valid_ether_addr((const u8 *)saddr->sa_data)) + eth_addr_inc(saddr->__data); + if (!is_valid_ether_addr(saddr->__data)) return -ENXIO; /* Set the flag for GMA command which should only be called once */ @@ -926,16 +926,15 @@ static int ncsi_rsp_handler_gcps(struct ncsi_request *nr) /* Update HNC's statistics */ ncs = &nc->stats; - ncs->hnc_cnt_hi = ntohl(rsp->cnt_hi); - ncs->hnc_cnt_lo = ntohl(rsp->cnt_lo); - ncs->hnc_rx_bytes = ntohl(rsp->rx_bytes); - ncs->hnc_tx_bytes = ntohl(rsp->tx_bytes); - ncs->hnc_rx_uc_pkts = ntohl(rsp->rx_uc_pkts); - ncs->hnc_rx_mc_pkts = ntohl(rsp->rx_mc_pkts); - ncs->hnc_rx_bc_pkts = ntohl(rsp->rx_bc_pkts); - ncs->hnc_tx_uc_pkts = ntohl(rsp->tx_uc_pkts); - ncs->hnc_tx_mc_pkts = ntohl(rsp->tx_mc_pkts); - ncs->hnc_tx_bc_pkts = ntohl(rsp->tx_bc_pkts); + ncs->hnc_cnt = be64_to_cpu(rsp->cnt); + ncs->hnc_rx_bytes = be64_to_cpu(rsp->rx_bytes); + ncs->hnc_tx_bytes = be64_to_cpu(rsp->tx_bytes); + ncs->hnc_rx_uc_pkts = be64_to_cpu(rsp->rx_uc_pkts); + ncs->hnc_rx_mc_pkts = be64_to_cpu(rsp->rx_mc_pkts); + ncs->hnc_rx_bc_pkts = be64_to_cpu(rsp->rx_bc_pkts); + ncs->hnc_tx_uc_pkts = be64_to_cpu(rsp->tx_uc_pkts); + ncs->hnc_tx_mc_pkts = be64_to_cpu(rsp->tx_mc_pkts); + ncs->hnc_tx_bc_pkts = be64_to_cpu(rsp->tx_bc_pkts); ncs->hnc_fcs_err = ntohl(rsp->fcs_err); ncs->hnc_align_err = ntohl(rsp->align_err); ncs->hnc_false_carrier = ntohl(rsp->false_carrier); @@ -964,7 +963,7 @@ static int ncsi_rsp_handler_gcps(struct ncsi_request *nr) ncs->hnc_tx_1023_frames = ntohl(rsp->tx_1023_frames); ncs->hnc_tx_1522_frames = ntohl(rsp->tx_1522_frames); ncs->hnc_tx_9022_frames = ntohl(rsp->tx_9022_frames); - ncs->hnc_rx_valid_bytes = ntohl(rsp->rx_valid_bytes); + ncs->hnc_rx_valid_bytes = be64_to_cpu(rsp->rx_valid_bytes); ncs->hnc_rx_runt_pkts = ntohl(rsp->rx_runt_pkts); ncs->hnc_rx_jabber_pkts = ntohl(rsp->rx_jabber_pkts); @@ -1089,14 +1088,12 @@ static int ncsi_rsp_handler_netlink(struct ncsi_request *nr) static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr) { struct ncsi_dev_priv *ndp = nr->ndp; + struct sockaddr_storage *saddr = &ndp->pending_mac; struct net_device *ndev = ndp->ndev.dev; struct ncsi_rsp_gmcma_pkt *rsp; - struct sockaddr saddr; - int ret = -1; int i; rsp = (struct ncsi_rsp_gmcma_pkt *)skb_network_header(nr->rsp); - saddr.sa_family = ndev->type; ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netdev_info(ndev, "NCSI: Received %d provisioned MAC addresses\n", @@ -1108,20 +1105,20 @@ static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr) rsp->addresses[i][4], rsp->addresses[i][5]); } + saddr->ss_family = ndev->type; for (i = 0; i < rsp->address_count; i++) { - memcpy(saddr.sa_data, &rsp->addresses[i], ETH_ALEN); - ret = ndev->netdev_ops->ndo_set_mac_address(ndev, &saddr); - if (ret < 0) { + if (!is_valid_ether_addr(rsp->addresses[i])) { netdev_warn(ndev, "NCSI: Unable to assign %pM to device\n", - saddr.sa_data); + rsp->addresses[i]); continue; } - netdev_warn(ndev, "NCSI: Set MAC address to %pM\n", saddr.sa_data); + memcpy(saddr->__data, rsp->addresses[i], ETH_ALEN); + netdev_warn(ndev, "NCSI: Will set MAC address to %pM\n", saddr->__data); break; } - ndp->gma_flag = ret == 0; - return ret; + ndp->gma_flag = 1; + return 0; } static struct ncsi_rsp_handler { diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index df2dc21304ef..2560416218d0 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -212,7 +212,7 @@ config NF_CT_PROTO_SCTP bool 'SCTP protocol connection tracking support' depends on NETFILTER_ADVANCED default y - select LIBCRC32C + select NET_CRC32C help With this option enabled, the layer 3 independent connection tracking code will be able to do state tracking on SCTP connections. @@ -475,7 +475,7 @@ endif # NF_CONNTRACK config NF_TABLES select NETFILTER_NETLINK - select LIBCRC32C + select NET_CRC32C tristate "Netfilter nf_tables support" help nftables is the new packet classification framework that intends to @@ -1180,7 +1180,7 @@ config NETFILTER_XT_MATCH_CGROUP tristate '"control group" match support' depends on NETFILTER_ADVANCED depends on CGROUPS - select CGROUP_NET_CLASSID + select SOCK_CGROUP_DATA help Socket/process control group matching allows you to match locally generated packets based on which net_cls control group processes diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b9f551f02c81..11a702065bab 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -31,9 +31,6 @@ const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); -DEFINE_PER_CPU(bool, nf_skb_duplicated); -EXPORT_SYMBOL_GPL(nf_skb_duplicated); - #ifdef CONFIG_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index cb48a2b9cb9f..6ae042f702d2 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -294,7 +294,7 @@ mtype_cancel_gc(struct ip_set *set) struct mtype *map = set->data; if (SET_WITH_TIMEOUT(set)) - del_timer_sync(&map->gc); + timer_delete_sync(&map->gc); } static const struct ip_set_type_variant mtype = { diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index cf3ce72c3de6..5251524b96af 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -64,7 +64,7 @@ struct hbucket { #define ahash_sizeof_regions(htable_bits) \ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) #define ahash_region(n, htable_bits) \ - ((n) % ahash_numof_locks(htable_bits)) + ((n) / jhash_size(HTABLE_REGION_BITS)) #define ahash_bucket_start(h, htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? 0 \ : (h) * jhash_size(HTABLE_REGION_BITS)) diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 2a3017b9c001..c203252e856d 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -105,7 +105,7 @@ config IP_VS_PROTO_AH config IP_VS_PROTO_SCTP bool "SCTP load balancing support" - select LIBCRC32C + select NET_CRC32C help This option enables support for load balancing SCTP transport protocol. Say Y if unsure. diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index c0289f83f96d..8699944c0baf 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -822,7 +822,7 @@ static void ip_vs_conn_rcu_free(struct rcu_head *head) /* Try to delete connection while not holding reference */ static void ip_vs_conn_del(struct ip_vs_conn *cp) { - if (del_timer(&cp->timer)) { + if (timer_delete(&cp->timer)) { /* Drop cp->control chain too */ if (cp->control) cp->timeout = 0; @@ -833,7 +833,7 @@ static void ip_vs_conn_del(struct ip_vs_conn *cp) /* Try to delete connection while holding reference */ static void ip_vs_conn_del_put(struct ip_vs_conn *cp) { - if (del_timer(&cp->timer)) { + if (timer_delete(&cp->timer)) { /* Drop cp->control chain too */ if (cp->control) cp->timeout = 0; @@ -860,7 +860,7 @@ static void ip_vs_conn_expire(struct timer_list *t) struct ip_vs_conn *ct = cp->control; /* delete the timer if it is activated by other users */ - del_timer(&cp->timer); + timer_delete(&cp->timer); /* does anybody control me? */ if (ct) { @@ -1046,28 +1046,35 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, #ifdef CONFIG_PROC_FS struct ip_vs_iter_state { struct seq_net_private p; - struct hlist_head *l; + unsigned int bucket; + unsigned int skip_elems; }; -static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +static void *ip_vs_conn_array(struct ip_vs_iter_state *iter) { int idx; struct ip_vs_conn *cp; - struct ip_vs_iter_state *iter = seq->private; - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + for (idx = iter->bucket; idx < ip_vs_conn_tab_size; idx++) { + unsigned int skip = 0; + hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { /* __ip_vs_conn_get() is not needed by * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show */ - if (pos-- == 0) { - iter->l = &ip_vs_conn_tab[idx]; + if (skip >= iter->skip_elems) { + iter->bucket = idx; return cp; } + + ++skip; } + + iter->skip_elems = 0; cond_resched_rcu(); } + iter->bucket = idx; return NULL; } @@ -1076,9 +1083,14 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) { struct ip_vs_iter_state *iter = seq->private; - iter->l = NULL; rcu_read_lock(); - return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; + if (*pos == 0) { + iter->skip_elems = 0; + iter->bucket = 0; + return SEQ_START_TOKEN; + } + + return ip_vs_conn_array(iter); } static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) @@ -1086,28 +1098,22 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct ip_vs_conn *cp = v; struct ip_vs_iter_state *iter = seq->private; struct hlist_node *e; - struct hlist_head *l = iter->l; - int idx; ++*pos; if (v == SEQ_START_TOKEN) - return ip_vs_conn_array(seq, 0); + return ip_vs_conn_array(iter); /* more on same hash chain? */ e = rcu_dereference(hlist_next_rcu(&cp->c_list)); - if (e) + if (e) { + iter->skip_elems++; return hlist_entry(e, struct ip_vs_conn, c_list); - - idx = l - ip_vs_conn_tab; - while (++idx < ip_vs_conn_tab_size) { - hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) { - iter->l = &ip_vs_conn_tab[idx]; - return cp; - } - cond_resched_rcu(); } - iter->l = NULL; - return NULL; + + iter->skip_elems = 0; + iter->bucket++; + + return ip_vs_conn_array(iter); } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 7d13110ce188..7d5b7418f8c7 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -848,7 +848,7 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs) { struct ip_vs_dest *dest, *nxt; - del_timer_sync(&ipvs->dest_trash_timer); + timer_delete_sync(&ipvs->dest_trash_timer); /* No need to use dest_trash_lock */ list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) { list_del(&dest->t_list); @@ -3091,12 +3091,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_SERVICES: { struct ip_vs_get_services *get; - int size; + size_t size; get = (struct ip_vs_get_services *)arg; size = struct_size(get, entrytable, get->num_services); if (*len != size) { - pr_err("length: %u != %u\n", *len, size); + pr_err("length: %u != %zu\n", *len, size); ret = -EINVAL; goto out; } @@ -3132,12 +3132,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP_VS_SO_GET_DESTS: { struct ip_vs_get_dests *get; - int size; + size_t size; get = (struct ip_vs_get_dests *)arg; size = struct_size(get, entrytable, get->num_dests); if (*len != size) { - pr_err("length: %u != %u\n", *len, size); + pr_err("length: %u != %zu\n", *len, size); ret = -EINVAL; goto out; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 3313bceb6cc9..014f07740369 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -119,13 +119,12 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) return false; } -/* Get route to daddr, update *saddr, optionally bind route to saddr */ +/* Get route to daddr, optionally bind route to saddr */ static struct rtable *do_output_route4(struct net *net, __be32 daddr, - int rt_mode, __be32 *saddr) + int rt_mode, __be32 *ret_saddr) { struct flowi4 fl4; struct rtable *rt; - bool loop = false; memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; @@ -135,23 +134,17 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, retry: rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { - /* Invalid saddr ? */ - if (PTR_ERR(rt) == -EINVAL && *saddr && - rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { - *saddr = 0; - flowi4_update_output(&fl4, 0, daddr, 0); - goto retry; - } IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); return NULL; - } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { + } + if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { ip_rt_put(rt); - *saddr = fl4.saddr; flowi4_update_output(&fl4, 0, daddr, fl4.saddr); - loop = true; + rt_mode = 0; goto retry; } - *saddr = fl4.saddr; + if (ret_saddr) + *ret_saddr = fl4.saddr; return rt; } @@ -344,19 +337,15 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.ip; } else { - __be32 saddr = htonl(INADDR_ANY); - noref = 0; /* For such unconfigured boxes avoid many route lookups * for performance reasons because we do not remember saddr */ rt_mode &= ~IP_VS_RT_MODE_CONNECT; - rt = do_output_route4(net, daddr, rt_mode, &saddr); + rt = do_output_route4(net, daddr, rt_mode, ret_saddr); if (!rt) goto err_unreach; - if (ret_saddr) - *ret_saddr = saddr; } local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 4890af4dc263..913ede2f57f9 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -132,7 +132,7 @@ static int __nf_conncount_add(struct net *net, struct nf_conn *found_ct; unsigned int collect = 0; - if (time_is_after_eq_jiffies((unsigned long)list->last_gc)) + if ((u32)jiffies == list->last_gc) goto add_new_node; /* check the saved connections */ @@ -234,7 +234,7 @@ bool nf_conncount_gc_list(struct net *net, bool ret = false; /* don't bother if we just did GC */ - if (time_is_after_eq_jiffies((unsigned long)READ_ONCE(list->last_gc))) + if ((u32)jiffies == READ_ONCE(list->last_gc)) return false; /* don't bother if other cpu is already doing GC */ @@ -377,6 +377,8 @@ restart: conn->tuple = *tuple; conn->zone = *zone; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; memcpy(rbconn->key, key, sizeof(u32) * data->keylen); nf_conncount_list_init(&rbconn->list); diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index d011d2eb0848..7be4c35e4795 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -106,7 +106,7 @@ static int amanda_help(struct sk_buff *skb, /* increase the UDP timeout of the master connection as replies from * Amanda clients to the server can be quite delayed */ - nf_ct_refresh(ct, skb, master_timeout * HZ); + nf_ct_refresh(ct, master_timeout * HZ); /* No data? */ dataoff = protoff + sizeof(struct udphdr); diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index cfa0fe0356de..a7552a46d6ac 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -75,7 +75,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, nf_ct_expect_related(exp, 0); nf_ct_expect_put(exp); - nf_ct_refresh(ct, skb, timeout * HZ); + nf_ct_refresh(ct, timeout * HZ); out: return NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 456446d7af20..201d3c4ec623 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -505,6 +505,11 @@ u32 nf_ct_get_id(const struct nf_conn *ct) } EXPORT_SYMBOL_GPL(nf_ct_get_id); +static u32 nf_conntrack_get_id(const struct nf_conntrack *nfct) +{ + return nf_ct_get_id(nf_ct_to_nf_conn(nfct)); +} + static void clean_from_lists(struct nf_conn *ct) { @@ -531,10 +536,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, p = tmpl; tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); - if (tmpl != p) { - tmpl = (struct nf_conn *)NFCT_ALIGN((unsigned long)p); + if (tmpl != p) tmpl->proto.tmpl_padto = (char *)tmpl - (char *)p; - } } else { tmpl = kzalloc(sizeof(*tmpl), flags); if (!tmpl) @@ -1544,12 +1547,6 @@ static void gc_worker(struct work_struct *work) tmp = nf_ct_tuplehash_to_ctrack(h); - if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { - nf_ct_offload_timeout(tmp); - if (!nf_conntrack_max95) - continue; - } - if (expired_count > GC_SCAN_EXPIRED_MAX) { rcu_read_unlock(); @@ -2089,9 +2086,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_in); /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, u32 extra_jiffies, - bool do_acct) + unsigned int bytes) { /* Only update if this is not a fixed timeout */ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) @@ -2104,8 +2100,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, if (READ_ONCE(ct->timeout) != extra_jiffies) WRITE_ONCE(ct->timeout, extra_jiffies); acct: - if (do_acct) - nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); + if (bytes) + nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -2719,6 +2715,7 @@ static const struct nf_ct_hook nf_conntrack_hook = { .attach = nf_conntrack_attach, .set_closing = nf_conntrack_set_closing, .confirm = __nf_conntrack_confirm, + .get_id = nf_conntrack_get_id, }; void nf_conntrack_init_end(void) diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 69948e1d6974..af68c64acaab 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -162,6 +162,14 @@ static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e, return ret; } +static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + if (local64_read(&e->timestamp)) + local64_set(&e->timestamp, ktime_get_real_ns()); +#endif +} + int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, u32 portid, int report) { @@ -186,6 +194,8 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, /* This is a resent of a destroy event? If so, skip missed */ missed = e->portid ? 0 : e->missed; + nf_ct_ecache_tstamp_refresh(e); + ret = __nf_conntrack_eventmask_report(e, events, missed, &item); if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { /* This is a destroy event that has been triggered by a process, @@ -297,6 +307,18 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) } } +static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + u64 ts = 0; + + if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) + ts = ktime_get_real_ns(); + + local64_set(&e->timestamp, ts); +#endif +} + bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { struct net *net = nf_ct_net(ct); @@ -326,6 +348,7 @@ bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp); if (e) { + nf_ct_ecache_tstamp_new(ct, e); e->ctmask = ctmask; e->expmask = expmask; } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 21fa550966f0..21d22fa22e4e 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -118,7 +118,7 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) { - if (del_timer(&exp->timeout)) { + if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); return true; @@ -214,11 +214,11 @@ nf_ct_find_expectation(struct net *net, if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; - } else if (del_timer(&exp->timeout)) { + } else if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); return exp; } - /* Undo exp->master refcnt increase, if del_timer() failed */ + /* Undo exp->master refcnt increase, if timer_delete() failed */ nf_ct_put(exp->master); return NULL; @@ -520,7 +520,7 @@ void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, vo hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { - if (iter(exp, data) && del_timer(&exp->timeout)) { + if (iter(exp, data) && timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } @@ -550,7 +550,7 @@ void nf_ct_expect_iterate_net(struct net *net, if (!net_eq(nf_ct_exp_net(exp), net)) continue; - if (iter(exp, data) && del_timer(&exp->timeout)) { + if (iter(exp, data) && timer_delete(&exp->timeout)) { nf_ct_unlink_expect_report(exp, portid, report); nf_ct_expect_put(exp); } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 5a9bce24f3c3..14f73872f647 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1385,7 +1385,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, if (info->timeout > 0) { pr_debug("nf_ct_ras: set RAS connection timeout to " "%u seconds\n", info->timeout); - nf_ct_refresh(ct, skb, info->timeout * HZ); + nf_ct_refresh(ct, info->timeout * HZ); /* Set expect timeout */ spin_lock_bh(&nf_conntrack_expect_lock); @@ -1433,7 +1433,7 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct, info->sig_port[!dir] = 0; /* Give it 30 seconds for UCF or URJ */ - nf_ct_refresh(ct, skb, 30 * HZ); + nf_ct_refresh(ct, 30 * HZ); return 0; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 36168f8b6efa..2cc0fde23344 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -357,11 +357,11 @@ nla_put_failure: static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_secctx; - int len, ret; - char *secctx; + struct lsm_context ctx; + int ret; - ret = security_secid_to_secctx(ct->secmark, &secctx, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, &ctx); + if (ret < 0) return 0; ret = -1; @@ -369,13 +369,13 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) if (!nest_secctx) goto nla_put_failure; - if (nla_put_string(skb, CTA_SECCTX_NAME, secctx)) + if (nla_put_string(skb, CTA_SECCTX_NAME, ctx.context)) goto nla_put_failure; nla_nest_end(skb, nest_secctx); ret = 0; nla_put_failure: - security_release_secctx(secctx, len); + security_release_secctx(&ctx); return ret; } #else @@ -383,6 +383,23 @@ nla_put_failure: #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS +static int +ctnetlink_dump_event_timestamp(struct sk_buff *skb, const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + const struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); + + if (e) { + u64 ts = local64_read(&e->timestamp); + + if (ts) + return nla_put_be64(skb, CTA_TIMESTAMP_EVENT, + cpu_to_be64(ts), CTA_TIMESTAMP_PAD); + } +#endif + return 0; +} + static inline int ctnetlink_label_size(const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); @@ -663,14 +680,14 @@ static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) static inline int ctnetlink_secctx_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_SECMARK - int len, ret; + int ret; - ret = security_secid_to_secctx(ct->secmark, NULL, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, NULL); + if (ret < 0) return 0; return nla_total_size(0) /* CTA_SECCTX */ - + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */ + + nla_total_size(sizeof(char) * ret); /* CTA_SECCTX_NAME */ #else return 0; #endif @@ -717,6 +734,9 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) #endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + + nla_total_size(sizeof(u64)) /* CTA_TIMESTAMP_EVENT */ +#endif ; } @@ -838,6 +858,10 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK))) goto nla_put_failure; #endif + + if (ctnetlink_dump_event_timestamp(skb, ct)) + goto nla_put_failure; + nlmsg_end(skb, nlh); err = nfnetlink_send(skb, net, item->portid, group, item->report, GFP_ATOMIC); @@ -1557,6 +1581,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { .len = NF_CT_LABELS_MAX_SIZE }, [CTA_FILTER] = { .type = NLA_NESTED }, [CTA_STATUS_MASK] = { .type = NLA_U32 }, + [CTA_TIMESTAMP_EVENT] = { .type = NLA_REJECT }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) @@ -3423,7 +3448,7 @@ static int ctnetlink_del_expect(struct sk_buff *skb, /* after list removal, usage count == 1 */ spin_lock_bh(&nf_conntrack_expect_lock); - if (del_timer(&exp->timeout)) { + if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, nlmsg_report(info->nlh)); nf_ct_expect_put(exp); @@ -3452,7 +3477,7 @@ ctnetlink_change_expect(struct nf_conntrack_expect *x, const struct nlattr * const cda[]) { if (cda[CTA_EXPECT_TIMEOUT]) { - if (!del_timer(&x->timeout)) + if (!timer_delete(&x->timeout)) return -ETIME; x->timeout.expires = jiffies + diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 4cc97f971264..7c6f7c9f7332 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -39,20 +39,15 @@ static const char *const sctp_conntrack_names[] = { [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; -#define SECS * HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS -#define DAYS * 24 HOURS - static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { - [SCTP_CONNTRACK_CLOSED] = 10 SECS, - [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, - [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, - [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, - [SCTP_CONNTRACK_SHUTDOWN_SENT] = 3 SECS, - [SCTP_CONNTRACK_SHUTDOWN_RECD] = 3 SECS, - [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, - [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, + [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10), + [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3), + [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210), + [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3), + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30), }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index d0eac27f6ba0..ca748f8dbff1 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1553,7 +1553,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, if (dataoff >= skb->len) return NF_ACCEPT; - nf_ct_refresh(ct, skb, sip_timeout * HZ); + nf_ct_refresh(ct, sip_timeout * HZ); if (unlikely(skb_linearize(skb))) return NF_DROP; @@ -1624,7 +1624,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, if (dataoff >= skb->len) return NF_ACCEPT; - nf_ct_refresh(ct, skb, sip_timeout * HZ); + nf_ct_refresh(ct, sip_timeout * HZ); if (unlikely(skb_linearize(skb))) return NF_DROP; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 7d4f0fa8b609..6c4cff10357d 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -98,69 +98,87 @@ struct ct_iter_state { struct seq_net_private p; struct hlist_nulls_head *hash; unsigned int htable_size; + unsigned int skip_elems; unsigned int bucket; u_int64_t time_now; }; -static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) +static struct nf_conntrack_tuple_hash *ct_get_next(const struct net *net, + struct ct_iter_state *st) { - struct ct_iter_state *st = seq->private; + struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int i; - for (st->bucket = 0; - st->bucket < st->htable_size; - st->bucket++) { - n = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - if (!is_a_nulls(n)) - return n; - } - return NULL; -} + for (i = st->bucket; i < st->htable_size; i++) { + unsigned int skip = 0; -static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, - struct hlist_nulls_node *head) -{ - struct ct_iter_state *st = seq->private; +restart: + hlist_nulls_for_each_entry_rcu(h, n, &st->hash[i], hnnode) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct hlist_nulls_node *tmp = n; + + if (!net_eq(net, nf_ct_net(ct))) + continue; + + if (++skip <= st->skip_elems) + continue; - head = rcu_dereference(hlist_nulls_next_rcu(head)); - while (is_a_nulls(head)) { - if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= st->htable_size) - return NULL; + /* h should be returned, skip to nulls marker. */ + while (!is_a_nulls(tmp)) + tmp = rcu_dereference(hlist_nulls_next_rcu(tmp)); + + /* check if h is still linked to hash[i] */ + if (get_nulls_value(tmp) != i) { + skip = 0; + goto restart; + } + + st->skip_elems = skip; + st->bucket = i; + return h; } - head = rcu_dereference( - hlist_nulls_first_rcu(&st->hash[st->bucket])); - } - return head; -} -static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) -{ - struct hlist_nulls_node *head = ct_get_first(seq); + skip = 0; + if (get_nulls_value(n) != i) + goto restart; + + st->skip_elems = 0; + } - if (head) - while (pos && (head = ct_get_next(seq, head))) - pos--; - return pos ? NULL : head; + st->bucket = i; + return NULL; } static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct ct_iter_state *st = seq->private; + struct net *net = seq_file_net(seq); st->time_now = ktime_get_real_ns(); rcu_read_lock(); nf_conntrack_get_ht(&st->hash, &st->htable_size); - return ct_get_idx(seq, *pos); + + if (*pos == 0) { + st->skip_elems = 0; + st->bucket = 0; + } else if (st->skip_elems) { + /* resume from last dumped entry */ + st->skip_elems--; + } + + return ct_get_next(net, st); } static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) { + struct ct_iter_state *st = s->private; + struct net *net = seq_file_net(s); + (*pos)++; - return ct_get_next(s, v); + return ct_get_next(net, st); } static void ct_seq_stop(struct seq_file *s, void *v) @@ -172,17 +190,16 @@ static void ct_seq_stop(struct seq_file *s, void *v) #ifdef CONFIG_NF_CONNTRACK_SECMARK static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { + struct lsm_context ctx; int ret; - u32 len; - char *secctx; - ret = security_secid_to_secctx(ct->secmark, &secctx, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, &ctx); + if (ret < 0) return; - seq_printf(s, "secctx=%s ", secctx); + seq_printf(s, "secctx=%s ", ctx.context); - security_release_secctx(secctx, len); + security_release_secctx(&ctx); } #else static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) @@ -619,7 +636,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", @@ -655,7 +674,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { .data = &nf_ct_expect_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", @@ -948,7 +969,9 @@ static struct ctl_table nf_ct_netfilter_table[] = { .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, }; diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index a8e2425e43b0..fab8b9011098 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -15,12 +15,26 @@ #define NF_RECURSION_LIMIT 2 -static DEFINE_PER_CPU(u8, nf_dup_skb_recursion); +#ifndef CONFIG_PREEMPT_RT +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return this_cpu_ptr(&softnet_data.xmit.nf_dup_skb_recursion); +} +#else + +static u8 *nf_get_nf_dup_skb_recursion(void) +{ + return ¤t->net_xmit.nf_dup_skb_recursion; +} + +#endif static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, enum nf_dev_hooks hook) { - if (__this_cpu_read(nf_dup_skb_recursion) > NF_RECURSION_LIMIT) + u8 *nf_dup_skb_recursion = nf_get_nf_dup_skb_recursion(); + + if (*nf_dup_skb_recursion > NF_RECURSION_LIMIT) goto err; if (hook == NF_NETDEV_INGRESS && skb_mac_header_was_set(skb)) { @@ -32,9 +46,9 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev, skb->dev = dev; skb_clear_tstamp(skb); - __this_cpu_inc(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)++; dev_queue_xmit(skb); - __this_cpu_dec(nf_dup_skb_recursion); + (*nf_dup_skb_recursion)--; return; err: kfree_skb(skb); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index df72b0376970..9441ac3d8c1a 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -161,42 +161,86 @@ void flow_offload_route_init(struct flow_offload *flow, } EXPORT_SYMBOL_GPL(flow_offload_route_init); -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) +static inline bool nf_flow_has_expired(const struct flow_offload *flow) +{ + return nf_flow_timeout_delta(flow->timeout) <= 0; +} + +static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state) { + struct ip_ct_tcp *tcp = &ct->proto.tcp; + + spin_lock_bh(&ct->lock); + if (tcp->state != tcp_state) + tcp->state = tcp_state; + + /* syn packet triggers the TCP reopen case from conntrack. */ + if (tcp->state == TCP_CONNTRACK_CLOSE) + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + + /* Conntrack state is outdated due to offload bypass. + * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks + * TCP reset validation will fail. + */ tcp->seen[0].td_maxwin = 0; + tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; tcp->seen[1].td_maxwin = 0; + tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; + spin_unlock_bh(&ct->lock); } -static void flow_offload_fixup_ct(struct nf_conn *ct) +static void flow_offload_fixup_ct(struct flow_offload *flow) { + struct nf_conn *ct = flow->ct; struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); + bool expired, closing = false; + u32 offload_timeout = 0; s32 timeout; if (l4num == IPPROTO_TCP) { - struct nf_tcp_net *tn = nf_tcp_pernet(net); - - flow_offload_fixup_tcp(&ct->proto.tcp); + const struct nf_tcp_net *tn = nf_tcp_pernet(net); + u8 tcp_state; + + /* Enter CLOSE state if fin/rst packet has been seen, this + * allows TCP reopen from conntrack. Otherwise, pick up from + * the last seen TCP state. + */ + closing = test_bit(NF_FLOW_CLOSING, &flow->flags); + if (closing) { + flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE); + timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]); + expired = false; + } else { + tcp_state = READ_ONCE(ct->proto.tcp.state); + flow_offload_fixup_tcp(ct, tcp_state); + timeout = READ_ONCE(tn->timeouts[tcp_state]); + expired = nf_flow_has_expired(flow); + } + offload_timeout = READ_ONCE(tn->offload_timeout); - timeout = tn->timeouts[ct->proto.tcp.state]; - timeout -= tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { - struct nf_udp_net *tn = nf_udp_pernet(net); + const struct nf_udp_net *tn = nf_udp_pernet(net); enum udp_conntrack state = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? UDP_CT_REPLIED : UDP_CT_UNREPLIED; - timeout = tn->timeouts[state]; - timeout -= tn->offload_timeout; + timeout = READ_ONCE(tn->timeouts[state]); + expired = nf_flow_has_expired(flow); + offload_timeout = READ_ONCE(tn->offload_timeout); } else { return; } + if (expired) + timeout -= offload_timeout; + if (timeout < 0) timeout = 0; - if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) - WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout); + if (closing || + nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) + nf_ct_refresh(ct, timeout); } static void flow_offload_route_release(struct flow_offload *flow) @@ -294,7 +338,7 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } - nf_ct_offload_timeout(flow->ct); + nf_ct_refresh(flow->ct, NF_CT_DAY); if (nf_flowtable_hw_offload(flow_table)) { __set_bit(NF_FLOW_HW, &flow->flags); @@ -316,18 +360,14 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, else return; - if (likely(!nf_flowtable_hw_offload(flow_table))) + if (likely(!nf_flowtable_hw_offload(flow_table)) || + test_bit(NF_FLOW_CLOSING, &flow->flags)) return; nf_flow_offload_add(flow_table, flow); } EXPORT_SYMBOL_GPL(flow_offload_refresh); -static inline bool nf_flow_has_expired(const struct flow_offload *flow) -{ - return nf_flow_timeout_delta(flow->timeout) <= 0; -} - static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { @@ -343,8 +383,8 @@ static void flow_offload_del(struct nf_flowtable *flow_table, void flow_offload_teardown(struct flow_offload *flow) { clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); - set_bit(NF_FLOW_TEARDOWN, &flow->flags); - flow_offload_fixup_ct(flow->ct); + if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags)) + flow_offload_fixup_ct(flow); } EXPORT_SYMBOL_GPL(flow_offload_teardown); @@ -414,15 +454,118 @@ static bool nf_flow_custom_gc(struct nf_flowtable *flow_table, return flow_table->type->gc && flow_table->type->gc(flow); } +/** + * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry + * @ct: Flowtable offloaded tcp ct + * + * Return: number of seconds when ct entry should expire. + */ +static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct) +{ + u8 state = READ_ONCE(ct->proto.tcp.state); + + switch (state) { + case TCP_CONNTRACK_SYN_SENT: + case TCP_CONNTRACK_SYN_RECV: + return 0; + case TCP_CONNTRACK_ESTABLISHED: + return NF_CT_DAY; + case TCP_CONNTRACK_FIN_WAIT: + case TCP_CONNTRACK_CLOSE_WAIT: + case TCP_CONNTRACK_LAST_ACK: + case TCP_CONNTRACK_TIME_WAIT: + return 5 * 60 * HZ; + case TCP_CONNTRACK_CLOSE: + return 0; + } + + return 0; +} + +/** + * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry + * @ct: Flowtable offloaded ct + * + * Datapath lookups in the conntrack table will evict nf_conn entries + * if they have expired. + * + * Once nf_conn entries have been offloaded, nf_conntrack might not see any + * packets anymore. Thus ct->timeout is no longer refreshed and ct can + * be evicted. + * + * To avoid the need for an additional check on the offload bit for every + * packet processed via nf_conntrack_in(), set an arbitrary timeout large + * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT + * from the packet path via nf_ct_is_expired(). + */ +static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct) +{ + static const u32 min_timeout = 5 * 60 * HZ; + u32 expires = nf_ct_expires(ct); + + /* normal case: large enough timeout, nothing to do. */ + if (likely(expires >= min_timeout)) + return; + + /* must check offload bit after this, we do not hold any locks. + * flowtable and ct entries could have been removed on another CPU. + */ + if (!refcount_inc_not_zero(&ct->ct_general.use)) + return; + + /* load ct->status after refcount increase */ + smp_acquire__after_ctrl_dep(); + + if (nf_ct_is_confirmed(ct) && + test_bit(IPS_OFFLOAD_BIT, &ct->status)) { + u8 l4proto = nf_ct_protonum(ct); + u32 new_timeout = true; + + switch (l4proto) { + case IPPROTO_UDP: + new_timeout = NF_CT_DAY; + break; + case IPPROTO_TCP: + new_timeout = nf_flow_table_tcp_timeout(ct); + break; + default: + WARN_ON_ONCE(1); + break; + } + + /* Update to ct->timeout from nf_conntrack happens + * without holding ct->lock. + * + * Use cmpxchg to ensure timeout extension doesn't + * happen when we race with conntrack datapath. + * + * The inverse -- datapath updating ->timeout right + * after this -- is fine, datapath is authoritative. + */ + if (new_timeout) { + new_timeout += nfct_time_stamp; + cmpxchg(&ct->timeout, expires, new_timeout); + } + } + + nf_ct_put(ct); +} + static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { + bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags); + if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || - nf_flow_custom_gc(flow_table, flow)) + nf_flow_custom_gc(flow_table, flow)) { flow_offload_teardown(flow); + teardown = true; + } else if (!teardown) { + nf_flow_table_extend_ct_timeout(flow->ct); + } - if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { + if (teardown) { if (test_bit(NF_FLOW_HW, &flow->flags)) { if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); @@ -431,6 +574,10 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, } else { flow_offload_del(flow_table, flow); } + } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) && + test_bit(NF_FLOW_HW, &flow->flags) && + !test_bit(NF_FLOW_HW_DYING, &flow->flags)) { + nf_flow_offload_del(flow_table, flow); } else if (test_bit(NF_FLOW_HW, &flow->flags)) { nf_flow_offload_stats(flow_table, flow); } diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 98edcaa37b38..8cd4cf7ae211 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -28,11 +28,15 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto, return 0; tcph = (void *)(skb_network_header(skb) + thoff); - if (unlikely(tcph->fin || tcph->rst)) { + if (tcph->syn && test_bit(NF_FLOW_CLOSING, &flow->flags)) { flow_offload_teardown(flow); return -1; } + if ((tcph->fin || tcph->rst) && + !test_bit(NF_FLOW_CLOSING, &flow->flags)) + set_bit(NF_FLOW_CLOSING, &flow->flags); + return 0; } diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index 58402226045e..86d5fc5d28e3 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -216,7 +216,9 @@ nf_log_dump_tcp_header(struct nf_log_buf *m, /* Max length: 9 "RES=0x3C " */ nf_log_buf_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); - /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + /* Max length: 35 "AE CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->ae) + nf_log_buf_add(m, "AE "); if (th->cwr) nf_log_buf_add(m, "CWR "); if (th->ece) @@ -516,7 +518,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* Proto Max log string length */ /* IP: 40+46+6+11+127 = 230 */ - /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* TCP: 10+max(25,20+30+13+9+35+11+127) = 255 */ /* UDP: 10+max(25,20) = 35 */ /* UDPLITE: 14+max(25,20) = 39 */ /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ @@ -526,7 +528,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, /* (ICMP allows recursion one level deep) */ /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ - /* maxlen = 230+ 91 + 230 + 252 = 803 */ + /* maxlen = 230+ 91 + 230 + 255 = 806 */ } static noinline_for_stack void diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index aad84aabd7f1..f391cd267922 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -248,7 +248,7 @@ static noinline bool nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_ct) { - static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST_BIT; + static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST; const struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conn *ct; @@ -287,8 +287,14 @@ nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, zone = nf_ct_zone(ignored_ct); thash = nf_conntrack_find_get(net, zone, tuple); - if (unlikely(!thash)) /* clashing entry went away */ - return false; + if (unlikely(!thash)) { + struct nf_conntrack_tuple reply; + + nf_ct_invert_tuple(&reply, tuple); + thash = nf_conntrack_find_get(net, zone, &reply); + if (!thash) /* clashing entry went away */ + return false; + } ct = nf_ct_tuplehash_to_ctrack(thash); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c4af283356e7..24c71ecb2179 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -34,7 +34,6 @@ unsigned int nf_tables_net_id __read_mostly; static LIST_HEAD(nf_tables_expressions); static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); -static LIST_HEAD(nf_tables_destroy_list); static LIST_HEAD(nf_tables_gc_list); static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); static DEFINE_SPINLOCK(nf_tables_gc_list_lock); @@ -125,7 +124,6 @@ static void nft_validate_state_update(struct nft_table *table, u8 new_validate_s table->validate_state = new_validate_state; } static void nf_tables_trans_destroy_work(struct work_struct *w); -static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); static void nft_trans_gc_work(struct work_struct *work); static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); @@ -302,40 +300,75 @@ void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain) static int nft_netdev_register_hooks(struct net *net, struct list_head *hook_list) { + struct nf_hook_ops *ops; struct nft_hook *hook; int err, j; j = 0; list_for_each_entry(hook, hook_list, list) { - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) - goto err_register; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nf_register_net_hook(net, ops); + if (err < 0) + goto err_register; - j++; + j++; + } } return 0; err_register: list_for_each_entry(hook, hook_list, list) { - if (j-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (j-- <= 0) + break; - nf_unregister_net_hook(net, &hook->ops); + nf_unregister_net_hook(net, ops); + } } return err; } +static void nft_netdev_hook_free_ops(struct nft_hook *hook) +{ + struct nf_hook_ops *ops, *next; + + list_for_each_entry_safe(ops, next, &hook->ops_list, list) { + list_del(&ops->list); + kfree(ops); + } +} + +static void nft_netdev_hook_free(struct nft_hook *hook) +{ + nft_netdev_hook_free_ops(hook); + kfree(hook); +} + +static void __nft_netdev_hook_free_rcu(struct rcu_head *rcu) +{ + struct nft_hook *hook = container_of(rcu, struct nft_hook, rcu); + + nft_netdev_hook_free(hook); +} + +static void nft_netdev_hook_free_rcu(struct nft_hook *hook) +{ + call_rcu(&hook->rcu, __nft_netdev_hook_free_rcu); +} + static void nft_netdev_unregister_hooks(struct net *net, struct list_head *hook_list, bool release_netdev) { struct nft_hook *hook, *next; + struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); + list_for_each_entry(ops, &hook->ops_list, list) + nf_unregister_net_hook(net, ops); if (release_netdev) { list_del(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } } @@ -1956,15 +1989,16 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, if (!first) first = hook; - if (nla_put_string(skb, NFTA_DEVICE_NAME, - hook->ops.dev->name)) + if (nla_put(skb, NFTA_DEVICE_NAME, + hook->ifnamelen, hook->ifname)) goto nla_put_failure; n++; } nla_nest_end(skb, nest_devs); if (n == 1 && - nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name)) + nla_put(skb, NFTA_HOOK_DEV, + first->ifnamelen, first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -2254,7 +2288,7 @@ void nf_tables_chain_destroy(struct nft_chain *chain) list_for_each_entry_safe(hook, next, &basechain->hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } module_put(basechain->type->owner); @@ -2275,34 +2309,43 @@ void nf_tables_chain_destroy(struct nft_chain *chain) static struct nft_hook *nft_netdev_hook_alloc(struct net *net, const struct nlattr *attr) { + struct nf_hook_ops *ops; struct net_device *dev; - char ifname[IFNAMSIZ]; struct nft_hook *hook; int err; hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT); - if (!hook) { - err = -ENOMEM; - goto err_hook_alloc; - } + if (!hook) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&hook->ops_list); + + err = nla_strscpy(hook->ifname, attr, IFNAMSIZ); + if (err < 0) + goto err_hook_free; + + hook->ifnamelen = nla_len(attr); - nla_strscpy(ifname, attr, IFNAMSIZ); /* nf_tables_netdev_event() is called under rtnl_mutex, this is * indirectly serializing all the other holders of the commit_mutex with * the rtnl_mutex. */ - dev = __dev_get_by_name(net, ifname); - if (!dev) { - err = -ENOENT; - goto err_hook_dev; - } - hook->ops.dev = dev; + for_each_netdev(net, dev) { + if (strncmp(dev->name, hook->ifname, hook->ifnamelen)) + continue; + ops = kzalloc(sizeof(struct nf_hook_ops), GFP_KERNEL_ACCOUNT); + if (!ops) { + err = -ENOMEM; + goto err_hook_free; + } + ops->dev = dev; + list_add_tail(&ops->list, &hook->ops_list); + } return hook; -err_hook_dev: - kfree(hook); -err_hook_alloc: +err_hook_free: + nft_netdev_hook_free(hook); return ERR_PTR(err); } @@ -2312,7 +2355,8 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, struct nft_hook *hook; list_for_each_entry(hook, hook_list, list) { - if (this->ops.dev == hook->ops.dev) + if (!strncmp(hook->ifname, this->ifname, + min(hook->ifnamelen, this->ifnamelen))) return hook; } @@ -2342,7 +2386,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net, } if (nft_hook_list_find(hook_list, hook)) { NL_SET_BAD_ATTR(extack, tmp); - kfree(hook); + nft_netdev_hook_free(hook); err = -EEXIST; goto err_hook; } @@ -2360,7 +2404,7 @@ static int nf_tables_parse_netdev_hooks(struct net *net, err_hook: list_for_each_entry_safe(hook, next, hook_list, list) { list_del(&hook->list); - kfree(hook); + nft_netdev_hook_free(hook); } return err; } @@ -2503,7 +2547,7 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook) list_for_each_entry_safe(h, next, &hook->list, list) { list_del(&h->list); - kfree(h); + nft_netdev_hook_free(h); } module_put(hook->type->owner); } @@ -2556,6 +2600,7 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, struct nft_chain_hook *hook, u32 flags) { struct nft_chain *chain; + struct nf_hook_ops *ops; struct nft_hook *h; basechain->type = hook->type; @@ -2564,8 +2609,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, if (nft_base_chain_netdev(family, hook->num)) { list_splice_init(&hook->list, &basechain->hook_list); - list_for_each_entry(h, &basechain->hook_list, list) - nft_basechain_hook_init(&h->ops, family, hook, chain); + list_for_each_entry(h, &basechain->hook_list, list) { + list_for_each_entry(ops, &h->ops_list, list) + nft_basechain_hook_init(ops, family, hook, chain); + } } nft_basechain_hook_init(&basechain->ops, family, hook, chain); @@ -2598,9 +2645,8 @@ int nft_chain_add(struct nft_table *table, struct nft_chain *chain) static u64 chain_id; -static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, - u8 policy, u32 flags, - struct netlink_ext_ack *extack) +static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy, + u32 flags, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -2785,15 +2831,17 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) { list_for_each_entry_safe(h, next, &hook.list, list) { - h->ops.pf = basechain->ops.pf; - h->ops.hooknum = basechain->ops.hooknum; - h->ops.priority = basechain->ops.priority; - h->ops.priv = basechain->ops.priv; - h->ops.hook = basechain->ops.hook; + list_for_each_entry(ops, &h->ops_list, list) { + ops->pf = basechain->ops.pf; + ops->hooknum = basechain->ops.hooknum; + ops->priority = basechain->ops.priority; + ops->priv = basechain->ops.priv; + ops->hook = basechain->ops.hook; + } if (nft_hook_list_find(&basechain->hook_list, h)) { list_del(&h->list); - kfree(h); + nft_netdev_hook_free(h); } } } else { @@ -2837,11 +2885,11 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, err = nft_netdev_register_hooks(ctx->net, &hook.list); if (err < 0) goto err_hooks; + + unregister = true; } } - unregister = true; - if (nla[NFTA_CHAIN_COUNTERS]) { if (!nft_is_base_chain(chain)) { err = -EOPNOTSUPP; @@ -2911,10 +2959,12 @@ err_trans: err_hooks: if (nla[NFTA_CHAIN_HOOK]) { list_for_each_entry_safe(h, next, &hook.list, list) { - if (unregister) - nf_unregister_net_hook(ctx->net, &h->ops); + if (unregister) { + list_for_each_entry(ops, &h->ops_list, list) + nf_unregister_net_hook(ctx->net, ops); + } list_del(&h->list); - kfree_rcu(h, rcu); + nft_netdev_hook_free_rcu(h); } module_put(hook.type->owner); } @@ -3038,7 +3088,7 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info, extack); } - return nf_tables_addchain(&ctx, family, genmask, policy, flags, extack); + return nf_tables_addchain(&ctx, family, policy, flags, extack); } static int nft_delchain_hook(struct nft_ctx *ctx, @@ -4567,6 +4617,8 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_HANDLE] = { .type = NLA_U64 }, [NFTA_SET_EXPR] = { .type = NLA_NESTED }, [NFTA_SET_EXPRESSIONS] = NLA_POLICY_NESTED_ARRAY(nft_expr_policy), + [NFTA_SET_TYPE] = { .type = NLA_REJECT }, + [NFTA_SET_COUNT] = { .type = NLA_REJECT }, }; static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { @@ -4753,6 +4805,35 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb, return 0; } +static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size) +{ + if (ops->usize) + return ops->usize(size); + + return size; +} + +static noinline_for_stack int +nf_tables_fill_set_info(struct sk_buff *skb, const struct nft_set *set) +{ + unsigned int nelems; + char str[40]; + int ret; + + ret = snprintf(str, sizeof(str), "%ps", set->ops); + + /* Not expected to happen and harmless: NFTA_SET_TYPE is dumped + * to userspace purely for informational/debug purposes. + */ + DEBUG_NET_WARN_ON_ONCE(ret >= sizeof(str)); + + if (nla_put_string(skb, NFTA_SET_TYPE, str)) + return -EMSGSIZE; + + nelems = nft_set_userspace_size(set->ops, atomic_read(&set->nelems)); + return nla_put_be32(skb, NFTA_SET_COUNT, htonl(nelems)); +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -4823,7 +4904,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (!nest) goto nla_put_failure; if (set->size && - nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) + nla_put_be32(skb, NFTA_SET_DESC_SIZE, + htonl(nft_set_userspace_size(set->ops, set->size)))) goto nla_put_failure; if (set->field_count > 1 && @@ -4832,6 +4914,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nla_nest_end(skb, nest); + if (nf_tables_fill_set_info(skb, set)) + goto nla_put_failure; + if (set->num_exprs == 1) { nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR); if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0) @@ -5065,7 +5150,7 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { - u32 num_regs = 0, key_num_regs = 0; + u32 len = 0, num_regs; struct nlattr *attr; int rem, err, i; @@ -5079,12 +5164,12 @@ static int nft_set_desc_concat(struct nft_set_desc *desc, } for (i = 0; i < desc->field_count; i++) - num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32)); + len += round_up(desc->field_len[i], sizeof(u32)); - key_num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); - if (key_num_regs != num_regs) + if (len != desc->klen) return -EINVAL; + num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); if (num_regs > NFT_REG32_COUNT) return -E2BIG; @@ -5191,6 +5276,15 @@ static bool nft_set_is_same(const struct nft_set *set, return true; } +static u32 nft_set_kernel_size(const struct nft_set_ops *ops, + const struct nft_set_desc *desc) +{ + if (ops->ksize) + return ops->ksize(desc->size); + + return desc->size; +} + static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { @@ -5373,6 +5467,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (err < 0) return err; + if (desc.size) + desc.size = nft_set_kernel_size(set->ops, &desc); + err = 0; if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); @@ -5395,6 +5492,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (IS_ERR(ops)) return PTR_ERR(ops); + if (desc.size) + desc.size = nft_set_kernel_size(ops, &desc); + udlen = 0; if (nla[NFTA_SET_USERDATA]) udlen = nla_len(nla[NFTA_SET_USERDATA]); @@ -7051,6 +7151,27 @@ static bool nft_setelem_valid_key_end(const struct nft_set *set, return true; } +static u32 nft_set_maxsize(const struct nft_set *set) +{ + u32 maxsize, delta; + + if (!set->size) + return UINT_MAX; + + if (set->ops->adjust_maxsize) + delta = set->ops->adjust_maxsize(set); + else + delta = 0; + + if (check_add_overflow(set->size, set->ndeact, &maxsize)) + return UINT_MAX; + + if (check_add_overflow(maxsize, delta, &maxsize)) + return UINT_MAX; + + return maxsize; +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { @@ -7423,7 +7544,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } if (!(flags & NFT_SET_ELEM_CATCHALL)) { - unsigned int max = set->size ? set->size + set->ndeact : UINT_MAX; + unsigned int max = nft_set_maxsize(set); if (!atomic_add_unless(&set->nelems, 1, max)) { err = -ENFILE; @@ -8712,6 +8833,7 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, struct netlink_ext_ack *extack, bool add) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; + struct nf_hook_ops *ops; struct nft_hook *hook; int hooknum, priority; int err; @@ -8766,11 +8888,13 @@ static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, } list_for_each_entry(hook, &flowtable_hook->list, list) { - hook->ops.pf = NFPROTO_NETDEV; - hook->ops.hooknum = flowtable_hook->num; - hook->ops.priority = flowtable_hook->priority; - hook->ops.priv = &flowtable->data; - hook->ops.hook = flowtable->data.type->hook; + list_for_each_entry(ops, &hook->ops_list, list) { + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable_hook->num; + ops->priority = flowtable_hook->priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + } } return err; @@ -8812,12 +8936,12 @@ nft_flowtable_type_get(struct net *net, u8 family) } /* Only called from error and netdev event paths. */ -static void nft_unregister_flowtable_hook(struct net *net, - struct nft_flowtable *flowtable, - struct nft_hook *hook) +static void nft_unregister_flowtable_ops(struct net *net, + struct nft_flowtable *flowtable, + struct nf_hook_ops *ops) { - nf_unregister_net_hook(net, &hook->ops); - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + nf_unregister_net_hook(net, ops); + flowtable->data.type->setup(&flowtable->data, ops->dev, FLOW_BLOCK_UNBIND); } @@ -8827,14 +8951,14 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, bool release_netdev) { struct nft_hook *hook, *next; + struct nf_hook_ops *ops; list_for_each_entry_safe(hook, next, hook_list, list) { - nf_unregister_net_hook(net, &hook->ops); - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(net, flowtable, ops); if (release_netdev) { list_del(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } } @@ -8846,13 +8970,34 @@ static void nft_unregister_flowtable_net_hooks(struct net *net, __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); } +static int nft_register_flowtable_ops(struct net *net, + struct nft_flowtable *flowtable, + struct nf_hook_ops *ops) +{ + int err; + + err = flowtable->data.type->setup(&flowtable->data, + ops->dev, FLOW_BLOCK_BIND); + if (err < 0) + return err; + + err = nf_register_net_hook(net, ops); + if (!err) + return 0; + + flowtable->data.type->setup(&flowtable->data, + ops->dev, FLOW_BLOCK_UNBIND); + return err; +} + static int nft_register_flowtable_net_hooks(struct net *net, struct nft_table *table, struct list_head *hook_list, struct nft_flowtable *flowtable) { - struct nft_hook *hook, *hook2, *next; + struct nft_hook *hook, *next; struct nft_flowtable *ft; + struct nf_hook_ops *ops; int err, i = 0; list_for_each_entry(hook, hook_list, list) { @@ -8860,42 +9005,33 @@ static int nft_register_flowtable_net_hooks(struct net *net, if (!nft_is_active_next(net, ft)) continue; - list_for_each_entry(hook2, &ft->hook_list, list) { - if (hook->ops.dev == hook2->ops.dev && - hook->ops.pf == hook2->ops.pf) { - err = -EEXIST; - goto err_unregister_net_hooks; - } + if (nft_hook_list_find(&ft->hook_list, hook)) { + err = -EEXIST; + goto err_unregister_net_hooks; } } - err = flowtable->data.type->setup(&flowtable->data, - hook->ops.dev, - FLOW_BLOCK_BIND); - if (err < 0) - goto err_unregister_net_hooks; + list_for_each_entry(ops, &hook->ops_list, list) { + err = nft_register_flowtable_ops(net, flowtable, ops); + if (err < 0) + goto err_unregister_net_hooks; - err = nf_register_net_hook(net, &hook->ops); - if (err < 0) { - flowtable->data.type->setup(&flowtable->data, - hook->ops.dev, - FLOW_BLOCK_UNBIND); - goto err_unregister_net_hooks; + i++; } - - i++; } return 0; err_unregister_net_hooks: list_for_each_entry_safe(hook, next, hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - nft_unregister_flowtable_hook(net, flowtable, hook); + nft_unregister_flowtable_ops(net, flowtable, ops); + } list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } return err; @@ -8907,7 +9043,7 @@ static void nft_hooks_destroy(struct list_head *hook_list) list_for_each_entry_safe(hook, next, hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } } @@ -8918,6 +9054,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; struct nft_hook *hook, *next; + struct nf_hook_ops *ops; struct nft_trans *trans; bool unregister = false; u32 flags; @@ -8931,7 +9068,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { if (nft_hook_list_find(&flowtable->hook_list, hook)) { list_del(&hook->list); - kfree(hook); + nft_netdev_hook_free(hook); } } @@ -8975,10 +9112,13 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, err_flowtable_update_hook: list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { - if (unregister) - nft_unregister_flowtable_hook(ctx->net, flowtable, hook); + if (unregister) { + list_for_each_entry(ops, &hook->ops_list, list) + nft_unregister_flowtable_ops(ctx->net, + flowtable, ops); + } list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } return err; @@ -9124,7 +9264,7 @@ static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook list_for_each_entry_safe(this, next, &flowtable_hook->list, list) { list_del(&this->list); - kfree(this); + nft_netdev_hook_free(this); } } @@ -9279,7 +9419,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, list_for_each_entry_rcu(hook, hook_list, list, lockdep_commit_lock_is_held(net)) { - if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) + if (nla_put(skb, NFTA_DEVICE_NAME, + hook->ifnamelen, hook->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -9486,7 +9627,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + nft_netdev_hook_free_rcu(hook); } kfree(flowtable->name); module_put(flowtable->data.type->owner); @@ -9519,46 +9660,190 @@ nla_put_failure: return -EMSGSIZE; } -static void nft_flowtable_event(unsigned long event, struct net_device *dev, - struct nft_flowtable *flowtable) +struct nf_hook_ops *nft_hook_find_ops(const struct nft_hook *hook, + const struct net_device *dev) +{ + struct nf_hook_ops *ops; + + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops); + +struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, + const struct net_device *dev) +{ + struct nf_hook_ops *ops; + + list_for_each_entry_rcu(ops, &hook->ops_list, list) { + if (ops->dev == dev) + return ops; + } + return NULL; +} +EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu); + +static void +nf_tables_device_notify(const struct nft_table *table, int attr, + const char *name, const struct nft_hook *hook, + const struct net_device *dev, int event) +{ + struct net *net = dev_net(dev); + struct nlmsghdr *nlh; + struct sk_buff *skb; + u16 flags = 0; + + if (!nfnetlink_has_listeners(net, NFNLGRP_NFT_DEV)) + return; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + goto err; + + event = event == NETDEV_REGISTER ? NFT_MSG_NEWDEV : NFT_MSG_DELDEV; + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); + nlh = nfnl_msg_put(skb, 0, 0, event, flags, table->family, + NFNETLINK_V0, nft_base_seq(net)); + if (!nlh) + goto err; + + if (nla_put_string(skb, NFTA_DEVICE_TABLE, table->name) || + nla_put_string(skb, attr, name) || + nla_put(skb, NFTA_DEVICE_SPEC, hook->ifnamelen, hook->ifname) || + nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) + goto err; + + nlmsg_end(skb, nlh); + nfnetlink_send(skb, net, 0, NFNLGRP_NFT_DEV, + nlmsg_report(nlh), GFP_KERNEL); + return; +err: + if (skb) + kfree_skb(skb); + nfnetlink_set_err(net, 0, NFNLGRP_NFT_DEV, -ENOBUFS); +} + +void +nf_tables_chain_device_notify(const struct nft_chain *chain, + const struct nft_hook *hook, + const struct net_device *dev, int event) +{ + nf_tables_device_notify(chain->table, NFTA_DEVICE_CHAIN, + chain->name, hook, dev, event); +} + +static void +nf_tables_flowtable_device_notify(const struct nft_flowtable *ft, + const struct nft_hook *hook, + const struct net_device *dev, int event) { + nf_tables_device_notify(ft->table, NFTA_DEVICE_FLOWTABLE, + ft->name, hook, dev, event); +} + +static int nft_flowtable_event(unsigned long event, struct net_device *dev, + struct nft_flowtable *flowtable, bool changename) +{ + struct nf_hook_ops *ops; struct nft_hook *hook; + bool match; list_for_each_entry(hook, &flowtable->hook_list, list) { - if (hook->ops.dev != dev) - continue; + ops = nft_hook_find_ops(hook, dev); + match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); - /* flow_offload_netdev_event() cleans up entries for us. */ - nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook); - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); + switch (event) { + case NETDEV_UNREGISTER: + /* NOP if not found or new name still matching */ + if (!ops || (changename && match)) + continue; + + /* flow_offload_netdev_event() cleans up entries for us. */ + nft_unregister_flowtable_ops(dev_net(dev), + flowtable, ops); + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); + break; + case NETDEV_REGISTER: + /* NOP if not matching or already registered */ + if (!match || (changename && ops)) + continue; + + ops = kzalloc(sizeof(struct nf_hook_ops), + GFP_KERNEL_ACCOUNT); + if (!ops) + return 1; + + ops->pf = NFPROTO_NETDEV; + ops->hooknum = flowtable->hooknum; + ops->priority = flowtable->data.priority; + ops->priv = &flowtable->data; + ops->hook = flowtable->data.type->hook; + ops->dev = dev; + if (nft_register_flowtable_ops(dev_net(dev), + flowtable, ops)) { + kfree(ops); + return 1; + } + list_add_tail_rcu(&ops->list, &hook->ops_list); + break; + } + nf_tables_flowtable_device_notify(flowtable, hook, dev, event); break; } + return 0; +} + +static int __nf_tables_flowtable_event(unsigned long event, + struct net_device *dev, + bool changename) +{ + struct nftables_pernet *nft_net = nft_pernet(dev_net(dev)); + struct nft_flowtable *flowtable; + struct nft_table *table; + + list_for_each_entry(table, &nft_net->tables, list) { + list_for_each_entry(flowtable, &table->flowtables, list) { + if (nft_flowtable_event(event, dev, + flowtable, changename)) + return 1; + } + } + return 0; } static int nf_tables_flowtable_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct nft_flowtable *flowtable; struct nftables_pernet *nft_net; - struct nft_table *table; + int ret = NOTIFY_DONE; struct net *net; - if (event != NETDEV_UNREGISTER) - return 0; + if (event != NETDEV_REGISTER && + event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; net = dev_net(dev); nft_net = nft_pernet(net); mutex_lock(&nft_net->commit_mutex); - list_for_each_entry(table, &nft_net->tables, list) { - list_for_each_entry(flowtable, &table->flowtables, list) { - nft_flowtable_event(event, dev, flowtable); + + if (event == NETDEV_CHANGENAME) { + if (__nf_tables_flowtable_event(NETDEV_REGISTER, dev, true)) { + ret = NOTIFY_BAD; + goto out_unlock; } + __nf_tables_flowtable_event(NETDEV_UNREGISTER, dev, true); + } else if (__nf_tables_flowtable_event(event, dev, false)) { + ret = NOTIFY_BAD; } +out_unlock: mutex_unlock(&nft_net->commit_mutex); - - return NOTIFY_DONE; + return ret; } static struct notifier_block nf_tables_flowtable_notifier = { @@ -9959,11 +10244,12 @@ static void nft_commit_release(struct nft_trans *trans) static void nf_tables_trans_destroy_work(struct work_struct *w) { + struct nftables_pernet *nft_net = container_of(w, struct nftables_pernet, destroy_work); struct nft_trans *trans, *next; LIST_HEAD(head); spin_lock(&nf_tables_destroy_list_lock); - list_splice_init(&nf_tables_destroy_list, &head); + list_splice_init(&nft_net->destroy_list, &head); spin_unlock(&nf_tables_destroy_list_lock); if (list_empty(&head)) @@ -9977,9 +10263,11 @@ static void nf_tables_trans_destroy_work(struct work_struct *w) } } -void nf_tables_trans_destroy_flush_work(void) +void nf_tables_trans_destroy_flush_work(struct net *net) { - flush_work(&trans_destroy_work); + struct nftables_pernet *nft_net = nft_pernet(net); + + flush_work(&nft_net->destroy_work); } EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work); @@ -10437,11 +10725,11 @@ static void nf_tables_commit_release(struct net *net) trans->put_net = true; spin_lock(&nf_tables_destroy_list_lock); - list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list); + list_splice_tail_init(&nft_net->commit_list, &nft_net->destroy_list); spin_unlock(&nf_tables_destroy_list_lock); nf_tables_module_autoload_cleanup(net); - schedule_work(&trans_destroy_work); + schedule_work(&nft_net->destroy_work); mutex_unlock(&nft_net->commit_mutex); } @@ -11694,47 +11982,6 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, } EXPORT_SYMBOL_GPL(nft_data_dump); -static void __nft_release_basechain_now(struct nft_ctx *ctx) -{ - struct nft_rule *rule, *nr; - - list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { - list_del(&rule->list); - nf_tables_rule_release(ctx, rule); - } - nf_tables_chain_destroy(ctx->chain); -} - -int __nft_release_basechain(struct nft_ctx *ctx) -{ - struct nft_rule *rule; - - if (WARN_ON_ONCE(!nft_is_base_chain(ctx->chain))) - return 0; - - nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); - list_for_each_entry(rule, &ctx->chain->rules, list) - nft_use_dec(&ctx->chain->use); - - nft_chain_del(ctx->chain); - nft_use_dec(&ctx->table->use); - - if (!maybe_get_net(ctx->net)) { - __nft_release_basechain_now(ctx); - return 0; - } - - /* wait for ruleset dumps to complete. Owning chain is no longer in - * lists, so new dumps can't find any of these rules anymore. - */ - synchronize_rcu(); - - __nft_release_basechain_now(ctx); - put_net(ctx->net); - return 0; -} -EXPORT_SYMBOL_GPL(__nft_release_basechain); - static void __nft_release_hook(struct net *net, struct nft_table *table) { struct nft_flowtable *flowtable; @@ -11847,7 +12094,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, gc_seq = nft_gc_seq_begin(nft_net); - nf_tables_trans_destroy_flush_work(); + nf_tables_trans_destroy_flush_work(net); again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && @@ -11889,6 +12136,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->tables); INIT_LIST_HEAD(&nft_net->commit_list); + INIT_LIST_HEAD(&nft_net->destroy_list); INIT_LIST_HEAD(&nft_net->commit_set_list); INIT_LIST_HEAD(&nft_net->binding_list); INIT_LIST_HEAD(&nft_net->module_list); @@ -11897,6 +12145,7 @@ static int __net_init nf_tables_init_net(struct net *net) nft_net->base_seq = 1; nft_net->gc_seq = 0; nft_net->validate_state = NFT_VALIDATE_SKIP; + INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work); return 0; } @@ -11925,14 +12174,17 @@ static void __net_exit nf_tables_exit_net(struct net *net) if (!list_empty(&nft_net->module_list)) nf_tables_module_autoload_cleanup(net); + cancel_work_sync(&nft_net->destroy_work); __nft_release_tables(net); nft_gc_seq_end(nft_net, gc_seq); mutex_unlock(&nft_net->commit_mutex); + WARN_ON_ONCE(!list_empty(&nft_net->tables)); WARN_ON_ONCE(!list_empty(&nft_net->module_list)); WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); + WARN_ON_ONCE(!list_empty(&nft_net->destroy_list)); } static void nf_tables_exit_batch(struct list_head *net_exit_list) @@ -12023,10 +12275,8 @@ static void __exit nf_tables_module_exit(void) unregister_netdevice_notifier(&nf_tables_flowtable_notifier); nft_chain_filter_fini(); nft_chain_route_fini(); - nf_tables_trans_destroy_flush_work(); unregister_pernet_subsys(&nf_tables_net_ops); cancel_work_sync(&trans_gc_work); - cancel_work_sync(&trans_destroy_work); rcu_barrier(); rhltable_destroy(&nft_objname_ht); nf_tables_core_module_exit(); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 75598520b0fa..6557a4018c09 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -21,25 +21,22 @@ #include <net/netfilter/nf_log.h> #include <net/netfilter/nft_meta.h> -#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_X86) - +#ifdef CONFIG_MITIGATION_RETPOLINE static struct static_key_false nf_tables_skip_direct_calls; -static bool nf_skip_indirect_calls(void) +static inline bool nf_skip_indirect_calls(void) { return static_branch_likely(&nf_tables_skip_direct_calls); } -static void __init nf_skip_indirect_calls_enable(void) +static inline void __init nf_skip_indirect_calls_enable(void) { if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE)) static_branch_enable(&nf_tables_skip_direct_calls); } #else -static inline bool nf_skip_indirect_calls(void) { return false; } - static inline void nf_skip_indirect_calls_enable(void) { } -#endif +#endif /* CONFIG_MITIGATION_RETPOLINE */ static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt, const struct nft_verdict *verdict, diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 64675f1c7f29..fd30e205de84 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -220,6 +220,7 @@ static int nft_chain_offload_priority(const struct nft_base_chain *basechain) bool nft_chain_offload_support(const struct nft_base_chain *basechain) { + struct nf_hook_ops *ops; struct net_device *dev; struct nft_hook *hook; @@ -227,13 +228,16 @@ bool nft_chain_offload_support(const struct nft_base_chain *basechain) return false; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.pf != NFPROTO_NETDEV || - hook->ops.hooknum != NF_NETDEV_INGRESS) - return false; - - dev = hook->ops.dev; - if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists()) - return false; + list_for_each_entry(ops, &hook->ops_list, list) { + if (ops->pf != NFPROTO_NETDEV || + ops->hooknum != NF_NETDEV_INGRESS) + return false; + + dev = ops->dev; + if (!dev->netdev_ops->ndo_setup_tc && + !flow_indr_dev_exists()) + return false; + } } return true; @@ -455,34 +459,37 @@ static int nft_flow_block_chain(struct nft_base_chain *basechain, const struct net_device *this_dev, enum flow_block_command cmd) { - struct net_device *dev; + struct nf_hook_ops *ops; struct nft_hook *hook; int err, i = 0; list_for_each_entry(hook, &basechain->hook_list, list) { - dev = hook->ops.dev; - if (this_dev && this_dev != dev) - continue; + list_for_each_entry(ops, &hook->ops_list, list) { + if (this_dev && this_dev != ops->dev) + continue; - err = nft_chain_offload_cmd(basechain, dev, cmd); - if (err < 0 && cmd == FLOW_BLOCK_BIND) { - if (!this_dev) - goto err_flow_block; + err = nft_chain_offload_cmd(basechain, ops->dev, cmd); + if (err < 0 && cmd == FLOW_BLOCK_BIND) { + if (!this_dev) + goto err_flow_block; - return err; + return err; + } + i++; } - i++; } return 0; err_flow_block: list_for_each_entry(hook, &basechain->hook_list, list) { - if (i-- <= 0) - break; + list_for_each_entry(ops, &hook->ops_list, list) { + if (i-- <= 0) + break; - dev = hook->ops.dev; - nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND); + nft_chain_offload_cmd(basechain, ops->dev, + FLOW_BLOCK_UNBIND); + } } return err; } @@ -638,7 +645,7 @@ static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *n found = NULL; basechain = nft_base_chain(chain); list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev != dev) + if (!nft_hook_find_ops(hook, dev)) continue; found = hook; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 580c55268f65..ae3fe87195ab 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -15,6 +15,7 @@ #include <linux/netfilter.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> @@ -90,6 +91,49 @@ static int nf_trace_fill_dev_info(struct sk_buff *nlskb, return 0; } +static int nf_trace_fill_ct_info(struct sk_buff *nlskb, + const struct sk_buff *skb) +{ + const struct nf_ct_hook *ct_hook; + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + u32 state; + + ct_hook = rcu_dereference(nf_ct_hook); + if (!ct_hook) + return 0; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) { + if (ctinfo != IP_CT_UNTRACKED) /* not seen by conntrack or invalid */ + return 0; + + state = NF_CT_STATE_UNTRACKED_BIT; + } else { + state = NF_CT_STATE_BIT(ctinfo); + } + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_STATE, htonl(state))) + return -1; + + if (ct) { + u32 id = ct_hook->get_id(&ct->ct_general); + u32 status = READ_ONCE(ct->status); + u8 dir = CTINFO2DIR(ctinfo); + + if (nla_put_u8(nlskb, NFTA_TRACE_CT_DIRECTION, dir)) + return -1; + + if (nla_put_be32(nlskb, NFTA_TRACE_CT_ID, (__force __be32)id)) + return -1; + + if (status && nla_put_be32(nlskb, NFTA_TRACE_CT_STATUS, htonl(status))) + return -1; + } + + return 0; +} + static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, const struct nft_pktinfo *pkt) { @@ -210,7 +254,11 @@ void nft_trace_notify(const struct nft_pktinfo *pkt, nla_total_size(sizeof(__be32)) + /* trace type */ nla_total_size(0) + /* VERDICT, nested */ nla_total_size(sizeof(u32)) + /* verdict code */ - nla_total_size(sizeof(u32)) + /* id */ + nla_total_size(sizeof(u32)) + /* ct id */ + nla_total_size(sizeof(u8)) + /* ct direction */ + nla_total_size(sizeof(u32)) + /* ct state */ + nla_total_size(sizeof(u32)) + /* ct status */ + nla_total_size(sizeof(u32)) + /* trace id */ nla_total_size(NFT_TRACETYPE_LL_HSIZE) + nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) + nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) + @@ -291,6 +339,10 @@ void nft_trace_notify(const struct nft_pktinfo *pkt, if (nf_trace_fill_pkt_info(skb, pkt)) goto nla_put_failure; + + if (nf_trace_fill_ct_info(skb, pkt->skb)) + goto nla_put_failure; + info->packet_dumped = true; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e598a2a252b0..ac77fc21632d 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -86,6 +86,7 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES, [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT, [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES, + [NFNLGRP_NFT_DEV] = NFNL_SUBSYS_NFTABLES, }; static struct nfnl_net *nfnl_pernet(struct net *net) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 134e05d31061..882962f3c84d 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -381,7 +381,7 @@ static void __nfulnl_flush(struct nfulnl_instance *inst) { /* timer holds a reference */ - if (del_timer(&inst->timer)) + if (timer_delete(&inst->timer)) instance_put(inst); if (inst->skb) __nfulnl_send(inst); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index d2773ce9b585..8b7b39d8a109 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -470,18 +470,18 @@ static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk) return 0; } -static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) +static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx) { - u32 seclen = 0; + int seclen = 0; #if IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (!skb || !sk_fullsock(skb->sk)) return 0; read_lock_bh(&skb->sk->sk_callback_lock); if (skb->secmark) - security_secid_to_secctx(skb->secmark, secdata, &seclen); - + seclen = security_secid_to_secctx(skb->secmark, ctx); read_unlock_bh(&skb->sk->sk_callback_lock); #endif return seclen; @@ -567,8 +567,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, enum ip_conntrack_info ctinfo = 0; const struct nfnl_ct_hook *nfnl_ct; bool csum_verify; - char *secdata = NULL; - u32 seclen = 0; + struct lsm_context ctx = { NULL, 0, 0 }; + int seclen = 0; ktime_t tstamp; size = nlmsg_total_size(sizeof(struct nfgenmsg)) @@ -642,7 +642,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { - seclen = nfqnl_get_sk_secctx(entskb, &secdata); + seclen = nfqnl_get_sk_secctx(entskb, &ctx); + if (seclen < 0) + return NULL; if (seclen) size += nla_total_size(seclen); } @@ -782,7 +784,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (nfqnl_put_sk_classid(skb, entskb->sk) < 0) goto nla_put_failure; - if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata)) + if (seclen > 0 && nla_put(skb, NFQA_SECCTX, ctx.len, ctx.context)) goto nla_put_failure; if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0) @@ -810,8 +812,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } nlh->nlmsg_len = skb->len; - if (seclen) - security_release_secctx(secdata, seclen); + if (seclen >= 0) + security_release_secctx(&ctx); return skb; nla_put_failure: @@ -819,8 +821,8 @@ nla_put_failure: kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); nlmsg_failure: - if (seclen) - security_release_secctx(secdata, seclen); + if (seclen >= 0) + security_release_secctx(&ctx); return NULL; } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 7010541fcca6..846d48ba8965 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -318,65 +318,74 @@ static const struct nft_chain_type nft_chain_filter_netdev = { }, }; -static void nft_netdev_event(unsigned long event, struct net_device *dev, - struct nft_ctx *ctx) +static int nft_netdev_event(unsigned long event, struct net_device *dev, + struct nft_base_chain *basechain, bool changename) { - struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - struct nft_hook *hook, *found = NULL; - int n = 0; + struct nft_table *table = basechain->chain.table; + struct nf_hook_ops *ops; + struct nft_hook *hook; + bool match; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev == dev) - found = hook; + ops = nft_hook_find_ops(hook, dev); + match = !strncmp(hook->ifname, dev->name, hook->ifnamelen); - n++; - } - if (!found) - return; + switch (event) { + case NETDEV_UNREGISTER: + /* NOP if not found or new name still matching */ + if (!ops || (changename && match)) + continue; - if (n > 1) { - if (!(ctx->chain->table->flags & NFT_TABLE_F_DORMANT)) - nf_unregister_net_hook(ctx->net, &found->ops); + if (!(table->flags & NFT_TABLE_F_DORMANT)) + nf_unregister_net_hook(dev_net(dev), ops); - list_del_rcu(&found->list); - kfree_rcu(found, rcu); - return; - } + list_del_rcu(&ops->list); + kfree_rcu(ops, rcu); + break; + case NETDEV_REGISTER: + /* NOP if not matching or already registered */ + if (!match || (changename && ops)) + continue; - /* UNREGISTER events are also happening on netns exit. - * - * Although nf_tables core releases all tables/chains, only this event - * handler provides guarantee that hook->ops.dev is still accessible, - * so we cannot skip exiting net namespaces. - */ - __nft_release_basechain(ctx); + ops = kmemdup(&basechain->ops, + sizeof(struct nf_hook_ops), + GFP_KERNEL_ACCOUNT); + if (!ops) + return 1; + + ops->dev = dev; + + if (!(table->flags & NFT_TABLE_F_DORMANT) && + nf_register_net_hook(dev_net(dev), ops)) { + kfree(ops); + return 1; + } + list_add_tail_rcu(&ops->list, &hook->ops_list); + break; + } + nf_tables_chain_device_notify(&basechain->chain, + hook, dev, event); + break; + } + return 0; } -static int nf_tables_netdev_event(struct notifier_block *this, - unsigned long event, void *ptr) +static int __nf_tables_netdev_event(unsigned long event, + struct net_device *dev, + bool changename) { - struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_base_chain *basechain; struct nftables_pernet *nft_net; - struct nft_chain *chain, *nr; + struct nft_chain *chain; struct nft_table *table; - struct nft_ctx ctx = { - .net = dev_net(dev), - }; - if (event != NETDEV_UNREGISTER) - return NOTIFY_DONE; - - nft_net = nft_pernet(ctx.net); - mutex_lock(&nft_net->commit_mutex); + nft_net = nft_pernet(dev_net(dev)); list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV && table->family != NFPROTO_INET) continue; - ctx.family = table->family; - ctx.table = table; - list_for_each_entry_safe(chain, nr, &table->chains, list) { + list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain)) continue; @@ -385,13 +394,40 @@ static int nf_tables_netdev_event(struct notifier_block *this, basechain->ops.hooknum != NF_INET_INGRESS) continue; - ctx.chain = chain; - nft_netdev_event(event, dev, &ctx); + if (nft_netdev_event(event, dev, basechain, changename)) + return 1; } } - mutex_unlock(&nft_net->commit_mutex); + return 0; +} + +static int nf_tables_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct nftables_pernet *nft_net; + int ret = NOTIFY_DONE; - return NOTIFY_DONE; + if (event != NETDEV_REGISTER && + event != NETDEV_UNREGISTER && + event != NETDEV_CHANGENAME) + return NOTIFY_DONE; + + nft_net = nft_pernet(dev_net(dev)); + mutex_lock(&nft_net->commit_mutex); + + if (event == NETDEV_CHANGENAME) { + if (__nf_tables_netdev_event(NETDEV_REGISTER, dev, true)) { + ret = NOTIFY_BAD; + goto out_unlock; + } + __nf_tables_netdev_event(NETDEV_UNREGISTER, dev, true); + } else if (__nf_tables_netdev_event(event, dev, false)) { + ret = NOTIFY_BAD; + } +out_unlock: + mutex_unlock(&nft_net->commit_mutex); + return ret; } static struct notifier_block nf_tables_netdev_notifier = { diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 7ca4f0d21fe2..72711d62fddf 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -228,7 +228,7 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) return 0; } -static void nft_compat_wait_for_destructors(void) +static void nft_compat_wait_for_destructors(struct net *net) { /* xtables matches or targets can have side effects, e.g. * creation/destruction of /proc files. @@ -236,7 +236,7 @@ static void nft_compat_wait_for_destructors(void) * work queue. If we have pending invocations we thus * need to wait for those to finish. */ - nf_tables_trans_destroy_flush_work(); + nf_tables_trans_destroy_flush_work(net); } static int @@ -262,7 +262,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv); - nft_compat_wait_for_destructors(); + nft_compat_wait_for_destructors(ctx->net); ret = xt_check_target(&par, size, proto, inv); if (ret < 0) { @@ -515,7 +515,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv); - nft_compat_wait_for_destructors(); + nft_compat_wait_for_destructors(ctx->net); return xt_check_match(&par, size, proto, inv); } diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 67a41cd2baaf..d526e69a2a2b 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -230,6 +230,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, enum ip_conntrack_info ctinfo; u16 value = nft_reg_load16(®s->data[priv->sreg]); struct nf_conn *ct; + int oldcnt; ct = nf_ct_get(skb, &ctinfo); if (ct) /* already tracked */ @@ -250,10 +251,11 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, ct = this_cpu_read(nft_ct_pcpu_template); - if (likely(refcount_read(&ct->ct_general.use) == 1)) { - refcount_inc(&ct->ct_general.use); + __refcount_inc(&ct->ct_general.use, &oldcnt); + if (likely(oldcnt == 1)) { nf_ct_zone_add(ct, &zone); } else { + refcount_dec(&ct->ct_general.use); /* previous skb got queued to userspace, allocate temporary * one until percpu template can be reused. */ @@ -929,7 +931,7 @@ static void nft_ct_timeout_obj_eval(struct nft_object *obj, */ values = nf_ct_timeout_data(timeout); if (values) - nf_ct_refresh(ct, pkt->skb, values[0]); + nf_ct_refresh(ct, values[0]); } static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index b8d03364566c..c74012c99125 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -85,7 +85,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; struct iphdr *iph, _iph; - unsigned int start; bool found = false; __be32 info; int optlen; @@ -93,7 +92,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (!iph) return -EBADMSG; - start = sizeof(struct iphdr); optlen = iph->ihl * 4 - (int)sizeof(struct iphdr); if (optlen <= 0) @@ -103,7 +101,7 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, /* Copy the options since __ip_options_compile() modifies * the options. */ - if (skb_copy_bits(skb, start, opt->__data, optlen)) + if (skb_copy_bits(skb, sizeof(struct iphdr), opt->__data, optlen)) return -EBADMSG; opt->optlen = optlen; @@ -118,18 +116,18 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb, found = target == IPOPT_SSRR ? opt->is_strictroute : !opt->is_strictroute; if (found) - *offset = opt->srr + start; + *offset = opt->srr; break; case IPOPT_RR: if (!opt->rr) break; - *offset = opt->rr + start; + *offset = opt->rr; found = true; break; case IPOPT_RA: if (!opt->router_alert) break; - *offset = opt->router_alert + start; + *offset = opt->router_alert; found = true; break; default: diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 3b474d235663..225ff293cd50 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -175,7 +175,7 @@ static bool nft_flowtable_find_dev(const struct net_device *dev, bool found = false; list_for_each_entry_rcu(hook, &ft->hook_list, list) { - if (hook->ops.dev != dev) + if (!nft_hook_find_ops_rcu(hook, dev)) continue; found = true; @@ -289,6 +289,15 @@ static bool nft_flow_offload_skip(struct sk_buff *skb, int family) return false; } +static void flow_offload_ct_tcp(struct nf_conn *ct) +{ + /* conntrack will not see all packets, disable tcp window validation. */ + spin_lock_bh(&ct->lock); + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + spin_unlock_bh(&ct->lock); +} + static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -356,11 +365,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, goto err_flow_alloc; flow_offload_route_init(flow, &route); - - if (tcph) { - ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; - ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; - } + if (tcph) + flow_offload_ct_tcp(ct); __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 817ab978d24a..c4569d4b9228 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -23,7 +23,14 @@ #include <linux/ip.h> #include <linux/ipv6.h> -static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx); +struct nft_inner_tun_ctx_locked { + struct nft_inner_tun_ctx ctx; + local_lock_t bh_lock; +}; + +static DEFINE_PER_CPU(struct nft_inner_tun_ctx_locked, nft_pcpu_tun_ctx) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; /* Same layout as nft_expr but it embeds the private expression data area. */ struct __nft_expr { @@ -237,12 +244,15 @@ static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *this_cpu_tun_ctx; local_bh_disable(); - this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) { local_bh_enable(); + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); return false; } *tun_ctx = *this_cpu_tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); local_bh_enable(); return true; @@ -254,9 +264,11 @@ static void nft_inner_save_tun_ctx(const struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *this_cpu_tun_ctx; local_bh_disable(); - this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); + local_lock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx.ctx); if (this_cpu_tun_ctx->cookie != tun_ctx->cookie) *this_cpu_tun_ctx = *tun_ctx; + local_unlock_nested_bh(&nft_pcpu_tun_ctx.bh_lock); local_bh_enable(); } diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 9b2d7463d3d3..df0798da2329 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -19,10 +19,16 @@ struct nft_quota { }; static inline bool nft_overquota(struct nft_quota *priv, - const struct sk_buff *skb) + const struct sk_buff *skb, + bool *report) { - return atomic64_add_return(skb->len, priv->consumed) >= - atomic64_read(&priv->quota); + u64 consumed = atomic64_add_return(skb->len, priv->consumed); + u64 quota = atomic64_read(&priv->quota); + + if (report) + *report = consumed >= quota; + + return consumed > quota; } static inline bool nft_quota_invert(struct nft_quota *priv) @@ -34,7 +40,7 @@ static inline void nft_quota_do_eval(struct nft_quota *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - if (nft_overquota(priv, pkt->skb) ^ nft_quota_invert(priv)) + if (nft_overquota(priv, pkt->skb, NULL) ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; } @@ -51,13 +57,13 @@ static void nft_quota_obj_eval(struct nft_object *obj, const struct nft_pktinfo *pkt) { struct nft_quota *priv = nft_obj_data(obj); - bool overquota; + bool overquota, report; - overquota = nft_overquota(priv, pkt->skb); + overquota = nft_overquota(priv, pkt->skb, &report); if (overquota ^ nft_quota_invert(priv)) regs->verdict.code = NFT_BREAK; - if (overquota && + if (report && !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0, NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 8bfac4185ac7..abb0c8ec6371 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -309,7 +309,8 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, nft_setelem_expr_foreach(expr, elem_expr, size) { if (expr->ops->gc && - expr->ops->gc(read_pnet(&set->net), expr)) + expr->ops->gc(read_pnet(&set->net), expr) && + set->flags & NFT_SET_EVAL) return true; } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 7be342b495f5..c5855069bdab 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -663,6 +663,9 @@ static int pipapo_realloc_mt(struct nft_pipapo_field *f, check_add_overflow(rules, extra, &rules_alloc)) return -EOVERFLOW; + if (rules_alloc > (INT_MAX / sizeof(*new_mt))) + return -ENOMEM; + new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT); if (!new_mt) return -ENOMEM; @@ -683,6 +686,30 @@ out_free: return 0; } + +/** + * lt_calculate_size() - Get storage size for lookup table with overflow check + * @groups: Amount of bit groups + * @bb: Number of bits grouped together in lookup table buckets + * @bsize: Size of each bucket in lookup table, in longs + * + * Return: allocation size including alignment overhead, negative on overflow + */ +static ssize_t lt_calculate_size(unsigned int groups, unsigned int bb, + unsigned int bsize) +{ + ssize_t ret = groups * NFT_PIPAPO_BUCKETS(bb) * sizeof(long); + + if (check_mul_overflow(ret, bsize, &ret)) + return -1; + if (check_add_overflow(ret, NFT_PIPAPO_ALIGN_HEADROOM, &ret)) + return -1; + if (ret > INT_MAX) + return -1; + + return ret; +} + /** * pipapo_resize() - Resize lookup or mapping table, or both * @f: Field containing lookup and mapping tables @@ -701,6 +728,7 @@ static int pipapo_resize(struct nft_pipapo_field *f, long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; unsigned int new_bucket_size, copy; int group, bucket, err; + ssize_t lt_size; if (rules >= NFT_PIPAPO_RULE0_MAX) return -ENOSPC; @@ -719,10 +747,11 @@ static int pipapo_resize(struct nft_pipapo_field *f, else copy = new_bucket_size; - new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS(f->bb) * - new_bucket_size * sizeof(*new_lt) + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL); + lt_size = lt_calculate_size(f->groups, f->bb, new_bucket_size); + if (lt_size < 0) + return -ENOMEM; + + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) return -ENOMEM; @@ -907,7 +936,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) { unsigned int groups, bb; unsigned long *new_lt; - size_t lt_size; + ssize_t lt_size; lt_size = f->groups * NFT_PIPAPO_BUCKETS(f->bb) * f->bsize * sizeof(*f->lt); @@ -917,15 +946,17 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) groups = f->groups * 2; bb = NFT_PIPAPO_GROUP_BITS_LARGE_SET; - lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * - sizeof(*f->lt); + lt_size = lt_calculate_size(groups, bb, f->bsize); + if (lt_size < 0) + return; } else if (f->bb == NFT_PIPAPO_GROUP_BITS_LARGE_SET && lt_size < NFT_PIPAPO_LT_SIZE_LOW) { groups = f->groups / 2; bb = NFT_PIPAPO_GROUP_BITS_SMALL_SET; - lt_size = groups * NFT_PIPAPO_BUCKETS(bb) * f->bsize * - sizeof(*f->lt); + lt_size = lt_calculate_size(groups, bb, f->bsize); + if (lt_size < 0) + return; /* Don't increase group width if the resulting lookup table size * would exceed the upper size threshold for a "small" set. @@ -936,7 +967,7 @@ static void pipapo_lt_bits_adjust(struct nft_pipapo_field *f) return; } - new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT); + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) return; @@ -1451,13 +1482,15 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) for (i = 0; i < old->field_count; i++) { unsigned long *new_lt; + ssize_t lt_size; memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); - new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) * - src->bsize * sizeof(*dst->lt) + - NFT_PIPAPO_ALIGN_HEADROOM, - GFP_KERNEL_ACCOUNT); + lt_size = lt_calculate_size(src->groups, src->bb, src->bsize); + if (lt_size < 0) + goto out_lt; + + new_lt = kvzalloc(lt_size, GFP_KERNEL_ACCOUNT); if (!new_lt) goto out_lt; @@ -1469,6 +1502,9 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) src->groups * NFT_PIPAPO_BUCKETS(src->bb)); if (src->rules > 0) { + if (src->rules_alloc > (INT_MAX / sizeof(*src->mt))) + goto out_mt; + dst->mt = kvmalloc_array(src->rules_alloc, sizeof(*src->mt), GFP_KERNEL_ACCOUNT); diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index b8d3c3213efe..be7c16c79f71 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -994,8 +994,9 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize); NFT_PIPAPO_AVX2_AND(6, 2, 3); + NFT_PIPAPO_AVX2_AND(3, 4, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize); - NFT_PIPAPO_AVX2_AND(0, 4, 5); + NFT_PIPAPO_AVX2_AND(0, 3, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize); NFT_PIPAPO_AVX2_AND(2, 6, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize); @@ -1113,6 +1114,25 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, } /** + * pipapo_resmap_init_avx2() - Initialise result map before first use + * @m: Matching data, including mapping table + * @res_map: Result map + * + * Like pipapo_resmap_init() but do not set start map bits covered by the first field. + */ +static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, unsigned long *res_map) +{ + const struct nft_pipapo_field *f = m->f; + int i; + + /* Starting map doesn't need to be set to all-ones for this implementation, + * but we do need to zero the remaining bits, if any. + */ + for (i = f->bsize; i < m->bsize_max; i++) + res_map[i] = 0ul; +} + +/** * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation * @net: Network namespace * @set: nftables API set representation @@ -1170,7 +1190,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, res = scratch->map + (map_index ? m->bsize_max : 0); fill = scratch->map + (map_index ? 0 : m->bsize_max); - /* Starting map doesn't need to be set for this implementation */ + pipapo_resmap_init_avx2(m, res); nft_pipapo_avx2_prepare(); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index b7ea21327549..2e8ef16ff191 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -750,6 +750,46 @@ static void nft_rbtree_gc_init(const struct nft_set *set) priv->last_gc = jiffies; } +/* rbtree stores ranges as singleton elements, each range is composed of two + * elements ... + */ +static u32 nft_rbtree_ksize(u32 size) +{ + return size * 2; +} + +/* ... hide this detail to userspace. */ +static u32 nft_rbtree_usize(u32 size) +{ + if (!size) + return 0; + + return size / 2; +} + +static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe; + struct rb_node *node; + const void *key; + + node = rb_last(&priv->root); + if (!node) + return 0; + + rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (!nft_rbtree_interval_end(rbe)) + return 0; + + key = nft_set_ext_key(&rbe->ext); + if (memchr(key, 1, set->klen)) + return 0; + + /* this is the all-zero no-match element. */ + return 1; +} + const struct nft_set_type nft_set_rbtree_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -768,5 +808,8 @@ const struct nft_set_type nft_set_rbtree_type = { .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, .get = nft_rbtree_get, + .ksize = nft_rbtree_ksize, + .usize = nft_rbtree_usize, + .adjust_maxsize = nft_rbtree_adjust_maxsize, }, }; diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 681301b46aa4..a12486ae089d 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -335,13 +335,13 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr, static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = { [NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 }, [NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 }, - [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 }, + [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 }, }; static int nft_tunnel_obj_geneve_init(const struct nlattr *attr, struct nft_tunnel_opts *opts) { - struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len; + struct geneve_opt *opt = (struct geneve_opt *)(opts->u.data + opts->len); struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1]; int err, data_len; @@ -621,11 +621,11 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, struct geneve_opt *opt; int offset = 0; - inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); - if (!inner) - goto failure; while (opts->len > offset) { - opt = (struct geneve_opt *)opts->u.data + offset; + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_GENEVE); + if (!inner) + goto failure; + opt = (struct geneve_opt *)(opts->u.data + offset); if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS, opt->opt_class) || nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE, @@ -634,8 +634,8 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, opt->length * 4, opt->opt_data)) goto inner_failure; offset += sizeof(*opt) + opt->length * 4; + nla_nest_end(skb, inner); } - nla_nest_end(skb, inner); } nla_nest_end(skb, nest); return 0; diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c index 8a07b46cc8fb..3210cfc966ab 100644 --- a/net/netfilter/nft_xfrm.c +++ b/net/netfilter/nft_xfrm.c @@ -112,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) return true; } - return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL; + return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL || + mode == XFRM_MODE_IPTFS; } static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 9f54819eb52c..9082155ee558 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -168,7 +168,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) INIT_WORK(&info->timer->work, idletimer_tg_work); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); return 0; @@ -229,7 +229,7 @@ static int idletimer_tg_create_v1(struct idletimer_tg_info_v1 *info) } else { timer_setup(&info->timer->timer, idletimer_tg_expired, 0); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } return 0; @@ -254,7 +254,7 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb, info->label, info->timeout); mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); return XT_CONTINUE; } @@ -275,7 +275,7 @@ static unsigned int idletimer_tg_target_v1(struct sk_buff *skb, alarm_start_relative(&info->timer->alarm, tout); } else { mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } return XT_CONTINUE; @@ -320,7 +320,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par) if (info->timer) { info->timer->refcnt++; mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); @@ -382,7 +382,7 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) } } else { mod_timer(&info->timer->timer, - msecs_to_jiffies(info->timeout * 1000) + jiffies); + secs_to_jiffies(info->timeout) + jiffies); } pr_debug("increased refcnt of timer %s to %u\n", info->label, info->timer->refcnt); diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 30e99464171b..93f064306901 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -91,7 +91,7 @@ tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb)); } -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -119,7 +119,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_tcpoptstrip_target_info), .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IP6_NF_MANGLE) +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "TCPOPTSTRIP", .family = NFPROTO_IPV6, diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index c0f5e9a4f3c6..c437fbd59ec1 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -23,6 +23,8 @@ MODULE_DESCRIPTION("Xtables: process control group matching"); MODULE_ALIAS("ipt_cgroup"); MODULE_ALIAS("ip6t_cgroup"); +#define NET_CLS_CLASSID_INVALID_MSG "xt_cgroup: classid invalid without net_cls cgroups\n" + static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) { struct xt_cgroup_info_v0 *info = par->matchinfo; @@ -30,6 +32,11 @@ static int cgroup_mt_check_v0(const struct xt_mtchk_param *par) if (info->invert & ~1) return -EINVAL; + if (!IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + return 0; } @@ -51,6 +58,11 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); @@ -83,6 +95,11 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) return -EINVAL; } + if (info->has_classid && !IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)) { + pr_info(NET_CLS_CLASSID_INVALID_MSG); + return -EINVAL; + } + info->priv = NULL; if (info->has_path) { cgrp = cgroup_get_from_path(info->path); @@ -100,6 +117,7 @@ static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { +#ifdef CONFIG_CGROUP_NET_CLASSID const struct xt_cgroup_info_v0 *info = par->matchinfo; struct sock *sk = skb->sk; @@ -108,6 +126,8 @@ cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^ info->invert; +#endif + return false; } static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) @@ -123,9 +143,12 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; +#ifdef CONFIG_CGROUP_NET_CLASSID else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; +#endif + return false; } static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) @@ -141,9 +164,12 @@ static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) if (ancestor) return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ info->invert_path; +#ifdef CONFIG_CGROUP_NET_CLASSID else return (info->classid == sock_cgroup_classid(skcd)) ^ info->invert_classid; +#endif + return false; } static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 0859b8f76764..3b507694e81e 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -15,7 +15,6 @@ #include <linux/random.h> #include <linux/jhash.h> #include <linux/slab.h> -#include <linux/vmalloc.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/list.h> @@ -294,8 +293,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, if (size < 16) size = 16; } - /* FIXME: don't use vmalloc() here or anywhere else -HW */ - hinfo = vmalloc(struct_size(hinfo, hash, size)); + hinfo = kvmalloc(struct_size(hinfo, hash, size), GFP_KERNEL); if (hinfo == NULL) return -ENOMEM; *out_hinfo = hinfo; @@ -303,7 +301,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, /* copy match config into hashtable config */ ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3); if (ret) { - vfree(hinfo); + kvfree(hinfo); return ret; } @@ -322,7 +320,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, hinfo->rnd_initialized = false; hinfo->name = kstrdup(name, GFP_KERNEL); if (!hinfo->name) { - vfree(hinfo); + kvfree(hinfo); return -ENOMEM; } spin_lock_init(&hinfo->lock); @@ -344,7 +342,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, ops, hinfo); if (hinfo->pde == NULL) { kfree(hinfo->name); - vfree(hinfo); + kvfree(hinfo); return -ENOMEM; } hinfo->net = net; @@ -363,11 +361,15 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select unsigned int i; for (i = 0; i < ht->cfg.size; i++) { + struct hlist_head *head = &ht->hash[i]; struct dsthash_ent *dh; struct hlist_node *n; + if (hlist_empty(head)) + continue; + spin_lock_bh(&ht->lock); - hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { + hlist_for_each_entry_safe(dh, n, head, node) { if (time_after_eq(jiffies, dh->expires) || select_all) dsthash_free(ht, dh); } @@ -429,7 +431,7 @@ static void htable_put(struct xt_hashlimit_htable *hinfo) cancel_delayed_work_sync(&hinfo->gc_work); htable_selective_cleanup(hinfo, true); kfree(hinfo->name); - vfree(hinfo); + kvfree(hinfo); } } diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 65b965ca40ea..59b9d04400ca 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -48,7 +48,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { .targetsize = sizeof(struct xt_mark_tginfo2), .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) +#if IS_ENABLED(CONFIG_IP_NF_ARPTABLES) || IS_ENABLED(CONFIG_NFT_COMPAT_ARP) { .name = "MARK", .revision = 2, diff --git a/net/netfilter/xt_repldata.h b/net/netfilter/xt_repldata.h index 5d1fb7018dba..600060ca940a 100644 --- a/net/netfilter/xt_repldata.h +++ b/net/netfilter/xt_repldata.h @@ -29,7 +29,7 @@ if (tbl == NULL) \ return NULL; \ term = (struct type##_error *)&(((char *)tbl)[term_offset]); \ - strscpy_pad(tbl->repl.name, info->name, sizeof(tbl->repl.name)); \ + strscpy(tbl->repl.name, info->name); \ *term = (struct type##_error)typ2##_ERROR_INIT; \ tbl->repl.valid_hooks = hook_mask; \ tbl->repl.num_entries = nhooks + 1; \ diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index cd9160bbc919..33b77084a4e5 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -1165,6 +1165,11 @@ int netlbl_conn_setattr(struct sock *sk, break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: + if (sk->sk_family != AF_INET6) { + ret_val = -EAFNOSUPPORT; + goto conn_setattr_return; + } + addr6 = (struct sockaddr_in6 *)addr; entry = netlbl_domhsh_getentry_af6(secattr->domain, &addr6->sin6_addr); diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 1bc2d0890a9f..dfda9ea61971 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -374,8 +374,7 @@ int netlbl_unlhsh_add(struct net *net, struct net_device *dev; struct netlbl_unlhsh_iface *iface; struct audit_buffer *audit_buf = NULL; - char *secctx = NULL; - u32 secctx_len; + struct lsm_context ctx; if (addr_len != sizeof(struct in_addr) && addr_len != sizeof(struct in6_addr)) @@ -438,11 +437,9 @@ int netlbl_unlhsh_add(struct net *net, unlhsh_add_return: rcu_read_unlock(); if (audit_buf != NULL) { - if (security_secid_to_secctx(secid, - &secctx, - &secctx_len) == 0) { - audit_log_format(audit_buf, " sec_obj=%s", secctx); - security_release_secctx(secctx, secctx_len); + if (security_secid_to_secctx(secid, &ctx) >= 0) { + audit_log_format(audit_buf, " sec_obj=%s", ctx.context); + security_release_secctx(&ctx); } audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); @@ -473,8 +470,7 @@ static int netlbl_unlhsh_remove_addr4(struct net *net, struct netlbl_unlhsh_addr4 *entry; struct audit_buffer *audit_buf; struct net_device *dev; - char *secctx; - u32 secctx_len; + struct lsm_context ctx; spin_lock(&netlbl_unlhsh_lock); list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr, @@ -494,10 +490,9 @@ static int netlbl_unlhsh_remove_addr4(struct net *net, addr->s_addr, mask->s_addr); dev_put(dev); if (entry != NULL && - security_secid_to_secctx(entry->secid, - &secctx, &secctx_len) == 0) { - audit_log_format(audit_buf, " sec_obj=%s", secctx); - security_release_secctx(secctx, secctx_len); + security_secid_to_secctx(entry->secid, &ctx) >= 0) { + audit_log_format(audit_buf, " sec_obj=%s", ctx.context); + security_release_secctx(&ctx); } audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); @@ -534,8 +529,7 @@ static int netlbl_unlhsh_remove_addr6(struct net *net, struct netlbl_unlhsh_addr6 *entry; struct audit_buffer *audit_buf; struct net_device *dev; - char *secctx; - u32 secctx_len; + struct lsm_context ctx; spin_lock(&netlbl_unlhsh_lock); list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list); @@ -554,10 +548,9 @@ static int netlbl_unlhsh_remove_addr6(struct net *net, addr, mask); dev_put(dev); if (entry != NULL && - security_secid_to_secctx(entry->secid, - &secctx, &secctx_len) == 0) { - audit_log_format(audit_buf, " sec_obj=%s", secctx); - security_release_secctx(secctx, secctx_len); + security_secid_to_secctx(entry->secid, &ctx) >= 0) { + audit_log_format(audit_buf, " sec_obj=%s", ctx.context); + security_release_secctx(&ctx); } audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); @@ -1069,10 +1062,9 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd, int ret_val = -ENOMEM; struct netlbl_unlhsh_walk_arg *cb_arg = arg; struct net_device *dev; + struct lsm_context ctx; void *data; u32 secid; - char *secctx; - u32 secctx_len; data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, cb_arg->seq, &netlbl_unlabel_gnl_family, @@ -1127,14 +1119,14 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd, secid = addr6->secid; } - ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len); - if (ret_val != 0) + ret_val = security_secid_to_secctx(secid, &ctx); + if (ret_val < 0) goto list_cb_failure; ret_val = nla_put(cb_arg->skb, NLBL_UNLABEL_A_SECCTX, - secctx_len, - secctx); - security_release_secctx(secctx, secctx_len); + ctx.len, + ctx.context); + security_release_secctx(&ctx); if (ret_val != 0) goto list_cb_failure; diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 81635a13987b..0d04d23aafe7 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -84,8 +84,7 @@ struct audit_buffer *netlbl_audit_start_common(int type, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; - char *secctx; - u32 secctx_len; + struct lsm_context ctx; if (audit_enabled == AUDIT_OFF) return NULL; @@ -99,10 +98,9 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_info->sessionid); if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &secctx, - &secctx_len) == 0) { - audit_log_format(audit_buf, " subj=%s", secctx); - security_release_secctx(secctx, secctx_len); + security_lsmprop_to_secctx(&audit_info->prop, &ctx) > 0) { + audit_log_format(audit_buf, " subj=%s", ctx.context); + security_release_secctx(&ctx); } return audit_buf; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f4e7b5e4bb59..e8972a857e51 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -771,6 +771,7 @@ static int netlink_release(struct socket *sock) nlk->cb.done(&nlk->cb); module_put(nlk->cb.module); kfree_skb(nlk->cb.skb); + WRITE_ONCE(nlk->cb_running, false); } module_put(nlk->module); @@ -795,16 +796,6 @@ static int netlink_release(struct socket *sock) sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); - /* Because struct net might disappear soon, do not keep a pointer. */ - if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) { - __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); - /* Because of deferred_put_nlk_sk and use of work queue, - * it is possible netns will be freed before this socket. - */ - sock_net_set(sk, &init_net); - __netns_tracker_alloc(&init_net, &sk->ns_tracker, - false, GFP_KERNEL); - } call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } @@ -1287,6 +1278,7 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; + skb_assert_len(skb); WARN_ON(skb->sk != NULL); delta = skb->end - skb->tail; if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) diff --git a/net/netlink/policy.c b/net/netlink/policy.c index 1f8909c16f14..99458da6be32 100644 --- a/net/netlink/policy.c +++ b/net/netlink/policy.c @@ -311,6 +311,8 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, NL_POLICY_TYPE_ATTR_PAD)) goto nla_put_failure; break; + } else if (pt->validation_type == NLA_VALIDATE_FUNCTION) { + break; } nla_get_range_unsigned(pt, &range); @@ -340,6 +342,9 @@ __netlink_policy_dump_write_attr(struct netlink_policy_dump_state *state, else type = NL_ATTR_TYPE_SINT; + if (pt->validation_type == NLA_VALIDATE_FUNCTION) + break; + nla_get_range_signed(pt, &range); if (nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_S, diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c index 511819fbfa67..7a9d765b30c0 100644 --- a/net/netrom/nr_loopback.c +++ b/net/netrom/nr_loopback.c @@ -68,6 +68,6 @@ static void nr_loopback_timer(struct timer_list *unused) void nr_loopback_clear(void) { - del_timer_sync(&loopback_timer); + timer_delete_sync(&loopback_timer); skb_queue_purge(&loopback_queue); } diff --git a/net/nfc/core.c b/net/nfc/core.c index e58dc6405054..75ed8a9146ba 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -464,7 +464,7 @@ int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode) } if (dev->ops->check_presence) - del_timer_sync(&dev->check_pres_timer); + timer_delete_sync(&dev->check_pres_timer); dev->ops->deactivate_target(dev, dev->active_target, mode); dev->active_target = NULL; @@ -509,7 +509,7 @@ int nfc_data_exchange(struct nfc_dev *dev, u32 target_idx, struct sk_buff *skb, } if (dev->ops->check_presence) - del_timer_sync(&dev->check_pres_timer); + timer_delete_sync(&dev->check_pres_timer); rc = dev->ops->im_transceive(dev, dev->active_target, skb, cb, cb_context); @@ -1172,7 +1172,7 @@ void nfc_unregister_device(struct nfc_dev *dev) device_unlock(&dev->dev); if (dev->ops->check_presence) { - del_timer_sync(&dev->check_pres_timer); + timer_delete_sync(&dev->check_pres_timer); cancel_work_sync(&dev->check_pres_work); } diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index ceb87db57cdb..aa493344d93e 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -148,7 +148,7 @@ static void nfc_hci_msg_rx_work(struct work_struct *work) static void __nfc_hci_cmd_completion(struct nfc_hci_dev *hdev, int err, struct sk_buff *skb) { - del_timer_sync(&hdev->cmd_timer); + timer_delete_sync(&hdev->cmd_timer); if (hdev->cmd_pending_msg->cb) hdev->cmd_pending_msg->cb(hdev->cmd_pending_msg->cb_context, @@ -1046,7 +1046,7 @@ void nfc_hci_unregister_device(struct nfc_hci_dev *hdev) mutex_unlock(&hdev->msg_tx_mutex); - del_timer_sync(&hdev->cmd_timer); + timer_delete_sync(&hdev->cmd_timer); cancel_work_sync(&hdev->msg_tx_work); cancel_work_sync(&hdev->msg_rx_work); diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index ba91284f4086..e6cf4eb06b46 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -78,17 +78,6 @@ static struct nfc_llc_engine *nfc_llc_name_to_engine(const char *name) return NULL; } -void nfc_llc_unregister(const char *name) -{ - struct nfc_llc_engine *llc_engine; - - llc_engine = nfc_llc_name_to_engine(name); - if (llc_engine == NULL) - return; - - nfc_llc_del_engine(llc_engine); -} - struct nfc_llc *nfc_llc_allocate(const char *name, struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv, rcv_to_hci_t rcv_to_hci, int tx_headroom, diff --git a/net/nfc/hci/llc.h b/net/nfc/hci/llc.h index d66271d211a5..09914608ec43 100644 --- a/net/nfc/hci/llc.h +++ b/net/nfc/hci/llc.h @@ -40,7 +40,6 @@ struct nfc_llc { void *nfc_llc_get_data(struct nfc_llc *llc); int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops); -void nfc_llc_unregister(const char *name); int nfc_llc_nop_register(void); diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index e90f70385813..ce9c683a3ead 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -198,7 +198,7 @@ static void llc_shdlc_reset_t2(struct llc_shdlc *shdlc, int y_nr) if (skb_queue_empty(&shdlc->ack_pending_q)) { if (shdlc->t2_active) { - del_timer_sync(&shdlc->t2_timer); + timer_delete_sync(&shdlc->t2_timer); shdlc->t2_active = false; pr_debug("All sent frames acked. Stopped T2(retransmit)\n"); @@ -289,7 +289,7 @@ static void llc_shdlc_rcv_rej(struct llc_shdlc *shdlc, int y_nr) if (llc_shdlc_x_lteq_y_lt_z(shdlc->dnr, y_nr, shdlc->ns)) { if (shdlc->t2_active) { - del_timer_sync(&shdlc->t2_timer); + timer_delete_sync(&shdlc->t2_timer); shdlc->t2_active = false; pr_debug("Stopped T2(retransmit)\n"); } @@ -342,7 +342,7 @@ static void llc_shdlc_connect_complete(struct llc_shdlc *shdlc, int r) { pr_debug("result=%d\n", r); - del_timer_sync(&shdlc->connect_timer); + timer_delete_sync(&shdlc->connect_timer); if (r == 0) { shdlc->ns = 0; @@ -526,7 +526,7 @@ static void llc_shdlc_handle_send_queue(struct llc_shdlc *shdlc) (shdlc->rnr == false)) { if (shdlc->t1_active) { - del_timer_sync(&shdlc->t1_timer); + timer_delete_sync(&shdlc->t1_timer); shdlc->t1_active = false; pr_debug("Stopped T1(send ack)\n"); } diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index 18be13fb9b75..27e863f96ed1 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -160,14 +160,14 @@ static struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) static void local_cleanup(struct nfc_llcp_local *local) { nfc_llcp_socket_release(local, false, ENXIO); - del_timer_sync(&local->link_timer); + timer_delete_sync(&local->link_timer); skb_queue_purge(&local->tx_queue); cancel_work_sync(&local->tx_work); cancel_work_sync(&local->rx_work); cancel_work_sync(&local->timeout_work); kfree_skb(local->rx_pending); local->rx_pending = NULL; - del_timer_sync(&local->sdreq_timer); + timer_delete_sync(&local->sdreq_timer); cancel_work_sync(&local->sdreq_timeout_work); nfc_llcp_free_sdp_tlv_list(&local->pending_sdreqs); } @@ -1536,7 +1536,7 @@ static void nfc_llcp_rx_work(struct work_struct *work) static void __nfc_llcp_recv(struct nfc_llcp_local *local, struct sk_buff *skb) { local->rx_pending = skb; - del_timer(&local->link_timer); + timer_delete(&local->link_timer); schedule_work(&local->rx_work); } diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 1ec5955fe469..0171bf3c7016 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -565,8 +565,8 @@ static int nci_close_device(struct nci_dev *ndev) * there is a queued/running cmd_work */ flush_workqueue(ndev->cmd_wq); - del_timer_sync(&ndev->cmd_timer); - del_timer_sync(&ndev->data_timer); + timer_delete_sync(&ndev->cmd_timer); + timer_delete_sync(&ndev->data_timer); mutex_unlock(&ndev->req_lock); return 0; } @@ -597,7 +597,7 @@ static int nci_close_device(struct nci_dev *ndev) /* Flush cmd wq */ flush_workqueue(ndev->cmd_wq); - del_timer_sync(&ndev->cmd_timer); + timer_delete_sync(&ndev->cmd_timer); /* Clear flags except NCI_UNREG */ ndev->flags &= BIT(NCI_UNREG); diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index 3d36ea5701f0..78f4131af3cf 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -42,7 +42,7 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, pr_debug("len %d, err %d\n", skb ? skb->len : 0, err); /* data exchange is complete, stop the data timer */ - del_timer_sync(&ndev->data_timer); + timer_delete_sync(&ndev->data_timer); clear_bit(NCI_DATA_EXCHANGE_TO, &ndev->flags); if (cb) { diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index de175318a3a0..082ab66f120b 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -542,6 +542,8 @@ static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, pr_debug("pipe created=%d\n", pipe); + if (pipe >= NCI_HCI_MAX_PIPES) + pipe = NCI_HCI_INVALID_PIPE; return pipe; } diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index b911ab78bed9..9eeb862825c5 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -347,7 +347,7 @@ void nci_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) __u16 rsp_opcode = nci_opcode(skb->data); /* we got a rsp, stop the cmd timer */ - del_timer(&ndev->cmd_timer); + timer_delete(&ndev->cmd_timer); pr_debug("NCI RX: MT=rsp, PBF=%d, GID=0x%x, OID=0x%x, plen=%d\n", nci_pbf(skb->data), diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 2535f3f9f462..e6aaee92dba4 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -11,8 +11,8 @@ config OPENVSWITCH (!NF_NAT || NF_NAT) && \ (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT))) depends on PSAMPLE || !PSAMPLE - select LIBCRC32C select MPLS + select NET_CRC32C select NET_MPLS_GSO select DST_CACHE select NET_NSH diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 704c858cf209..e7269a3eec79 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -39,56 +39,18 @@ #include "flow_netlink.h" #include "openvswitch_trace.h" -struct deferred_action { - struct sk_buff *skb; - const struct nlattr *actions; - int actions_len; - - /* Store pkt_key clone when creating deferred action. */ - struct sw_flow_key pkt_key; -}; - -#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) -struct ovs_frag_data { - unsigned long dst; - struct vport *vport; - struct ovs_skb_cb cb; - __be16 inner_protocol; - u16 network_offset; /* valid only for MPLS */ - u16 vlan_tci; - __be16 vlan_proto; - unsigned int l2_len; - u8 mac_proto; - u8 l2_data[MAX_L2_LEN]; -}; - -static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); - -#define DEFERRED_ACTION_FIFO_SIZE 10 -#define OVS_RECURSION_LIMIT 5 -#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) -struct action_fifo { - int head; - int tail; - /* Deferred action fifo queue storage. */ - struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +DEFINE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; -struct action_flow_keys { - struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; -}; - -static struct action_fifo __percpu *action_fifos; -static struct action_flow_keys __percpu *flow_keys; -static DEFINE_PER_CPU(int, exec_actions_level); - /* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys' * space. Return NULL if out of key spaces. */ static struct sw_flow_key *clone_key(const struct sw_flow_key *key_) { - struct action_flow_keys *keys = this_cpu_ptr(flow_keys); - int level = this_cpu_read(exec_actions_level); + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(&ovs_pcpu_storage); + struct action_flow_keys *keys = &ovs_pcpu->flow_keys; + int level = ovs_pcpu->exec_level; struct sw_flow_key *key = NULL; if (level <= OVS_DEFERRED_ACTION_THRESHOLD) { @@ -132,10 +94,9 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb, const struct nlattr *actions, const int actions_len) { - struct action_fifo *fifo; + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage.action_fifos); struct deferred_action *da; - fifo = this_cpu_ptr(action_fifos); da = action_fifo_put(fifo); if (da) { da->skb = skb; @@ -794,7 +755,7 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); + struct ovs_frag_data *data = this_cpu_ptr(&ovs_pcpu_storage.frag_data); struct vport *vport = data->vport; if (skb_cow_head(skb, data->l2_len) < 0) { @@ -846,7 +807,7 @@ static void prepare_frag(struct vport *vport, struct sk_buff *skb, unsigned int hlen = skb_network_offset(skb); struct ovs_frag_data *data; - data = this_cpu_ptr(&ovs_frag_data_storage); + data = this_cpu_ptr(&ovs_pcpu_storage.frag_data); data->dst = skb->_skb_refdst; data->vport = vport; data->cb = *OVS_CB(skb); @@ -947,12 +908,6 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, pskb_trim(skb, ovs_mac_header_len(key)); } - /* Need to set the pkt_type to involve the routing layer. The - * packet movement through the OVS datapath doesn't generally - * use routing, but this is needed for tunnel cases. - */ - skb->pkt_type = PACKET_OUTGOING; - if (likely(!mru || (skb->len <= mru + vport->dev->hard_header_len))) { ovs_vport_send(vport, skb, ovs_key_mac_proto(key)); @@ -981,8 +936,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, upcall.cmd = OVS_PACKET_CMD_ACTION; upcall.mru = OVS_CB(skb)->mru; - for (a = nla_data(attr), rem = nla_len(attr); rem > 0; - a = nla_next(a, &rem)) { + nla_for_each_nested(a, attr, rem) { switch (nla_type(a)) { case OVS_USERSPACE_ATTR_USERDATA: upcall.userdata = a; @@ -1615,13 +1569,13 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, if (actions) { /* Sample action */ if (clone_flow_key) - __this_cpu_inc(exec_actions_level); + __this_cpu_inc(ovs_pcpu_storage.exec_level); err = do_execute_actions(dp, skb, clone, actions, len); if (clone_flow_key) - __this_cpu_dec(exec_actions_level); + __this_cpu_dec(ovs_pcpu_storage.exec_level); } else { /* Recirc action */ clone->recirc_id = recirc_id; ovs_dp_process_packet(skb, clone); @@ -1657,7 +1611,7 @@ static int clone_execute(struct datapath *dp, struct sk_buff *skb, static void process_deferred_actions(struct datapath *dp) { - struct action_fifo *fifo = this_cpu_ptr(action_fifos); + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage.action_fifos); /* Do not touch the FIFO in case there is no deferred actions. */ if (action_fifo_is_empty(fifo)) @@ -1688,7 +1642,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, { int err, level; - level = __this_cpu_inc_return(exec_actions_level); + level = __this_cpu_inc_return(ovs_pcpu_storage.exec_level); if (unlikely(level > OVS_RECURSION_LIMIT)) { net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", ovs_dp_name(dp)); @@ -1705,27 +1659,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, process_deferred_actions(dp); out: - __this_cpu_dec(exec_actions_level); + __this_cpu_dec(ovs_pcpu_storage.exec_level); return err; } - -int action_fifos_init(void) -{ - action_fifos = alloc_percpu(struct action_fifo); - if (!action_fifos) - return -ENOMEM; - - flow_keys = alloc_percpu(struct action_flow_keys); - if (!flow_keys) { - free_percpu(action_fifos); - return -ENOMEM; - } - - return 0; -} - -void action_fifos_exit(void) -{ - free_percpu(action_fifos); - free_percpu(flow_keys); -} diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 3bb4810234aa..e573e9221302 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1368,8 +1368,11 @@ bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) attr == OVS_KEY_ATTR_CT_MARK) return true; if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && - attr == OVS_KEY_ATTR_CT_LABELS) - return true; + attr == OVS_KEY_ATTR_CT_LABELS) { + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + return ovs_net->xt_label; + } return false; } @@ -1378,7 +1381,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log) { - unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; struct ovs_conntrack_info ct_info; const char *helper = NULL; u16 family; @@ -1407,12 +1409,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, return -ENOMEM; } - if (nf_connlabels_get(net, n_bits - 1)) { - nf_ct_tmpl_free(ct_info.ct); - OVS_NLERR(log, "Failed to set connlabel length"); - return -EOPNOTSUPP; - } - if (ct_info.timeout[0]) { if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, ct_info.timeout)) @@ -1581,7 +1577,6 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) if (ct_info->ct) { if (ct_info->timeout[0]) nf_ct_destroy_timeout(ct_info->ct); - nf_connlabels_put(nf_ct_net(ct_info->ct)); nf_ct_tmpl_free(ct_info->ct); } } @@ -2006,9 +2001,17 @@ struct genl_family dp_ct_limit_genl_family __ro_after_init = { int ovs_ct_init(struct net *net) { -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) + unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + if (nf_connlabels_get(net, n_bits - 1)) { + ovs_net->xt_label = false; + OVS_NLERR(true, "Failed to set connlabel length"); + } else { + ovs_net->xt_label = true; + } + +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) return ovs_ct_limit_init(net, ovs_net); #else return 0; @@ -2017,9 +2020,12 @@ int ovs_ct_init(struct net *net) void ovs_ct_exit(struct net *net) { -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) struct ovs_net *ovs_net = net_generic(net, ovs_net_id); +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) ovs_ct_limit_exit(net, ovs_net); #endif + + if (ovs_net->xt_label) + nf_connlabels_put(net); } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 225f6048867f..6a304ae2d959 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -244,11 +244,13 @@ void ovs_dp_detach_port(struct vport *p) /* Must be called with rcu_read_lock. */ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(&ovs_pcpu_storage); const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; + bool ovs_pcpu_locked = false; u64 *stats_counter; u32 n_mask_hit; u32 n_cache_hit; @@ -290,10 +292,26 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) ovs_flow_stats_update(flow, key->tp.flags, skb); sf_acts = rcu_dereference(flow->sf_acts); + /* This path can be invoked recursively: Use the current task to + * identify recursive invocation - the lock must be acquired only once. + * Even with disabled bottom halves this can be preempted on PREEMPT_RT. + * Limit the locking to RT to avoid assigning `owner' if it can be + * avoided. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && ovs_pcpu->owner != current) { + local_lock_nested_bh(&ovs_pcpu_storage.bh_lock); + ovs_pcpu->owner = current; + ovs_pcpu_locked = true; + } + error = ovs_execute_actions(dp, skb, sf_acts, key); if (unlikely(error)) net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", ovs_dp_name(dp), error); + if (ovs_pcpu_locked) { + ovs_pcpu->owner = NULL; + local_unlock_nested_bh(&ovs_pcpu_storage.bh_lock); + } stats_counter = &stats->n_hit; @@ -671,7 +689,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); + local_lock_nested_bh(&ovs_pcpu_storage.bh_lock); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_write(ovs_pcpu_storage.owner, current); err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_write(ovs_pcpu_storage.owner, NULL); + local_unlock_nested_bh(&ovs_pcpu_storage.bh_lock); local_bh_enable(); rcu_read_unlock(); @@ -2101,6 +2125,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; + struct net *net_vport; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, @@ -2117,12 +2142,15 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) goto nla_put_failure; - if (!net_eq(net, dev_net(vport->dev))) { - int id = peernet2id_alloc(net, dev_net(vport->dev), gfp); + rcu_read_lock(); + net_vport = dev_net_rcu(vport->dev); + if (!net_eq(net, net_vport)) { + int id = peernet2id_alloc(net, net_vport, GFP_ATOMIC); if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) - goto nla_put_failure; + goto nla_put_failure_unlock; } + rcu_read_unlock(); ovs_vport_get_stats(vport, &vport_stats); if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, @@ -2143,6 +2171,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, genlmsg_end(skb, ovs_header); return 0; +nla_put_failure_unlock: + rcu_read_unlock(); nla_put_failure: err = -EMSGSIZE; error: @@ -2723,13 +2753,9 @@ static int __init dp_init(void) pr_info("Open vSwitch switching datapath\n"); - err = action_fifos_init(); - if (err) - goto error; - err = ovs_internal_dev_rtnl_link_register(); if (err) - goto error_action_fifos_exit; + goto error; err = ovs_flow_init(); if (err) @@ -2772,8 +2798,6 @@ error_flow_exit: ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); -error_action_fifos_exit: - action_fifos_exit(); error: return err; } @@ -2789,7 +2813,6 @@ static void dp_cleanup(void) ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); - action_fifos_exit(); } module_init(dp_init); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 365b9bb7f546..1b5348b0f559 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -13,6 +13,7 @@ #include <linux/skbuff.h> #include <linux/u64_stats_sync.h> #include <net/ip_tunnels.h> +#include <net/mpls.h> #include "conntrack.h" #include "flow.h" @@ -29,8 +30,8 @@ * datapath. * @n_hit: Number of received packets for which a matching flow was found in * the flow table. - * @n_miss: Number of received packets that had no matching flow in the flow - * table. The sum of @n_hit and @n_miss is the number of packets that have + * @n_missed: Number of received packets that had no matching flow in the flow + * table. The sum of @n_hit and @n_missed is the number of packets that have * been received by the datapath. * @n_lost: Number of received packets that had no matching flow in the flow * table that could not be sent to userspace (normally due to an overflow in @@ -40,6 +41,7 @@ * up per packet. * @n_cache_hit: The number of received packets that had their mask found using * the mask cache. + * @syncp: Synchronization point for 64bit counters. */ struct dp_stats_percpu { u64 n_hit; @@ -74,8 +76,10 @@ struct dp_nlsk_pids { * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. - * @max_headroom: the maximum headroom of all vports in this datapath; it will + * @user_features: Bitmap of enabled %OVS_DP_F_* features. + * @max_headroom: The maximum headroom of all vports in this datapath; it will * be used by all the internal vports in this dp. + * @meter_tbl: Meter table. * @upcall_portids: RCU protected 'struct dp_nlsk_pids'. * * Context: See the comment on locking at the top of datapath.c for additional @@ -128,10 +132,13 @@ struct ovs_skb_cb { #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) /** - * struct dp_upcall - metadata to include with a packet to send to userspace + * struct dp_upcall_info - metadata to include with a packet sent to userspace * @cmd: One of %OVS_PACKET_CMD_*. * @userdata: If nonnull, its variable-length value is passed to userspace as * %OVS_PACKET_ATTR_USERDATA. + * @actions: If nonnull, its variable-length value is passed to userspace as + * %OVS_PACKET_ATTR_ACTIONS. + * @actions_len: The length of the @actions. * @portid: Netlink portid to which packet should be sent. If @portid is 0 * then no packet is sent and the packet is accounted in the datapath's @n_lost * counter. @@ -152,6 +159,10 @@ struct dp_upcall_info { * struct ovs_net - Per net-namespace data for ovs. * @dps: List of datapaths to enable dumping them all out. * Protected by genl_mutex. + * @dp_notify_work: A work notifier to handle port unregistering. + * @masks_rebalance: A work to periodically optimize flow table caches. + * @ct_limit_info: A hash table of conntrack zone connection limits. + * @xt_label: Whether connlables are configured for the network or not. */ struct ovs_net { struct list_head dps; @@ -160,8 +171,57 @@ struct ovs_net { #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) struct ovs_ct_limit_info *ct_limit_info; #endif + bool xt_label; }; +#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) +struct ovs_frag_data { + unsigned long dst; + struct vport *vport; + struct ovs_skb_cb cb; + __be16 inner_protocol; + u16 network_offset; /* valid only for MPLS */ + u16 vlan_tci; + __be16 vlan_proto; + unsigned int l2_len; + u8 mac_proto; + u8 l2_data[MAX_L2_LEN]; +}; + +struct deferred_action { + struct sk_buff *skb; + const struct nlattr *actions; + int actions_len; + + /* Store pkt_key clone when creating deferred action. */ + struct sw_flow_key pkt_key; +}; + +#define DEFERRED_ACTION_FIFO_SIZE 10 +#define OVS_RECURSION_LIMIT 5 +#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) + +struct action_fifo { + int head; + int tail; + /* Deferred action fifo queue storage. */ + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; +}; + +struct action_flow_keys { + struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; +}; + +struct ovs_pcpu_storage { + struct action_fifo action_fifos; + struct action_flow_keys flow_keys; + struct ovs_frag_data frag_data; + int exec_level; + struct task_struct *owner; + local_lock_t bh_lock; +}; +DECLARE_PER_CPU(struct ovs_pcpu_storage, ovs_pcpu_storage); + /** * enum ovs_pkt_hash_types - hash info to include with a packet * to send to userspace. @@ -270,9 +330,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, void ovs_dp_notify_wq(struct work_struct *work); -int action_fifos_init(void); -void action_fifos_exit(void); - /* 'KEY' must not have any bits set outside of the 'MASK' */ #define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) #define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 8a848ce72e29..b80bd3a90773 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -788,7 +788,7 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) memset(&key->ipv4, 0, sizeof(key->ipv4)); } } else if (eth_p_mpls(key->eth.type)) { - u8 label_count = 1; + size_t label_count = 1; memset(&key->mpls, 0, sizeof(key->mpls)); skb_set_inner_network_header(skb, skb->mac_len); diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 881ddd3696d5..ad64bb9ab5e2 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2317,14 +2317,10 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb) OVS_FLOW_ATTR_MASK, true, skb); } -#define MAX_ACTIONS_BUFSIZE (32 * 1024) - static struct sw_flow_actions *nla_alloc_flow_actions(int size) { struct sw_flow_actions *sfa; - WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE); - sfa = kmalloc(kmalloc_size_roundup(sizeof(*sfa) + size), GFP_KERNEL); if (!sfa) return ERR_PTR(-ENOMEM); @@ -2480,15 +2476,6 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2); - if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) { - OVS_NLERR(log, "Flow action size exceeds max %u", - MAX_ACTIONS_BUFSIZE); - return ERR_PTR(-EMSGSIZE); - } - new_acts_size = MAX_ACTIONS_BUFSIZE; - } - acts = nla_alloc_flow_actions(new_acts_size); if (IS_ERR(acts)) return ERR_CAST(acts); @@ -2889,7 +2876,8 @@ static int validate_set(const struct nlattr *a, size_t key_len; /* There can be only one key in a action */ - if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) + if (!nla_ok(ovs_key, nla_len(a)) || + nla_total_size(nla_len(ovs_key)) != nla_len(a)) return -EINVAL; key_len = nla_len(ovs_key); @@ -3061,7 +3049,8 @@ static int validate_userspace(const struct nlattr *attr) struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; int error; - error = nla_parse_nested_deprecated(a, OVS_USERSPACE_ATTR_MAX, attr, + error = nla_parse_deprecated_strict(a, OVS_USERSPACE_ATTR_MAX, + nla_data(attr), nla_len(attr), userspace_policy, NULL); if (error) return error; @@ -3545,7 +3534,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, int err; u32 mpls_label_count = 0; - *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE)); + *sfa = nla_alloc_flow_actions(nla_len(attr)); if (IS_ERR(*sfa)) return PTR_ERR(*sfa); diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 2412d7813d24..125d310871e9 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -149,7 +149,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) - vport->dev->netns_local = true; + vport->dev->netns_immutable = true; rtnl_lock(); err = register_netdevice(vport->dev); diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 3e71ca8ad8a7..9f67b9dd49f9 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -97,6 +97,8 @@ struct vport { * @desired_ifindex: New vport's ifindex. * @dp: New vport's datapath. * @port_no: New vport's port number. + * @upcall_portids: %OVS_VPORT_ATTR_UPCALL_PID attribute from Netlink message, + * %NULL if none was supplied. */ struct vport_parms { const char *name; @@ -125,6 +127,8 @@ struct vport_parms { * have any configuration. * @send: Send a packet on the device. * zero for dropped packets or negative for error. + * @owner: Module that implements this vport type. + * @list: List entry in the global list of vport types. */ struct vport_ops { enum ovs_vport_type type; @@ -144,6 +148,7 @@ struct vport_ops { /** * struct vport_upcall_stats_percpu - per-cpu packet upcall statistics for * a given vport. + * @syncp: Synchronization point for 64bit counters. * @n_success: Number of packets that upcall to userspace succeed. * @n_fail: Number of packets that upcall to userspace failed. */ @@ -164,6 +169,8 @@ void ovs_vport_free(struct vport *); * * @vport: vport to access * + * Returns: A void pointer to a private data allocated in the @vport. + * * If a nonzero size was passed in priv_size of vport_alloc() a private data * area was allocated on creation. This allows that area to be accessed and * used for any purpose needed by the vport implementer. @@ -178,6 +185,8 @@ static inline void *vport_priv(const struct vport *vport) * * @priv: Start of private data area. * + * Returns: A reference to a vport structure that contains @priv. + * * It is sometimes useful to translate from a pointer to the private data * area to the vport, such as in the case where the private data pointer is * the result of a hash table lookup. @priv must point to the start of the diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 2d73769d67f4..20be2c47cf41 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -581,7 +581,7 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) { - del_timer_sync(&pkc->retire_blk_timer); + timer_delete_sync(&pkc->retire_blk_timer); } static void prb_shutdown_retire_blk_timer(struct packet_sock *po, @@ -2102,8 +2102,8 @@ retry: skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); - skb->mark = READ_ONCE(sk->sk_mark); + skb->priority = sockc.priority; + skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); skb_setup_tx_timestamp(skb, &sockc); @@ -2634,8 +2634,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(po->sk.sk_priority); - skb->mark = READ_ONCE(po->sk.sk_mark); + skb->priority = sockc->priority; + skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); @@ -3039,7 +3039,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; sockcm_init(&sockc, sk); - sockc.mark = READ_ONCE(sk->sk_mark); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) @@ -3112,7 +3111,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); + skb->priority = sockc.priority; skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); @@ -3714,15 +3713,15 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, } static void packet_dev_mclist_delete(struct net_device *dev, - struct packet_mclist **mlp) + struct packet_mclist **mlp, + struct list_head *list) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { - packet_dev_mc(dev, ml, -1); + list_add(&ml->remove_list, list); *mlp = ml->next; - kfree(ml); } else mlp = &ml->next; } @@ -3770,6 +3769,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; + INIT_LIST_HEAD(&i->remove_list); i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); @@ -4234,9 +4234,11 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, static int packet_notifier(struct notifier_block *this, unsigned long msg, void *ptr) { - struct sock *sk; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); + struct packet_mclist *ml, *tmp; + LIST_HEAD(mclist); + struct sock *sk; rcu_read_lock(); sk_for_each_rcu(sk, &net->packet.sklist) { @@ -4245,7 +4247,8 @@ static int packet_notifier(struct notifier_block *this, switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) - packet_dev_mclist_delete(dev, &po->mclist); + packet_dev_mclist_delete(dev, &po->mclist, + &mclist); fallthrough; case NETDEV_DOWN: @@ -4278,6 +4281,13 @@ static int packet_notifier(struct notifier_block *this, } } rcu_read_unlock(); + + /* packet_dev_mc might grab instance locks so can't run under rcu */ + list_for_each_entry_safe(ml, tmp, &mclist, remove_list) { + packet_dev_mc(dev, ml, -1); + kfree(ml); + } + return NOTIFY_DONE; } diff --git a/net/packet/internal.h b/net/packet/internal.h index d5d70712007a..1e743d0316fd 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -11,6 +11,7 @@ struct packet_mclist { unsigned short type; unsigned short alen; unsigned char addr[MAX_ADDR_LEN]; + struct list_head remove_list; }; /* kbdq - kernel block descriptor queue */ diff --git a/net/rds/connection.c b/net/rds/connection.c index c749c5525b40..d62f486ab29f 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -749,8 +749,7 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer) cinfo->laddr = conn->c_laddr.s6_addr32[3]; cinfo->faddr = conn->c_faddr.s6_addr32[3]; cinfo->tos = conn->c_tos; - strncpy(cinfo->transport, conn->c_trans->t_name, - sizeof(cinfo->transport)); + strscpy_pad(cinfo->transport, conn->c_trans->t_name); cinfo->flags = 0; rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), @@ -775,8 +774,7 @@ static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer) cinfo6->next_rx_seq = cp->cp_next_rx_seq; cinfo6->laddr = conn->c_laddr; cinfo6->faddr = conn->c_faddr; - strncpy(cinfo6->transport, conn->c_trans->t_name, - sizeof(cinfo6->transport)); + strscpy_pad(cinfo6->transport, conn->c_trans->t_name); cinfo6->flags = 0; rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags), diff --git a/net/rds/page.c b/net/rds/page.c index 7cc57e098ddb..afb151eac271 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -40,10 +40,12 @@ struct rds_page_remainder { struct page *r_page; unsigned long r_offset; + local_lock_t bh_lock; }; -static -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; /** * rds_page_remainder_alloc - build up regions of a message. @@ -69,7 +71,6 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, gfp_t gfp) { struct rds_page_remainder *rem; - unsigned long flags; struct page *page; int ret; @@ -87,8 +88,9 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, goto out; } - rem = &per_cpu(rds_page_remainders, get_cpu()); - local_irq_save(flags); + local_bh_disable(); + local_lock_nested_bh(&rds_page_remainders.bh_lock); + rem = this_cpu_ptr(&rds_page_remainders); while (1) { /* avoid a tiny region getting stuck by tossing it */ @@ -116,13 +118,14 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, } /* alloc if there is nothing for us to use */ - local_irq_restore(flags); - put_cpu(); + local_unlock_nested_bh(&rds_page_remainders.bh_lock); + local_bh_enable(); page = alloc_page(gfp); - rem = &per_cpu(rds_page_remainders, get_cpu()); - local_irq_save(flags); + local_bh_disable(); + local_lock_nested_bh(&rds_page_remainders.bh_lock); + rem = this_cpu_ptr(&rds_page_remainders); if (!page) { ret = -ENOMEM; @@ -140,8 +143,8 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, rem->r_offset = 0; } - local_irq_restore(flags); - put_cpu(); + local_unlock_nested_bh(&rds_page_remainders.bh_lock); + local_bh_enable(); out: rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, diff --git a/net/rds/stats.c b/net/rds/stats.c index 9e87da43c004..cb2e3d2cdf73 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -89,8 +89,7 @@ void rds_stats_info_copy(struct rds_info_iterator *iter, for (i = 0; i < nr; i++) { BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); - strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); - ctr.name[sizeof(ctr.name) - 1] = '\0'; + strscpy_pad(ctr.name, names[i]); ctr.value = values[i]; rds_info_copy(iter, &ctr, sizeof(ctr)); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 0581c53e6517..3cc2f303bf78 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -504,12 +504,8 @@ bool rds_tcp_tune(struct socket *sock) release_sock(sk); return false; } - /* Update ns_tracker to current stack trace and refcounted tracker */ - __netns_tracker_free(net, &sk->ns_tracker, false); - - sk->sk_net_refcnt = 1; - netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sk); + put_net(net); } rtn = net_generic(net, rds_tcp_netid); if (rtn->sndbuf_size > 0) { diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 9fa019e0dcad..41e657e97761 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -162,6 +162,9 @@ static int rfkill_gpio_probe(struct platform_device *pdev) if (!rfkill->rfkill_dev) return -ENOMEM; + if (device_property_present(&pdev->dev, "default-blocked")) + rfkill_init_sw_state(rfkill->rfkill_dev, true); + ret = rfkill_register(rfkill->rfkill_dev); if (ret < 0) goto err_destroy; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 59050caab65c..a4a668b88a8f 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -397,15 +397,15 @@ static int rose_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); - int opt; + unsigned int opt; if (level != SOL_ROSE) return -ENOPROTOOPT; - if (optlen < sizeof(int)) + if (optlen < sizeof(unsigned int)) return -EINVAL; - if (copy_from_sockptr(&opt, optval, sizeof(int))) + if (copy_from_sockptr(&opt, optval, sizeof(unsigned int))) return -EFAULT; switch (optname) { @@ -414,31 +414,31 @@ static int rose_setsockopt(struct socket *sock, int level, int optname, return 0; case ROSE_T1: - if (opt < 1) + if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; rose->t1 = opt * HZ; return 0; case ROSE_T2: - if (opt < 1) + if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; rose->t2 = opt * HZ; return 0; case ROSE_T3: - if (opt < 1) + if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; rose->t3 = opt * HZ; return 0; case ROSE_HOLDBACK: - if (opt < 1) + if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; rose->hb = opt * HZ; return 0; case ROSE_IDLE: - if (opt < 0) + if (opt > UINT_MAX / (60 * HZ)) return -EINVAL; rose->idle = opt * 60 * HZ; return 0; @@ -701,11 +701,9 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net_device *dev; ax25_address *source; ax25_uid_assoc *user; + int err = -EINVAL; int n; - if (!sock_flag(sk, SOCK_ZAPPED)) - return -EINVAL; - if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) return -EINVAL; @@ -718,8 +716,15 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS) return -EINVAL; - if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) - return -EADDRNOTAVAIL; + lock_sock(sk); + + if (!sock_flag(sk, SOCK_ZAPPED)) + goto out_release; + + err = -EADDRNOTAVAIL; + dev = rose_dev_get(&addr->srose_addr); + if (!dev) + goto out_release; source = &addr->srose_call; @@ -730,7 +735,8 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } else { if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { dev_put(dev); - return -EACCES; + err = -EACCES; + goto out_release; } rose->source_call = *source; } @@ -753,8 +759,10 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) rose_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); - - return 0; + err = 0; +out_release: + release_sock(sk); + return err; } static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index 0f77ae8ef944..9f9629e6fdae 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -32,7 +32,7 @@ static void rose_transmit_restart_request(struct rose_neigh *neigh); void rose_start_ftimer(struct rose_neigh *neigh) { - del_timer(&neigh->ftimer); + timer_delete(&neigh->ftimer); neigh->ftimer.function = rose_ftimer_expiry; neigh->ftimer.expires = @@ -43,7 +43,7 @@ void rose_start_ftimer(struct rose_neigh *neigh) static void rose_start_t0timer(struct rose_neigh *neigh) { - del_timer(&neigh->t0timer); + timer_delete(&neigh->t0timer); neigh->t0timer.function = rose_t0timer_expiry; neigh->t0timer.expires = @@ -54,12 +54,12 @@ static void rose_start_t0timer(struct rose_neigh *neigh) void rose_stop_ftimer(struct rose_neigh *neigh) { - del_timer(&neigh->ftimer); + timer_delete(&neigh->ftimer); } void rose_stop_t0timer(struct rose_neigh *neigh) { - del_timer(&neigh->t0timer); + timer_delete(&neigh->t0timer); } int rose_ftimer_running(struct rose_neigh *neigh) diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c index 036d92c0ad79..b538e39b3df5 100644 --- a/net/rose/rose_loopback.c +++ b/net/rose/rose_loopback.c @@ -124,7 +124,7 @@ void __exit rose_loopback_clear(void) { struct sk_buff *skb; - del_timer(&loopback_timer); + timer_delete(&loopback_timer); while ((skb = skb_dequeue(&loopback_queue)) != NULL) { skb->sk = NULL; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index fee772b4637c..2dd6bd3a3011 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -227,8 +227,8 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) { struct rose_neigh *s; - del_timer_sync(&rose_neigh->ftimer); - del_timer_sync(&rose_neigh->t0timer); + timer_delete_sync(&rose_neigh->ftimer); + timer_delete_sync(&rose_neigh->t0timer); skb_queue_purge(&rose_neigh->queue); diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index f06ddbed3fed..1525773e94aa 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -122,6 +122,10 @@ static void rose_heartbeat_expiry(struct timer_list *t) struct rose_sock *rose = rose_sk(sk); bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ/20); + goto out; + } switch (rose->state) { case ROSE_STATE_0: /* Magic here: If we listen() and a new link dies before it @@ -152,6 +156,7 @@ static void rose_heartbeat_expiry(struct timer_list *t) } rose_start_heartbeat(sk); +out: bh_unlock_sock(sk); sock_put(sk); } @@ -162,6 +167,10 @@ static void rose_timer_expiry(struct timer_list *t) struct sock *sk = &rose->sock; bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &rose->timer, jiffies + HZ/20); + goto out; + } switch (rose->state) { case ROSE_STATE_1: /* T1 */ case ROSE_STATE_4: /* T2 */ @@ -182,6 +191,7 @@ static void rose_timer_expiry(struct timer_list *t) } break; } +out: bh_unlock_sock(sk); sock_put(sk); } @@ -192,6 +202,10 @@ static void rose_idletimer_expiry(struct timer_list *t) struct sock *sk = &rose->sock; bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &rose->idletimer, jiffies + HZ/20); + goto out; + } rose_clear_queues(sk); rose_write_internal(sk, ROSE_CLEAR_REQUEST); @@ -207,6 +221,7 @@ static void rose_idletimer_expiry(struct timer_list *t) sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } +out: bh_unlock_sock(sk); sock_put(sk); } diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index a20986806fea..f60b81c66078 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -67,6 +67,29 @@ config RXKAD See Documentation/networking/rxrpc.rst. +config RXGK + bool "RxRPC GSSAPI security" + select CRYPTO_KRB5 + select CRYPTO_MANAGER + select CRYPTO_KRB5ENC + select CRYPTO_AUTHENC + select CRYPTO_SKCIPHER + select CRYPTO_HASH_INFO + select CRYPTO_HMAC + select CRYPTO_CMAC + select CRYPTO_SHA1 + select CRYPTO_SHA256 + select CRYPTO_SHA512 + select CRYPTO_CBC + select CRYPTO_CTS + select CRYPTO_AES + select CRYPTO_CAMELLIA + help + Provide the GSSAPI-based RxGK security class for AFS. Keys are added + with add_key(). + + See Documentation/networking/rxrpc.rst. + config RXPERF tristate "RxRPC test service" help diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile index ac5caf5a48e1..c0542bae719e 100644 --- a/net/rxrpc/Makefile +++ b/net/rxrpc/Makefile @@ -16,6 +16,7 @@ rxrpc-y := \ conn_object.o \ conn_service.o \ input.o \ + input_rack.o \ insecure.o \ io_thread.o \ key.o \ @@ -23,6 +24,7 @@ rxrpc-y := \ local_object.o \ misc.o \ net_ns.o \ + oob.o \ output.o \ peer_event.o \ peer_object.o \ @@ -38,6 +40,9 @@ rxrpc-y := \ rxrpc-$(CONFIG_PROC_FS) += proc.o rxrpc-$(CONFIG_RXKAD) += rxkad.o rxrpc-$(CONFIG_SYSCTL) += sysctl.o - +rxrpc-$(CONFIG_RXGK) += \ + rxgk.o \ + rxgk_app.o \ + rxgk_kdf.o obj-$(CONFIG_RXPERF) += rxperf.o diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 9d8bd0b37e41..36df0274d7b7 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -265,7 +265,10 @@ static int rxrpc_listen(struct socket *sock, int backlog) * @gfp: Allocation flags * * Lookup or create a remote transport endpoint record for the specified - * address and return it with a ref held. + * address. + * + * Return: The peer record found with a reference, %NULL if no record is found + * or a negative error code if the address is invalid or unsupported. */ struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, struct sockaddr_rxrpc *srx, gfp_t gfp) @@ -283,9 +286,11 @@ EXPORT_SYMBOL(rxrpc_kernel_lookup_peer); /** * rxrpc_kernel_get_peer - Get a reference on a peer - * @peer: The peer to get a reference on. + * @peer: The peer to get a reference on (may be NULL). + * + * Get a reference for a remote peer record (if not NULL). * - * Get a record for the remote peer in a call. + * Return: The @peer argument. */ struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer) { @@ -296,6 +301,8 @@ EXPORT_SYMBOL(rxrpc_kernel_get_peer); /** * rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference * @peer: The peer to drop a ref on + * + * Drop a reference on a peer record. */ void rxrpc_kernel_put_peer(struct rxrpc_peer *peer) { @@ -320,10 +327,12 @@ EXPORT_SYMBOL(rxrpc_kernel_put_peer); * * Allow a kernel service to begin a call on the nominated socket. This just * sets up all the internal tracking structures and allocates connection and - * call IDs as appropriate. The call to be used is returned. + * call IDs as appropriate. * * The default socket destination address and security may be overridden by * supplying @srx and @key. + * + * Return: The new call or an error code. */ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, @@ -408,9 +417,9 @@ void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call) /* Make sure we're not going to call back into a kernel service */ if (call->notify_rx) { - spin_lock(&call->notify_lock); + spin_lock_irq(&call->notify_lock); call->notify_rx = rxrpc_dummy_notify_rx; - spin_unlock(&call->notify_lock); + spin_unlock_irq(&call->notify_lock); } } mutex_unlock(&call->user_mutex); @@ -437,6 +446,8 @@ EXPORT_SYMBOL(rxrpc_kernel_put_call); * * Allow a kernel service to find out whether a call is still alive - whether * it has completed successfully and all received data has been consumed. + * + * Return: %true if the call is still ongoing and %false if it has completed. */ bool rxrpc_kernel_check_life(const struct socket *sock, const struct rxrpc_call *call) @@ -450,63 +461,20 @@ bool rxrpc_kernel_check_life(const struct socket *sock, EXPORT_SYMBOL(rxrpc_kernel_check_life); /** - * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. - * @sock: The socket the call is on - * @call: The call to query - * - * Allow a kernel service to retrieve the epoch value from a service call to - * see if the client at the other end rebooted. - */ -u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) -{ - return call->conn->proto.epoch; -} -EXPORT_SYMBOL(rxrpc_kernel_get_epoch); - -/** - * rxrpc_kernel_new_call_notification - Get notifications of new calls - * @sock: The socket to intercept received messages on - * @notify_new_call: Function to be called when new calls appear - * @discard_new_call: Function to discard preallocated calls + * rxrpc_kernel_set_notifications - Set table of callback operations + * @sock: The socket to install table upon + * @app_ops: Callback operation table to set * - * Allow a kernel service to be given notifications about new calls. + * Allow a kernel service to set a table of event notifications on a socket. */ -void rxrpc_kernel_new_call_notification( - struct socket *sock, - rxrpc_notify_new_call_t notify_new_call, - rxrpc_discard_new_call_t discard_new_call) +void rxrpc_kernel_set_notifications(struct socket *sock, + const struct rxrpc_kernel_ops *app_ops) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - rx->notify_new_call = notify_new_call; - rx->discard_new_call = discard_new_call; + rx->app_ops = app_ops; } -EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); - -/** - * rxrpc_kernel_set_max_life - Set maximum lifespan on a call - * @sock: The socket the call is on - * @call: The call to configure - * @hard_timeout: The maximum lifespan of the call in ms - * - * Set the maximum lifespan of a call. The call will end with ETIME or - * ETIMEDOUT if it takes longer than this. - */ -void rxrpc_kernel_set_max_life(struct socket *sock, struct rxrpc_call *call, - unsigned long hard_timeout) -{ - ktime_t delay = ms_to_ktime(hard_timeout), expect_term_by; - - mutex_lock(&call->user_mutex); - - expect_term_by = ktime_add(ktime_get_real(), delay); - WRITE_ONCE(call->expect_term_by, expect_term_by); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_hard); - rxrpc_poke_call(call, rxrpc_call_poke_set_timeout); - - mutex_unlock(&call->user_mutex); -} -EXPORT_SYMBOL(rxrpc_kernel_set_max_life); +EXPORT_SYMBOL(rxrpc_kernel_set_notifications); /* * connect an RxRPC socket @@ -624,7 +592,10 @@ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) fallthrough; case RXRPC_SERVER_BOUND: case RXRPC_SERVER_LISTENING: - ret = rxrpc_do_sendmsg(rx, m, len); + if (m->msg_flags & MSG_OOB) + ret = rxrpc_sendmsg_oob(rx, m, len); + else + ret = rxrpc_do_sendmsg(rx, m, len); /* The socket has been unlocked */ goto out; default: @@ -659,7 +630,7 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); - unsigned int min_sec_level; + unsigned int min_sec_level, val; u16 service_upgrade[2]; int ret; @@ -740,6 +711,26 @@ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, rx->service_upgrade.to = service_upgrade[1]; goto success; + case RXRPC_MANAGE_RESPONSE: + ret = -EINVAL; + if (optlen != sizeof(unsigned int)) + goto error; + ret = -EISCONN; + if (rx->sk.sk_state != RXRPC_UNBOUND) + goto error; + ret = copy_safe_from_sockptr(&val, sizeof(val), + optval, optlen); + if (ret) + goto error; + ret = -EINVAL; + if (val > 1) + goto error; + if (val) + set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + else + clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + goto success; + default: break; } @@ -846,6 +837,8 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, rx->calls = RB_ROOT; spin_lock_init(&rx->incoming_lock); + skb_queue_head_init(&rx->recvmsg_oobq); + rx->pending_oobq = RB_ROOT; INIT_LIST_HEAD(&rx->sock_calls); INIT_LIST_HEAD(&rx->to_be_accepted); INIT_LIST_HEAD(&rx->recvmsg_q); @@ -879,8 +872,10 @@ static int rxrpc_shutdown(struct socket *sock, int flags) lock_sock(sk); if (sk->sk_state < RXRPC_CLOSE) { + spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; sk->sk_shutdown = SHUTDOWN_MASK; + spin_unlock_irq(&rx->recvmsg_lock); } else { ret = -ESHUTDOWN; } @@ -892,12 +887,30 @@ static int rxrpc_shutdown(struct socket *sock, int flags) } /* + * Purge the out-of-band queue. + */ +static void rxrpc_purge_oob_queue(struct sock *sk) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&rx->recvmsg_oobq))) + rxrpc_kernel_free_oob(skb); + while (!RB_EMPTY_ROOT(&rx->pending_oobq)) { + skb = rb_entry(rx->pending_oobq.rb_node, struct sk_buff, rbnode); + rb_erase(&skb->rbnode, &rx->pending_oobq); + rxrpc_kernel_free_oob(skb); + } +} + +/* * RxRPC socket destructor */ static void rxrpc_sock_destructor(struct sock *sk) { _enter("%p", sk); + rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); @@ -936,7 +949,9 @@ static int rxrpc_release_sock(struct sock *sk) break; } + spin_lock_irq(&rx->recvmsg_lock); sk->sk_state = RXRPC_CLOSE; + spin_unlock_irq(&rx->recvmsg_lock); if (rx->local && rx->local->service == rx) { write_lock(&rx->local->services_lock); @@ -948,6 +963,7 @@ static int rxrpc_release_sock(struct sock *sk) rxrpc_discard_prealloc(rx); rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); + rxrpc_purge_oob_queue(sk); rxrpc_purge_queue(&sk->sk_receive_queue); rxrpc_unuse_local(rx->local, rxrpc_local_unuse_release_sock); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index d0fd37bdcfe9..5bd3922c310d 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -30,6 +30,8 @@ struct rxrpc_crypt { struct key_preparsed_payload; struct rxrpc_connection; struct rxrpc_txbuf; +struct rxrpc_txqueue; +struct rxgk_context; /* * Mark applied to socket buffers in skb->mark. skb->priority is used @@ -38,6 +40,7 @@ struct rxrpc_txbuf; enum rxrpc_skb_mark { RXRPC_SKB_MARK_PACKET, /* Received packet */ RXRPC_SKB_MARK_ERROR, /* Error notification */ + RXRPC_SKB_MARK_CHALLENGE, /* Challenge notification */ RXRPC_SKB_MARK_SERVICE_CONN_SECURED, /* Service connection response has been verified */ RXRPC_SKB_MARK_REJECT_BUSY, /* Reject with BUSY */ RXRPC_SKB_MARK_REJECT_ABORT, /* Reject with ABORT (code in skb->priority) */ @@ -98,6 +101,7 @@ struct rxrpc_net { atomic_t stat_tx_data_send; atomic_t stat_tx_data_send_frag; atomic_t stat_tx_data_send_fail; + atomic_t stat_tx_data_send_msgsize; atomic_t stat_tx_data_underflow; atomic_t stat_tx_data_cwnd_reset; atomic_t stat_rx_data; @@ -109,6 +113,8 @@ struct rxrpc_net { atomic_t stat_tx_ack_skip; atomic_t stat_tx_acks[256]; atomic_t stat_rx_acks[256]; + atomic_t stat_tx_jumbo[10]; + atomic_t stat_rx_jumbo[10]; atomic_t stat_why_req_ack[8]; @@ -142,10 +148,12 @@ struct rxrpc_backlog { struct rxrpc_sock { /* WARNING: sk has to be the first member */ struct sock sk; - rxrpc_notify_new_call_t notify_new_call; /* Func to notify of new call */ - rxrpc_discard_new_call_t discard_new_call; /* Func to discard a new call */ + const struct rxrpc_kernel_ops *app_ops; /* Table of kernel app notification funcs */ struct rxrpc_local *local; /* local endpoint */ struct rxrpc_backlog *backlog; /* Preallocation for services */ + struct sk_buff_head recvmsg_oobq; /* OOB messages for recvmsg to pick up */ + struct rb_root pending_oobq; /* OOB messages awaiting userspace to respond to */ + u64 oob_id_counter; /* OOB message ID counter */ spinlock_t incoming_lock; /* Incoming call vs service shutdown lock */ struct list_head sock_calls; /* List of calls owned by this socket */ struct list_head to_be_accepted; /* calls awaiting acceptance */ @@ -156,6 +164,7 @@ struct rxrpc_sock { struct rb_root calls; /* User ID -> call mapping */ unsigned long flags; #define RXRPC_SOCK_CONNECTED 0 /* connect_srx is set */ +#define RXRPC_SOCK_MANAGE_RESPONSE 1 /* User wants to manage RESPONSE packets */ rwlock_t call_lock; /* lock for calls */ u32 min_sec_level; /* minimum security level */ #define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT @@ -199,7 +208,7 @@ struct rxrpc_host_header { */ struct rxrpc_skb_priv { union { - struct rxrpc_connection *conn; /* Connection referred to (poke packet) */ + struct rxrpc_connection *poke_conn; /* Conn referred to (poke packet) */ struct { u16 offset; /* Offset of data */ u16 len; /* Length of data */ @@ -210,10 +219,22 @@ struct rxrpc_skb_priv { rxrpc_seq_t first_ack; /* First packet in acks table */ rxrpc_seq_t prev_ack; /* Highest seq seen */ rxrpc_serial_t acked_serial; /* Packet in response to (or 0) */ + u16 nr_acks; /* Number of acks+nacks */ u8 reason; /* Reason for ack */ - u8 nr_acks; /* Number of acks+nacks */ - u8 nr_nacks; /* Number of nacks */ } ack; + struct { + struct rxrpc_connection *conn; /* Connection referred to */ + union { + u32 rxkad_nonce; + }; + } chall; + struct { + rxrpc_serial_t challenge_serial; + u32 kvno; + u32 version; + u16 len; + u16 ticket_len; + } resp; }; struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */ }; @@ -267,9 +288,24 @@ struct rxrpc_security { /* issue a challenge */ int (*issue_challenge)(struct rxrpc_connection *); + /* Validate a challenge packet */ + bool (*validate_challenge)(struct rxrpc_connection *conn, + struct sk_buff *skb); + + /* Fill out the cmsg for recvmsg() to pass on a challenge to userspace. + * The security class gets to add additional information. + */ + int (*challenge_to_recvmsg)(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct msghdr *msg); + + /* Parse sendmsg() control message and respond to challenge. */ + int (*sendmsg_respond_to_challenge)(struct sk_buff *challenge, + struct msghdr *msg); + /* respond to a challenge */ - int (*respond_to_challenge)(struct rxrpc_connection *, - struct sk_buff *); + int (*respond_to_challenge)(struct rxrpc_connection *conn, + struct sk_buff *challenge); /* verify a response */ int (*verify_response)(struct rxrpc_connection *, @@ -277,6 +313,11 @@ struct rxrpc_security { /* clear connection security */ void (*clear)(struct rxrpc_connection *); + + /* Default ticket -> key decoder */ + int (*default_decode_ticket)(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key); }; /* @@ -320,6 +361,12 @@ struct rxrpc_local { struct list_head new_client_calls; /* Newly created client calls need connection */ spinlock_t client_call_lock; /* Lock for ->new_client_calls */ struct sockaddr_rxrpc srx; /* local address */ + /* Provide a kvec table sufficiently large to manage either a DATA + * packet with a maximum set of jumbo subpackets or a PING ACK padded + * out to 64K with zeropages for PMTUD. + */ + struct kvec kvec[1 + RXRPC_MAX_NR_JUMBO > 3 + 16 ? + 1 + RXRPC_MAX_NR_JUMBO : 3 + 16]; }; /* @@ -335,28 +382,31 @@ struct rxrpc_peer { struct hlist_head error_targets; /* targets for net error distribution */ struct rb_root service_conns; /* Service connections */ struct list_head keepalive_link; /* Link in net->peer_keepalive[] */ + unsigned long app_data; /* Application data (e.g. afs_server) */ time64_t last_tx_at; /* Last time packet sent here */ seqlock_t service_conn_lock; spinlock_t lock; /* access lock */ - unsigned int if_mtu; /* interface MTU for this peer */ - unsigned int mtu; /* network MTU for this peer */ - unsigned int maxdata; /* data size (MTU - hdrsize) */ - unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ int debug_id; /* debug ID for printks */ struct sockaddr_rxrpc srx; /* remote address */ - /* calculated RTT cache */ -#define RXRPC_RTT_CACHE_SIZE 32 - spinlock_t rtt_input_lock; /* RTT lock for input routine */ - ktime_t rtt_last_req; /* Time of last RTT request */ - unsigned int rtt_count; /* Number of samples we've got */ + /* Path MTU discovery [RFC8899] */ + unsigned int pmtud_trial; /* Current MTU probe size */ + unsigned int pmtud_good; /* Largest working MTU probe we've tried */ + unsigned int pmtud_bad; /* Smallest non-working MTU probe we've tried */ + bool pmtud_lost; /* T if MTU probe was lost */ + bool pmtud_probing; /* T if we have an active probe outstanding */ + bool pmtud_pending; /* T if a call to this peer should send a probe */ + u8 pmtud_jumbo; /* Max jumbo packets for the MTU */ + bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */ + unsigned int ackr_max_data; /* Maximum data advertised by peer */ + unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */ + unsigned int max_data; /* Maximum packet data capacity for this peer */ + unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ + unsigned short tx_seg_max; /* Maximum number of transmissable segments */ - u32 srtt_us; /* smoothed round trip time << 3 in usecs */ - u32 mdev_us; /* medium deviation */ - u32 mdev_max_us; /* maximal mdev for the last rtt period */ - u32 rttvar_us; /* smoothed mdev_max */ - u32 rto_us; /* Retransmission timeout in usec */ - u8 backoff; /* Backoff timeout (as shift) */ + /* Calculated RTT cache */ + unsigned int recent_srtt_us; + unsigned int recent_rto_us; u8 cong_ssthresh; /* Congestion slow-start threshold */ }; @@ -514,7 +564,17 @@ struct rxrpc_connection { struct rxrpc_crypt csum_iv; /* packet checksum base */ u32 nonce; /* response re-use preventer */ } rxkad; + struct { + struct rxgk_context *keys[4]; /* (Re-)keying buffer */ + u64 start_time; /* The start time for TK derivation */ + u8 nonce[20]; /* Response re-use preventer */ + u32 enctype; /* Kerberos 5 encoding type */ + u32 key_number; /* Current key number */ + } rxgk; }; + rwlock_t security_use_lock; /* Security use/modification lock */ + struct sk_buff *tx_response; /* Response packet to be transmitted */ + unsigned long flags; unsigned long events; unsigned long idle_timestamp; /* Time at which last became idle */ @@ -525,6 +585,8 @@ struct rxrpc_connection { int debug_id; /* debug ID for printks */ rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */ unsigned int hi_serial; /* highest serial number received */ + rxrpc_serial_t pmtud_probe; /* Serial of MTU probe (or 0) */ + unsigned int pmtud_call; /* ID of call used for probe */ u32 service_id; /* Service ID, possibly upgraded */ u32 security_level; /* Security level selected */ u8 security_ix; /* security type */ @@ -557,6 +619,7 @@ enum rxrpc_call_flag { RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */ RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */ RXRPC_CALL_TX_ALL_ACKED, /* Last packet has been hard-acked */ + RXRPC_CALL_TX_NO_MORE, /* No more data to transmit (MSG_MORE deasserted) */ RXRPC_CALL_SEND_PING, /* A ping will need to be sent */ RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */ RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */ @@ -567,6 +630,7 @@ enum rxrpc_call_flag { RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */ RXRPC_CALL_RX_IS_IDLE, /* recvmsg() is idle - send an ACK */ RXRPC_CALL_RECVMSG_READ_ALL, /* recvmsg() read all of the received data */ + RXRPC_CALL_CONN_CHALLENGING, /* The connection is being challenged */ }; /* @@ -587,7 +651,6 @@ enum rxrpc_call_state { RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ - RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ @@ -599,13 +662,25 @@ enum rxrpc_call_state { /* * Call Tx congestion management modes. */ -enum rxrpc_congest_mode { - RXRPC_CALL_SLOW_START, - RXRPC_CALL_CONGEST_AVOIDANCE, - RXRPC_CALL_PACKET_LOSS, - RXRPC_CALL_FAST_RETRANSMIT, - NR__RXRPC_CONGEST_MODES -}; +enum rxrpc_ca_state { + RXRPC_CA_SLOW_START, + RXRPC_CA_CONGEST_AVOIDANCE, + RXRPC_CA_PACKET_LOSS, + RXRPC_CA_FAST_RETRANSMIT, + NR__RXRPC_CA_STATES +} __mode(byte); + +/* + * Current purpose of call RACK timer. According to the RACK-TLP protocol + * [RFC8985], the transmission timer (call->rack_timo_at) may only be used for + * one of these at once. + */ +enum rxrpc_rack_timer_mode { + RXRPC_CALL_RACKTIMER_OFF, /* Timer not running */ + RXRPC_CALL_RACKTIMER_RACK_REORDER, /* RACK reordering timer */ + RXRPC_CALL_RACKTIMER_TLP_PTO, /* TLP timeout */ + RXRPC_CALL_RACKTIMER_RTO, /* Retransmission timeout */ +} __mode(byte); /* * RxRPC call definition @@ -624,8 +699,7 @@ struct rxrpc_call { struct mutex user_mutex; /* User access mutex */ struct sockaddr_rxrpc dest_srx; /* Destination address */ ktime_t delay_ack_at; /* When DELAY ACK needs to happen */ - ktime_t ack_lost_at; /* When ACK is figured as lost */ - ktime_t resend_at; /* When next resend needs to happen */ + ktime_t rack_timo_at; /* When ACK is figured as lost */ ktime_t ping_at; /* When next to send a ping */ ktime_t keepalive_at; /* When next to send a keepalive ping */ ktime_t expect_rx_by; /* When we expect to get a packet by */ @@ -666,25 +740,35 @@ struct rxrpc_call { u32 call_id; /* call ID on connection */ u32 cid; /* connection ID plus channel index */ u32 security_level; /* Security level selected */ + u32 security_enctype; /* Security-specific encoding type (or 0) */ int debug_id; /* debug ID for printks */ unsigned short rx_pkt_offset; /* Current recvmsg packet offset */ unsigned short rx_pkt_len; /* Current recvmsg packet len */ + /* Sendmsg data tracking. */ + rxrpc_seq_t send_top; /* Highest Tx slot filled by sendmsg. */ + struct rxrpc_txqueue *send_queue; /* Queue that sendmsg is writing into */ + /* Transmitted data tracking. */ - spinlock_t tx_lock; /* Transmit queue lock */ - struct list_head tx_sendmsg; /* Sendmsg prepared packets */ - struct list_head tx_buffer; /* Buffer of transmissible packets */ + struct rxrpc_txqueue *tx_queue; /* Start of transmission buffers */ + struct rxrpc_txqueue *tx_qtail; /* End of transmission buffers */ + rxrpc_seq_t tx_qbase; /* First slot in tx_queue */ rxrpc_seq_t tx_bottom; /* First packet in buffer */ rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */ - rxrpc_seq_t tx_prepared; /* Highest Tx slot prepared. */ rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */ + rxrpc_serial_t tx_last_serial; /* Serial of last DATA transmitted */ u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */ - u8 tx_winsize; /* Maximum size of Tx window */ + u16 tx_nr_sent; /* Number of packets sent, but unacked */ + u16 tx_nr_lost; /* Number of packets marked lost */ + u16 tx_nr_resent; /* Number of packets resent, but unacked */ + u16 tx_winsize; /* Maximum size of Tx window */ #define RXRPC_TX_MAX_WINDOW 128 + u8 tx_jumbo_max; /* Maximum subpkts peer will accept */ ktime_t tx_last_sent; /* Last time a transmission occurred */ /* Received data tracking */ struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */ + struct sk_buff_head rx_queue; /* Queue of packets for this call to receive */ struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */ rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */ @@ -698,14 +782,32 @@ struct rxrpc_call { */ #define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN #define RXRPC_MIN_CWND 4 - u8 cong_cwnd; /* Congestion window size */ + enum rxrpc_ca_state cong_ca_state; /* Congestion control state */ u8 cong_extra; /* Extra to send for congestion management */ - u8 cong_ssthresh; /* Slow-start threshold */ - enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */ - u8 cong_dup_acks; /* Count of ACKs showing missing packets */ - u8 cong_cumul_acks; /* Cumulative ACK count */ + u16 cong_cwnd; /* Congestion window size */ + u16 cong_ssthresh; /* Slow-start threshold */ + u16 cong_dup_acks; /* Count of ACKs showing missing packets */ + u16 cong_cumul_acks; /* Cumulative ACK count */ ktime_t cong_tstamp; /* Last time cwnd was changed */ - struct sk_buff *cong_last_nack; /* Last ACK with nacks received */ + + /* RACK-TLP [RFC8985] state. */ + ktime_t rack_xmit_ts; /* Latest transmission timestamp */ + ktime_t rack_rtt; /* RTT of most recently ACK'd segment */ + ktime_t rack_rtt_ts; /* Timestamp of rack_rtt */ + ktime_t rack_reo_wnd; /* Reordering window */ + unsigned int rack_reo_wnd_mult; /* Multiplier applied to rack_reo_wnd */ + int rack_reo_wnd_persist; /* Num loss recoveries before reset reo_wnd */ + rxrpc_seq_t rack_fack; /* Highest sequence so far ACK'd */ + rxrpc_seq_t rack_end_seq; /* Highest sequence seen */ + rxrpc_seq_t rack_dsack_round; /* DSACK opt recv'd in latest roundtrip */ + bool rack_dsack_round_none; /* T if dsack_round is "None" */ + bool rack_reordering_seen; /* T if detected reordering event */ + enum rxrpc_rack_timer_mode rack_timer_mode; /* Current mode of RACK timer */ + bool tlp_is_retrans; /* T if unacked TLP retransmission */ + rxrpc_serial_t tlp_serial; /* Serial of TLP probe (or 0 if none in progress) */ + rxrpc_seq_t tlp_seq; /* Sequence of TLP probe */ + unsigned int tlp_rtt_taken; /* Last time RTT taken */ + ktime_t tlp_max_ack_delay; /* Sender budget for max delayed ACK interval */ /* Receive-phase ACK management (ACKs we send). */ u8 ackr_reason; /* reason to ACK */ @@ -730,32 +832,45 @@ struct rxrpc_call { /* Transmission-phase ACK management (ACKs we've received). */ ktime_t acks_latest_ts; /* Timestamp of latest ACK received */ - rxrpc_seq_t acks_first_seq; /* first sequence number received */ + rxrpc_seq_t acks_hard_ack; /* Highest sequence hard acked */ rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */ - rxrpc_seq_t acks_hard_ack; /* Latest hard-ack point */ rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */ rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */ + unsigned short acks_nr_sacks; /* Number of soft acks recorded */ + unsigned short acks_nr_snacks; /* Number of soft nacks recorded */ + + /* Calculated RTT cache */ + ktime_t rtt_last_req; /* Time of last RTT request */ + unsigned int rtt_count; /* Number of samples we've got */ + unsigned int rtt_taken; /* Number of samples taken (wrapping) */ + struct minmax min_rtt; /* Estimated minimum RTT */ + u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + u32 mdev_us; /* medium deviation */ + u32 mdev_max_us; /* maximal mdev for the last rtt period */ + u32 rttvar_us; /* smoothed mdev_max */ + u32 rto_us; /* Retransmission timeout in usec */ + u8 backoff; /* Backoff timeout (as shift) */ }; /* * Summary of a new ACK and the changes it made to the Tx buffer packet states. */ struct rxrpc_ack_summary { - u16 nr_acks; /* Number of ACKs in packet */ - u16 nr_new_acks; /* Number of new ACKs in packet */ - u16 nr_new_nacks; /* Number of new nacks in packet */ - u16 nr_retained_nacks; /* Number of nacks retained between ACKs */ - u8 ack_reason; - bool saw_nacks; /* Saw NACKs in packet */ - bool new_low_nack; /* T if new low NACK found */ - bool retrans_timeo; /* T if reTx due to timeout happened */ - u8 flight_size; /* Number of unreceived transmissions */ - /* Place to stash values for tracing */ - enum rxrpc_congest_mode mode:8; - u8 cwnd; - u8 ssthresh; - u8 dup_acks; - u8 cumulative_acks; + rxrpc_serial_t ack_serial; /* Serial number of ACK */ + rxrpc_serial_t acked_serial; /* Serial number ACK'd */ + u16 in_flight; /* Number of unreceived transmissions */ + u16 nr_new_hacks; /* Number of rotated new ACKs */ + u16 nr_new_sacks; /* Number of new soft ACKs in packet */ + u16 nr_new_snacks; /* Number of new soft nacks in packet */ + u8 ack_reason; + bool new_low_snack:1; /* T if new low soft NACK found */ + bool retrans_timeo:1; /* T if reTx due to timeout happened */ + bool need_retransmit:1; /* T if we need transmission */ + bool rtt_sample_avail:1; /* T if RTT sample available */ + bool in_fast_or_rto_recovery:1; + bool exiting_fast_or_rto_recovery:1; + bool tlp_probe_acked:1; /* T if the TLP probe seq was acked */ + u8 /*enum rxrpc_congest_change*/ change; }; /* @@ -793,25 +908,24 @@ struct rxrpc_send_params { * Buffer of data to be output as a packet. */ struct rxrpc_txbuf { - struct list_head call_link; /* Link in call->tx_sendmsg/tx_buffer */ - struct list_head tx_link; /* Link in live Enc queue or Tx queue */ - ktime_t last_sent; /* Time at which last transmitted */ refcount_t ref; rxrpc_seq_t seq; /* Sequence number of this packet */ rxrpc_serial_t serial; /* Last serial number transmitted with */ unsigned int call_debug_id; unsigned int debug_id; - unsigned int len; /* Amount of data in buffer */ - unsigned int space; /* Remaining data space */ - unsigned int offset; /* Offset of fill point */ + unsigned short len; /* Amount of data in buffer */ + unsigned short space; /* Remaining data space */ + unsigned short offset; /* Offset of fill point */ + unsigned short crypto_header; /* Size of crypto header */ + unsigned short sec_header; /* Size of security header */ + unsigned short pkt_len; /* Size of packet content */ + unsigned short alloc_size; /* Amount of bufferage allocated */ unsigned int flags; #define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */ #define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ __be16 cksum; /* Checksum to go in header */ - unsigned short ack_rwind; /* ACK receive window */ - u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */ - u8 nr_kvec; /* Amount of kvec[] used */ - struct kvec kvec[3]; + bool jumboable; /* Can be non-terminal jumbo subpacket */ + void *data; /* Data with preceding jumbo header */ }; static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb) @@ -824,6 +938,46 @@ static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb) return !rxrpc_sending_to_server(txb); } +/* + * Transmit queue element, including RACK [RFC8985] per-segment metadata. The + * transmission timestamp is in usec from the base. + */ +struct rxrpc_txqueue { + /* Start with the members we want to prefetch. */ + struct rxrpc_txqueue *next; + ktime_t xmit_ts_base; + rxrpc_seq_t qbase; + u8 nr_reported_acks; /* Number of segments explicitly acked/nacked */ + unsigned long segment_acked; /* Bit-per-buf: Set if ACK'd */ + unsigned long segment_lost; /* Bit-per-buf: Set if declared lost */ + unsigned long segment_retransmitted; /* Bit-per-buf: Set if retransmitted */ + unsigned long rtt_samples; /* Bit-per-buf: Set if available for RTT */ + unsigned long ever_retransmitted; /* Bit-per-buf: Set if ever retransmitted */ + + /* The arrays we want to pack into as few cache lines as possible. */ + struct { +#define RXRPC_NR_TXQUEUE BITS_PER_LONG +#define RXRPC_TXQ_MASK (RXRPC_NR_TXQUEUE - 1) + struct rxrpc_txbuf *bufs[RXRPC_NR_TXQUEUE]; + unsigned int segment_serial[RXRPC_NR_TXQUEUE]; + unsigned int segment_xmit_ts[RXRPC_NR_TXQUEUE]; + } ____cacheline_aligned; +}; + +/* + * Data transmission request. + */ +struct rxrpc_send_data_req { + ktime_t now; /* Current time */ + struct rxrpc_txqueue *tq; /* Tx queue segment holding first DATA */ + rxrpc_seq_t seq; /* Sequence of first data */ + int n; /* Number of DATA packets to glue into jumbo */ + bool retrans; /* T if this is a retransmission */ + bool did_send; /* T if did actually send */ + bool tlp_probe; /* T if this is a TLP probe */ + int /* enum rxrpc_txdata_trace */ trace; +}; + #include <trace/events/rxrpc.h> /* @@ -841,6 +995,21 @@ static inline rxrpc_serial_t rxrpc_get_next_serial(struct rxrpc_connection *conn } /* + * Allocate the next serial n numbers on a connection. 0 must be skipped. + */ +static inline rxrpc_serial_t rxrpc_get_next_serials(struct rxrpc_connection *conn, + unsigned int n) +{ + rxrpc_serial_t serial; + + serial = conn->tx_serial; + if (serial + n <= n) + serial = 1; + conn->tx_serial = serial + n; + return serial; +} + +/* * af_rxrpc.c */ extern atomic_t rxrpc_n_rx_skbs; @@ -865,10 +1034,10 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial, enum rxrpc_propose_ack_trace why); void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t, enum rxrpc_propose_ack_trace); -void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *); -void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb); - -bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb); +void rxrpc_resend_tlp(struct rxrpc_call *call); +void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace); +bool rxrpc_input_call_event(struct rxrpc_call *call); /* * call_object.c @@ -883,7 +1052,9 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, struct rxrpc_conn_parameters *, struct rxrpc_call_params *, gfp_t, - unsigned int); + unsigned int) + __releases(&rx->sk.sk_lock) + __acquires(&call->user_mutex); void rxrpc_start_call_timer(struct rxrpc_call *call); void rxrpc_incoming_call(struct rxrpc_sock *, struct rxrpc_call *, struct sk_buff *); @@ -1047,6 +1218,32 @@ void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *); void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *); /* + * input_rack.c + */ +void rxrpc_input_rack_one(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix); +void rxrpc_input_rack(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long new_acks); +void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary); +ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now); +void rxrpc_tlp_send_probe(struct rxrpc_call *call); +void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary); +void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by); + +/* Initialise TLP state [RFC8958 7.1]. */ +static inline void rxrpc_tlp_init(struct rxrpc_call *call) +{ + call->tlp_serial = 0; + call->tlp_seq = call->acks_hard_ack; + call->tlp_is_retrans = false; +} + +/* * io_thread.c */ int rxrpc_encap_rcv(struct sock *, struct sk_buff *); @@ -1054,8 +1251,11 @@ void rxrpc_error_report(struct sock *); bool rxrpc_direct_abort(struct sk_buff *skb, enum rxrpc_abort_reason why, s32 abort_code, int err); int rxrpc_io_thread(void *data); +void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb); static inline void rxrpc_wake_up_io_thread(struct rxrpc_local *local) { + if (!local->io_thread) + return; wake_up_process(READ_ONCE(local->io_thread)); } @@ -1145,21 +1345,33 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net) } /* + * out_of_band.c + */ +void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb); +void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb); +int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len); + +/* * output.c */ +ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len); void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why); +void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call); int rxrpc_send_abort_packet(struct rxrpc_call *); +void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req); void rxrpc_send_conn_abort(struct rxrpc_connection *conn); void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb); void rxrpc_send_keepalive(struct rxrpc_peer *); -void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb); +void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *skb); /* * peer_event.c */ void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *); void rxrpc_peer_keepalive_worker(struct work_struct *); +void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, + bool sendmsg_fail); /* * peer_object.c @@ -1208,10 +1420,17 @@ static inline int rxrpc_abort_eproto(struct rxrpc_call *call, /* * rtt.c */ -void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int, - rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t); -ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans); -void rxrpc_peer_init_rtt(struct rxrpc_peer *); +void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, + int rtt_slot, + rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, + ktime_t send_time, ktime_t resp_time); +ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans); +void rxrpc_call_init_rtt(struct rxrpc_call *call); + +/* + * rxgk.c + */ +extern const struct rxrpc_security rxgk_yfs; /* * rxkad.c @@ -1284,8 +1503,6 @@ static inline void rxrpc_sysctl_exit(void) {} extern atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp); -struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size); -void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what); @@ -1311,6 +1528,53 @@ static inline bool after_eq(u32 seq1, u32 seq2) return (s32)(seq1 - seq2) >= 0; } +static inline u32 earliest(u32 seq1, u32 seq2) +{ + return before(seq1, seq2) ? seq1 : seq2; +} + +static inline u32 latest(u32 seq1, u32 seq2) +{ + return after(seq1, seq2) ? seq1 : seq2; +} + +static inline bool rxrpc_seq_in_txq(const struct rxrpc_txqueue *tq, rxrpc_seq_t seq) +{ + return (seq & (RXRPC_NR_TXQUEUE - 1)) == tq->qbase; +} + +static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + rxrpc_get_skb(skb, rxrpc_skb_get_call_rx); + __skb_queue_tail(&call->rx_queue, skb); + rxrpc_poke_call(call, rxrpc_call_poke_rx_packet); +} + +/* + * Calculate how much space there is for transmitting more DATA packets. + */ +static inline unsigned int rxrpc_tx_window_space(const struct rxrpc_call *call) +{ + int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra); + int transmitted = call->tx_top - call->tx_bottom; + + return max(winsize - transmitted, 0); +} + +static inline unsigned int rxrpc_left_out(const struct rxrpc_call *call) +{ + return call->acks_nr_sacks + call->tx_nr_lost; +} + +/* + * Calculate the number of transmitted DATA packets assumed to be in flight + * [approx RFC6675]. + */ +static inline unsigned int rxrpc_tx_in_flight(const struct rxrpc_call *call) +{ + return call->tx_nr_sent - rxrpc_left_out(call) + call->tx_nr_resent; +} + /* * debug tracing */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 0f5a1d77b890..a4b363b47cca 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -34,7 +34,6 @@ static void rxrpc_dummy_notify(struct sock *sk, struct rxrpc_call *call, static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, struct rxrpc_backlog *b, rxrpc_notify_rx_t notify_rx, - rxrpc_user_attach_call_t user_attach_call, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { @@ -123,9 +122,10 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx, call->user_call_ID = user_call_ID; call->notify_rx = notify_rx; - if (user_attach_call) { + if (rx->app_ops && + rx->app_ops->user_attach_call) { rxrpc_get_call(call, rxrpc_call_get_kernel_service); - user_attach_call(call, user_call_ID); + rx->app_ops->user_attach_call(call, user_call_ID); } rxrpc_get_call(call, rxrpc_call_get_userid); @@ -188,8 +188,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) /* Make sure that there aren't any incoming calls in progress before we * clear the preallocation buffers. */ - spin_lock(&rx->incoming_lock); - spin_unlock(&rx->incoming_lock); + spin_lock_irq(&rx->incoming_lock); + spin_unlock_irq(&rx->incoming_lock); head = b->peer_backlog_head; tail = b->peer_backlog_tail; @@ -219,9 +219,10 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx) while (CIRC_CNT(head, tail, size) > 0) { struct rxrpc_call *call = b->call_backlog[tail]; rcu_assign_pointer(call->socket, rx); - if (rx->discard_new_call) { + if (rx->app_ops && + rx->app_ops->discard_new_call) { _debug("discard %lx", call->user_call_ID); - rx->discard_new_call(call, call->user_call_ID); + rx->app_ops->discard_new_call(call, call->user_call_ID); if (call->notify_rx) call->notify_rx = rxrpc_dummy_notify; rxrpc_put_call(call, rxrpc_call_put_kernel); @@ -343,7 +344,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA) return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call); - read_lock(&local->services_lock); + read_lock_irq(&local->services_lock); /* Weed out packets to services we're not offering. Packets that would * begin a call are explicitly rejected and the rest are just @@ -387,8 +388,9 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, rxrpc_incoming_call(rx, call, skb); conn = call->conn; - if (rx->notify_new_call) - rx->notify_new_call(&rx->sk, call, call->user_call_ID); + if (rx->app_ops && + rx->app_ops->notify_new_call) + rx->app_ops->notify_new_call(&rx->sk, call, call->user_call_ID); spin_lock(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_UNSECURED) { @@ -399,34 +401,34 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local, spin_unlock(&conn->state_lock); spin_unlock(&rx->incoming_lock); - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); if (hlist_unhashed(&call->error_link)) { - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } _leave(" = %p{%d}", call, call->debug_id); - rxrpc_input_call_event(call, skb); + rxrpc_queue_rx_call_packet(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); return true; unsupported_service: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EOPNOTSUPP); unsupported_security: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered, RX_INVALID_OPERATION, -EKEYREJECTED); no_call: spin_unlock(&rx->incoming_lock); - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); _leave(" = f [%u]", skb->mark); return false; discard: - read_unlock(&local->services_lock); + read_unlock_irq(&local->services_lock); return true; } @@ -440,8 +442,7 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) if (rx->sk.sk_state == RXRPC_CLOSE) return -ESHUTDOWN; - return rxrpc_service_prealloc_one(rx, b, NULL, NULL, user_call_ID, - GFP_KERNEL, + return rxrpc_service_prealloc_one(rx, b, NULL, user_call_ID, GFP_KERNEL, atomic_inc_return(&rxrpc_debug_id)); } @@ -449,20 +450,18 @@ int rxrpc_user_charge_accept(struct rxrpc_sock *rx, unsigned long user_call_ID) * rxrpc_kernel_charge_accept - Charge up socket with preallocated calls * @sock: The socket on which to preallocate * @notify_rx: Event notification function for the call - * @user_attach_call: Func to attach call to user_call_ID * @user_call_ID: The tag to attach to the preallocated call * @gfp: The allocation conditions. * @debug_id: The tracing debug ID. * - * Charge up the socket with preallocated calls, each with a user ID. A - * function should be provided to effect the attachment from the user's side. - * The user is given a ref to hold on the call. + * Charge up the socket with preallocated calls, each with a user ID. The + * ->user_attach_call() callback function should be provided to effect the + * attachment from the user's side. The user is given a ref to hold on the + * call. * * Note that the call may be come connected before this function returns. */ -int rxrpc_kernel_charge_accept(struct socket *sock, - rxrpc_notify_rx_t notify_rx, - rxrpc_user_attach_call_t user_attach_call, +int rxrpc_kernel_charge_accept(struct socket *sock, rxrpc_notify_rx_t notify_rx, unsigned long user_call_ID, gfp_t gfp, unsigned int debug_id) { @@ -472,8 +471,7 @@ int rxrpc_kernel_charge_accept(struct socket *sock, if (sock->sk->sk_state == RXRPC_CLOSE) return -ESHUTDOWN; - return rxrpc_service_prealloc_one(rx, b, notify_rx, - user_attach_call, user_call_ID, + return rxrpc_service_prealloc_one(rx, b, notify_rx, user_call_ID, gfp, debug_id); } EXPORT_SYMBOL(rxrpc_kernel_charge_accept); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 7bbb68504766..fec59d9338b9 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -44,8 +44,8 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial); - if (call->peer->srtt_us) - delay = (call->peer->srtt_us >> 3) * NSEC_PER_USEC; + if (call->srtt_us) + delay = (call->srtt_us >> 3) * NSEC_PER_USEC; else delay = ms_to_ktime(READ_ONCE(rxrpc_soft_ack_delay)); ktime_add_ms(delay, call->tx_backoff); @@ -55,147 +55,104 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial, } /* - * Handle congestion being detected by the retransmit timeout. + * Retransmit one or more packets. */ -static void rxrpc_congestion_timeout(struct rxrpc_call *call) +static bool rxrpc_retransmit_data(struct rxrpc_call *call, + struct rxrpc_send_data_req *req) { - set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags); + struct rxrpc_txqueue *tq = req->tq; + unsigned int ix = req->seq & RXRPC_TXQ_MASK; + struct rxrpc_txbuf *txb = tq->bufs[ix]; + + _enter("%x,%x,%x,%x", tq->qbase, req->seq, ix, txb->debug_id); + + req->retrans = true; + trace_rxrpc_retransmit(call, req, txb); + + txb->flags |= RXRPC_TXBUF_RESENT; + rxrpc_send_data_packet(call, req); + rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); + + req->tq = NULL; + req->n = 0; + req->did_send = true; + req->now = ktime_get_real(); + return true; } /* * Perform retransmission of NAK'd and unack'd packets. */ -void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb) +static void rxrpc_resend(struct rxrpc_call *call) { - struct rxrpc_ackpacket *ack = NULL; - struct rxrpc_skb_priv *sp; - struct rxrpc_txbuf *txb; - rxrpc_seq_t transmitted = call->tx_transmitted; - ktime_t next_resend = KTIME_MAX, rto = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); - ktime_t resend_at = KTIME_MAX, now, delay; - bool unacked = false, did_send = false; - unsigned int i; - - _enter("{%d,%d}", call->acks_hard_ack, call->tx_top); - - now = ktime_get_real(); - - if (list_empty(&call->tx_buffer)) - goto no_resend; + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .trace = rxrpc_txdata_retransmit, + }; + struct rxrpc_txqueue *tq; - trace_rxrpc_resend(call, ack_skb); - txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link); + _enter("{%d,%d}", call->tx_bottom, call->tx_top); - /* Scan the soft ACK table without dropping the lock and resend any - * explicitly NAK'd packets. - */ - if (ack_skb) { - sp = rxrpc_skb(ack_skb); - ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header); + trace_rxrpc_resend(call, call->acks_highest_serial); - for (i = 0; i < sp->ack.nr_acks; i++) { - rxrpc_seq_t seq; + /* Scan the transmission queue, looking for lost packets. */ + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long lost = tq->segment_lost; - if (ack->acks[i] & 1) - continue; - seq = sp->ack.first_ack + i; - if (after(txb->seq, transmitted)) - break; - if (after(txb->seq, seq)) - continue; /* A new hard ACK probably came in */ - list_for_each_entry_from(txb, &call->tx_buffer, call_link) { - if (txb->seq == seq) - goto found_txb; - } - goto no_further_resend; - - found_txb: - resend_at = ktime_add(txb->last_sent, rto); - if (after(txb->serial, call->acks_highest_serial)) { - if (ktime_after(resend_at, now) && - ktime_before(resend_at, next_resend)) - next_resend = resend_at; - continue; /* Ack point not yet reached */ - } + if (after(tq->qbase, call->tx_transmitted)) + break; - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked); + _debug("retr %16lx %u c=%08x [%x]", + tq->segment_acked, tq->nr_reported_acks, call->debug_id, tq->qbase); + _debug("lost %16lx", lost); - trace_rxrpc_retransmit(call, txb->seq, txb->serial, - ktime_sub(resend_at, now)); + trace_rxrpc_resend_lost(call, tq, lost); + while (lost) { + unsigned int ix = __ffs(lost); + struct rxrpc_txbuf *txb = tq->bufs[ix]; - txb->flags |= RXRPC_TXBUF_RESENT; - rxrpc_transmit_one(call, txb); - did_send = true; - now = ktime_get_real(); + __clear_bit(ix, &lost); + rxrpc_see_txbuf(txb, rxrpc_txbuf_see_lost); - if (list_is_last(&txb->call_link, &call->tx_buffer)) - goto no_further_resend; - txb = list_next_entry(txb, call_link); + req.tq = tq; + req.seq = tq->qbase + ix; + req.n = 1; + rxrpc_retransmit_data(call, &req); } } - /* Fast-forward through the Tx queue to the point the peer says it has - * seen. Anything between the soft-ACK table and that point will get - * ACK'd or NACK'd in due course, so don't worry about it here; here we - * need to consider retransmitting anything beyond that point. - */ - if (after_eq(call->acks_prev_seq, call->tx_transmitted)) - goto no_further_resend; - - list_for_each_entry_from(txb, &call->tx_buffer, call_link) { - resend_at = ktime_add(txb->last_sent, rto); - - if (before_eq(txb->seq, call->acks_prev_seq)) - continue; - if (after(txb->seq, call->tx_transmitted)) - break; /* Not transmitted yet */ - - if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE && - before(txb->serial, ntohl(ack->serial))) - goto do_resend; /* Wasn't accounted for by a more recent ping. */ - - if (ktime_after(resend_at, now)) { - if (ktime_before(resend_at, next_resend)) - next_resend = resend_at; - continue; - } - - do_resend: - unacked = true; - - txb->flags |= RXRPC_TXBUF_RESENT; - rxrpc_transmit_one(call, txb); - did_send = true; - rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans); - now = ktime_get_real(); - } + rxrpc_get_rto_backoff(call, req.did_send); + _leave(""); +} -no_further_resend: -no_resend: - if (resend_at < KTIME_MAX) { - delay = rxrpc_get_rto_backoff(call->peer, did_send); - resend_at = ktime_add(resend_at, delay); - trace_rxrpc_timer_set(call, resend_at - now, rxrpc_timer_trace_resend_reset); +/* + * Resend the highest-seq DATA packet so far transmitted for RACK-TLP [RFC8985 7.3]. + */ +void rxrpc_resend_tlp(struct rxrpc_call *call) +{ + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .seq = call->tx_transmitted, + .n = 1, + .tlp_probe = true, + .trace = rxrpc_txdata_tlp_retransmit, + }; + + /* There's a chance it'll be on the tail segment of the queue. */ + req.tq = READ_ONCE(call->tx_qtail); + if (req.tq && + before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) { + rxrpc_retransmit_data(call, &req); + return; } - call->resend_at = resend_at; - - if (unacked) - rxrpc_congestion_timeout(call); - - /* If there was nothing that needed retransmission then it's likely - * that an ACK got lost somewhere. Send a ping to find out instead of - * retransmitting data. - */ - if (!did_send) { - ktime_t next_ping = ktime_add_us(call->acks_latest_ts, - call->peer->srtt_us >> 3); - if (ktime_sub(next_ping, now) <= 0) - rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, - rxrpc_propose_ack_ping_for_0_retrans); + for (req.tq = call->tx_queue; req.tq; req.tq = req.tq->next) { + if (after_eq(call->tx_transmitted, req.tq->qbase) && + before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) { + rxrpc_retransmit_data(call, &req); + return; + } } - - _leave(""); } /* @@ -231,68 +188,93 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call) } } -static bool rxrpc_tx_window_has_space(struct rxrpc_call *call) -{ - unsigned int winsize = min_t(unsigned int, call->tx_winsize, - call->cong_cwnd + call->cong_extra); - rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize; - rxrpc_seq_t tx_top = call->tx_top; - int space; - - space = wtop - tx_top; - return space > 0; -} - /* - * Decant some if the sendmsg prepared queue into the transmission buffer. + * Transmit some as-yet untransmitted data, to a maximum of the supplied limit. */ -static void rxrpc_decant_prepared_tx(struct rxrpc_call *call) +static void rxrpc_transmit_fresh_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace) { - struct rxrpc_txbuf *txb; + int space = rxrpc_tx_window_space(call); if (!test_bit(RXRPC_CALL_EXPOSED, &call->flags)) { - if (list_empty(&call->tx_sendmsg)) + if (call->send_top == call->tx_top) return; rxrpc_expose_client_call(call); } - while ((txb = list_first_entry_or_null(&call->tx_sendmsg, - struct rxrpc_txbuf, call_link))) { - spin_lock(&call->tx_lock); - list_del(&txb->call_link); - spin_unlock(&call->tx_lock); + while (space > 0) { + struct rxrpc_send_data_req req = { + .now = ktime_get_real(), + .seq = call->tx_transmitted + 1, + .n = 0, + .trace = trace, + }; + struct rxrpc_txqueue *tq; + struct rxrpc_txbuf *txb; + rxrpc_seq_t send_top, seq; + int limit = min(space, max(call->peer->pmtud_jumbo, 1)); + + /* Order send_top before the contents of the new txbufs and + * txqueue pointers + */ + send_top = smp_load_acquire(&call->send_top); + if (call->tx_top == send_top) + break; - call->tx_top = txb->seq; - list_add_tail(&txb->call_link, &call->tx_buffer); + trace_rxrpc_transmit(call, send_top, space); - if (txb->flags & RXRPC_LAST_PACKET) - rxrpc_close_tx_phase(call); + tq = call->tx_qtail; + seq = call->tx_top; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant); - rxrpc_transmit_one(call, txb); + do { + int ix; - if (!rxrpc_tx_window_has_space(call)) - break; + seq++; + ix = seq & RXRPC_TXQ_MASK; + if (!ix) { + tq = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant_advance); + } + if (!req.tq) + req.tq = tq; + txb = tq->bufs[ix]; + req.n++; + if (!txb->jumboable) + break; + } while (req.n < limit && before(seq, send_top)); + + if (txb->flags & RXRPC_LAST_PACKET) { + rxrpc_close_tx_phase(call); + tq = NULL; + } + call->tx_qtail = tq; + call->tx_top = seq; + + space -= req.n; + rxrpc_send_data_packet(call, &req); } } -static void rxrpc_transmit_some_data(struct rxrpc_call *call) +void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit, + enum rxrpc_txdata_trace trace) { switch (__rxrpc_call_state(call)) { case RXRPC_CALL_SERVER_ACK_REQUEST: - if (list_empty(&call->tx_sendmsg)) + if (call->tx_bottom == READ_ONCE(call->send_top)) return; rxrpc_begin_service_reply(call); fallthrough; case RXRPC_CALL_SERVER_SEND_REPLY: case RXRPC_CALL_CLIENT_SEND_REQUEST: - if (!rxrpc_tx_window_has_space(call)) + if (!rxrpc_tx_window_space(call)) return; - if (list_empty(&call->tx_sendmsg)) { + if (call->tx_bottom == READ_ONCE(call->send_top)) { rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow); return; } - rxrpc_decant_prepared_tx(call); + rxrpc_transmit_fresh_data(call, limit, trace); break; default: return; @@ -305,8 +287,8 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call) */ static void rxrpc_send_initial_ping(struct rxrpc_call *call) { - if (call->peer->rtt_count < 3 || - ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + if (call->rtt_count < 3 || + ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_params); @@ -315,10 +297,11 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call) /* * Handle retransmission and deferred ACK/abort generation. */ -bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) +bool rxrpc_input_call_event(struct rxrpc_call *call) { + struct sk_buff *skb; ktime_t now, t; - bool resend = false; + bool did_receive = false, saw_ack = false; s32 abort_code; rxrpc_see_call(call, rxrpc_call_see_input); @@ -328,9 +311,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) call->debug_id, rxrpc_call_states[__rxrpc_call_state(call)], call->events); - if (__rxrpc_call_is_complete(call)) - goto out; - /* Handle abort request locklessly, vs rxrpc_propose_abort(). */ abort_code = smp_load_acquire(&call->send_abort); if (abort_code) { @@ -339,11 +319,33 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) goto out; } - if (skb && skb->mark == RXRPC_SKB_MARK_ERROR) - goto out; + do { + skb = __skb_dequeue(&call->rx_queue); + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + if (__rxrpc_call_is_complete(call) || + skb->mark == RXRPC_SKB_MARK_ERROR) { + rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); + goto out; + } + + saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK; + + rxrpc_input_call_packet(call, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_call_rx); + did_receive = true; + } - if (skb) - rxrpc_input_call_packet(call, skb); + t = ktime_sub(call->rack_timo_at, ktime_get_real()); + if (t <= 0) { + trace_rxrpc_timer_exp(call, t, + rxrpc_timer_trace_rack_off + call->rack_timer_mode); + call->rack_timo_at = KTIME_MAX; + rxrpc_rack_timer_expired(call, t); + } + + } while (!skb_queue_empty(&call->rx_queue)); /* If we see our async-event poke, check for timeout trippage. */ now = ktime_get_real(); @@ -376,13 +378,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_delayed_ack); } - t = ktime_sub(call->ack_lost_at, now); - if (t <= 0) { - trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_lost_ack); - call->ack_lost_at = KTIME_MAX; - set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events); - } - t = ktime_sub(call->ping_at, now); if (t <= 0) { trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_ping); @@ -391,15 +386,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_ping_for_keepalive); } - t = ktime_sub(call->resend_at, now); - if (t <= 0) { - trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_resend); - call->resend_at = KTIME_MAX; - resend = true; - } - - rxrpc_transmit_some_data(call); - now = ktime_get_real(); t = ktime_sub(call->keepalive_at, now); if (t <= 0) { @@ -409,35 +395,40 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) rxrpc_propose_ack_ping_for_keepalive); } - if (skb) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - - if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK) - rxrpc_congestion_degrade(call); - } - if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events)) rxrpc_send_initial_ping(call); + rxrpc_transmit_some_data(call, UINT_MAX, rxrpc_txdata_new_data); + + if (saw_ack) + rxrpc_congestion_degrade(call); + + if (did_receive && + (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_SEND_REQUEST || + __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SEND_REPLY)) { + t = ktime_sub(call->rack_timo_at, ktime_get_real()); + trace_rxrpc_rack(call, t); + } + /* Process events */ if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_lost_ack); - if (resend && + if (call->tx_nr_lost > 0 && __rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY && !test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags)) - rxrpc_resend(call, NULL); + rxrpc_resend(call); if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags)) rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0, rxrpc_propose_ack_rx_idle); if (call->ackr_nr_unacked > 2) { - if (call->peer->rtt_count < 3) + if (call->rtt_count < 3) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_rtt); - else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), + else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, rxrpc_propose_ack_ping_for_old_rtt); @@ -455,8 +446,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) set(call->expect_req_by); set(call->expect_rx_by); set(call->delay_ack_at); - set(call->ack_lost_at); - set(call->resend_at); + set(call->rack_timo_at); set(call->keepalive_at); set(call->ping_at); @@ -467,7 +457,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) } else { unsigned long nowj = jiffies, delayj, nextj; - delayj = max(nsecs_to_jiffies(delay), 1); + delayj = umax(nsecs_to_jiffies(delay), 1); nextj = nowj + delayj; if (time_before(nextj, call->timer.expires) || !timer_pending(&call->timer)) { @@ -479,14 +469,17 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb) out: if (__rxrpc_call_is_complete(call)) { - del_timer_sync(&call->timer); + timer_delete_sync(&call->timer); if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) rxrpc_disconnect_call(call); if (call->security) call->security->free_call_crypto(call); + } else { + if (did_receive && + call->peer->ackr_adv_pmtud && + call->peer->pmtud_pending) + rxrpc_send_probe_for_pmtud(call); } - if (call->acks_hard_ack != call->tx_bottom) - rxrpc_shrink_call_tx_buffer(call); _leave(""); return true; diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index f9e983a12c14..e9e8f0ef3fd5 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -22,7 +22,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", - [RXRPC_CALL_SERVER_SECURING] = "SvSecure", [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", @@ -49,7 +48,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) bool busy; if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) { - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); busy = !list_empty(&call->attend_link); trace_rxrpc_poke_call(call, busy, what); if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke)) @@ -57,7 +56,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what) if (!busy) { list_add_tail(&call->attend_link, &local->call_attend_q); } - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); if (!busy) rxrpc_wake_up_io_thread(local); } @@ -146,23 +145,21 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, INIT_LIST_HEAD(&call->recvmsg_link); INIT_LIST_HEAD(&call->sock_link); INIT_LIST_HEAD(&call->attend_link); - INIT_LIST_HEAD(&call->tx_sendmsg); - INIT_LIST_HEAD(&call->tx_buffer); skb_queue_head_init(&call->recvmsg_queue); + skb_queue_head_init(&call->rx_queue); skb_queue_head_init(&call->rx_oos_queue); init_waitqueue_head(&call->waitq); spin_lock_init(&call->notify_lock); - spin_lock_init(&call->tx_lock); refcount_set(&call->ref, 1); call->debug_id = debug_id; call->tx_total_len = -1; + call->tx_jumbo_max = 1; call->next_rx_timo = 20 * HZ; call->next_req_timo = 1 * HZ; call->ackr_window = 1; call->ackr_wtop = 1; call->delay_ack_at = KTIME_MAX; - call->ack_lost_at = KTIME_MAX; - call->resend_at = KTIME_MAX; + call->rack_timo_at = KTIME_MAX; call->ping_at = KTIME_MAX; call->keepalive_at = KTIME_MAX; call->expect_rx_by = KTIME_MAX; @@ -177,6 +174,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, call->cong_cwnd = RXRPC_MIN_CWND; call->cong_ssthresh = RXRPC_TX_MAX_WINDOW; + rxrpc_call_init_rtt(call); + call->rxnet = rxnet; call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK; atomic_inc(&rxnet->nr_calls); @@ -220,9 +219,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, __set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags); if (p->timeouts.normal) - call->next_rx_timo = min(p->timeouts.normal, 1); + call->next_rx_timo = umin(p->timeouts.normal, 1); if (p->timeouts.idle) - call->next_req_timo = min(p->timeouts.idle, 1); + call->next_req_timo = umin(p->timeouts.idle, 1); if (p->timeouts.hard) call->hard_timo = p->timeouts.hard; @@ -302,9 +301,9 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call); rxrpc_get_call(call, rxrpc_call_get_io_thread); - spin_lock(&local->client_call_lock); + spin_lock_irq(&local->client_call_lock); list_add_tail(&call->wait_link, &local->new_client_calls); - spin_unlock(&local->client_call_lock); + spin_unlock_irq(&local->client_call_lock); rxrpc_wake_up_io_thread(local); return 0; @@ -323,7 +322,7 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, struct rxrpc_call_params *p, gfp_t gfp, unsigned int debug_id) - __releases(&rx->sk.sk_lock.slock) + __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex) { struct rxrpc_call *call, *xcall; @@ -434,7 +433,7 @@ error_attached_to_socket: /* * Set up an incoming call. call->conn points to the connection. - * This is called in BH context and isn't allowed to fail. + * This is called with interrupts disabled and isn't allowed to fail. */ void rxrpc_incoming_call(struct rxrpc_sock *rx, struct rxrpc_call *call, @@ -453,17 +452,16 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->cong_tstamp = skb->tstamp; __set_bit(RXRPC_CALL_EXPOSED, &call->flags); - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); spin_lock(&conn->state_lock); switch (conn->state) { case RXRPC_CONN_SERVICE_UNSECURED: case RXRPC_CONN_SERVICE_CHALLENGING: - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + __set_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags); break; case RXRPC_CONN_SERVICE: - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); break; case RXRPC_CONN_ABORTED: @@ -531,11 +529,29 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why) } /* - * Clean up the Rx skb ring. + * Clean up the transmission buffers. + */ +static void rxrpc_cleanup_tx_buffers(struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq, *next; + + for (tq = call->tx_queue; tq; tq = next) { + next = tq->next; + for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) + if (tq->bufs[i]) + rxrpc_put_txbuf(tq->bufs[i], rxrpc_txbuf_put_cleaned); + trace_rxrpc_tq(call, tq, 0, rxrpc_tq_cleaned); + kfree(tq); + } +} + +/* + * Clean up the receive buffers. */ -static void rxrpc_cleanup_ring(struct rxrpc_call *call) +static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call) { rxrpc_purge_queue(&call->recvmsg_queue); + rxrpc_purge_queue(&call->rx_queue); rxrpc_purge_queue(&call->rx_oos_queue); } @@ -558,7 +574,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) rxrpc_put_call_slot(call); /* Make sure we don't get any more notifications */ - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); if (!list_empty(&call->recvmsg_link)) { _debug("unlinking once-pending call %p { e=%lx f=%lx }", @@ -571,7 +587,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call) call->recvmsg_link.next = NULL; call->recvmsg_link.prev = NULL; - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); if (put) rxrpc_put_call(call, rxrpc_call_put_unnotify); @@ -671,23 +687,11 @@ static void rxrpc_rcu_free_call(struct rcu_head *rcu) static void rxrpc_destroy_call(struct work_struct *work) { struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer); - struct rxrpc_txbuf *txb; - - del_timer_sync(&call->timer); - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - rxrpc_cleanup_ring(call); - while ((txb = list_first_entry_or_null(&call->tx_sendmsg, - struct rxrpc_txbuf, call_link))) { - list_del(&txb->call_link); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); - } - while ((txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link))) { - list_del(&txb->call_link); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned); - } + timer_delete_sync(&call->timer); + rxrpc_cleanup_tx_buffers(call); + rxrpc_cleanup_rx_buffers(call); rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned); rxrpc_put_connection(call->conn, rxrpc_conn_put_call); rxrpc_deactivate_bundle(call->bundle); @@ -707,7 +711,7 @@ void rxrpc_cleanup_call(struct rxrpc_call *call) ASSERTCMP(__rxrpc_call_state(call), ==, RXRPC_CALL_COMPLETE); ASSERT(test_bit(RXRPC_CALL_RELEASED, &call->flags)); - del_timer(&call->timer); + timer_delete(&call->timer); if (rcu_read_lock_held()) /* Can't use the rxrpc workqueue as we need to cancel/flush @@ -756,3 +760,23 @@ void rxrpc_destroy_all_calls(struct rxrpc_net *rxnet) atomic_dec(&rxnet->nr_calls); wait_var_event(&rxnet->nr_calls, !atomic_read(&rxnet->nr_calls)); } + +/** + * rxrpc_kernel_query_call_security - Query call's security parameters + * @call: The call to query + * @_service_id: Where to return the service ID + * @_enctype: Where to return the "encoding type" + * + * This queries the security parameters of a call, setting *@_service_id and + * *@_enctype and returning the security class. + * + * Return: The security class protocol number. + */ +u8 rxrpc_kernel_query_call_security(struct rxrpc_call *call, + u16 *_service_id, u32 *_enctype) +{ + *_service_id = call->dest_srx.srx_service; + *_enctype = call->security_enctype; + return call->security_ix; +} +EXPORT_SYMBOL(rxrpc_kernel_query_call_security); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index bb11e8289d6d..63bbcc567f59 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -231,7 +231,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) distance = id - id_cursor; if (distance < 0) distance = -distance; - limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024); + limit = umax(atomic_read(&rxnet->nr_conns) * 4, 1024); if (distance > limit) goto mark_dont_reuse; @@ -437,9 +437,9 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, call->dest_srx.srx_service = conn->service_id; call->cong_ssthresh = call->peer->cong_ssthresh; if (call->cong_cwnd >= call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; else - call->cong_mode = RXRPC_CALL_SLOW_START; + call->cong_ca_state = RXRPC_CA_SLOW_START; chan->call_id = call_id; chan->call_debug_id = call->debug_id; @@ -508,16 +508,18 @@ static void rxrpc_activate_channels(struct rxrpc_bundle *bundle) void rxrpc_connect_client_calls(struct rxrpc_local *local) { struct rxrpc_call *call; + LIST_HEAD(new_client_calls); - while ((call = list_first_entry_or_null(&local->new_client_calls, - struct rxrpc_call, wait_link)) - ) { + spin_lock_irq(&local->client_call_lock); + list_splice_tail_init(&local->new_client_calls, &new_client_calls); + spin_unlock_irq(&local->client_call_lock); + + while ((call = list_first_entry_or_null(&new_client_calls, + struct rxrpc_call, wait_link))) { struct rxrpc_bundle *bundle = call->bundle; - spin_lock(&local->client_call_lock); list_move_tail(&call->wait_link, &bundle->waiting_calls); rxrpc_see_call(call, rxrpc_call_see_waiting_call); - spin_unlock(&local->client_call_lock); if (rxrpc_bundle_has_space(bundle)) rxrpc_activate_channels(bundle); @@ -545,9 +547,9 @@ void rxrpc_expose_client_call(struct rxrpc_call *call) set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); trace_rxrpc_client(conn, channel, rxrpc_client_exposed); - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_add_head(&call->error_link, &call->peer->error_targets); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } } @@ -588,9 +590,9 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call ASSERTCMP(call->call_id, ==, 0); ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags)); /* May still be on ->new_client_calls. */ - spin_lock(&local->client_call_lock); + spin_lock_irq(&local->client_call_lock); list_del_init(&call->wait_link); - spin_unlock(&local->client_call_lock); + spin_unlock_irq(&local->client_call_lock); return; } @@ -816,7 +818,7 @@ void rxrpc_clean_up_local_conns(struct rxrpc_local *local) local->kill_all_client_conns = true; - del_timer_sync(&local->client_conn_reap_timer); + timer_delete_sync(&local->client_conn_reap_timer); while ((conn = list_first_entry_or_null(&local->idle_client_conns, struct rxrpc_connection, cache_link))) { diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 598b4ee389fc..232b6986da83 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -19,14 +19,14 @@ /* * Set the completion state on an aborted connection. */ -static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff *skb, +static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, s32 abort_code, int err, enum rxrpc_call_completion compl) { bool aborted = false; if (conn->state != RXRPC_CONN_ABORTED) { - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state != RXRPC_CONN_ABORTED) { conn->abort_code = abort_code; conn->error = err; @@ -37,7 +37,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff set_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events); aborted = true; } - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); } return aborted; @@ -49,12 +49,20 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, s32 abort_code, int err, enum rxrpc_abort_reason why) { - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - if (rxrpc_set_conn_aborted(conn, skb, abort_code, err, + u32 cid = conn->proto.cid, call = 0, seq = 0; + + if (skb) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + cid = sp->hdr.cid; + call = sp->hdr.callNumber; + seq = sp->hdr.seq; + } + + if (rxrpc_set_conn_aborted(conn, abort_code, err, RXRPC_CALL_LOCALLY_ABORTED)) { - trace_rxrpc_abort(0, why, sp->hdr.cid, sp->hdr.callNumber, - sp->hdr.seq, abort_code, err); + trace_rxrpc_abort(0, why, cid, call, seq, abort_code, err); rxrpc_poke_conn(conn, rxrpc_conn_get_poke_abort); } return -EPROTO; @@ -63,11 +71,12 @@ int rxrpc_abort_conn(struct rxrpc_connection *conn, struct sk_buff *skb, /* * Mark a connection as being remotely aborted. */ -static bool rxrpc_input_conn_abort(struct rxrpc_connection *conn, +static void rxrpc_input_conn_abort(struct rxrpc_connection *conn, struct sk_buff *skb) { - return rxrpc_set_conn_aborted(conn, skb, skb->priority, -ECONNABORTED, - RXRPC_CALL_REMOTELY_ABORTED); + trace_rxrpc_rx_conn_abort(conn, skb); + rxrpc_set_conn_aborted(conn, skb->priority, -ECONNABORTED, + RXRPC_CALL_REMOTELY_ABORTED); } /* @@ -91,7 +100,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, struct rxrpc_acktrailer trailer; size_t len; int ret, ioc; - u32 serial, mtu, call_id, padding; + u32 serial, max_mtu, if_mtu, call_id, padding; _enter("%d", conn->debug_id); @@ -149,8 +158,13 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, break; case RXRPC_PACKET_TYPE_ACK: - mtu = conn->peer->if_mtu; - mtu -= conn->peer->hdrsize; + if_mtu = conn->peer->if_mtu - conn->peer->hdrsize; + if (conn->peer->ackr_adv_pmtud) { + max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu); + } else { + if_mtu = umin(1444, if_mtu); + max_mtu = if_mtu; + } pkt.ack.bufferSpace = 0; pkt.ack.maxSkew = htons(skb ? skb->priority : 0); pkt.ack.firstPacket = htonl(chan->last_seq + 1); @@ -158,10 +172,10 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0); pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE; pkt.ack.nAcks = 0; - trailer.maxMTU = htonl(rxrpc_rx_mtu); - trailer.ifMTU = htonl(mtu); + trailer.maxMTU = htonl(max_mtu); + trailer.ifMTU = htonl(if_mtu); trailer.rwind = htonl(rxrpc_rx_window_size); - trailer.jumbo_max = htonl(rxrpc_rx_jumbo_max); + trailer.jumbo_max = 0; pkt.whdr.flags |= RXRPC_SLOW_START_OK; padding = 0; iov[0].iov_len += sizeof(pkt.ack); @@ -171,7 +185,8 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn, trace_rxrpc_tx_ack(chan->call_debug_id, serial, ntohl(pkt.ack.firstPacket), ntohl(pkt.ack.serial), - pkt.ack.reason, 0, rxrpc_rx_window_size); + pkt.ack.reason, 0, rxrpc_rx_window_size, + rxrpc_propose_ack_retransmit); break; default: @@ -202,11 +217,14 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) for (i = 0; i < RXRPC_MAXCALLS; i++) { call = conn->channels[i].call; - if (call) + if (call) { + rxrpc_see_call(call, rxrpc_call_see_conn_abort); rxrpc_set_call_completion(call, conn->completion, conn->abort_code, conn->error); + rxrpc_poke_call(call, rxrpc_call_poke_conn_abort); + } } _leave(""); @@ -218,10 +236,8 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) */ static void rxrpc_call_is_secure(struct rxrpc_call *call) { - if (call && __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SECURING) { - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); + if (call && __test_and_clear_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) rxrpc_notify_socket(call); - } } /* @@ -240,7 +256,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_CHALLENGE: - return conn->security->respond_to_challenge(conn, skb); + ret = conn->security->respond_to_challenge(conn, skb); + sp->chall.conn = NULL; + rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input); + return ret; case RXRPC_PACKET_TYPE_RESPONSE: ret = conn->security->verify_response(conn, skb); @@ -252,16 +271,18 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, if (ret < 0) return ret; - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING) conn->state = RXRPC_CONN_SERVICE; - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_SERVICE) { /* Offload call state flipping to the I/O thread. As * we've already received the packet, put it on the * front of the queue. */ + sp->poke_conn = rxrpc_get_connection( + conn, rxrpc_conn_get_poke_secured); skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED; rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured); skb_queue_head(&conn->local->rx_queue, skb); @@ -383,6 +404,61 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, } /* + * Post a CHALLENGE packet to the socket of one of a connection's calls so that + * it can get application data to include in the packet, possibly querying + * userspace. + */ +static bool rxrpc_post_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_call *call = NULL; + struct rxrpc_sock *rx; + bool respond = false; + + sp->chall.conn = + rxrpc_get_connection(conn, rxrpc_conn_get_challenge_input); + + if (!conn->security->challenge_to_recvmsg) { + rxrpc_post_packet_to_conn(conn, skb); + return true; + } + + rcu_read_lock(); + + for (int i = 0; i < ARRAY_SIZE(conn->channels); i++) { + if (conn->channels[i].call) { + call = conn->channels[i].call; + rx = rcu_dereference(call->socket); + if (!rx) { + call = NULL; + continue; + } + + respond = true; + if (test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags)) + break; + call = NULL; + } + } + + if (!respond) { + rcu_read_unlock(); + rxrpc_put_connection(conn, rxrpc_conn_put_challenge_input); + sp->chall.conn = NULL; + return false; + } + + if (call) + rxrpc_notify_socket_oob(call, skb); + rcu_read_unlock(); + + if (!call) + rxrpc_post_packet_to_conn(conn, skb); + return true; +} + +/* * Input a connection-level packet. */ bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) @@ -402,6 +478,16 @@ bool rxrpc_input_conn_packet(struct rxrpc_connection *conn, struct sk_buff *skb) return true; case RXRPC_PACKET_TYPE_CHALLENGE: + rxrpc_see_skb(skb, rxrpc_skb_see_oob_challenge); + if (rxrpc_is_conn_aborted(conn)) { + if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) + rxrpc_send_conn_abort(conn); + return true; + } + if (!conn->security->validate_challenge(conn, skb)) + return false; + return rxrpc_post_challenge(conn, skb); + case RXRPC_PACKET_TYPE_RESPONSE: if (rxrpc_is_conn_aborted(conn)) { if (conn->completion == RXRPC_CALL_LOCALLY_ABORTED) @@ -427,17 +513,60 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) rxrpc_abort_calls(conn); - switch (skb->mark) { - case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: - if (conn->state != RXRPC_CONN_SERVICE) - break; + if (conn->tx_response) { + struct sk_buff *skb; - for (loop = 0; loop < RXRPC_MAXCALLS; loop++) - rxrpc_call_is_secure(conn->channels[loop].call); - break; + spin_lock_irq(&conn->local->lock); + skb = conn->tx_response; + conn->tx_response = NULL; + spin_unlock_irq(&conn->local->lock); + + if (conn->state != RXRPC_CONN_ABORTED) + rxrpc_send_response(conn, skb); + rxrpc_free_skb(skb, rxrpc_skb_put_response); + } + + if (skb) { + switch (skb->mark) { + case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: + if (conn->state != RXRPC_CONN_SERVICE) + break; + + for (loop = 0; loop < RXRPC_MAXCALLS; loop++) + rxrpc_call_is_secure(conn->channels[loop].call); + break; + } } /* Process delayed ACKs whose time has come. */ if (conn->flags & RXRPC_CONN_FINAL_ACK_MASK) rxrpc_process_delayed_final_acks(conn, false); } + +/* + * Post a RESPONSE message to the I/O thread for transmission. + */ +void rxrpc_post_response(struct rxrpc_connection *conn, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_local *local = conn->local; + struct sk_buff *old; + + _enter("%x", sp->resp.challenge_serial); + + spin_lock_irq(&local->lock); + old = conn->tx_response; + if (old) { + struct rxrpc_skb_priv *osp = rxrpc_skb(skb); + + /* Always go with the response to the most recent challenge. */ + if (after(sp->resp.challenge_serial, osp->resp.challenge_serial)) + conn->tx_response = old; + else + old = skb; + } else { + conn->tx_response = skb; + } + spin_unlock_irq(&local->lock); + rxrpc_poke_conn(conn, rxrpc_conn_get_poke_response); +} diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 694c4df7a1a3..37340becb224 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -31,13 +31,13 @@ void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why) if (WARN_ON_ONCE(!local)) return; - spin_lock_bh(&local->lock); + spin_lock_irq(&local->lock); busy = !list_empty(&conn->attend_link); if (!busy) { rxrpc_get_connection(conn, why); list_add_tail(&conn->attend_link, &local->conn_attend_q); } - spin_unlock_bh(&local->lock); + spin_unlock_irq(&local->lock); rxrpc_wake_up_io_thread(local); } @@ -67,11 +67,13 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, INIT_WORK(&conn->destructor, rxrpc_clean_up_connection); INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); + INIT_LIST_HEAD(&conn->attend_link); mutex_init(&conn->security_lock); mutex_init(&conn->tx_data_alloc_lock); skb_queue_head_init(&conn->rx_queue); conn->rxnet = rxnet; conn->security = &rxrpc_no_security; + rwlock_init(&conn->security_use_lock); spin_lock_init(&conn->state_lock); conn->debug_id = atomic_inc_return(&rxrpc_debug_id); conn->idle_timestamp = jiffies; @@ -196,9 +198,9 @@ void rxrpc_disconnect_call(struct rxrpc_call *call) call->peer->cong_ssthresh = call->cong_ssthresh; if (!hlist_unhashed(&call->error_link)) { - spin_lock(&call->peer->lock); + spin_lock_irq(&call->peer->lock); hlist_del_init(&call->error_link); - spin_unlock(&call->peer->lock); + spin_unlock_irq(&call->peer->lock); } if (rxrpc_is_client_call(call)) { @@ -313,15 +315,22 @@ static void rxrpc_clean_up_connection(struct work_struct *work) !conn->channels[3].call); ASSERT(list_empty(&conn->cache_link)); - del_timer_sync(&conn->timer); + timer_delete_sync(&conn->timer); cancel_work_sync(&conn->processor); /* Processing may restart the timer */ - del_timer_sync(&conn->timer); + timer_delete_sync(&conn->timer); write_lock(&rxnet->conn_lock); list_del_init(&conn->proc_link); write_unlock(&rxnet->conn_lock); + if (conn->pmtud_probe) { + trace_rxrpc_pmtud_lost(conn, 0); + conn->peer->pmtud_probing = false; + conn->peer->pmtud_pending = true; + } + rxrpc_purge_queue(&conn->rx_queue); + rxrpc_free_skb(conn->tx_response, rxrpc_skb_put_response); rxrpc_kill_client_conn(conn); @@ -358,7 +367,7 @@ void rxrpc_put_connection(struct rxrpc_connection *conn, dead = __refcount_dec_and_test(&conn->ref, &r); trace_rxrpc_conn(debug_id, r - 1, why); if (dead) { - del_timer(&conn->timer); + timer_delete(&conn->timer); cancel_work(&conn->processor); if (in_softirq() || work_busy(&conn->processor) || @@ -463,7 +472,7 @@ void rxrpc_destroy_all_connections(struct rxrpc_net *rxnet) atomic_dec(&rxnet->nr_conns); - del_timer_sync(&rxnet->service_conn_reap_timer); + timer_delete_sync(&rxnet->service_conn_reap_timer); rxrpc_queue_work(&rxnet->service_conn_reaper); flush_workqueue(rxrpc_workqueue); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 16d49a861dbb..24aceb183c2c 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -27,80 +27,68 @@ static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq, } /* - * Do TCP-style congestion management [RFC 5681]. + * Do TCP-style congestion management [RFC5681]. */ static void rxrpc_congestion_management(struct rxrpc_call *call, - struct sk_buff *skb, - struct rxrpc_ack_summary *summary, - rxrpc_serial_t acked_serial) + struct rxrpc_ack_summary *summary) { - enum rxrpc_congest_change change = rxrpc_cong_no_change; - unsigned int cumulative_acks = call->cong_cumul_acks; - unsigned int cwnd = call->cong_cwnd; - bool resend = false; - - summary->flight_size = - (call->tx_top - call->acks_hard_ack) - summary->nr_acks; + summary->change = rxrpc_cong_no_change; + summary->in_flight = rxrpc_tx_in_flight(call); if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) { summary->retrans_timeo = true; - call->cong_ssthresh = max_t(unsigned int, - summary->flight_size / 2, 2); - cwnd = 1; - if (cwnd >= call->cong_ssthresh && - call->cong_mode == RXRPC_CALL_SLOW_START) { - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; - call->cong_tstamp = skb->tstamp; - cumulative_acks = 0; + call->cong_ssthresh = umax(summary->in_flight / 2, 2); + call->cong_cwnd = 1; + if (call->cong_cwnd >= call->cong_ssthresh && + call->cong_ca_state == RXRPC_CA_SLOW_START) { + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; + call->cong_tstamp = call->acks_latest_ts; + call->cong_cumul_acks = 0; } } - cumulative_acks += summary->nr_new_acks; - if (cumulative_acks > 255) - cumulative_acks = 255; - - summary->cwnd = call->cong_cwnd; - summary->ssthresh = call->cong_ssthresh; - summary->cumulative_acks = cumulative_acks; - summary->dup_acks = call->cong_dup_acks; + call->cong_cumul_acks += summary->nr_new_sacks; + call->cong_cumul_acks += summary->nr_new_hacks; + if (call->cong_cumul_acks > 255) + call->cong_cumul_acks = 255; - switch (call->cong_mode) { - case RXRPC_CALL_SLOW_START: - if (summary->saw_nacks) + switch (call->cong_ca_state) { + case RXRPC_CA_SLOW_START: + if (call->acks_nr_snacks > 0) goto packet_loss_detected; - if (summary->cumulative_acks > 0) - cwnd += 1; - if (cwnd >= call->cong_ssthresh) { - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; - call->cong_tstamp = skb->tstamp; + if (call->cong_cumul_acks > 0) + call->cong_cwnd += 1; + if (call->cong_cwnd >= call->cong_ssthresh) { + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; + call->cong_tstamp = call->acks_latest_ts; } goto out; - case RXRPC_CALL_CONGEST_AVOIDANCE: - if (summary->saw_nacks) + case RXRPC_CA_CONGEST_AVOIDANCE: + if (call->acks_nr_snacks > 0) goto packet_loss_detected; /* We analyse the number of packets that get ACK'd per RTT * period and increase the window if we managed to fill it. */ - if (call->peer->rtt_count == 0) + if (call->rtt_count == 0) goto out; - if (ktime_before(skb->tstamp, + if (ktime_before(call->acks_latest_ts, ktime_add_us(call->cong_tstamp, - call->peer->srtt_us >> 3))) + call->srtt_us >> 3))) goto out_no_clear_ca; - change = rxrpc_cong_rtt_window_end; - call->cong_tstamp = skb->tstamp; - if (cumulative_acks >= cwnd) - cwnd++; + summary->change = rxrpc_cong_rtt_window_end; + call->cong_tstamp = call->acks_latest_ts; + if (call->cong_cumul_acks >= call->cong_cwnd) + call->cong_cwnd++; goto out; - case RXRPC_CALL_PACKET_LOSS: - if (!summary->saw_nacks) + case RXRPC_CA_PACKET_LOSS: + if (call->acks_nr_snacks == 0) goto resume_normality; - if (summary->new_low_nack) { - change = rxrpc_cong_new_low_nack; + if (summary->new_low_snack) { + summary->change = rxrpc_cong_new_low_nack; call->cong_dup_acks = 1; if (call->cong_extra > 1) call->cong_extra = 1; @@ -111,31 +99,35 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, if (call->cong_dup_acks < 3) goto send_extra_data; - change = rxrpc_cong_begin_retransmission; - call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT; - call->cong_ssthresh = max_t(unsigned int, - summary->flight_size / 2, 2); - cwnd = call->cong_ssthresh + 3; + summary->change = rxrpc_cong_begin_retransmission; + call->cong_ca_state = RXRPC_CA_FAST_RETRANSMIT; + call->cong_ssthresh = umax(summary->in_flight / 2, 2); + call->cong_cwnd = call->cong_ssthresh + 3; call->cong_extra = 0; call->cong_dup_acks = 0; - resend = true; + summary->need_retransmit = true; + summary->in_fast_or_rto_recovery = true; goto out; - case RXRPC_CALL_FAST_RETRANSMIT: - if (!summary->new_low_nack) { - if (summary->nr_new_acks == 0) - cwnd += 1; + case RXRPC_CA_FAST_RETRANSMIT: + rxrpc_tlp_init(call); + summary->in_fast_or_rto_recovery = true; + if (!summary->new_low_snack) { + if (summary->nr_new_sacks == 0) + call->cong_cwnd += 1; call->cong_dup_acks++; if (call->cong_dup_acks == 2) { - change = rxrpc_cong_retransmit_again; + summary->change = rxrpc_cong_retransmit_again; call->cong_dup_acks = 0; - resend = true; + summary->need_retransmit = true; } } else { - change = rxrpc_cong_progress; - cwnd = call->cong_ssthresh; - if (!summary->saw_nacks) + summary->change = rxrpc_cong_progress; + call->cong_cwnd = call->cong_ssthresh; + if (call->acks_nr_snacks == 0) { + summary->exiting_fast_or_rto_recovery = true; goto resume_normality; + } } goto out; @@ -145,30 +137,25 @@ static void rxrpc_congestion_management(struct rxrpc_call *call, } resume_normality: - change = rxrpc_cong_cleared_nacks; + summary->change = rxrpc_cong_cleared_nacks; call->cong_dup_acks = 0; call->cong_extra = 0; - call->cong_tstamp = skb->tstamp; - if (cwnd < call->cong_ssthresh) - call->cong_mode = RXRPC_CALL_SLOW_START; + call->cong_tstamp = call->acks_latest_ts; + if (call->cong_cwnd < call->cong_ssthresh) + call->cong_ca_state = RXRPC_CA_SLOW_START; else - call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE; + call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE; out: - cumulative_acks = 0; + call->cong_cumul_acks = 0; out_no_clear_ca: - if (cwnd >= RXRPC_TX_MAX_WINDOW) - cwnd = RXRPC_TX_MAX_WINDOW; - call->cong_cwnd = cwnd; - call->cong_cumul_acks = cumulative_acks; - summary->mode = call->cong_mode; - trace_rxrpc_congest(call, summary, acked_serial, change); - if (resend) - rxrpc_resend(call, skb); + if (call->cong_cwnd >= RXRPC_TX_MAX_WINDOW) + call->cong_cwnd = RXRPC_TX_MAX_WINDOW; + trace_rxrpc_congest(call, summary); return; packet_loss_detected: - change = rxrpc_cong_saw_nack; - call->cong_mode = RXRPC_CALL_PACKET_LOSS; + summary->change = rxrpc_cong_saw_nack; + call->cong_ca_state = RXRPC_CA_PACKET_LOSS; call->cong_dup_acks = 0; goto send_extra_data; @@ -177,7 +164,7 @@ send_extra_data: * state. */ if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) || - summary->nr_acks != call->tx_top - call->acks_hard_ack) { + call->acks_nr_sacks != call->tx_top - call->tx_bottom) { call->cong_extra++; wake_up(&call->waitq); } @@ -189,26 +176,42 @@ send_extra_data: */ void rxrpc_congestion_degrade(struct rxrpc_call *call) { - ktime_t rtt, now; + ktime_t rtt, now, time_since; - if (call->cong_mode != RXRPC_CALL_SLOW_START && - call->cong_mode != RXRPC_CALL_CONGEST_AVOIDANCE) + if (call->cong_ca_state != RXRPC_CA_SLOW_START && + call->cong_ca_state != RXRPC_CA_CONGEST_AVOIDANCE) return; if (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_REPLY) return; - rtt = ns_to_ktime(call->peer->srtt_us * (1000 / 8)); + rtt = ns_to_ktime(call->srtt_us * (NSEC_PER_USEC / 8)); now = ktime_get_real(); - if (!ktime_before(ktime_add(call->tx_last_sent, rtt), now)) + time_since = ktime_sub(now, call->tx_last_sent); + if (ktime_before(time_since, rtt)) return; - trace_rxrpc_reset_cwnd(call, now); + trace_rxrpc_reset_cwnd(call, time_since, rtt); rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset); call->tx_last_sent = now; - call->cong_mode = RXRPC_CALL_SLOW_START; - call->cong_ssthresh = max_t(unsigned int, call->cong_ssthresh, - call->cong_cwnd * 3 / 4); - call->cong_cwnd = max_t(unsigned int, call->cong_cwnd / 2, RXRPC_MIN_CWND); + call->cong_ca_state = RXRPC_CA_SLOW_START; + call->cong_ssthresh = umax(call->cong_ssthresh, call->cong_cwnd * 3 / 4); + call->cong_cwnd = umax(call->cong_cwnd / 2, RXRPC_MIN_CWND); +} + +/* + * Add an RTT sample derived from an ACK'd DATA packet. + */ +static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + int ix) +{ + ktime_t xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); + + rxrpc_call_add_rtt(call, rxrpc_rtt_rx_data_ack, -1, + summary->acked_serial, summary->ack_serial, + xmit_ts, call->acks_latest_ts); + __clear_bit(ix, &tq->rtt_samples); /* Prevent repeat RTT sample */ } /* @@ -217,37 +220,120 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call) static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to, struct rxrpc_ack_summary *summary) { - struct rxrpc_txbuf *txb; - bool rot_last = false; + struct rxrpc_txqueue *tq = call->tx_queue; + rxrpc_seq_t seq = call->tx_bottom + 1; + bool rot_last = false, trace = false; - list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) { - if (before_eq(txb->seq, call->acks_hard_ack)) - continue; - if (txb->flags & RXRPC_LAST_PACKET) { + _enter("%x,%x", call->tx_bottom, to); + + trace_rxrpc_tx_rotate(call, seq, to); + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate); + + if (call->acks_lowest_nak == call->tx_bottom) { + call->acks_lowest_nak = to; + } else if (after(to, call->acks_lowest_nak)) { + summary->new_low_snack = true; + call->acks_lowest_nak = to; + } + + /* We may have a left over fully-consumed buffer at the front that we + * couldn't drop before (rotate_and_keep below). + */ + if (seq == call->tx_qbase + RXRPC_NR_TXQUEUE) { + call->tx_qbase += RXRPC_NR_TXQUEUE; + call->tx_queue = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + tq = call->tx_queue; + } + + do { + unsigned int ix = seq - call->tx_qbase; + + _debug("tq=%x seq=%x i=%d f=%x", tq->qbase, seq, ix, tq->bufs[ix]->flags); + if (tq->bufs[ix]->flags & RXRPC_LAST_PACKET) { set_bit(RXRPC_CALL_TX_LAST, &call->flags); rot_last = true; } - if (txb->seq == to) - break; - } - if (rot_last) - set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags); + if (summary->acked_serial == tq->segment_serial[ix] && + test_bit(ix, &tq->rtt_samples)) + rxrpc_add_data_rtt_sample(call, summary, tq, ix); + + if (ix == tq->nr_reported_acks) { + /* Packet directly hard ACK'd. */ + tq->nr_reported_acks++; + rxrpc_input_rack_one(call, summary, tq, ix); + if (seq == call->tlp_seq) + summary->tlp_probe_acked = true; + summary->nr_new_hacks++; + __set_bit(ix, &tq->segment_acked); + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_hack); + } else if (test_bit(ix, &tq->segment_acked)) { + /* Soft ACK -> hard ACK. */ + call->acks_nr_sacks--; + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_sack); + } else { + /* Soft NAK -> hard ACK. */ + call->acks_nr_snacks--; + rxrpc_input_rack_one(call, summary, tq, ix); + if (seq == call->tlp_seq) + summary->tlp_probe_acked = true; + summary->nr_new_hacks++; + __set_bit(ix, &tq->segment_acked); + trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_snak); + } - _enter("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last); + call->tx_nr_sent--; + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) + call->tx_nr_resent--; + __clear_bit(ix, &tq->ever_retransmitted); - if (call->acks_lowest_nak == call->acks_hard_ack) { - call->acks_lowest_nak = to; - } else if (after(to, call->acks_lowest_nak)) { - summary->new_low_nack = true; - call->acks_lowest_nak = to; + rxrpc_put_txbuf(tq->bufs[ix], rxrpc_txbuf_put_rotated); + tq->bufs[ix] = NULL; + + WRITE_ONCE(call->tx_bottom, seq); + trace_rxrpc_txqueue(call, (rot_last ? + rxrpc_txqueue_rotate_last : + rxrpc_txqueue_rotate)); + + seq++; + trace = true; + if (!(seq & RXRPC_TXQ_MASK)) { + trace_rxrpc_rack_update(call, summary); + trace = false; + prefetch(tq->next); + if (tq != call->tx_qtail) { + call->tx_qbase += RXRPC_NR_TXQUEUE; + call->tx_queue = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + tq = call->tx_queue; + } else { + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_keep); + tq = NULL; + break; + } + } + + } while (before_eq(seq, to)); + + if (trace) + trace_rxrpc_rack_update(call, summary); + + if (rot_last) { + set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags); + if (tq) { + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free); + kfree(tq); + call->tx_queue = NULL; + } } - smp_store_release(&call->acks_hard_ack, to); + _debug("%x,%x,%x,%d", to, call->tx_bottom, call->tx_top, rot_last); - trace_rxrpc_txqueue(call, (rot_last ? - rxrpc_txqueue_rotate_last : - rxrpc_txqueue_rotate)); wake_up(&call->waitq); return rot_last; } @@ -263,13 +349,10 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun, { ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags)); - call->resend_at = KTIME_MAX; - trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend); - - if (unlikely(call->cong_last_nack)) { - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - call->cong_last_nack = NULL; - } + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; + call->rack_timo_at = KTIME_MAX; + trace_rxrpc_rack_timer(call, 0, false); + trace_rxrpc_timer_can(call, rxrpc_timer_trace_rack_off + call->rack_timer_mode); switch (__rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_SEND_REQUEST: @@ -365,11 +448,19 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); bool last = sp->hdr.flags & RXRPC_LAST_PACKET; + spin_lock_irq(&call->recvmsg_queue.lock); + __skb_queue_tail(&call->recvmsg_queue, skb); rxrpc_input_update_ack_window(call, window, wtop); trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq); if (last) + /* Change the state inside the lock so that recvmsg syncs + * correctly with it and using sendmsg() to send a reply + * doesn't race. + */ rxrpc_end_rx_phase(call, sp->hdr.serial); + + spin_unlock_irq(&call->recvmsg_queue.lock); } /* @@ -442,7 +533,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg); - spin_lock(&call->recvmsg_queue.lock); rxrpc_input_queue_data(call, skb, window, wtop, rxrpc_receive_queue); *_notify = true; @@ -464,8 +554,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb, rxrpc_receive_queue_oos); } - spin_unlock(&call->recvmsg_queue.lock); - call->ackr_sack_base = sack; } else { unsigned int slot; @@ -530,7 +618,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len = skb->len - offset; bool notify = false; - int ack_reason = 0; + int ack_reason = 0, count = 1, stat_ix; while (sp->hdr.flags & RXRPC_JUMBO_PACKET) { if (len < RXRPC_JUMBO_SUBPKTLEN) @@ -559,12 +647,16 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb sp->hdr.serial++; offset += RXRPC_JUMBO_SUBPKTLEN; len -= RXRPC_JUMBO_SUBPKTLEN; + count++; } sp->offset = offset; sp->len = len; rxrpc_input_data_one(call, skb, ¬ify, &ack_serial, &ack_reason); + stat_ix = umin(count, ARRAY_SIZE(call->rxnet->stat_rx_jumbo)) - 1; + atomic_inc(&call->rxnet->stat_rx_jumbo[stat_ix]); + if (ack_reason > 0) { rxrpc_send_ACK(call, ack_reason, ack_serial, rxrpc_propose_ack_input_data); @@ -573,7 +665,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb rxrpc_propose_delay_ACK(call, sp->hdr.serial, rxrpc_propose_ack_input_data); } - if (notify) { + if (notify && !test_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) { trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); rxrpc_notify_socket(call); } @@ -667,7 +759,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_mb(); /* Read data before setting avail bit */ set_bit(i, &call->rtt_avail); - rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial, + rxrpc_call_add_rtt(call, type, i, acked_serial, ack_serial, sent_at, resp_time); matched = true; } @@ -677,7 +769,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, */ if (after(acked_serial, orig_serial)) { trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_obsolete, i, - orig_serial, acked_serial, 0, 0); + orig_serial, acked_serial, 0, 0, 0); clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_wmb(); set_bit(i, &call->rtt_avail); @@ -685,7 +777,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call, } if (!matched) - trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0); + trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0, 0); } /* @@ -695,10 +787,13 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb struct rxrpc_acktrailer *trailer) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - struct rxrpc_peer *peer; - unsigned int mtu; + struct rxrpc_peer *peer = call->peer; + unsigned int max_data, capacity; bool wake = false; - u32 rwind = ntohl(trailer->rwind); + u32 max_mtu = ntohl(trailer->maxMTU); + //u32 if_mtu = ntohl(trailer->ifMTU); + u32 rwind = ntohl(trailer->rwind); + u32 jumbo_max = ntohl(trailer->jumbo_max); if (rwind > RXRPC_TX_MAX_WINDOW) rwind = RXRPC_TX_MAX_WINDOW; @@ -709,54 +804,147 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb call->tx_winsize = rwind; } - mtu = min(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU)); + max_mtu = clamp(max_mtu, 500, 65535); + peer->ackr_max_data = max_mtu; + + if (max_mtu < peer->max_data) { + trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_mtu, + rxrpc_pmtud_reduce_ack); + peer->max_data = max_mtu; + } + + max_data = umin(max_mtu, peer->max_data); + capacity = max_data; + capacity += sizeof(struct rxrpc_jumbo_header); /* First subpacket has main hdr, not jumbo */ + capacity /= sizeof(struct rxrpc_jumbo_header) + RXRPC_JUMBO_DATALEN; - peer = call->peer; - if (mtu < peer->maxdata) { - spin_lock(&peer->lock); - peer->maxdata = mtu; - peer->mtu = mtu + peer->hdrsize; - spin_unlock(&peer->lock); + if (jumbo_max == 0) { + /* The peer says it supports pmtu discovery */ + peer->ackr_adv_pmtud = true; + } else { + peer->ackr_adv_pmtud = false; + capacity = clamp(capacity, 1, jumbo_max); } + call->tx_jumbo_max = capacity; + if (wake) wake_up(&call->waitq); } +#if defined(CONFIG_X86) && __GNUC__ && !defined(__clang__) +/* Clang doesn't support the %z constraint modifier */ +#define shiftr_adv_rotr(shift_from, rotate_into) ({ \ + asm(" shr%z1 %1\n" \ + " inc %0\n" \ + " rcr%z2 %2\n" \ + : "+d"(shift_from), "+m"(*(shift_from)), "+rm"(rotate_into) \ + ); \ + }) +#else +#define shiftr_adv_rotr(shift_from, rotate_into) ({ \ + typeof(rotate_into) __bit0 = *(shift_from) & 1; \ + *(shift_from) >>= 1; \ + shift_from++; \ + rotate_into >>= 1; \ + rotate_into |= __bit0 << (sizeof(rotate_into) * 8 - 1); \ + }) +#endif + /* - * Determine how many nacks from the previous ACK have now been satisfied. + * Deal with RTT samples from soft ACKs. */ -static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, - struct rxrpc_ack_summary *summary, - rxrpc_seq_t seq) +static void rxrpc_input_soft_rtt(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq) { - struct sk_buff *skb = call->cong_last_nack; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int i, new_acks = 0, retained_nacks = 0; - rxrpc_seq_t old_seq = sp->ack.first_ack; - u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); + for (int ix = 0; ix < RXRPC_NR_TXQUEUE; ix++) + if (summary->acked_serial == tq->segment_serial[ix]) + return rxrpc_add_data_rtt_sample(call, summary, tq, ix); +} - if (after_eq(seq, old_seq + sp->ack.nr_acks)) { - summary->nr_new_acks += sp->ack.nr_nacks; - summary->nr_new_acks += seq - (old_seq + sp->ack.nr_acks); - summary->nr_retained_nacks = 0; - } else if (seq == old_seq) { - summary->nr_retained_nacks = sp->ack.nr_nacks; - } else { - for (i = 0; i < sp->ack.nr_acks; i++) { - if (acks[i] == RXRPC_ACK_TYPE_NACK) { - if (before(old_seq + i, seq)) - new_acks++; - else - retained_nacks++; - } +/* + * Process a batch of soft ACKs specific to a transmission queue segment. + */ +static void rxrpc_input_soft_ack_tq(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long extracted_acks, + int nr_reported, + rxrpc_seq_t seq, + rxrpc_seq_t *lowest_nak) +{ + unsigned long old_reported = 0, flipped, new_acks = 0; + unsigned long a_to_n, n_to_a = 0; + int new, a, n; + + if (tq->nr_reported_acks > 0) + old_reported = ~0UL >> (RXRPC_NR_TXQUEUE - tq->nr_reported_acks); + + _enter("{%x,%lx,%d},%lx,%d,%x", + tq->qbase, tq->segment_acked, tq->nr_reported_acks, + extracted_acks, nr_reported, seq); + + _debug("[%x]", tq->qbase); + _debug("tq %16lx %u", tq->segment_acked, tq->nr_reported_acks); + _debug("sack %16lx %u", extracted_acks, nr_reported); + + /* See how many previously logged ACKs/NAKs have flipped. */ + flipped = (tq->segment_acked ^ extracted_acks) & old_reported; + if (flipped) { + n_to_a = ~tq->segment_acked & flipped; /* Old NAK -> ACK */ + a_to_n = tq->segment_acked & flipped; /* Old ACK -> NAK */ + a = hweight_long(n_to_a); + n = hweight_long(a_to_n); + _debug("flip %16lx", flipped); + _debug("ntoa %16lx %d", n_to_a, a); + _debug("aton %16lx %d", a_to_n, n); + call->acks_nr_sacks += a - n; + call->acks_nr_snacks += n - a; + summary->nr_new_sacks += a; + summary->nr_new_snacks += n; + } + + /* See how many new ACKs/NAKs have been acquired. */ + new = nr_reported - tq->nr_reported_acks; + if (new > 0) { + new_acks = extracted_acks & ~old_reported; + if (new_acks) { + a = hweight_long(new_acks); + n = new - a; + _debug("new_a %16lx new=%d a=%d n=%d", new_acks, new, a, n); + call->acks_nr_sacks += a; + call->acks_nr_snacks += n; + summary->nr_new_sacks += a; + summary->nr_new_snacks += n; + } else { + call->acks_nr_snacks += new; + summary->nr_new_snacks += new; } + } + + tq->nr_reported_acks = nr_reported; + tq->segment_acked = extracted_acks; + trace_rxrpc_apply_acks(call, tq); - summary->nr_new_acks += new_acks; - summary->nr_retained_nacks = retained_nacks; + if (extracted_acks != ~0UL) { + rxrpc_seq_t lowest = seq + ffz(extracted_acks); + + if (before(lowest, *lowest_nak)) + *lowest_nak = lowest; } - return old_seq + sp->ack.nr_acks; + if (summary->acked_serial) + rxrpc_input_soft_rtt(call, summary, tq); + + new_acks |= n_to_a; + if (new_acks) + rxrpc_input_rack(call, summary, tq, new_acks); + + if (call->tlp_serial && + rxrpc_seq_in_txq(tq, call->tlp_seq) && + test_bit(call->tlp_seq - tq->qbase, &new_acks)) + summary->tlp_probe_acked = true; } /* @@ -770,39 +958,50 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call, */ static void rxrpc_input_soft_acks(struct rxrpc_call *call, struct rxrpc_ack_summary *summary, - struct sk_buff *skb, - rxrpc_seq_t seq, - rxrpc_seq_t since) + struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - unsigned int i, old_nacks = 0; + struct rxrpc_txqueue *tq = call->tx_queue; + unsigned long extracted = ~0UL; + unsigned int nr = 0; + rxrpc_seq_t seq = call->acks_hard_ack + 1; rxrpc_seq_t lowest_nak = seq + sp->ack.nr_acks; u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); - for (i = 0; i < sp->ack.nr_acks; i++) { - if (acks[i] == RXRPC_ACK_TYPE_ACK) { - summary->nr_acks++; - if (after_eq(seq, since)) - summary->nr_new_acks++; - } else { - summary->saw_nacks = true; - if (before(seq, since)) { - /* Overlap with previous ACK */ - old_nacks++; - } else { - summary->nr_new_nacks++; - sp->ack.nr_nacks++; - } + _enter("%x,%x,%u", tq->qbase, seq, sp->ack.nr_acks); + + while (after(seq, tq->qbase + RXRPC_NR_TXQUEUE - 1)) + tq = tq->next; - if (before(seq, lowest_nak)) - lowest_nak = seq; + for (unsigned int i = 0; i < sp->ack.nr_acks; i++) { + /* Decant ACKs until we hit a txqueue boundary. */ + shiftr_adv_rotr(acks, extracted); + if (i == 256) { + acks -= i; + i = 0; } seq++; + nr++; + if ((seq & RXRPC_TXQ_MASK) != 0) + continue; + + _debug("bound %16lx %u", extracted, nr); + + rxrpc_input_soft_ack_tq(call, summary, tq, extracted, RXRPC_NR_TXQUEUE, + seq - RXRPC_NR_TXQUEUE, &lowest_nak); + extracted = ~0UL; + nr = 0; + tq = tq->next; + prefetch(tq); } - if (lowest_nak != call->acks_lowest_nak) { - call->acks_lowest_nak = lowest_nak; - summary->new_low_nack = true; + if (nr) { + unsigned int nr_reported = seq & RXRPC_TXQ_MASK; + + extracted >>= RXRPC_NR_TXQUEUE - nr_reported; + _debug("tail %16lx %u", extracted, nr_reported); + rxrpc_input_soft_ack_tq(call, summary, tq, extracted, nr_reported, + seq & ~RXRPC_TXQ_MASK, &lowest_nak); } /* We *can* have more nacks than we did - the peer is permitted to drop @@ -810,9 +1009,14 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, * possible for the nack distribution to change whilst the number of * nacks stays the same or goes down. */ - if (old_nacks < summary->nr_retained_nacks) - summary->nr_new_acks += summary->nr_retained_nacks - old_nacks; - summary->nr_retained_nacks = old_nacks; + if (lowest_nak != call->acks_lowest_nak) { + call->acks_lowest_nak = lowest_nak; + summary->new_low_snack = true; + } + + _debug("summary A=%d+%d N=%d+%d", + call->acks_nr_sacks, summary->nr_new_sacks, + call->acks_nr_snacks, summary->nr_new_snacks); } /* @@ -820,21 +1024,21 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, * with respect to the ack state conveyed by preceding ACKs. */ static bool rxrpc_is_ack_valid(struct rxrpc_call *call, - rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt) + rxrpc_seq_t hard_ack, rxrpc_seq_t prev_pkt) { - rxrpc_seq_t base = READ_ONCE(call->acks_first_seq); + rxrpc_seq_t base = READ_ONCE(call->acks_hard_ack); - if (after(first_pkt, base)) + if (after(hard_ack, base)) return true; /* The window advanced */ - if (before(first_pkt, base)) + if (before(hard_ack, base)) return false; /* firstPacket regressed */ if (after_eq(prev_pkt, call->acks_prev_seq)) return true; /* previousPacket hasn't regressed. */ /* Some rx implementations put a serial number in previousPacket. */ - if (after_eq(prev_pkt, base + call->tx_winsize)) + if (after(prev_pkt, base + call->tx_winsize)) return false; return true; } @@ -852,53 +1056,34 @@ static bool rxrpc_is_ack_valid(struct rxrpc_call *call, static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_ack_summary summary = { 0 }; - struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_acktrailer trailer; - rxrpc_serial_t ack_serial, acked_serial; - rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt, since; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt; int nr_acks, offset, ioffset; _enter(""); offset = sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket); - ack_serial = sp->hdr.serial; - acked_serial = sp->ack.acked_serial; - first_soft_ack = sp->ack.first_ack; - prev_pkt = sp->ack.prev_ack; - nr_acks = sp->ack.nr_acks; - hard_ack = first_soft_ack - 1; - summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ? - sp->ack.reason : RXRPC_ACK__INVALID); - - trace_rxrpc_rx_ack(call, ack_serial, acked_serial, - first_soft_ack, prev_pkt, - summary.ack_reason, nr_acks); - rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); + summary.ack_serial = sp->hdr.serial; + first_soft_ack = sp->ack.first_ack; + prev_pkt = sp->ack.prev_ack; + nr_acks = sp->ack.nr_acks; + hard_ack = first_soft_ack - 1; + summary.acked_serial = sp->ack.acked_serial; + summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ? + sp->ack.reason : RXRPC_ACK__INVALID); - if (acked_serial != 0) { - switch (summary.ack_reason) { - case RXRPC_ACK_PING_RESPONSE: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_ping_response); - break; - case RXRPC_ACK_REQUESTED: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_requested_ack); - break; - default: - rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial, - rxrpc_rtt_rx_other_ack); - break; - } - } + trace_rxrpc_rx_ack(call, sp); + rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]); + prefetch(call->tx_queue); /* If we get an EXCEEDS_WINDOW ACK from the server, it probably * indicates that the client address changed due to NAT. The server * lost the call because it switched to a different peer. */ if (unlikely(summary.ack_reason == RXRPC_ACK_EXCEEDS_WINDOW) && - first_soft_ack == 1 && + hard_ack == 0 && prev_pkt == 0 && rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, @@ -911,9 +1096,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) * if we still have it buffered to the beginning. */ if (unlikely(summary.ack_reason == RXRPC_ACK_OUT_OF_SEQUENCE) && - first_soft_ack == 1 && + hard_ack == 0 && prev_pkt == 0 && - call->acks_hard_ack == 0 && + call->tx_bottom == 0 && rxrpc_is_client_call(call)) { rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, 0, -ENETRESET); @@ -921,11 +1106,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) } /* Discard any out-of-order or duplicate ACKs (outside lock). */ - if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) { - trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial, - first_soft_ack, call->acks_first_seq, - prev_pkt, call->acks_prev_seq); - goto send_response; + if (!rxrpc_is_ack_valid(call, hard_ack, prev_pkt)) { + trace_rxrpc_rx_discard_ack(call, summary.ack_serial, hard_ack, prev_pkt); + goto send_response; /* Still respond if requested. */ } trailer.maxMTU = 0; @@ -937,34 +1120,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) skb_condense(skb); - if (call->cong_last_nack) { - since = rxrpc_input_check_prev_ack(call, &summary, first_soft_ack); - rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack); - call->cong_last_nack = NULL; - } else { - summary.nr_new_acks = first_soft_ack - call->acks_first_seq; - call->acks_lowest_nak = first_soft_ack + nr_acks; - since = first_soft_ack; - } - - call->acks_latest_ts = skb->tstamp; - call->acks_first_seq = first_soft_ack; + call->acks_latest_ts = ktime_get_real(); + call->acks_hard_ack = hard_ack; call->acks_prev_seq = prev_pkt; - switch (summary.ack_reason) { - case RXRPC_ACK_PING: - break; - default: - if (acked_serial && after(acked_serial, call->acks_highest_serial)) - call->acks_highest_serial = acked_serial; - break; + if (summary.acked_serial) { + switch (summary.ack_reason) { + case RXRPC_ACK_PING_RESPONSE: + rxrpc_complete_rtt_probe(call, call->acks_latest_ts, + summary.acked_serial, summary.ack_serial, + rxrpc_rtt_rx_ping_response); + break; + default: + if (after(summary.acked_serial, call->acks_highest_serial)) + call->acks_highest_serial = summary.acked_serial; + summary.rtt_sample_avail = true; + break; + } } /* Parse rwind and mtu sizes if provided. */ if (trailer.maxMTU) rxrpc_input_ack_trailer(call, skb, &trailer); - if (first_soft_ack == 0) + if (hard_ack + 1 == 0) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero); /* Ignore ACKs unless we are or have just been transmitting. */ @@ -978,13 +1157,13 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) goto send_response; } - if (before(hard_ack, call->acks_hard_ack) || + if (before(hard_ack, call->tx_bottom) || after(hard_ack, call->tx_top)) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_outside_window); if (nr_acks > call->tx_top - hard_ack) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_sack_overflow); - if (after(hard_ack, call->acks_hard_ack)) { + if (after(hard_ack, call->tx_bottom)) { if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) { rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack); goto send_response; @@ -994,25 +1173,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) if (nr_acks > 0) { if (offset > (int)skb->len - nr_acks) return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_short_sack); - rxrpc_input_soft_acks(call, &summary, skb, first_soft_ack, since); - rxrpc_get_skb(skb, rxrpc_skb_get_last_nack); - call->cong_last_nack = skb; + rxrpc_input_soft_acks(call, &summary, skb); } if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) && - summary.nr_acks == call->tx_top - hard_ack && + call->acks_nr_sacks == call->tx_top - hard_ack && rxrpc_is_client_call(call)) - rxrpc_propose_ping(call, ack_serial, + rxrpc_propose_ping(call, summary.ack_serial, rxrpc_propose_ack_ping_for_lost_reply); - rxrpc_congestion_management(call, skb, &summary, acked_serial); + /* Drive the congestion management algorithm first and then RACK-TLP as + * the latter depends on the state/change in state in the former. + */ + rxrpc_congestion_management(call, &summary); + rxrpc_rack_detect_loss_and_arm_timer(call, &summary); + rxrpc_tlp_process_ack(call, &summary); + if (call->tlp_serial && after_eq(summary.acked_serial, call->tlp_serial)) + call->tlp_serial = 0; send_response: if (summary.ack_reason == RXRPC_ACK_PING) - rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial, + rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, summary.ack_serial, rxrpc_propose_ack_respond_to_ping); else if (sp->hdr.flags & RXRPC_REQUEST_ACK) - rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial, + rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, summary.ack_serial, rxrpc_propose_ack_respond_to_ack); } @@ -1111,5 +1295,5 @@ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb) break; } - rxrpc_input_call_event(call, skb); + rxrpc_input_call_event(call); } diff --git a/net/rxrpc/input_rack.c b/net/rxrpc/input_rack.c new file mode 100644 index 000000000000..13c371261e0a --- /dev/null +++ b/net/rxrpc/input_rack.c @@ -0,0 +1,418 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RACK-TLP [RFC8958] Implementation + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "ar-internal.h" + +static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1, + ktime_t t2, rxrpc_seq_t seq2) +{ + if (ktime_after(t1, t2)) + return true; + return t1 == t2 && after(seq1, seq2); +} + +/* + * Mark a packet lost. + */ +static void rxrpc_rack_mark_lost(struct rxrpc_call *call, + struct rxrpc_txqueue *tq, unsigned int ix) +{ + if (__test_and_set_bit(ix, &tq->segment_lost)) { + if (__test_and_clear_bit(ix, &tq->segment_retransmitted)) + call->tx_nr_resent--; + } else { + call->tx_nr_lost++; + } + tq->segment_xmit_ts[ix] = UINT_MAX; +} + +/* + * Get the transmission time of a packet in the Tx queue. + */ +static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix) +{ + if (tq->segment_xmit_ts[ix] == UINT_MAX) + return KTIME_MAX; + return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]); +} + +/* + * Get a bitmask of nack bits for a queue segment and mask off any that aren't + * yet reported. + */ +static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq) +{ + unsigned long nacks = ~tq->segment_acked; + + if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE) + nacks &= (1UL << tq->nr_reported_acks) - 1; + return nacks; +} + +/* + * Update the RACK state for the most recently sent packet that has been + * delivered [RFC8958 6.2 Step 2]. + */ +static void rxrpc_rack_update(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts); + + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + + if (test_bit(ix, &tq->segment_retransmitted)) { + /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */ + if (before(call->acks_highest_serial, tq->segment_serial[ix])) + return; + if (rtt < minmax_get(&call->min_rtt)) + return; + } + + /* The RACK algorithm requires the segment ACKs to be traversed in + * order of segment transmission - but the only thing this seems to + * matter for is that RACK.rtt is set to the rtt of the most recently + * transmitted segment. We should be able to achieve the same by only + * setting RACK.rtt if the xmit time is greater. + */ + if (ktime_after(xmit_ts, call->rack_rtt_ts)) { + call->rack_rtt = rtt; + call->rack_rtt_ts = xmit_ts; + } + + if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) { + call->rack_rtt = rtt; + call->rack_xmit_ts = xmit_ts; + call->rack_end_seq = seq; + } +} + +/* + * Detect data segment reordering [RFC8958 6.2 Step 3]. + */ +static void rxrpc_rack_detect_reordering(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_seq_t seq = tq->qbase + ix; + + /* Track the highest sequence number so far ACK'd. This is not + * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer + * could put a NACK in the last SACK slot. + */ + if (after(seq, call->rack_fack)) + call->rack_fack = seq; + else if (before(seq, call->rack_fack) && + test_bit(ix, &tq->segment_retransmitted)) + call->rack_reordering_seen = true; +} + +void rxrpc_input_rack_one(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned int ix) +{ + rxrpc_rack_update(call, summary, tq, ix); + rxrpc_rack_detect_reordering(call, summary, tq, ix); +} + +void rxrpc_input_rack(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary, + struct rxrpc_txqueue *tq, + unsigned long new_acks) +{ + while (new_acks) { + unsigned int ix = __ffs(new_acks); + + __clear_bit(ix, &new_acks); + rxrpc_input_rack_one(call, summary, tq, ix); + } + + trace_rxrpc_rack_update(call, summary); +} + +/* + * Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated + * duration of the reordering window. + * + * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can + * be given a 'DUPLICATE' reason with the serial number referring to the + * duplicated DATA packet. Rx does not inform as to whether this was a + * reception of the same packet twice or of a retransmission of a packet we + * already received (though this could be determined by the transmitter based + * on the serial number). + */ +static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ + rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */ + bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE; + int dup_thresh = 3; + + /* DSACK-based reordering window adaptation */ + if (!call->rack_dsack_round_none && + after_eq(snd_una, call->rack_dsack_round)) + call->rack_dsack_round_none = true; + + /* Grow the reordering window per round that sees DSACK. Reset the + * window after 16 DSACK-free recoveries. + */ + if (call->rack_dsack_round_none && have_dsack_option) { + call->rack_dsack_round_none = false; + call->rack_dsack_round = snd_nxt; + call->rack_reo_wnd_mult++; + call->rack_reo_wnd_persist = 16; + } else if (summary->exiting_fast_or_rto_recovery) { + call->rack_reo_wnd_persist--; + if (call->rack_reo_wnd_persist <= 0) + call->rack_reo_wnd_mult = 1; + } + + if (!call->rack_reordering_seen) { + if (summary->in_fast_or_rto_recovery) + return 0; + if (call->acks_nr_sacks >= dup_thresh) + return 0; + } + + return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4, + call->srtt_us >> 3)); +} + +/* + * Detect losses [RFC8958 6.2 Step 5]. + */ +static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + struct rxrpc_txqueue *tq; + ktime_t timeout = 0, lost_after, now = ktime_get_real(); + + call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary); + lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); + trace_rxrpc_rack_scan_loss(call); + + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long nacks = rxrpc_tq_nacks(tq); + + if (after(tq->qbase, call->tx_transmitted)) + break; + trace_rxrpc_rack_scan_loss_tq(call, tq, nacks); + + /* Skip ones marked lost but not yet retransmitted */ + nacks &= ~tq->segment_lost | tq->segment_retransmitted; + + while (nacks) { + unsigned int ix = __ffs(nacks); + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t remaining; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + + __clear_bit(ix, &nacks); + + if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq, + xmit_ts, seq)) { + remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now); + if (remaining <= 0) { + rxrpc_rack_mark_lost(call, tq, ix); + trace_rxrpc_rack_detect_loss(call, summary, seq); + } else { + timeout = max(remaining, timeout); + } + } + } + } + + return timeout; +} + +/* + * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5]. + */ +void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call, + struct rxrpc_ack_summary *summary) +{ + ktime_t timeout = rxrpc_rack_detect_loss(call, summary); + + if (timeout) { + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER; + call->rack_timo_at = ktime_add(ktime_get_real(), timeout); + trace_rxrpc_rack_timer(call, timeout, false); + trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo); + } +} + +/* + * Handle RACK-TLP RTO expiration [RFC8958 6.3]. + */ +static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq; + rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */ + ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd); + ktime_t deadline = ktime_sub(ktime_get_real(), lost_after); + + for (tq = call->tx_queue; tq; tq = tq->next) { + unsigned long unacked = ~tq->segment_acked; + + trace_rxrpc_rack_mark_loss_tq(call, tq); + while (unacked) { + unsigned int ix = __ffs(unacked); + rxrpc_seq_t seq = tq->qbase + ix; + ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix); + + if (after(seq, call->tx_transmitted)) + return; + __clear_bit(ix, &unacked); + + if (seq == snd_una || + ktime_before(xmit_ts, deadline)) + rxrpc_rack_mark_lost(call, tq, ix); + } + } +} + +/* + * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2]. + */ +ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now) +{ + unsigned int flight_size = rxrpc_tx_in_flight(call); + ktime_t rto_at = ktime_add(call->tx_last_sent, + rxrpc_get_rto_backoff(call, false)); + ktime_t pto; + + if (call->rtt_count > 0) { + /* Use 2*SRTT as the timeout. */ + pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4); + if (flight_size) + pto = ktime_add(pto, call->tlp_max_ack_delay); + } else { + pto = NSEC_PER_SEC; + } + + if (ktime_after(ktime_add(now, pto), rto_at)) + pto = ktime_sub(rto_at, now); + return pto; +} + +/* + * Send a TLP loss probe on PTO expiration [RFC8958 7.3]. + */ +void rxrpc_tlp_send_probe(struct rxrpc_call *call) +{ + unsigned int in_flight = rxrpc_tx_in_flight(call); + + if (after_eq(call->acks_hard_ack, call->tx_transmitted)) + return; /* Everything we transmitted has been acked. */ + + /* There must be no other loss probe still in flight and we need to + * have taken a new RTT sample since last probe or the start of + * connection. + */ + if (!call->tlp_serial && + call->tlp_rtt_taken != call->rtt_taken) { + call->tlp_is_retrans = false; + if (after(call->send_top, call->tx_transmitted) && + rxrpc_tx_window_space(call) > 0) { + /* Transmit the lowest-sequence unsent DATA */ + call->tx_last_serial = 0; + rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data); + call->tlp_serial = call->tx_last_serial; + call->tlp_seq = call->tx_transmitted; + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new); + in_flight = rxrpc_tx_in_flight(call); + } else { + /* Retransmit the highest-sequence DATA sent */ + call->tx_last_serial = 0; + rxrpc_resend_tlp(call); + call->tlp_is_retrans = true; + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit); + } + } else { + trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy); + } + + if (in_flight != 0) { + ktime_t rto = rxrpc_get_rto_backoff(call, false); + + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO; + call->rack_timo_at = ktime_add(ktime_get_real(), rto); + trace_rxrpc_rack_timer(call, rto, false); + trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto); + } +} + +/* + * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4]. + */ +void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary) +{ + if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack)) + return; + + if (!call->tlp_is_retrans) { + /* TLP of new data delivered */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data); + call->tlp_serial = 0; + } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE && + summary->acked_serial == call->tlp_serial) { + /* General Case: Detected packet losses using RACK [7.4.1] */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked); + call->tlp_serial = 0; + } else if (after(call->acks_hard_ack, call->tlp_seq)) { + /* Repaired the single loss */ + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond); + call->tlp_serial = 0; + // TODO: Invoke congestion control to react to the loss + // event the probe has repaired + } else if (summary->tlp_probe_acked) { + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked); + /* Special Case: Detected a single loss repaired by the loss + * probe [7.4.2] + */ + call->tlp_serial = 0; + } else { + trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete); + } +} + +/* + * Handle RACK timer expiration; returns true to request a resend. + */ +void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by) +{ + struct rxrpc_ack_summary summary = {}; + enum rxrpc_rack_timer_mode mode = call->rack_timer_mode; + + trace_rxrpc_rack_timer(call, overran_by, true); + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF; + + switch (mode) { + case RXRPC_CALL_RACKTIMER_RACK_REORDER: + rxrpc_rack_detect_loss_and_arm_timer(call, &summary); + break; + case RXRPC_CALL_RACKTIMER_TLP_PTO: + rxrpc_tlp_send_probe(call); + break; + case RXRPC_CALL_RACKTIMER_RTO: + // Might need to poke the congestion algo in some way + rxrpc_rack_mark_losses_on_rto(call); + break; + //case RXRPC_CALL_RACKTIMER_ZEROWIN: + default: + pr_warn("Unexpected rack timer %u", call->rack_timer_mode); + } +} diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 6716c021a532..0a260df45d25 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -19,11 +19,14 @@ static int none_init_connection_security(struct rxrpc_connection *conn, */ static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { - return rxrpc_alloc_data_txbuf(call, min_t(size_t, remain, RXRPC_JUMBO_DATALEN), 1, gfp); + return rxrpc_alloc_data_txbuf(call, umin(remain, RXRPC_JUMBO_DATALEN), 1, gfp); } static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) { + txb->pkt_len = txb->len; + if (txb->len == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; return 0; } @@ -39,11 +42,18 @@ static void none_free_call_crypto(struct rxrpc_call *call) { } -static int none_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb) +static bool none_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) { - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, - rxrpc_eproto_rxnull_challenge); + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxrpc_eproto_rxnull_challenge); + return true; +} + +static int none_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + return -EINVAL; } static int none_verify_response(struct rxrpc_connection *conn, @@ -79,7 +89,8 @@ const struct rxrpc_security rxrpc_no_security = { .alloc_txbuf = none_alloc_txbuf, .secure_packet = none_secure_packet, .verify_packet = none_verify_packet, - .respond_to_challenge = none_respond_to_challenge, + .validate_challenge = none_validate_challenge, + .sendmsg_respond_to_challenge = none_sendmsg_respond_to_challenge, .verify_response = none_verify_response, .clear = none_clear, }; diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c index 07c74c77d802..27b650d30f4d 100644 --- a/net/rxrpc/io_thread.c +++ b/net/rxrpc/io_thread.c @@ -338,7 +338,6 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, struct rxrpc_channel *chan; struct rxrpc_call *call = NULL; unsigned int channel; - bool ret; if (sp->hdr.securityIndex != conn->security_ix) return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security, @@ -364,6 +363,12 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, if (sp->hdr.callNumber == 0) return rxrpc_input_conn_packet(conn, skb); + /* Deal with path MTU discovery probing. */ + if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK && + conn->pmtud_probe && + after_eq(sp->ack.acked_serial, conn->pmtud_probe)) + rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false); + /* Call-bound packets are routed by connection channel. */ channel = sp->hdr.cid & RXRPC_CHANNELMASK; chan = &conn->channels[channel]; @@ -419,9 +424,9 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn, peer_srx, skb); } - ret = rxrpc_input_call_event(call, skb); + rxrpc_queue_rx_call_packet(call, skb); rxrpc_put_call(call, rxrpc_call_put_input); - return ret; + return true; } /* @@ -438,6 +443,8 @@ int rxrpc_io_thread(void *data) ktime_t now; #endif bool should_stop; + LIST_HEAD(conn_attend_q); + LIST_HEAD(call_attend_q); complete(&local->io_thread_ready); @@ -448,43 +455,26 @@ int rxrpc_io_thread(void *data) for (;;) { rxrpc_inc_stat(local->rxnet, stat_io_loop); - /* Deal with connections that want immediate attention. */ - conn = list_first_entry_or_null(&local->conn_attend_q, - struct rxrpc_connection, - attend_link); - if (conn) { - spin_lock_bh(&local->lock); - list_del_init(&conn->attend_link); - spin_unlock_bh(&local->lock); - - rxrpc_input_conn_event(conn, NULL); - rxrpc_put_connection(conn, rxrpc_conn_put_poke); - continue; + /* Inject a delay into packets if requested. */ +#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY + now = ktime_get_real(); + while ((skb = skb_peek(&local->rx_delay_queue))) { + if (ktime_before(now, skb->tstamp)) + break; + skb = skb_dequeue(&local->rx_delay_queue); + skb_queue_tail(&local->rx_queue, skb); } +#endif - if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, - &local->client_conn_flags)) - rxrpc_discard_expired_client_conns(local); - - /* Deal with calls that want immediate attention. */ - if ((call = list_first_entry_or_null(&local->call_attend_q, - struct rxrpc_call, - attend_link))) { - spin_lock_bh(&local->lock); - list_del_init(&call->attend_link); - spin_unlock_bh(&local->lock); - - trace_rxrpc_call_poked(call); - rxrpc_input_call_event(call, NULL); - rxrpc_put_call(call, rxrpc_call_put_poke); - continue; + if (!skb_queue_empty(&local->rx_queue)) { + spin_lock_irq(&local->rx_queue.lock); + skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); + spin_unlock_irq(&local->rx_queue.lock); + trace_rxrpc_iothread_rx(local, skb_queue_len(&rx_queue)); } - if (!list_empty(&local->new_client_calls)) - rxrpc_connect_client_calls(local); - - /* Process received packets and errors. */ - if ((skb = __skb_dequeue(&rx_queue))) { + /* Distribute packets and errors. */ + while ((skb = __skb_dequeue(&rx_queue))) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); switch (skb->mark) { case RXRPC_SKB_MARK_PACKET: @@ -499,8 +489,8 @@ int rxrpc_io_thread(void *data) rxrpc_free_skb(skb, rxrpc_skb_put_error_report); break; case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: - rxrpc_input_conn_event(sp->conn, skb); - rxrpc_put_connection(sp->conn, rxrpc_conn_put_poke); + rxrpc_input_conn_event(sp->poke_conn, skb); + rxrpc_put_connection(sp->poke_conn, rxrpc_conn_put_poke); rxrpc_free_skb(skb, rxrpc_skb_put_conn_secured); break; default: @@ -508,27 +498,48 @@ int rxrpc_io_thread(void *data) rxrpc_free_skb(skb, rxrpc_skb_put_unknown); break; } - continue; } - /* Inject a delay into packets if requested. */ -#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY - now = ktime_get_real(); - while ((skb = skb_peek(&local->rx_delay_queue))) { - if (ktime_before(now, skb->tstamp)) - break; - skb = skb_dequeue(&local->rx_delay_queue); - skb_queue_tail(&local->rx_queue, skb); + /* Deal with connections that want immediate attention. */ + if (!list_empty_careful(&local->conn_attend_q)) { + spin_lock_irq(&local->lock); + list_splice_tail_init(&local->conn_attend_q, &conn_attend_q); + spin_unlock_irq(&local->lock); } -#endif - if (!skb_queue_empty(&local->rx_queue)) { - spin_lock_irq(&local->rx_queue.lock); - skb_queue_splice_tail_init(&local->rx_queue, &rx_queue); - spin_unlock_irq(&local->rx_queue.lock); - continue; + while ((conn = list_first_entry_or_null(&conn_attend_q, + struct rxrpc_connection, + attend_link))) { + spin_lock_irq(&local->lock); + list_del_init(&conn->attend_link); + spin_unlock_irq(&local->lock); + rxrpc_input_conn_event(conn, NULL); + rxrpc_put_connection(conn, rxrpc_conn_put_poke); + } + + if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER, + &local->client_conn_flags)) + rxrpc_discard_expired_client_conns(local); + + /* Deal with calls that want immediate attention. */ + spin_lock_irq(&local->lock); + list_splice_tail_init(&local->call_attend_q, &call_attend_q); + spin_unlock_irq(&local->lock); + + while ((call = list_first_entry_or_null(&call_attend_q, + struct rxrpc_call, + attend_link))) { + spin_lock_irq(&local->lock); + list_del_init(&call->attend_link); + spin_unlock_irq(&local->lock); + trace_rxrpc_call_poked(call); + rxrpc_input_call_event(call); + rxrpc_put_call(call, rxrpc_call_put_poke); } + if (!list_empty(&local->new_client_calls)) + rxrpc_connect_client_calls(local); + set_current_state(TASK_INTERRUPTIBLE); should_stop = kthread_should_stop(); if (!skb_queue_empty(&local->rx_queue) || @@ -558,7 +569,7 @@ int rxrpc_io_thread(void *data) } timeout = nsecs_to_jiffies(delay_ns); - timeout = max(timeout, 1UL); + timeout = umax(timeout, 1); schedule_timeout(timeout); __set_current_state(TASK_RUNNING); continue; diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 33e8302a79e3..9fdc1f031c9d 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -129,6 +129,160 @@ static int rxrpc_preparse_xdr_rxkad(struct key_preparsed_payload *prep, return 0; } +static u64 xdr_dec64(const __be32 *xdr) +{ + return (u64)ntohl(xdr[0]) << 32 | (u64)ntohl(xdr[1]); +} + +static time64_t rxrpc_s64_to_time64(s64 time_in_100ns) +{ + bool neg = false; + u64 tmp = time_in_100ns; + + if (time_in_100ns < 0) { + tmp = -time_in_100ns; + neg = true; + } + do_div(tmp, 10000000); + return neg ? -tmp : tmp; +} + +/* + * Parse a YFS-RxGK type XDR format token + * - the caller guarantees we have at least 4 words + * + * struct token_rxgk { + * opr_time begintime; + * opr_time endtime; + * afs_int64 level; + * afs_int64 lifetime; + * afs_int64 bytelife; + * afs_int64 enctype; + * opaque key<>; + * opaque ticket<>; + * }; + */ +static int rxrpc_preparse_xdr_yfs_rxgk(struct key_preparsed_payload *prep, + size_t datalen, + const __be32 *xdr, unsigned int toklen) +{ + struct rxrpc_key_token *token, **pptoken; + time64_t expiry; + size_t plen; + const __be32 *ticket, *key; + s64 tmp; + u32 tktlen, keylen; + + _enter(",{%x,%x,%x,%x},%x", + ntohl(xdr[0]), ntohl(xdr[1]), ntohl(xdr[2]), ntohl(xdr[3]), + toklen); + + if (6 * 2 + 2 > toklen / 4) + goto reject; + + key = xdr + (6 * 2 + 1); + keylen = ntohl(key[-1]); + _debug("keylen: %x", keylen); + keylen = round_up(keylen, 4); + if ((6 * 2 + 2) * 4 + keylen > toklen) + goto reject; + + ticket = xdr + (6 * 2 + 1 + (keylen / 4) + 1); + tktlen = ntohl(ticket[-1]); + _debug("tktlen: %x", tktlen); + tktlen = round_up(tktlen, 4); + if ((6 * 2 + 2) * 4 + keylen + tktlen != toklen) { + kleave(" = -EKEYREJECTED [%x!=%x, %x,%x]", + (6 * 2 + 2) * 4 + keylen + tktlen, toklen, + keylen, tktlen); + goto reject; + } + + plen = sizeof(*token) + sizeof(*token->rxgk) + tktlen + keylen; + prep->quotalen = datalen + plen; + + plen -= sizeof(*token); + token = kzalloc(sizeof(*token), GFP_KERNEL); + if (!token) + goto nomem; + + token->rxgk = kzalloc(sizeof(*token->rxgk) + keylen, GFP_KERNEL); + if (!token->rxgk) + goto nomem_token; + + token->security_index = RXRPC_SECURITY_YFS_RXGK; + token->rxgk->begintime = xdr_dec64(xdr + 0 * 2); + token->rxgk->endtime = xdr_dec64(xdr + 1 * 2); + token->rxgk->level = tmp = xdr_dec64(xdr + 2 * 2); + if (tmp < -1LL || tmp > RXRPC_SECURITY_ENCRYPT) + goto reject_token; + token->rxgk->lifetime = xdr_dec64(xdr + 3 * 2); + token->rxgk->bytelife = xdr_dec64(xdr + 4 * 2); + token->rxgk->enctype = tmp = xdr_dec64(xdr + 5 * 2); + if (tmp < 0 || tmp > UINT_MAX) + goto reject_token; + token->rxgk->key.len = ntohl(key[-1]); + token->rxgk->key.data = token->rxgk->_key; + token->rxgk->ticket.len = ntohl(ticket[-1]); + + if (token->rxgk->endtime != 0) { + expiry = rxrpc_s64_to_time64(token->rxgk->endtime); + if (expiry < 0) + goto expired; + if (expiry < prep->expiry) + prep->expiry = expiry; + } + + memcpy(token->rxgk->key.data, key, token->rxgk->key.len); + + /* Pad the ticket so that we can use it directly in XDR */ + token->rxgk->ticket.data = kzalloc(round_up(token->rxgk->ticket.len, 4), + GFP_KERNEL); + if (!token->rxgk->ticket.data) + goto nomem_yrxgk; + memcpy(token->rxgk->ticket.data, ticket, token->rxgk->ticket.len); + + _debug("SCIX: %u", token->security_index); + _debug("EXPY: %llx", token->rxgk->endtime); + _debug("LIFE: %llx", token->rxgk->lifetime); + _debug("BYTE: %llx", token->rxgk->bytelife); + _debug("ENC : %u", token->rxgk->enctype); + _debug("LEVL: %u", token->rxgk->level); + _debug("KLEN: %u", token->rxgk->key.len); + _debug("TLEN: %u", token->rxgk->ticket.len); + _debug("KEY0: %*phN", token->rxgk->key.len, token->rxgk->key.data); + _debug("TICK: %*phN", + min_t(u32, token->rxgk->ticket.len, 32), token->rxgk->ticket.data); + + /* count the number of tokens attached */ + prep->payload.data[1] = (void *)((unsigned long)prep->payload.data[1] + 1); + + /* attach the data */ + for (pptoken = (struct rxrpc_key_token **)&prep->payload.data[0]; + *pptoken; + pptoken = &(*pptoken)->next) + continue; + *pptoken = token; + + _leave(" = 0"); + return 0; + +nomem_yrxgk: + kfree(token->rxgk); +nomem_token: + kfree(token); +nomem: + return -ENOMEM; +reject_token: + kfree(token); +reject: + return -EKEYREJECTED; +expired: + kfree(token->rxgk); + kfree(token); + return -EKEYEXPIRED; +} + /* * attempt to parse the data as the XDR format * - the caller guarantees we have more than 7 words @@ -228,6 +382,9 @@ static int rxrpc_preparse_xdr(struct key_preparsed_payload *prep) case RXRPC_SECURITY_RXKAD: ret2 = rxrpc_preparse_xdr_rxkad(prep, datalen, token, toklen); break; + case RXRPC_SECURITY_YFS_RXGK: + ret2 = rxrpc_preparse_xdr_yfs_rxgk(prep, datalen, token, toklen); + break; default: ret2 = -EPROTONOSUPPORT; break; @@ -390,6 +547,10 @@ static void rxrpc_free_token_list(struct rxrpc_key_token *token) case RXRPC_SECURITY_RXKAD: kfree(token->kad); break; + case RXRPC_SECURITY_YFS_RXGK: + kfree(token->rxgk->ticket.data); + kfree(token->rxgk); + break; default: pr_err("Unknown token type %x on rxrpc key\n", token->security_index); @@ -433,6 +594,9 @@ static void rxrpc_describe(const struct key *key, struct seq_file *m) case RXRPC_SECURITY_RXKAD: seq_puts(m, "ka"); break; + case RXRPC_SECURITY_YFS_RXGK: + seq_puts(m, "ygk"); + break; default: /* we have a ticket we can't encode */ seq_printf(m, "%u", token->security_index); break; @@ -531,6 +695,8 @@ EXPORT_SYMBOL(rxrpc_get_server_data_key); * * Generate a null RxRPC key that can be used to indicate anonymous security is * required for a particular domain. + * + * Return: The new key or a negative error code. */ struct key *rxrpc_get_null_key(const char *keyname) { @@ -595,6 +761,13 @@ static long rxrpc_read(const struct key *key, toksize += RND(token->kad->ticket_len); break; + case RXRPC_SECURITY_YFS_RXGK: + toksize += 6 * 8 + 2 * 4; + if (!token->no_leak_key) + toksize += RND(token->rxgk->key.len); + toksize += RND(token->rxgk->ticket.len); + break; + default: /* we have a ticket we can't encode */ pr_err("Unsupported key token type (%u)\n", token->security_index); @@ -674,6 +847,20 @@ static long rxrpc_read(const struct key *key, ENCODE_DATA(token->kad->ticket_len, token->kad->ticket); break; + case RXRPC_SECURITY_YFS_RXGK: + ENCODE64(token->rxgk->begintime); + ENCODE64(token->rxgk->endtime); + ENCODE64(token->rxgk->level); + ENCODE64(token->rxgk->lifetime); + ENCODE64(token->rxgk->bytelife); + ENCODE64(token->rxgk->enctype); + if (token->no_leak_key) + ENCODE(0); + else + ENCODE_DATA(token->rxgk->key.len, token->rxgk->key.data); + ENCODE_DATA(token->rxgk->ticket.len, token->rxgk->ticket.data); + break; + default: pr_err("Unsupported key token type (%u)\n", token->security_index); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 2792d2304605..a74a4b43904f 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -215,9 +215,6 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) /* we want to set the don't fragment bit */ rxrpc_local_dont_fragment(local, true); - - /* We want receive timestamps. */ - sock_enable_timestamps(usk); break; default: diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c index 657cf35089a6..8fcc8139d771 100644 --- a/net/rxrpc/misc.c +++ b/net/rxrpc/misc.c @@ -46,13 +46,13 @@ unsigned int rxrpc_rx_window_size = 255; * Maximum Rx MTU size. This indicates to the sender the size of jumbo packet * made by gluing normal packets together that we're willing to handle. */ -unsigned int rxrpc_rx_mtu = 5692; +unsigned int rxrpc_rx_mtu = RXRPC_JUMBO(46); /* * The maximum number of fragments in a received jumbo packet that we tell the * sender that we're willing to handle. */ -unsigned int rxrpc_rx_jumbo_max = 4; +unsigned int rxrpc_rx_jumbo_max = 46; #ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY /* diff --git a/net/rxrpc/net_ns.c b/net/rxrpc/net_ns.c index a4c135d0fbcc..9a9834145e81 100644 --- a/net/rxrpc/net_ns.c +++ b/net/rxrpc/net_ns.c @@ -105,10 +105,10 @@ static __net_exit void rxrpc_exit_net(struct net *net) struct rxrpc_net *rxnet = rxrpc_net(net); rxnet->live = false; - del_timer_sync(&rxnet->peer_keepalive_timer); + timer_delete_sync(&rxnet->peer_keepalive_timer); cancel_work_sync(&rxnet->peer_keepalive_work); /* Remove the timer again as the worker may have restarted it. */ - del_timer_sync(&rxnet->peer_keepalive_timer); + timer_delete_sync(&rxnet->peer_keepalive_timer); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); rxrpc_destroy_all_peers(rxnet); diff --git a/net/rxrpc/oob.c b/net/rxrpc/oob.c new file mode 100644 index 000000000000..05ca9c1faa57 --- /dev/null +++ b/net/rxrpc/oob.c @@ -0,0 +1,379 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Out of band message handling (e.g. challenge-response) + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/gfp.h> +#include <linux/skbuff.h> +#include <linux/export.h> +#include <linux/sched/signal.h> +#include <net/sock.h> +#include <net/af_rxrpc.h> +#include "ar-internal.h" + +enum rxrpc_oob_command { + RXRPC_OOB_CMD_UNSET, + RXRPC_OOB_CMD_RESPOND, +} __mode(byte); + +struct rxrpc_oob_params { + u64 oob_id; /* ID number of message if reply */ + s32 abort_code; + enum rxrpc_oob_command command; + bool have_oob_id:1; +}; + +/* + * Post an out-of-band message for attention by the socket or kernel service + * associated with a reference call. + */ +void rxrpc_notify_socket_oob(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxrpc_sock *rx; + struct sock *sk; + + rcu_read_lock(); + + rx = rcu_dereference(call->socket); + if (rx) { + sk = &rx->sk; + spin_lock_irq(&rx->recvmsg_lock); + + if (sk->sk_state < RXRPC_CLOSE) { + skb->skb_mstamp_ns = rx->oob_id_counter++; + rxrpc_get_skb(skb, rxrpc_skb_get_post_oob); + skb_queue_tail(&rx->recvmsg_oobq, skb); + + trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); + if (rx->app_ops) + rx->app_ops->notify_oob(sk, skb); + } + + spin_unlock_irq(&rx->recvmsg_lock); + if (!rx->app_ops && !sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk); + } + + rcu_read_unlock(); +} + +/* + * Locate the OOB message to respond to by its ID. + */ +static struct sk_buff *rxrpc_find_pending_oob(struct rxrpc_sock *rx, u64 oob_id) +{ + struct rb_node *p; + struct sk_buff *skb; + + p = rx->pending_oobq.rb_node; + while (p) { + skb = rb_entry(p, struct sk_buff, rbnode); + + if (oob_id < skb->skb_mstamp_ns) + p = p->rb_left; + else if (oob_id > skb->skb_mstamp_ns) + p = p->rb_right; + else + return skb; + } + + return NULL; +} + +/* + * Add an OOB message into the pending-response set. We always assign the next + * value from a 64-bit counter to the oob_id, so just assume we're always going + * to be on the right-hand edge of the tree and that the counter won't wrap. + * The tree is also given a ref to the message. + */ +void rxrpc_add_pending_oob(struct rxrpc_sock *rx, struct sk_buff *skb) +{ + struct rb_node **pp = &rx->pending_oobq.rb_node, *p = NULL; + + while (*pp) { + p = *pp; + pp = &(*pp)->rb_right; + } + + rb_link_node(&skb->rbnode, p, pp); + rb_insert_color(&skb->rbnode, &rx->pending_oobq); +} + +/* + * Extract control messages from the sendmsg() control buffer. + */ +static int rxrpc_sendmsg_oob_cmsg(struct msghdr *msg, struct rxrpc_oob_params *p) +{ + struct cmsghdr *cmsg; + int len; + + if (msg->msg_controllen == 0) + return -EINVAL; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + len = cmsg->cmsg_len - sizeof(struct cmsghdr); + _debug("CMSG %d, %d, %d", + cmsg->cmsg_level, cmsg->cmsg_type, len); + + if (cmsg->cmsg_level != SOL_RXRPC) + continue; + + switch (cmsg->cmsg_type) { + case RXRPC_OOB_ID: + if (len != sizeof(p->oob_id) || p->have_oob_id) + return -EINVAL; + memcpy(&p->oob_id, CMSG_DATA(cmsg), sizeof(p->oob_id)); + p->have_oob_id = true; + break; + case RXRPC_RESPOND: + if (p->command != RXRPC_OOB_CMD_UNSET) + return -EINVAL; + p->command = RXRPC_OOB_CMD_RESPOND; + break; + case RXRPC_ABORT: + if (len != sizeof(p->abort_code) || p->abort_code) + return -EINVAL; + memcpy(&p->abort_code, CMSG_DATA(cmsg), sizeof(p->abort_code)); + if (p->abort_code == 0) + return -EINVAL; + break; + case RXRPC_RESP_RXGK_APPDATA: + if (p->command != RXRPC_OOB_CMD_RESPOND) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + + switch (p->command) { + case RXRPC_OOB_CMD_RESPOND: + if (!p->have_oob_id) + return -EBADSLT; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * Allow userspace to respond to an OOB using sendmsg(). + */ +static int rxrpc_respond_to_oob(struct rxrpc_sock *rx, + struct rxrpc_oob_params *p, + struct msghdr *msg) +{ + struct rxrpc_connection *conn; + struct rxrpc_skb_priv *sp; + struct sk_buff *skb; + int ret; + + skb = rxrpc_find_pending_oob(rx, p->oob_id); + if (skb) + rb_erase(&skb->rbnode, &rx->pending_oobq); + release_sock(&rx->sk); + if (!skb) + return -EBADSLT; + + sp = rxrpc_skb(skb); + + switch (p->command) { + case RXRPC_OOB_CMD_RESPOND: + ret = -EPROTO; + if (skb->mark != RXRPC_OOB_CHALLENGE) + break; + conn = sp->chall.conn; + ret = -EOPNOTSUPP; + if (!conn->security->sendmsg_respond_to_challenge) + break; + if (p->abort_code) { + rxrpc_abort_conn(conn, NULL, p->abort_code, -ECONNABORTED, + rxrpc_abort_response_sendmsg); + ret = 0; + } else { + ret = conn->security->sendmsg_respond_to_challenge(skb, msg); + } + break; + default: + ret = -EINVAL; + break; + } + + rxrpc_free_skb(skb, rxrpc_skb_put_oob); + return ret; +} + +/* + * Send an out-of-band message or respond to a received out-of-band message. + * - caller gives us the socket lock + * - the socket may be either a client socket or a server socket + */ +int rxrpc_sendmsg_oob(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) +{ + struct rxrpc_oob_params p = {}; + int ret; + + _enter(""); + + ret = rxrpc_sendmsg_oob_cmsg(msg, &p); + if (ret < 0) + goto error_release_sock; + + if (p.have_oob_id) + return rxrpc_respond_to_oob(rx, &p, msg); + + release_sock(&rx->sk); + + switch (p.command) { + default: + ret = -EINVAL; + break; + } + + _leave(" = %d", ret); + return ret; + +error_release_sock: + release_sock(&rx->sk); + return ret; +} + +/** + * rxrpc_kernel_query_oob - Query the parameters of an out-of-band message + * @oob: The message to query + * @_peer: Where to return the peer record + * @_peer_appdata: The application data attached to a peer record + * + * Extract useful parameters from an out-of-band message. The source peer + * parameters are returned through the argument list and the message type is + * returned. + * + * Return: + * * %RXRPC_OOB_CHALLENGE - Challenge wanting a response. + */ +enum rxrpc_oob_type rxrpc_kernel_query_oob(struct sk_buff *oob, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(oob); + enum rxrpc_oob_type type = oob->mark; + + switch (type) { + case RXRPC_OOB_CHALLENGE: + *_peer = sp->chall.conn->peer; + *_peer_appdata = sp->chall.conn->peer->app_data; + break; + default: + WARN_ON_ONCE(1); + *_peer = NULL; + *_peer_appdata = 0; + break; + } + + return type; +} +EXPORT_SYMBOL(rxrpc_kernel_query_oob); + +/** + * rxrpc_kernel_dequeue_oob - Dequeue and return the front OOB message + * @sock: The socket to query + * @_type: Where to return the message type + * + * Dequeue the front OOB message, if there is one, and return it and + * its type. + * + * Return: The sk_buff representing the OOB message or %NULL if the queue was + * empty. + */ +struct sk_buff *rxrpc_kernel_dequeue_oob(struct socket *sock, + enum rxrpc_oob_type *_type) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *oob; + + oob = skb_dequeue(&rx->recvmsg_oobq); + if (oob) + *_type = oob->mark; + return oob; +} +EXPORT_SYMBOL(rxrpc_kernel_dequeue_oob); + +/** + * rxrpc_kernel_free_oob - Free an out-of-band message + * @oob: The OOB message to free + * + * Free an OOB message along with any resources it holds. + */ +void rxrpc_kernel_free_oob(struct sk_buff *oob) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(oob); + + switch (oob->mark) { + case RXRPC_OOB_CHALLENGE: + rxrpc_put_connection(sp->chall.conn, rxrpc_conn_put_oob); + break; + } + + rxrpc_free_skb(oob, rxrpc_skb_put_purge_oob); +} +EXPORT_SYMBOL(rxrpc_kernel_free_oob); + +/** + * rxrpc_kernel_query_challenge - Query the parameters of a challenge + * @challenge: The challenge to query + * @_peer: Where to return the peer record + * @_peer_appdata: The application data attached to a peer record + * @_service_id: Where to return the connection service ID + * @_security_index: Where to return the connection security index + * + * Extract useful parameters from a CHALLENGE message. + */ +void rxrpc_kernel_query_challenge(struct sk_buff *challenge, + struct rxrpc_peer **_peer, + unsigned long *_peer_appdata, + u16 *_service_id, u8 *_security_index) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + *_peer = sp->chall.conn->peer; + *_peer_appdata = sp->chall.conn->peer->app_data; + *_service_id = sp->hdr.serviceId; + *_security_index = sp->hdr.securityIndex; +} +EXPORT_SYMBOL(rxrpc_kernel_query_challenge); + +/** + * rxrpc_kernel_reject_challenge - Allow a kernel service to reject a challenge + * @challenge: The challenge to be rejected + * @abort_code: The abort code to stick into the ABORT packet + * @error: Local error value + * @why: Indication as to why. + * + * Allow a kernel service to reject a challenge by aborting the connection if + * it's still in an abortable state. The error is returned so this function + * can be used with a return statement. + * + * Return: The %error parameter. + */ +int rxrpc_kernel_reject_challenge(struct sk_buff *challenge, u32 abort_code, + int error, enum rxrpc_abort_reason why) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + _enter("{%x},%d,%d,%u", sp->hdr.serial, abort_code, error, why); + + rxrpc_abort_conn(sp->chall.conn, NULL, abort_code, error, why); + return error; +} +EXPORT_SYMBOL(rxrpc_kernel_reject_challenge); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 5ea9601efd05..0af19bcdc80a 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -18,7 +18,7 @@ extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); -static ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) +ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len) { struct sockaddr *sa = msg->msg_name; struct sock *sk = socket->sk; @@ -72,22 +72,96 @@ static void rxrpc_set_keepalive(struct rxrpc_call *call, ktime_t now) } /* + * Allocate transmission buffers for an ACK and attach them to local->kv[]. + */ +static int rxrpc_alloc_ack(struct rxrpc_call *call, size_t sack_size) +{ + struct rxrpc_wire_header *whdr; + struct rxrpc_acktrailer *trailer; + struct rxrpc_ackpacket *ack; + struct kvec *kv = call->local->kvec; + gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS; + void *buf, *buf2 = NULL; + u8 *filler; + + buf = page_frag_alloc(&call->local->tx_alloc, + sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp); + if (!buf) + return -ENOMEM; + + if (sack_size) { + buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp); + if (!buf2) { + page_frag_free(buf); + return -ENOMEM; + } + } + + whdr = buf; + ack = buf + sizeof(*whdr); + filler = buf + sizeof(*whdr) + sizeof(*ack) + 1; + trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3; + + kv[0].iov_base = whdr; + kv[0].iov_len = sizeof(*whdr) + sizeof(*ack); + kv[1].iov_base = buf2; + kv[1].iov_len = sack_size; + kv[2].iov_base = filler; + kv[2].iov_len = 3 + sizeof(*trailer); + return 3; /* Number of kvec[] used. */ +} + +static void rxrpc_free_ack(struct rxrpc_call *call) +{ + page_frag_free(call->local->kvec[0].iov_base); + if (call->local->kvec[1].iov_base) + page_frag_free(call->local->kvec[1].iov_base); +} + +/* + * Record the beginning of an RTT probe. + */ +static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, + ktime_t now, enum rxrpc_rtt_tx_trace why) +{ + unsigned long avail = call->rtt_avail; + int rtt_slot = 9; + + if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK)) + goto no_slot; + + rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK); + if (!test_and_clear_bit(rtt_slot, &call->rtt_avail)) + goto no_slot; + + call->rtt_serial[rtt_slot] = serial; + call->rtt_sent_at[rtt_slot] = now; + smp_wmb(); /* Write data before avail bit */ + set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); + + trace_rxrpc_rtt_tx(call, why, rtt_slot, serial); + return; + +no_slot: + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial); +} + +/* * Fill out an ACK packet. */ -static void rxrpc_fill_out_ack(struct rxrpc_call *call, - struct rxrpc_txbuf *txb, - u8 ack_reason, - rxrpc_serial_t serial) +static int rxrpc_fill_out_ack(struct rxrpc_call *call, int nr_kv, u8 ack_reason, + rxrpc_serial_t serial_to_ack, rxrpc_serial_t *_ack_serial) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxrpc_acktrailer *trailer = txb->kvec[2].iov_base + 3; + struct kvec *kv = call->local->kvec; + struct rxrpc_wire_header *whdr = kv[0].iov_base; + struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3; struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); - unsigned int qsize, sack, wrap, to; + unsigned int qsize, sack, wrap, to, max_mtu, if_mtu; rxrpc_seq_t window, wtop; + ktime_t now = ktime_get_real(); int rsize; - u32 mtu, jmax; - u8 *filler = txb->kvec[2].iov_base; - u8 *sackp = txb->kvec[1].iov_base; + u8 *filler = kv[2].iov_base; + u8 *sackp = kv[1].iov_base; rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill); @@ -95,14 +169,25 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, wtop = call->ackr_wtop; sack = call->ackr_sack_base % RXRPC_SACK_SIZE; + *_ack_serial = rxrpc_get_next_serial(call->conn); + + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->serial = htonl(*_ack_serial); whdr->seq = 0; whdr->type = RXRPC_PACKET_TYPE_ACK; - txb->flags |= RXRPC_SLOW_START_OK; + whdr->flags = call->conn->out_clientflag | RXRPC_SLOW_START_OK; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->dest_srx.srx_service); + ack->bufferSpace = 0; ack->maxSkew = 0; ack->firstPacket = htonl(window); ack->previousPacket = htonl(call->rx_highest_seq); - ack->serial = htonl(serial); + ack->serial = htonl(serial_to_ack); ack->reason = ack_reason; ack->nAcks = wtop - window; filler[0] = 0; @@ -110,15 +195,13 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, filler[2] = 0; if (ack_reason == RXRPC_ACK_PING) - txb->flags |= RXRPC_REQUEST_ACK; + whdr->flags |= RXRPC_REQUEST_ACK; if (after(wtop, window)) { - txb->len += ack->nAcks; - txb->kvec[1].iov_base = sackp; - txb->kvec[1].iov_len = ack->nAcks; + kv[1].iov_len = ack->nAcks; wrap = RXRPC_SACK_SIZE - sack; - to = min_t(unsigned int, ack->nAcks, RXRPC_SACK_SIZE); + to = umin(ack->nAcks, RXRPC_SACK_SIZE); if (sack + ack->nAcks <= RXRPC_SACK_SIZE) { memcpy(sackp, call->ackr_sack_table + sack, ack->nAcks); @@ -132,56 +215,42 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call, ack->reason = RXRPC_ACK_IDLE; } - mtu = call->peer->if_mtu; - mtu -= call->peer->hdrsize; - jmax = rxrpc_rx_jumbo_max; qsize = (window - 1) - call->rx_consumed; rsize = max_t(int, call->rx_winsize - qsize, 0); - txb->ack_rwind = rsize; - trailer->maxMTU = htonl(rxrpc_rx_mtu); - trailer->ifMTU = htonl(mtu); - trailer->rwind = htonl(rsize); - trailer->jumbo_max = htonl(jmax); -} - -/* - * Record the beginning of an RTT probe. - */ -static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, - ktime_t now, enum rxrpc_rtt_tx_trace why) -{ - unsigned long avail = call->rtt_avail; - int rtt_slot = 9; - - if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK)) - goto no_slot; - - rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK); - if (!test_and_clear_bit(rtt_slot, &call->rtt_avail)) - goto no_slot; - call->rtt_serial[rtt_slot] = serial; - call->rtt_sent_at[rtt_slot] = now; - smp_wmb(); /* Write data before avail bit */ - set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); + if_mtu = call->peer->if_mtu - call->peer->hdrsize; + if (call->peer->ackr_adv_pmtud) { + max_mtu = umax(call->peer->max_data, rxrpc_rx_mtu); + } else { + if_mtu = umin(if_mtu, 1444); + max_mtu = if_mtu; + } - trace_rxrpc_rtt_tx(call, why, rtt_slot, serial); - return; + trailer->maxMTU = htonl(max_mtu); + trailer->ifMTU = htonl(if_mtu); + trailer->rwind = htonl(rsize); + trailer->jumbo_max = 0; /* Advertise pmtu discovery */ -no_slot: - trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial); + if (ack_reason == RXRPC_ACK_PING) + rxrpc_begin_rtt_probe(call, *_ack_serial, now, rxrpc_rtt_tx_ping); + if (whdr->flags & RXRPC_REQUEST_ACK) + call->rtt_last_req = now; + rxrpc_set_keepalive(call, now); + return nr_kv; } /* * Transmit an ACK packet. */ -static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static void rxrpc_send_ack_packet(struct rxrpc_call *call, int nr_kv, size_t len, + rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct kvec *kv = call->local->kvec; + struct rxrpc_wire_header *whdr = kv[0].iov_base; + struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3; struct rxrpc_connection *conn; struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1); struct msghdr msg; - ktime_t now; int ret; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) @@ -195,33 +264,34 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t msg.msg_controllen = 0; msg.msg_flags = MSG_SPLICE_PAGES; - whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; - - txb->serial = rxrpc_get_next_serial(conn); - whdr->serial = htonl(txb->serial); - trace_rxrpc_tx_ack(call->debug_id, txb->serial, + trace_rxrpc_tx_ack(call->debug_id, serial, ntohl(ack->firstPacket), ntohl(ack->serial), ack->reason, ack->nAcks, - txb->ack_rwind); + ntohl(trailer->rwind), why); rxrpc_inc_stat(call->rxnet, stat_tx_ack_send); - iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, txb->len); - rxrpc_local_dont_fragment(conn->local, false); - ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len); + iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, len); + rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); call->peer->last_tx_at = ktime_get_seconds(); if (ret < 0) { - trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, + trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_ack); + if (why == rxrpc_propose_ack_ping_for_mtu_probe && + ret == -EMSGSIZE) + rxrpc_input_probe_for_pmtud(conn, serial, true); } else { trace_rxrpc_tx_packet(call->debug_id, whdr, rxrpc_tx_point_call_ack); - now = ktime_get_real(); - if (ack->reason == RXRPC_ACK_PING) - rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_ping); - if (txb->flags & RXRPC_REQUEST_ACK) - call->peer->rtt_last_req = now; - rxrpc_set_keepalive(call, now); + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { + call->peer->pmtud_pending = false; + call->peer->pmtud_probing = true; + call->conn->pmtud_probe = serial; + call->conn->pmtud_call = call->debug_id; + trace_rxrpc_pmtud_tx(call); + } } rxrpc_tx_backoff(call, ret); } @@ -230,31 +300,62 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t * Queue an ACK for immediate transmission. */ void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason, - rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why) + rxrpc_serial_t serial_to_ack, enum rxrpc_propose_ack_trace why) { - struct rxrpc_txbuf *txb; + struct kvec *kv = call->local->kvec; + rxrpc_serial_t ack_serial; + size_t len; + int nr_kv; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return; rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]); - txb = rxrpc_alloc_ack_txbuf(call, call->ackr_wtop - call->ackr_window); - if (!txb) { + nr_kv = rxrpc_alloc_ack(call, call->ackr_wtop - call->ackr_window); + if (nr_kv < 0) { kleave(" = -ENOMEM"); return; } - txb->ack_why = why; + nr_kv = rxrpc_fill_out_ack(call, nr_kv, ack_reason, serial_to_ack, &ack_serial); + len = kv[0].iov_len; + len += kv[1].iov_len; + len += kv[2].iov_len; + + /* Extend a path MTU probe ACK. */ + if (why == rxrpc_propose_ack_ping_for_mtu_probe) { + size_t probe_mtu = call->peer->pmtud_trial + sizeof(struct rxrpc_wire_header); + + if (len > probe_mtu) + goto skip; + while (len < probe_mtu) { + size_t part = umin(probe_mtu - len, PAGE_SIZE); + + kv[nr_kv].iov_base = page_address(ZERO_PAGE(0)); + kv[nr_kv].iov_len = part; + len += part; + nr_kv++; + } + } - rxrpc_fill_out_ack(call, txb, ack_reason, serial); call->ackr_nr_unacked = 0; atomic_set(&call->ackr_nr_consumed, 0); clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags); - trace_rxrpc_send_ack(call, why, ack_reason, serial); - rxrpc_send_ack_packet(call, txb); - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx); + trace_rxrpc_send_ack(call, why, ack_reason, ack_serial); + rxrpc_send_ack_packet(call, nr_kv, len, ack_serial, why); +skip: + rxrpc_free_ack(call); +} + +/* + * Send an ACK probe for path MTU discovery. + */ +void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call) +{ + rxrpc_send_ACK(call, RXRPC_ACK_PING, 0, + rxrpc_propose_ack_ping_for_mtu_probe); } /* @@ -324,14 +425,21 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) /* * Prepare a (sub)packet for transmission. */ -static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb, - rxrpc_serial_t serial) +static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_txbuf *txb, + struct rxrpc_wire_header *whdr, + rxrpc_serial_t serial, int subpkt) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct rxrpc_jumbo_header *jumbo = txb->data - sizeof(*jumbo); enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; + struct kvec *kv = &call->local->kvec[1 + subpkt]; + size_t len = txb->pkt_len; + bool last; + u8 flags; - _enter("%x,{%d}", txb->seq, txb->len); + _enter("%x,%zd", txb->seq, len); txb->serial = serial; @@ -339,6 +447,15 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t txb->seq == 1) whdr->userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; + txb->flags &= ~RXRPC_REQUEST_ACK; + flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; + last = txb->flags & RXRPC_LAST_PACKET; + + if (subpkt < req->n - 1) { + len = RXRPC_JUMBO_DATALEN; + goto dont_set_request_ack; + } + /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. * @@ -346,113 +463,208 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t * service call, lest OpenAFS incorrectly send us an ACK with some * soft-ACKs in it and then never follow up with a proper hard ACK. */ - if (txb->flags & RXRPC_REQUEST_ACK) - why = rxrpc_reqack_already_on; - else if ((txb->flags & RXRPC_LAST_PACKET) && rxrpc_sending_to_client(txb)) + if (last && rxrpc_sending_to_client(txb)) why = rxrpc_reqack_no_srv_last; else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) why = rxrpc_reqack_ack_lost; else if (txb->flags & RXRPC_TXBUF_RESENT) why = rxrpc_reqack_retrans; - else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2) + else if (call->cong_ca_state == RXRPC_CA_SLOW_START && call->cong_cwnd <= RXRPC_MIN_CWND) why = rxrpc_reqack_slow_start; else if (call->tx_winsize <= 2) why = rxrpc_reqack_small_txwin; - else if (call->peer->rtt_count < 3 && txb->seq & 1) + else if (call->rtt_count < 3) why = rxrpc_reqack_more_rtt; - else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real())) + else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real())) why = rxrpc_reqack_old_rtt; + else if (!last && !after(READ_ONCE(call->send_top), txb->seq)) + why = rxrpc_reqack_app_stall; else goto dont_set_request_ack; rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]); trace_rxrpc_req_ack(call->debug_id, txb->seq, why); - if (why != rxrpc_reqack_no_srv_last) - txb->flags |= RXRPC_REQUEST_ACK; + if (why != rxrpc_reqack_no_srv_last) { + flags |= RXRPC_REQUEST_ACK; + trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, -1, serial); + call->rtt_last_req = req->now; + } dont_set_request_ack: - whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS; - whdr->serial = htonl(txb->serial); - whdr->cksum = txb->cksum; + /* There's a jumbo header prepended to the data if we need it. */ + if (subpkt < req->n - 1) + flags |= RXRPC_JUMBO_PACKET; + else + flags &= ~RXRPC_JUMBO_PACKET; + if (subpkt == 0) { + whdr->flags = flags; + whdr->cksum = txb->cksum; + kv->iov_base = txb->data; + } else { + jumbo->flags = flags; + jumbo->pad = 0; + jumbo->cksum = txb->cksum; + kv->iov_base = jumbo; + len += sizeof(*jumbo); + } - trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, false); + trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, req->trace); + kv->iov_len = len; + return len; } /* - * Prepare a packet for transmission. + * Prepare a transmission queue object for initial transmission. Returns the + * number of microseconds since the transmission queue base timestamp. */ -static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static unsigned int rxrpc_prepare_txqueue(struct rxrpc_txqueue *tq, + struct rxrpc_send_data_req *req) { - rxrpc_serial_t serial; - - /* Each transmission of a Tx packet needs a new serial number */ - serial = rxrpc_get_next_serial(call->conn); - - rxrpc_prepare_data_subpacket(call, txb, serial); - - return txb->len; + if (!tq) + return 0; + if (tq->xmit_ts_base == KTIME_MIN) { + tq->xmit_ts_base = req->now; + return 0; + } + return ktime_to_us(ktime_sub(req->now, tq->xmit_ts_base)); } /* - * Set timeouts after transmitting a packet. + * Prepare a (jumbo) packet for transmission. */ -static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_wire_header *whdr) { - ktime_t now = ktime_get_real(); - bool ack_requested = txb->flags & RXRPC_REQUEST_ACK; + struct rxrpc_txqueue *tq = req->tq; + rxrpc_serial_t serial; + unsigned int xmit_ts; + rxrpc_seq_t seq = req->seq; + size_t len = 0; + bool start_tlp = false; - call->tx_last_sent = now; - txb->last_sent = now; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit); - if (ack_requested) { - rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_data); + /* Each transmission of a Tx packet needs a new serial number */ + serial = rxrpc_get_next_serials(call->conn, req->n); + + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->seq = htonl(seq); + whdr->serial = htonl(serial); + whdr->type = RXRPC_PACKET_TYPE_DATA; + whdr->flags = 0; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->conn->service_id); + + call->tx_last_serial = serial + req->n - 1; + call->tx_last_sent = req->now; + xmit_ts = rxrpc_prepare_txqueue(tq, req); + prefetch(tq->next); + + for (int i = 0;;) { + int ix = seq & RXRPC_TXQ_MASK; + struct rxrpc_txbuf *txb = tq->bufs[seq & RXRPC_TXQ_MASK]; + + _debug("prep[%u] tq=%x q=%x", i, tq->qbase, seq); + + /* Record (re-)transmission for RACK [RFC8985 6.1]. */ + if (__test_and_clear_bit(ix, &tq->segment_lost)) + call->tx_nr_lost--; + if (req->retrans) { + __set_bit(ix, &tq->ever_retransmitted); + __set_bit(ix, &tq->segment_retransmitted); + call->tx_nr_resent++; + } else { + call->tx_nr_sent++; + start_tlp = true; + } + tq->segment_xmit_ts[ix] = xmit_ts; + tq->segment_serial[ix] = serial; + if (i + 1 == req->n) + /* Only sample the last subpacket in a jumbo. */ + __set_bit(ix, &tq->rtt_samples); + len += rxrpc_prepare_data_subpacket(call, req, txb, whdr, serial, i); + serial++; + seq++; + i++; + if (i >= req->n) + break; + if (!(seq & RXRPC_TXQ_MASK)) { + tq = tq->next; + trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit_advance); + xmit_ts = rxrpc_prepare_txqueue(tq, req); + } + } - call->peer->rtt_last_req = now; - if (call->peer->rtt_count > 1) { - ktime_t delay = rxrpc_get_rto_backoff(call->peer, false); + /* Set timeouts */ + if (req->tlp_probe) { + /* Sending TLP loss probe [RFC8985 7.3]. */ + call->tlp_serial = serial - 1; + call->tlp_seq = seq - 1; + } else if (start_tlp) { + /* Schedule TLP loss probe [RFC8985 7.2]. */ + ktime_t pto; + + if (!test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + /* The first packet may take longer to elicit a response. */ + pto = NSEC_PER_SEC; + else + pto = rxrpc_tlp_calc_pto(call, req->now); - call->ack_lost_at = ktime_add(now, delay); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack); - } + call->rack_timer_mode = RXRPC_CALL_RACKTIMER_TLP_PTO; + call->rack_timo_at = ktime_add(req->now, pto); + trace_rxrpc_rack_timer(call, pto, false); + trace_rxrpc_timer_set(call, pto, rxrpc_timer_trace_rack_tlp_pto); } if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) { ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo)); - call->expect_rx_by = ktime_add(now, delay); + call->expect_rx_by = ktime_add(req->now, delay); trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx); } - rxrpc_set_keepalive(call, now); + rxrpc_set_keepalive(call, req->now); + page_frag_free(whdr); + return len; } /* - * send a packet through the transport endpoint + * Send one or more packets through the transport endpoint */ -static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; + struct rxrpc_wire_header *whdr; struct rxrpc_connection *conn = call->conn; enum rxrpc_tx_point frag; + struct rxrpc_txqueue *tq = req->tq; + struct rxrpc_txbuf *txb; struct msghdr msg; - size_t len; - int ret; + rxrpc_seq_t seq = req->seq; + size_t len = sizeof(*whdr); + bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags); + int ret, stat_ix; - _enter("%x,{%d}", txb->seq, txb->len); + _enter("%x,%x-%x", tq->qbase, seq, seq + req->n - 1); - len = rxrpc_prepare_data_packet(call, txb); + whdr = page_frag_alloc(&call->local->tx_alloc, sizeof(*whdr), GFP_NOFS); + if (!whdr) + return; /* Drop the packet if no memory. */ - if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { - static int lose; - if ((lose++ & 7) == 7) { - ret = 0; - trace_rxrpc_tx_data(call, txb->seq, txb->serial, - txb->flags, true); - goto done; - } - } + call->local->kvec[0].iov_base = whdr; + call->local->kvec[0].iov_len = sizeof(*whdr); + + stat_ix = umin(req->n, ARRAY_SIZE(call->rxnet->stat_tx_jumbo)) - 1; + atomic_inc(&call->rxnet->stat_tx_jumbo[stat_ix]); - iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, len); + len += rxrpc_prepare_data_packet(call, req, whdr); + txb = tq->bufs[seq & RXRPC_TXQ_MASK]; + + iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, 1 + req->n, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -460,16 +672,11 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t msg.msg_controllen = 0; msg.msg_flags = MSG_SPLICE_PAGES; - /* Track what we've attempted to transmit at least once so that the - * retransmission algorithm doesn't try to resend what we haven't sent - * yet. + /* Send the packet with the don't fragment bit set unless we think it's + * too big or if this is a retransmission. */ - if (txb->seq == call->tx_transmitted + 1) - call->tx_transmitted = txb->seq; - - /* send the packet with the don't fragment bit set if we currently - * think it's small enough */ - if (txb->len >= call->peer->maxdata) { + if (seq == call->tx_transmitted + 1 && + len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) { rxrpc_local_dont_fragment(conn->local, false); frag = rxrpc_tx_point_call_data_frag; } else { @@ -477,7 +684,25 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t frag = rxrpc_tx_point_call_data_nofrag; } -retry: + /* Track what we've attempted to transmit at least once so that the + * retransmission algorithm doesn't try to resend what we haven't sent + * yet. + */ + if (seq == call->tx_transmitted + 1) + call->tx_transmitted = seq + req->n - 1; + + if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { + static int lose; + + if ((lose++ & 7) == 7) { + ret = 0; + trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, + rxrpc_txdata_inject_loss); + conn->peer->last_tx_at = ktime_get_seconds(); + goto done; + } + } + /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet * to go out of the interface @@ -488,7 +713,11 @@ retry: ret = do_udp_sendmsg(conn->local->socket, &msg, len); conn->peer->last_tx_at = ktime_get_seconds(); - if (ret < 0) { + if (ret == -EMSGSIZE) { + rxrpc_inc_stat(call->rxnet, stat_tx_data_send_msgsize); + trace_rxrpc_tx_packet(call->debug_id, whdr, frag); + ret = 0; + } else if (ret < 0) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag); } else { @@ -496,28 +725,23 @@ retry: } rxrpc_tx_backoff(call, ret); - if (ret == -EMSGSIZE && frag == rxrpc_tx_point_call_data_frag) { - rxrpc_local_dont_fragment(conn->local, false); - frag = rxrpc_tx_point_call_data_frag; - goto retry; - } -done: - if (ret >= 0) { - rxrpc_tstamp_data_packets(call, txb); - } else { - /* Cancel the call if the initial transmission fails, - * particularly if that's due to network routing issues that - * aren't going away anytime soon. The layer above can arrange - * the retransmission. + if (ret < 0) { + /* Cancel the call if the initial transmission fails or if we + * hit due to network routing issues that aren't going away + * anytime soon. The layer above can arrange the + * retransmission. */ - if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) + if (new_call || + ret == -ENETUNREACH || + ret == -EHOSTUNREACH || + ret == -ECONNREFUSED) rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_USER_ABORT, ret); } - _leave(" = %d [%u]", ret, call->peer->maxdata); - return ret; +done: + _leave(" = %d [%u]", ret, call->peer->max_data); } /* @@ -694,39 +918,59 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer) } /* - * Schedule an instant Tx resend. + * Send a RESPONSE message. */ -static inline void rxrpc_instant_resend(struct rxrpc_call *call, - struct rxrpc_txbuf *txb) +void rxrpc_send_response(struct rxrpc_connection *conn, struct sk_buff *response) { - if (!__rxrpc_call_is_complete(call)) - kdebug("resend"); -} + struct rxrpc_skb_priv *sp = rxrpc_skb(response); + struct scatterlist sg[16]; + struct bio_vec bvec[16]; + struct msghdr msg; + size_t len = sp->resp.len; + __be32 wserial; + u32 serial = 0; + int ret, nr_sg; -/* - * Transmit one packet. - */ -void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb) -{ - int ret; + _enter("C=%x,%x", conn->debug_id, sp->resp.challenge_serial); - ret = rxrpc_send_data_packet(call, txb); - if (ret < 0) { - switch (ret) { - case -ENETUNREACH: - case -EHOSTUNREACH: - case -ECONNREFUSED: - rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, - 0, ret); - break; - default: - _debug("need instant resend %d", ret); - rxrpc_instant_resend(call, txb); - } - } else { - ktime_t delay = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC); + sg_init_table(sg, ARRAY_SIZE(sg)); + ret = skb_to_sgvec(response, sg, 0, len); + if (ret < 0) + goto fail; + nr_sg = ret; - call->resend_at = ktime_add(ktime_get_real(), delay); - trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend_tx); - } + for (int i = 0; i < nr_sg; i++) + bvec_set_page(&bvec[i], sg_page(&sg[i]), sg[i].length, sg[i].offset); + + iov_iter_bvec(&msg.msg_iter, WRITE, bvec, nr_sg, len); + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + serial = rxrpc_get_next_serials(conn, 1); + wserial = htonl(serial); + + trace_rxrpc_tx_response(conn, serial, sp); + + ret = skb_store_bits(response, offsetof(struct rxrpc_wire_header, serial), + &wserial, sizeof(wserial)); + if (ret < 0) + goto fail; + + rxrpc_local_dont_fragment(conn->local, false); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret < 0) + goto fail; + + conn->peer->last_tx_at = ktime_get_seconds(); + return; + +fail: + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_response); + kleave(" = %d", ret); } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index 552ba84a255c..7f4729234957 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -102,6 +102,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, */ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) { + unsigned int max_data; + /* wind down the local interface MTU */ if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu) peer->if_mtu = mtu; @@ -120,11 +122,15 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) } } - if (mtu < peer->mtu) { - spin_lock(&peer->lock); - peer->mtu = mtu; - peer->maxdata = peer->mtu - peer->hdrsize; - spin_unlock(&peer->lock); + max_data = max_t(int, mtu - peer->hdrsize, 500); + if (max_data < peer->max_data) { + if (peer->pmtud_good > max_data) + peer->pmtud_good = max_data; + if (peer->pmtud_bad > max_data + 1) + peer->pmtud_bad = max_data + 1; + + trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp); + peer->max_data = max_data; } } @@ -161,6 +167,13 @@ void rxrpc_input_error(struct rxrpc_local *local, struct sk_buff *skb) goto out; } + if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 && + serr->ee.ee_type == ICMPV6_PKT_TOOBIG && + serr->ee.ee_code == 0)) { + rxrpc_adjust_mtu(peer, serr->ee.ee_info); + goto out; + } + rxrpc_store_error(peer, skb); out: rxrpc_put_peer(peer, rxrpc_peer_put_input_error); @@ -205,23 +218,23 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, struct sk_buff *skb, struct rxrpc_call *call; HLIST_HEAD(error_targets); - spin_lock(&peer->lock); + spin_lock_irq(&peer->lock); hlist_move_list(&peer->error_targets, &error_targets); while (!hlist_empty(&error_targets)) { call = hlist_entry(error_targets.first, struct rxrpc_call, error_link); hlist_del_init(&call->error_link); - spin_unlock(&peer->lock); + spin_unlock_irq(&peer->lock); rxrpc_see_call(call, rxrpc_call_see_distribute_error); rxrpc_set_call_completion(call, compl, 0, -err); - rxrpc_input_call_event(call, skb); + rxrpc_input_call_event(call); - spin_lock(&peer->lock); + spin_lock_irq(&peer->lock); } - spin_unlock(&peer->lock); + spin_unlock_irq(&peer->lock); } /* @@ -238,7 +251,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, bool use; int slot; - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); while (!list_empty(collector)) { peer = list_entry(collector->next, @@ -249,7 +262,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, continue; use = __rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (use) { keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; @@ -269,17 +282,17 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, */ slot += cursor; slot &= mask; - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive[slot & mask]); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive); } rxrpc_put_peer(peer, rxrpc_peer_put_keepalive); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); } - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); } /* @@ -309,7 +322,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) * second; the bucket at cursor + 1 goes at now + 1s and so * on... */ - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); list_splice_init(&rxnet->peer_keepalive_new, &collector); stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive); @@ -321,7 +334,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) } base = now; - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxnet->peer_keepalive_base = base; rxnet->peer_keepalive_cursor = cursor; @@ -347,3 +360,84 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) _leave(""); } + +/* + * Do path MTU probing. + */ +void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial, + bool sendmsg_fail) +{ + struct rxrpc_peer *peer = conn->peer; + unsigned int max_data = peer->max_data; + int good, trial, bad, jumbo; + + good = peer->pmtud_good; + trial = peer->pmtud_trial; + bad = peer->pmtud_bad; + if (good >= bad - 1) { + conn->pmtud_probe = 0; + peer->pmtud_lost = false; + return; + } + + if (!peer->pmtud_probing) + goto send_probe; + + if (sendmsg_fail || after(acked_serial, conn->pmtud_probe)) { + /* Retry a lost probe. */ + if (!peer->pmtud_lost) { + trace_rxrpc_pmtud_lost(conn, acked_serial); + conn->pmtud_probe = 0; + peer->pmtud_lost = true; + goto send_probe; + } + + /* The probed size didn't seem to get through. */ + bad = trial; + peer->pmtud_bad = bad; + if (bad <= max_data) + max_data = bad - 1; + } else { + /* It did get through. */ + good = trial; + peer->pmtud_good = good; + if (good > max_data) + max_data = good; + } + + max_data = umin(max_data, peer->ackr_max_data); + if (max_data != peer->max_data) + peer->max_data = max_data; + + jumbo = max_data + sizeof(struct rxrpc_jumbo_header); + jumbo /= RXRPC_JUMBO_SUBPKTLEN; + peer->pmtud_jumbo = jumbo; + + trace_rxrpc_pmtud_rx(conn, acked_serial); + conn->pmtud_probe = 0; + peer->pmtud_lost = false; + + if (good < RXRPC_JUMBO(2) && bad > RXRPC_JUMBO(2)) + trial = RXRPC_JUMBO(2); + else if (good < RXRPC_JUMBO(4) && bad > RXRPC_JUMBO(4)) + trial = RXRPC_JUMBO(4); + else if (good < RXRPC_JUMBO(3) && bad > RXRPC_JUMBO(3)) + trial = RXRPC_JUMBO(3); + else if (good < RXRPC_JUMBO(6) && bad > RXRPC_JUMBO(6)) + trial = RXRPC_JUMBO(6); + else if (good < RXRPC_JUMBO(5) && bad > RXRPC_JUMBO(5)) + trial = RXRPC_JUMBO(5); + else if (good < RXRPC_JUMBO(8) && bad > RXRPC_JUMBO(8)) + trial = RXRPC_JUMBO(8); + else if (good < RXRPC_JUMBO(7) && bad > RXRPC_JUMBO(7)) + trial = RXRPC_JUMBO(7); + else + trial = (good + bad) / 2; + peer->pmtud_trial = trial; + + if (good >= bad) + return; + +send_probe: + peer->pmtud_pending = true; +} diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index 49dcda67a0d5..e2f35e6c04d6 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -162,6 +162,11 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, #endif peer->if_mtu = 1500; + if (peer->max_data < peer->if_mtu - peer->hdrsize) { + trace_rxrpc_pmtud_reduce(peer, 0, peer->if_mtu - peer->hdrsize, + rxrpc_pmtud_reduce_route); + peer->max_data = peer->if_mtu - peer->hdrsize; + } memset(&fl, 0, sizeof(fl)); switch (peer->srx.transport.family) { @@ -199,8 +204,16 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local, } peer->if_mtu = dst_mtu(dst); + peer->hdrsize += dst->header_len + dst->trailer_len; + peer->tx_seg_max = dst->dev->gso_max_segs; dst_release(dst); + peer->max_data = umin(RXRPC_JUMBO(1), peer->if_mtu - peer->hdrsize); + peer->pmtud_good = 500; + peer->pmtud_bad = peer->if_mtu - peer->hdrsize + 1; + peer->pmtud_trial = umin(peer->max_data, peer->pmtud_bad - 1); + peer->pmtud_pending = true; + _leave(" [if_mtu %u]", peer->if_mtu); } @@ -222,11 +235,8 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); - spin_lock_init(&peer->rtt_input_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); - - rxrpc_peer_init_rtt(peer); - + peer->recent_srtt_us = UINT_MAX; peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; trace_rxrpc_peer(peer->debug_id, 1, why); } @@ -242,9 +252,7 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, unsigned long hash_key) { peer->hash_key = hash_key; - rxrpc_assess_MTU_size(local, peer); - peer->mtu = peer->if_mtu; - peer->rtt_last_req = ktime_get_real(); + switch (peer->srx.transport.family) { case AF_INET: @@ -268,7 +276,9 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer, } peer->hdrsize += sizeof(struct rxrpc_wire_header); - peer->maxdata = peer->mtu - peer->hdrsize; + peer->max_data = peer->if_mtu - peer->hdrsize; + + rxrpc_assess_MTU_size(local, peer); } /* @@ -304,6 +314,7 @@ static void rxrpc_free_peer(struct rxrpc_peer *peer) * Set up a new incoming peer. There shouldn't be any other matching peers * since we've already done a search in the list from the non-reentrant context * (the data_ready handler) that is the only place we can add new peers. + * Called with interrupts disabled. */ void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer) { @@ -348,7 +359,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, return NULL; } - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); @@ -361,7 +372,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, &rxnet->peer_keepalive_new); } - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (peer) rxrpc_free_peer(candidate); @@ -411,10 +422,10 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) ASSERT(hlist_empty(&peer->error_targets)); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_free_peer(peer); } @@ -450,7 +461,7 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) continue; hlist_for_each_entry(peer, &rxnet->peer_hash[i], hash_link) { - pr_err("Leaked peer %u {%u} %pISp\n", + pr_err("Leaked peer %x {%u} %pISp\n", peer->debug_id, refcount_read(&peer->ref), &peer->srx.transport); @@ -464,10 +475,12 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) * @call: The call to query * * Get a record for the remote peer in a call. + * + * Return: The call's peer record. */ struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call) { - return call->peer; + return rxrpc_get_peer(call->peer, rxrpc_peer_get_application); } EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); @@ -475,11 +488,13 @@ EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT * @peer: The peer to query * - * Get the call's peer smoothed RTT in uS or UINT_MAX if we have no samples. + * Get the call's peer smoothed RTT. + * + * Return: The RTT in uS or %UINT_MAX if we have no samples. */ unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer) { - return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX; + return READ_ONCE(peer->recent_srtt_us); } EXPORT_SYMBOL(rxrpc_kernel_get_srtt); @@ -488,7 +503,10 @@ EXPORT_SYMBOL(rxrpc_kernel_get_srtt); * @peer: The peer to query * * Get a pointer to the address from a peer record. The caller is responsible - * for making sure that the address is not deallocated. + * for making sure that the address is not deallocated. A fake address will be + * substituted if %peer in NULL. + * + * Return: The rxrpc address record or a fake record. */ const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer) { @@ -501,7 +519,10 @@ EXPORT_SYMBOL(rxrpc_kernel_remote_srx); * @peer: The peer to query * * Get a pointer to the transport address from a peer record. The caller is - * responsible for making sure that the address is not deallocated. + * responsible for making sure that the address is not deallocated. A fake + * address will be substituted if %peer in NULL. + * + * Return: The transport address record or a fake record. */ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer) { @@ -509,3 +530,33 @@ const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer) (peer ? &peer->srx.transport : &rxrpc_null_addr.transport); } EXPORT_SYMBOL(rxrpc_kernel_remote_addr); + +/** + * rxrpc_kernel_set_peer_data - Set app-specific data on a peer. + * @peer: The peer to alter + * @app_data: The data to set + * + * Set the app-specific data on a peer. AF_RXRPC makes no effort to retain + * anything the data might refer to. + * + * Return: The previous app_data. + */ +unsigned long rxrpc_kernel_set_peer_data(struct rxrpc_peer *peer, unsigned long app_data) +{ + return xchg(&peer->app_data, app_data); +} +EXPORT_SYMBOL(rxrpc_kernel_set_peer_data); + +/** + * rxrpc_kernel_get_peer_data - Get app-specific data from a peer. + * @peer: The peer to query + * + * Retrieve the app-specific data from a peer. + * + * Return: The peer's app data. + */ +unsigned long rxrpc_kernel_get_peer_data(const struct rxrpc_peer *peer) +{ + return peer->app_data; +} +EXPORT_SYMBOL(rxrpc_kernel_get_peer_data); diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c index 263a2251e3d2..d803562ca0ac 100644 --- a/net/rxrpc/proc.c +++ b/net/rxrpc/proc.c @@ -52,7 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) struct rxrpc_call *call; struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq)); enum rxrpc_call_state state; - rxrpc_seq_t acks_hard_ack; + rxrpc_seq_t tx_bottom; char lbuff[50], rbuff[50]; long timeout = 0; @@ -79,7 +79,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) if (state != RXRPC_CALL_SERVER_PREALLOC) timeout = ktime_ms_delta(READ_ONCE(call->expect_rx_by), ktime_get_real()); - acks_hard_ack = READ_ONCE(call->acks_hard_ack); + tx_bottom = READ_ONCE(call->tx_bottom); seq_printf(seq, "UDP %-47.47s %-47.47s %4x %08x %08x %s %3u" " %-8.8s %08x %08x %08x %02x %08x %02x %08x %02x %06lx\n", @@ -93,7 +93,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v) rxrpc_call_states[state], call->abort_code, call->debug_id, - acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack, + tx_bottom, READ_ONCE(call->tx_top) - tx_bottom, call->ackr_window, call->ackr_wtop - call->ackr_window, call->rx_serial, call->cong_cwnd, @@ -283,9 +283,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) { seq_puts(seq, - "Proto Local " - " Remote " - " Use SST MTU LastUse RTT RTO\n" + "Proto Local Remote Use SST Maxd LastUse RTT RTO\n" ); return 0; } @@ -298,16 +296,15 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v) now = ktime_get_seconds(); seq_printf(seq, - "UDP %-47.47s %-47.47s %3u" - " %3u %5u %6llus %8u %8u\n", + "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8d %8d\n", lbuff, rbuff, refcount_read(&peer->ref), peer->cong_ssthresh, - peer->mtu, + peer->max_data, now - peer->last_tx_at, - peer->srtt_us >> 3, - peer->rto_us); + READ_ONCE(peer->recent_srtt_us), + READ_ONCE(peer->recent_rto_us)); return 0; } @@ -476,10 +473,11 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(seq)); seq_printf(seq, - "Data : send=%u sendf=%u fail=%u\n", + "Data : send=%u sendf=%u fail=%u emsz=%u\n", atomic_read(&rxnet->stat_tx_data_send), atomic_read(&rxnet->stat_tx_data_send_frag), - atomic_read(&rxnet->stat_tx_data_send_fail)); + atomic_read(&rxnet->stat_tx_data_send_fail), + atomic_read(&rxnet->stat_tx_data_send_msgsize)); seq_printf(seq, "Data-Tx : nr=%u retrans=%u uf=%u cwr=%u\n", atomic_read(&rxnet->stat_tx_data), @@ -508,7 +506,7 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DELAY]), atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_IDLE])); seq_printf(seq, - "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n", + "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u z=%u\n", atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_REQUESTED]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DUPLICATE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]), @@ -517,13 +515,14 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING_RESPONSE]), atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]), - atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE])); + atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]), + atomic_read(&rxnet->stat_rx_acks[0])); seq_printf(seq, - "Why-Req-A: acklost=%u already=%u mrtt=%u ortt=%u\n", + "Why-Req-A: acklost=%u mrtt=%u ortt=%u stall=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_already_on]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]), - atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt])); + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]), + atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_app_stall])); seq_printf(seq, "Why-Req-A: nolast=%u retx=%u slows=%u smtxw=%u\n", atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_no_srv_last]), @@ -531,6 +530,30 @@ int rxrpc_stats_show(struct seq_file *seq, void *v) atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_slow_start]), atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_small_txwin])); seq_printf(seq, + "Jumbo-Tx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n", + atomic_read(&rxnet->stat_tx_jumbo[0]), + atomic_read(&rxnet->stat_tx_jumbo[1]), + atomic_read(&rxnet->stat_tx_jumbo[2]), + atomic_read(&rxnet->stat_tx_jumbo[3]), + atomic_read(&rxnet->stat_tx_jumbo[4]), + atomic_read(&rxnet->stat_tx_jumbo[5]), + atomic_read(&rxnet->stat_tx_jumbo[6]), + atomic_read(&rxnet->stat_tx_jumbo[7]), + atomic_read(&rxnet->stat_tx_jumbo[8]), + atomic_read(&rxnet->stat_tx_jumbo[9])); + seq_printf(seq, + "Jumbo-Rx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n", + atomic_read(&rxnet->stat_rx_jumbo[0]), + atomic_read(&rxnet->stat_rx_jumbo[1]), + atomic_read(&rxnet->stat_rx_jumbo[2]), + atomic_read(&rxnet->stat_rx_jumbo[3]), + atomic_read(&rxnet->stat_rx_jumbo[4]), + atomic_read(&rxnet->stat_rx_jumbo[5]), + atomic_read(&rxnet->stat_rx_jumbo[6]), + atomic_read(&rxnet->stat_rx_jumbo[7]), + atomic_read(&rxnet->stat_rx_jumbo[8]), + atomic_read(&rxnet->stat_rx_jumbo[9])); + seq_printf(seq, "Buffers : txb=%u rxb=%u\n", atomic_read(&rxrpc_nr_txbuf), atomic_read(&rxrpc_n_rx_skbs)); @@ -567,6 +590,8 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size) atomic_set(&rxnet->stat_tx_ack_skip, 0); memset(&rxnet->stat_tx_acks, 0, sizeof(rxnet->stat_tx_acks)); memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks)); + memset(&rxnet->stat_tx_jumbo, 0, sizeof(rxnet->stat_tx_jumbo)); + memset(&rxnet->stat_rx_jumbo, 0, sizeof(rxnet->stat_rx_jumbo)); memset(&rxnet->stat_why_req_ack, 0, sizeof(rxnet->stat_why_req_ack)); diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h index 4fe6b4d20ada..f8bfec12bc7e 100644 --- a/net/rxrpc/protocol.h +++ b/net/rxrpc/protocol.h @@ -92,11 +92,16 @@ struct rxrpc_jumbo_header { /* * The maximum number of subpackets that can possibly fit in a UDP packet is: * - * ((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1 - * = ((65535 - 28 - 28) / 1416) + 1 - * = 46 non-terminal packets and 1 terminal packet. + * (max_UDP - wirehdr + jumbohdr) / (jumbohdr + 1412) + * = ((65535 - 28 + 4) / 1416) + * = 45 non-terminal packets and 1 terminal packet. */ -#define RXRPC_MAX_NR_JUMBO 47 +#define RXRPC_MAX_NR_JUMBO 46 + +/* Size of a jumbo packet with N subpackets, excluding UDP+IP */ +#define RXRPC_JUMBO(N) ((int)sizeof(struct rxrpc_wire_header) + \ + RXRPC_JUMBO_DATALEN + \ + ((N) - 1) * RXRPC_JUMBO_SUBPKTLEN) /*****************************************************************************/ /* @@ -176,4 +181,24 @@ struct rxkad_response { __be32 ticket_len; /* Kerberos ticket length */ } __packed; +/* + * GSSAPI security type-4 and type-6 data header. + */ +struct rxgk_header { + __be32 epoch; + __be32 cid; + __be32 call_number; + __be32 seq; + __be32 sec_index; + __be32 data_len; +} __packed; + +/* + * GSSAPI security type-4 and type-6 response packet header. + */ +struct rxgk_response { + __be64 start_time; + __be32 token_len; +} __packed; + #endif /* _LINUX_RXRPC_PACKET_H */ diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index a482f88c5fc5..86a27fb55a1c 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -36,16 +36,16 @@ void rxrpc_notify_socket(struct rxrpc_call *call) sk = &rx->sk; if (rx && sk->sk_state < RXRPC_CLOSE) { if (call->notify_rx) { - spin_lock(&call->notify_lock); + spin_lock_irq(&call->notify_lock); call->notify_rx(sk, call, call->user_call_ID); - spin_unlock(&call->notify_lock); + spin_unlock_irq(&call->notify_lock); } else { - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { rxrpc_get_call(call, rxrpc_call_get_notify_socket); list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); } - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); if (!sock_flag(sk, SOCK_DEAD)) { _debug("call %ps", sk->sk_data_ready); @@ -155,6 +155,82 @@ static int rxrpc_verify_data(struct rxrpc_call *call, struct sk_buff *skb) } /* + * Transcribe a call's user ID to a control message. + */ +static int rxrpc_recvmsg_user_id(struct rxrpc_call *call, struct msghdr *msg, + int flags) +{ + if (!test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) + return 0; + + if (flags & MSG_CMSG_COMPAT) { + unsigned int id32 = call->user_call_ID; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned int), &id32); + } else { + unsigned long idl = call->user_call_ID; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, + sizeof(unsigned long), &idl); + } +} + +/* + * Deal with a CHALLENGE packet. + */ +static int rxrpc_recvmsg_challenge(struct socket *sock, struct msghdr *msg, + struct sk_buff *challenge, unsigned int flags) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + struct rxrpc_connection *conn = sp->chall.conn; + + return conn->security->challenge_to_recvmsg(conn, challenge, msg); +} + +/* + * Process OOB packets. Called with the socket locked. + */ +static int rxrpc_recvmsg_oob(struct socket *sock, struct msghdr *msg, + unsigned int flags) +{ + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); + struct sk_buff *skb; + bool need_response = false; + int ret; + + skb = skb_peek(&rx->recvmsg_oobq); + if (!skb) + return -EAGAIN; + rxrpc_see_skb(skb, rxrpc_skb_see_recvmsg); + + ret = put_cmsg(msg, SOL_RXRPC, RXRPC_OOB_ID, sizeof(u64), + &skb->skb_mstamp_ns); + if (ret < 0) + return ret; + + switch ((enum rxrpc_oob_type)skb->mark) { + case RXRPC_OOB_CHALLENGE: + need_response = true; + ret = rxrpc_recvmsg_challenge(sock, msg, skb, flags); + break; + default: + WARN_ONCE(1, "recvmsg() can't process unknown OOB type %u\n", + skb->mark); + ret = -EIO; + break; + } + + if (!(flags & MSG_PEEK)) + skb_unlink(skb, &rx->recvmsg_oobq); + if (need_response) + rxrpc_add_pending_oob(rx, skb); + else + rxrpc_free_skb(skb, rxrpc_skb_put_oob); + return ret; +} + +/* * Deliver messages to a call. This keeps processing packets until the buffer * is filled and we find either more DATA (returns 0) or the end of the DATA * (returns 1). If more packets are required, it returns -EAGAIN and if the @@ -165,6 +241,7 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, size_t len, int flags, size_t *_offset) { struct rxrpc_skb_priv *sp; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct sk_buff *skb; rxrpc_seq_t seq = 0; size_t remain; @@ -207,7 +284,6 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, trace_rxrpc_recvdata(call, rxrpc_recvmsg_next, seq, sp->offset, sp->len, ret2); if (ret2 < 0) { - kdebug("verify = %d", ret2); ret = ret2; goto out; } @@ -255,6 +331,13 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, if (!(flags & MSG_PEEK)) rxrpc_rotate_rx_window(call); + + if (!rx->app_ops && + !skb_queue_empty_lockless(&rx->recvmsg_oobq)) { + trace_rxrpc_recvdata(call, rxrpc_recvmsg_oobq, seq, + rx_pkt_offset, rx_pkt_len, ret); + break; + } } out: @@ -262,6 +345,7 @@ out: call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; } + done: trace_rxrpc_recvdata(call, rxrpc_recvmsg_data_return, seq, rx_pkt_offset, rx_pkt_len, ret); @@ -301,6 +385,7 @@ try_again: /* Return immediately if a client socket has no outstanding calls */ if (RB_EMPTY_ROOT(&rx->calls) && list_empty(&rx->recvmsg_q) && + skb_queue_empty_lockless(&rx->recvmsg_oobq) && rx->sk.sk_state != RXRPC_SERVER_LISTENING) { release_sock(&rx->sk); return -EAGAIN; @@ -322,7 +407,8 @@ try_again: if (ret) goto wait_error; - if (list_empty(&rx->recvmsg_q)) { + if (list_empty(&rx->recvmsg_q) && + skb_queue_empty_lockless(&rx->recvmsg_oobq)) { if (signal_pending(current)) goto wait_interrupted; trace_rxrpc_recvmsg(0, rxrpc_recvmsg_wait, 0); @@ -332,19 +418,29 @@ try_again: goto try_again; } + /* Deal with OOB messages before we consider getting normal data. */ + if (!skb_queue_empty_lockless(&rx->recvmsg_oobq)) { + ret = rxrpc_recvmsg_oob(sock, msg, flags); + release_sock(&rx->sk); + if (ret == -EAGAIN) + goto try_again; + goto error_no_call; + } + /* Find the next call and dequeue it if we're not just peeking. If we * do dequeue it, that comes with a ref that we will need to release. * We also want to weed out calls that got requeued whilst we were * shovelling data out. */ - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); l = rx->recvmsg_q.next; call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!rxrpc_call_is_complete(call) && - skb_queue_empty(&call->recvmsg_queue)) { + skb_queue_empty(&call->recvmsg_queue) && + skb_queue_empty(&rx->recvmsg_oobq)) { list_del_init(&call->recvmsg_link); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); release_sock(&rx->sk); trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0); rxrpc_put_call(call, rxrpc_call_put_recvmsg); @@ -355,7 +451,7 @@ try_again: list_del_init(&call->recvmsg_link); else rxrpc_get_call(call, rxrpc_call_get_recvmsg); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); call_debug_id = call->debug_id; trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0); @@ -377,21 +473,9 @@ try_again: if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); - if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { - if (flags & MSG_CMSG_COMPAT) { - unsigned int id32 = call->user_call_ID; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned int), &id32); - } else { - unsigned long idl = call->user_call_ID; - - ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, - sizeof(unsigned long), &idl); - } - if (ret < 0) - goto error_unlock_call; - } + ret = rxrpc_recvmsg_user_id(call, msg, flags); + if (ret < 0) + goto error_unlock_call; if (msg->msg_name && call->peer) { size_t len = sizeof(call->dest_srx); @@ -445,9 +529,9 @@ error_unlock_call: error_requeue_call: if (!(flags & MSG_PEEK)) { - spin_lock(&rx->recvmsg_lock); + spin_lock_irq(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); - spin_unlock(&rx->recvmsg_lock); + spin_unlock_irq(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0); } else { rxrpc_put_call(call, rxrpc_call_put_recvmsg); @@ -477,14 +561,14 @@ wait_error: * @_service: Where to store the actual service ID (may be upgraded) * * Allow a kernel service to receive data and pick up information about the - * state of a call. Returns 0 if got what was asked for and there's more - * available, 1 if we got what was asked for and we're at the end of the data - * and -EAGAIN if we need more data. + * state of a call. Note that *@_abort should also be initialised to %0. * - * Note that we may return -EAGAIN to drain empty packets at the end of the - * data, even if we've already copied over the requested data. + * Note that we may return %-EAGAIN to drain empty packets at the end + * of the data, even if we've already copied over the requested data. * - * *_abort should also be initialised to 0. + * Return: %0 if got what was asked for and there's more available, %1 + * if we got what was asked for and we're at the end of the data and + * %-EAGAIN if we need more data. */ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, struct iov_iter *iter, size_t *_len, diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index cdab7b7d08a0..7474f88d7b18 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -12,22 +12,22 @@ #include "ar-internal.h" #define RXRPC_RTO_MAX (120 * USEC_PER_SEC) -#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * MSEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ +#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * USEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */ #define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */ -static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) +static u32 rxrpc_rto_min_us(struct rxrpc_call *call) { return 200; } -static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) +static u32 __rxrpc_set_rto(const struct rxrpc_call *call) { - return (peer->srtt_us >> 3) + peer->rttvar_us; + return (call->srtt_us >> 3) + call->rttvar_us; } static u32 rxrpc_bound_rto(u32 rto) { - return min(rto, RXRPC_RTO_MAX); + return clamp(200000, rto + 100000, RXRPC_RTO_MAX); } /* @@ -40,10 +40,10 @@ static u32 rxrpc_bound_rto(u32 rto) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) +static void rxrpc_rtt_estimator(struct rxrpc_call *call, long sample_rtt_us) { long m = sample_rtt_us; /* RTT */ - u32 srtt = peer->srtt_us; + u32 srtt = call->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -66,7 +66,7 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ - m -= (peer->mdev_us >> 2); /* similar update on mdev */ + m -= (call->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain @@ -78,31 +78,31 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us) if (m > 0) m >>= 3; } else { - m -= (peer->mdev_us >> 2); /* similar update on mdev */ + m -= (call->mdev_us >> 2); /* similar update on mdev */ } - peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ - if (peer->mdev_us > peer->mdev_max_us) { - peer->mdev_max_us = peer->mdev_us; - if (peer->mdev_max_us > peer->rttvar_us) - peer->rttvar_us = peer->mdev_max_us; + call->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ + if (call->mdev_us > call->mdev_max_us) { + call->mdev_max_us = call->mdev_us; + if (call->mdev_max_us > call->rttvar_us) + call->rttvar_us = call->mdev_max_us; } } else { /* no previous measure. */ srtt = m << 3; /* take the measured time to be rtt */ - peer->mdev_us = m << 1; /* make sure rto = 3*rtt */ - peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer)); - peer->mdev_max_us = peer->rttvar_us; + call->mdev_us = m << 1; /* make sure rto = 3*rtt */ + call->rttvar_us = umax(call->mdev_us, rxrpc_rto_min_us(call)); + call->mdev_max_us = call->rttvar_us; } - peer->srtt_us = max(1U, srtt); + call->srtt_us = umax(srtt, 1); } /* * Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static void rxrpc_set_rto(struct rxrpc_peer *peer) +static void rxrpc_set_rto(struct rxrpc_call *call) { u32 rto; @@ -113,7 +113,7 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer) * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some circumstances. */ - rto = __rxrpc_set_rto(peer); + rto = __rxrpc_set_rto(call); /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, @@ -124,61 +124,73 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer) /* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo * guarantees that rto is higher. */ - peer->rto_us = rxrpc_bound_rto(rto); + call->rto_us = rxrpc_bound_rto(rto); } -static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us) +static void rxrpc_update_rtt_min(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) +{ + /* Window size 5mins in approx usec (ipv4.sysctl_tcp_min_rtt_wlen) */ + u32 wlen_us = 5ULL * NSEC_PER_SEC / 1024; + + minmax_running_min(&call->min_rtt, wlen_us, resp_time / 1024, + (u32)rtt_us ? : jiffies_to_usecs(1)); +} + +static void rxrpc_ack_update_rtt(struct rxrpc_call *call, ktime_t resp_time, long rtt_us) { if (rtt_us < 0) return; - //rxrpc_update_rtt_min(peer, rtt_us); - rxrpc_rtt_estimator(peer, rtt_us); - rxrpc_set_rto(peer); + /* Update RACK min RTT [RFC8985 6.1 Step 1]. */ + rxrpc_update_rtt_min(call, resp_time, rtt_us); + + rxrpc_rtt_estimator(call, rtt_us); + rxrpc_set_rto(call); - /* RFC6298: only reset backoff on valid RTT measurement. */ - peer->backoff = 0; + /* Only reset backoff on valid RTT measurement [RFC6298]. */ + call->backoff = 0; } /* * Add RTT information to cache. This is called in softirq mode and has - * exclusive access to the peer RTT data. + * exclusive access to the call RTT data. */ -void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, +void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why, int rtt_slot, rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial, ktime_t send_time, ktime_t resp_time) { - struct rxrpc_peer *peer = call->peer; s64 rtt_us; rtt_us = ktime_to_us(ktime_sub(resp_time, send_time)); if (rtt_us < 0) return; - spin_lock(&peer->rtt_input_lock); - rxrpc_ack_update_rtt(peer, rtt_us); - if (peer->rtt_count < 3) - peer->rtt_count++; - spin_unlock(&peer->rtt_input_lock); + rxrpc_ack_update_rtt(call, resp_time, rtt_us); + if (call->rtt_count < 3) + call->rtt_count++; + call->rtt_taken++; + + WRITE_ONCE(call->peer->recent_srtt_us, call->srtt_us / 8); + WRITE_ONCE(call->peer->recent_rto_us, call->rto_us); trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial, - peer->srtt_us >> 3, peer->rto_us); + rtt_us, call->srtt_us, call->rto_us); } /* * Get the retransmission timeout to set in nanoseconds, backing it off each * time we retransmit. */ -ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) +ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans) { u64 timo_us; - u32 backoff = READ_ONCE(peer->backoff); + u32 backoff = READ_ONCE(call->backoff); - timo_us = peer->rto_us; + timo_us = call->rto_us; timo_us <<= backoff; if (retrans && timo_us * 2 <= RXRPC_RTO_MAX) - WRITE_ONCE(peer->backoff, backoff + 1); + WRITE_ONCE(call->backoff, backoff + 1); if (timo_us < 1) timo_us = 1; @@ -186,10 +198,11 @@ ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans) return ns_to_ktime(timo_us * NSEC_PER_USEC); } -void rxrpc_peer_init_rtt(struct rxrpc_peer *peer) +void rxrpc_call_init_rtt(struct rxrpc_call *call) { - peer->rto_us = RXRPC_TIMEOUT_INIT; - peer->mdev_us = RXRPC_TIMEOUT_INIT; - peer->backoff = 0; - //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U); + call->rtt_last_req = KTIME_MIN; + call->rto_us = RXRPC_TIMEOUT_INIT; + call->mdev_us = RXRPC_TIMEOUT_INIT; + call->backoff = 0; + //minmax_reset(&call->rtt_min, rxrpc_jiffies32, ~0U); } diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c new file mode 100644 index 000000000000..1e19c605bcc8 --- /dev/null +++ b/net/rxrpc/rxgk.c @@ -0,0 +1,1371 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* GSSAPI-based RxRPC security + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/key-type.h> +#include "ar-internal.h" +#include "rxgk_common.h" + +/* + * Parse the information from a server key + */ +static int rxgk_preparse_server_key(struct key_preparsed_payload *prep) +{ + const struct krb5_enctype *krb5; + struct krb5_buffer *server_key = (void *)&prep->payload.data[2]; + unsigned int service, sec_class, kvno, enctype; + int n = 0; + + _enter("%zu", prep->datalen); + + if (sscanf(prep->orig_description, "%u:%u:%u:%u%n", + &service, &sec_class, &kvno, &enctype, &n) != 4) + return -EINVAL; + + if (prep->orig_description[n]) + return -EINVAL; + + krb5 = crypto_krb5_find_enctype(enctype); + if (!krb5) + return -ENOPKG; + + prep->payload.data[0] = (struct krb5_enctype *)krb5; + + if (prep->datalen != krb5->key_len) + return -EKEYREJECTED; + + server_key->len = prep->datalen; + server_key->data = kmemdup(prep->data, prep->datalen, GFP_KERNEL); + if (!server_key->data) + return -ENOMEM; + + _leave(" = 0"); + return 0; +} + +static void rxgk_free_server_key(union key_payload *payload) +{ + struct krb5_buffer *server_key = (void *)&payload->data[2]; + + kfree_sensitive(server_key->data); +} + +static void rxgk_free_preparse_server_key(struct key_preparsed_payload *prep) +{ + rxgk_free_server_key(&prep->payload); +} + +static void rxgk_destroy_server_key(struct key *key) +{ + rxgk_free_server_key(&key->payload); +} + +static void rxgk_describe_server_key(const struct key *key, struct seq_file *m) +{ + const struct krb5_enctype *krb5 = key->payload.data[0]; + + if (krb5) + seq_printf(m, ": %s", krb5->name); +} + +/* + * Handle rekeying the connection when we see our limits overrun or when the + * far side decided to rekey. + * + * Returns a ref on the context if successful or -ESTALE if the key is out of + * date. + */ +static struct rxgk_context *rxgk_rekey(struct rxrpc_connection *conn, + const u16 *specific_key_number) +{ + struct rxgk_context *gk, *dead = NULL; + unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1; + bool crank = false; + + _enter("%d", specific_key_number ? *specific_key_number : -1); + + mutex_lock(&conn->security_lock); + + current_key = conn->rxgk.key_number; + if (!specific_key_number) { + key_number = current_key; + } else { + if (*specific_key_number == (u16)current_key) + key_number = current_key; + else if (*specific_key_number == (u16)(current_key - 1)) + key_number = current_key - 1; + else if (*specific_key_number == (u16)(current_key + 1)) + goto crank_window; + else + goto bad_key; + } + + gk = conn->rxgk.keys[key_number & mask]; + if (!gk) + goto generate_key; + if (!specific_key_number && + test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags)) + goto crank_window; + +grab: + refcount_inc(&gk->usage); + mutex_unlock(&conn->security_lock); + rxgk_put(dead); + return gk; + +crank_window: + trace_rxrpc_rxgk_rekey(conn, current_key, + specific_key_number ? *specific_key_number : -1); + if (current_key == UINT_MAX) + goto bad_key; + if (current_key + 1 == UINT_MAX) + set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags); + + key_number = current_key + 1; + if (WARN_ON(conn->rxgk.keys[key_number & mask])) + goto bad_key; + crank = true; + +generate_key: + gk = conn->rxgk.keys[current_key & mask]; + gk = rxgk_generate_transport_key(conn, gk->key, key_number, GFP_NOFS); + if (IS_ERR(gk)) { + mutex_unlock(&conn->security_lock); + return gk; + } + + write_lock(&conn->security_use_lock); + if (crank) { + current_key++; + conn->rxgk.key_number = current_key; + dead = conn->rxgk.keys[(current_key - 2) & mask]; + conn->rxgk.keys[(current_key - 2) & mask] = NULL; + } + conn->rxgk.keys[current_key & mask] = gk; + write_unlock(&conn->security_use_lock); + goto grab; + +bad_key: + mutex_unlock(&conn->security_lock); + return ERR_PTR(-ESTALE); +} + +/* + * Get the specified keying context. + * + * Returns a ref on the context if successful or -ESTALE if the key is out of + * date. + */ +static struct rxgk_context *rxgk_get_key(struct rxrpc_connection *conn, + const u16 *specific_key_number) +{ + struct rxgk_context *gk; + unsigned int key_number, current_key, mask = ARRAY_SIZE(conn->rxgk.keys) - 1; + + _enter("{%u},%d", + conn->rxgk.key_number, specific_key_number ? *specific_key_number : -1); + + read_lock(&conn->security_use_lock); + + current_key = conn->rxgk.key_number; + if (!specific_key_number) { + key_number = current_key; + } else { + /* Only the bottom 16 bits of the key number are exposed in the + * header, so we try and keep the upper 16 bits in step. The + * whole 32 bits are used to generate the TK. + */ + if (*specific_key_number == (u16)current_key) + key_number = current_key; + else if (*specific_key_number == (u16)(current_key - 1)) + key_number = current_key - 1; + else if (*specific_key_number == (u16)(current_key + 1)) + goto rekey; + else + goto bad_key; + } + + gk = conn->rxgk.keys[key_number & mask]; + if (!gk) + goto slow_path; + if (!specific_key_number && + key_number < UINT_MAX) { + if (time_after(jiffies, gk->expiry) || + gk->bytes_remaining < 0) { + set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags); + goto slow_path; + } + + if (test_bit(RXGK_TK_NEEDS_REKEY, &gk->flags)) + goto slow_path; + } + + refcount_inc(&gk->usage); + read_unlock(&conn->security_use_lock); + return gk; + +rekey: + _debug("rekey"); + if (current_key == UINT_MAX) + goto bad_key; + gk = conn->rxgk.keys[current_key & mask]; + if (gk) + set_bit(RXGK_TK_NEEDS_REKEY, &gk->flags); +slow_path: + read_unlock(&conn->security_use_lock); + return rxgk_rekey(conn, specific_key_number); +bad_key: + read_unlock(&conn->security_use_lock); + return ERR_PTR(-ESTALE); +} + +/* + * initialise connection security + */ +static int rxgk_init_connection_security(struct rxrpc_connection *conn, + struct rxrpc_key_token *token) +{ + struct rxgk_context *gk; + int ret; + + _enter("{%d,%u},{%x}", + conn->debug_id, conn->rxgk.key_number, key_serial(conn->key)); + + conn->security_ix = token->security_index; + conn->security_level = token->rxgk->level; + + if (rxrpc_conn_is_client(conn)) { + conn->rxgk.start_time = ktime_get(); + do_div(conn->rxgk.start_time, 100); + } + + gk = rxgk_generate_transport_key(conn, token->rxgk, conn->rxgk.key_number, + GFP_NOFS); + if (IS_ERR(gk)) + return PTR_ERR(gk); + conn->rxgk.enctype = gk->krb5->etype; + conn->rxgk.keys[gk->key_number & 3] = gk; + + switch (conn->security_level) { + case RXRPC_SECURITY_PLAIN: + case RXRPC_SECURITY_AUTH: + case RXRPC_SECURITY_ENCRYPT: + break; + default: + ret = -EKEYREJECTED; + goto error; + } + + ret = 0; +error: + _leave(" = %d", ret); + return ret; +} + +/* + * Clean up the crypto on a call. + */ +static void rxgk_free_call_crypto(struct rxrpc_call *call) +{ +} + +/* + * Work out how much data we can put in a packet. + */ +static struct rxrpc_txbuf *rxgk_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) +{ + enum krb5_crypto_mode mode; + struct rxgk_context *gk; + struct rxrpc_txbuf *txb; + size_t shdr, alloc, limit, part, offset, gap; + + switch (call->conn->security_level) { + default: + alloc = umin(remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp); + case RXRPC_SECURITY_AUTH: + shdr = 0; + mode = KRB5_CHECKSUM_MODE; + break; + case RXRPC_SECURITY_ENCRYPT: + shdr = sizeof(struct rxgk_header); + mode = KRB5_ENCRYPT_MODE; + break; + } + + gk = rxgk_get_key(call->conn, NULL); + if (IS_ERR(gk)) + return NULL; + + /* Work out the maximum amount of data that will fit. */ + alloc = RXRPC_JUMBO_DATALEN; + limit = crypto_krb5_how_much_data(gk->krb5, mode, &alloc, &offset); + + if (remain < limit - shdr) { + part = remain; + alloc = crypto_krb5_how_much_buffer(gk->krb5, mode, + shdr + part, &offset); + gap = 0; + } else { + part = limit - shdr; + gap = RXRPC_JUMBO_DATALEN - alloc; + alloc = RXRPC_JUMBO_DATALEN; + } + + rxgk_put(gk); + + txb = rxrpc_alloc_data_txbuf(call, alloc, 16, gfp); + if (!txb) + return NULL; + + txb->crypto_header = offset; + txb->sec_header = shdr; + txb->offset += offset + shdr; + txb->space = part; + + /* Clear excess space in the packet */ + if (gap) + memset(txb->data + alloc - gap, 0, gap); + return txb; +} + +/* + * Integrity mode (sign a packet - level 1 security) + */ +static int rxgk_secure_packet_integrity(const struct rxrpc_call *call, + struct rxgk_context *gk, + struct rxrpc_txbuf *txb) +{ + struct rxgk_header *hdr; + struct scatterlist sg[1]; + struct krb5_buffer metadata; + int ret = -ENOMEM; + + _enter(""); + + hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + if (!hdr) + goto error_gk; + + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(txb->seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(txb->len); + metadata.len = sizeof(*hdr); + metadata.data = hdr; + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], txb->data, txb->alloc_size); + + ret = crypto_krb5_get_mic(gk->krb5, gk->tx_Kc, &metadata, + sg, 1, txb->alloc_size, + txb->crypto_header, txb->sec_header + txb->len); + if (ret >= 0) { + txb->pkt_len = ret; + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; + gk->bytes_remaining -= ret; + } + kfree(hdr); +error_gk: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * wholly encrypt a packet (level 2 security) + */ +static int rxgk_secure_packet_encrypted(const struct rxrpc_call *call, + struct rxgk_context *gk, + struct rxrpc_txbuf *txb) +{ + struct rxgk_header *hdr; + struct scatterlist sg[1]; + int ret; + + _enter("%x", txb->len); + + /* Insert the header into the buffer. */ + hdr = txb->data + txb->crypto_header; + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(txb->seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(txb->len); + + sg_init_table(sg, 1); + sg_set_buf(&sg[0], txb->data, txb->alloc_size); + + ret = crypto_krb5_encrypt(gk->krb5, gk->tx_enc, + sg, 1, txb->alloc_size, + txb->crypto_header, txb->sec_header + txb->len, + false); + if (ret >= 0) { + txb->pkt_len = ret; + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; + gk->bytes_remaining -= ret; + } + + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * checksum an RxRPC packet header + */ +static int rxgk_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) +{ + struct rxgk_context *gk; + int ret; + + _enter("{%d{%x}},{#%u},%u,", + call->debug_id, key_serial(call->conn->key), txb->seq, txb->len); + + gk = rxgk_get_key(call->conn, NULL); + if (IS_ERR(gk)) + return PTR_ERR(gk) == -ESTALE ? -EKEYREJECTED : PTR_ERR(gk); + + ret = key_validate(call->conn->key); + if (ret < 0) { + rxgk_put(gk); + return ret; + } + + call->security_enctype = gk->krb5->etype; + txb->cksum = htons(gk->key_number); + + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + rxgk_put(gk); + txb->pkt_len = txb->len; + return 0; + case RXRPC_SECURITY_AUTH: + return rxgk_secure_packet_integrity(call, gk, txb); + case RXRPC_SECURITY_ENCRYPT: + return rxgk_secure_packet_encrypted(call, gk, txb); + default: + rxgk_put(gk); + return -EPERM; + } +} + +/* + * Integrity mode (check the signature on a packet - level 1 security) + */ +static int rxgk_verify_packet_integrity(struct rxrpc_call *call, + struct rxgk_context *gk, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_header *hdr; + struct krb5_buffer metadata; + unsigned int offset = sp->offset, len = sp->len; + size_t data_offset = 0, data_len = len; + u32 ac; + int ret = -ENOMEM; + + _enter(""); + + crypto_krb5_where_is_the_data(gk->krb5, KRB5_CHECKSUM_MODE, + &data_offset, &data_len); + + hdr = kzalloc(sizeof(*hdr), GFP_NOFS); + if (!hdr) + goto put_gk; + + hdr->epoch = htonl(call->conn->proto.epoch); + hdr->cid = htonl(call->cid); + hdr->call_number = htonl(call->call_id); + hdr->seq = htonl(sp->hdr.seq); + hdr->sec_index = htonl(call->security_ix); + hdr->data_len = htonl(data_len); + + metadata.len = sizeof(*hdr); + metadata.data = hdr; + ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, + skb, &offset, &len, &ac); + kfree(hdr); + if (ret == -EPROTO) { + rxrpc_abort_eproto(call, skb, ac, + rxgk_abort_1_verify_mic_eproto); + } else { + sp->offset = offset; + sp->len = len; + } + +put_gk: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Decrypt an encrypted packet (level 2 security). + */ +static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, + struct rxgk_context *gk, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_header hdr; + unsigned int offset = sp->offset, len = sp->len; + int ret; + u32 ac; + + _enter(""); + + ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); + if (ret == -EPROTO) + rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); + if (ret < 0) + goto error; + + if (len < sizeof(hdr)) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_header); + goto error; + } + + /* Extract the header from the skb */ + ret = skb_copy_bits(skb, offset, &hdr, sizeof(hdr)); + if (ret < 0) { + ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, + rxgk_abort_2_short_encdata); + goto error; + } + offset += sizeof(hdr); + len -= sizeof(hdr); + + if (ntohl(hdr.epoch) != call->conn->proto.epoch || + ntohl(hdr.cid) != call->cid || + ntohl(hdr.call_number) != call->call_id || + ntohl(hdr.seq) != sp->hdr.seq || + ntohl(hdr.sec_index) != call->security_ix || + ntohl(hdr.data_len) > len) { + ret = rxrpc_abort_eproto(call, skb, RXGK_SEALEDINCON, + rxgk_abort_2_short_data); + goto error; + } + + sp->offset = offset; + sp->len = ntohl(hdr.data_len); + ret = 0; +error: + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Verify the security on a received packet or subpacket (if part of a + * jumbo packet). + */ +static int rxgk_verify_packet(struct rxrpc_call *call, struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_context *gk; + u16 key_number = sp->hdr.cksum; + + _enter("{%d{%x}},{#%u}", + call->debug_id, key_serial(call->conn->key), sp->hdr.seq); + + gk = rxgk_get_key(call->conn, &key_number); + if (IS_ERR(gk)) { + switch (PTR_ERR(gk)) { + case -ESTALE: + return rxrpc_abort_eproto(call, skb, RXGK_BADKEYNO, + rxgk_abort_bad_key_number); + default: + return PTR_ERR(gk); + } + } + + call->security_enctype = gk->krb5->etype; + switch (call->conn->security_level) { + case RXRPC_SECURITY_PLAIN: + rxgk_put(gk); + return 0; + case RXRPC_SECURITY_AUTH: + return rxgk_verify_packet_integrity(call, gk, skb); + case RXRPC_SECURITY_ENCRYPT: + return rxgk_verify_packet_encrypted(call, gk, skb); + default: + rxgk_put(gk); + return -ENOANO; + } +} + +/* + * Allocate memory to hold a challenge or a response packet. We're not running + * in the io_thread, so we can't use ->tx_alloc. + */ +static struct page *rxgk_alloc_packet(size_t total_len) +{ + gfp_t gfp = GFP_NOFS; + int order; + + order = get_order(total_len); + if (order > 0) + gfp |= __GFP_COMP; + return alloc_pages(gfp, order); +} + +/* + * Issue a challenge. + */ +static int rxgk_issue_challenge(struct rxrpc_connection *conn) +{ + struct rxrpc_wire_header *whdr; + struct bio_vec bvec[1]; + struct msghdr msg; + struct page *page; + size_t len = sizeof(*whdr) + sizeof(conn->rxgk.nonce); + u32 serial; + int ret; + + _enter("{%d}", conn->debug_id); + + get_random_bytes(&conn->rxgk.nonce, sizeof(conn->rxgk.nonce)); + + /* We can't use conn->tx_alloc without a lock */ + page = rxgk_alloc_packet(sizeof(*whdr) + sizeof(conn->rxgk.nonce)); + if (!page) + return -ENOMEM; + + bvec_set_page(&bvec[0], page, len, 0); + iov_iter_bvec(&msg.msg_iter, WRITE, bvec, 1, len); + + msg.msg_name = &conn->peer->srx.transport; + msg.msg_namelen = conn->peer->srx.transport_len; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = MSG_SPLICE_PAGES; + + whdr = page_address(page); + whdr->epoch = htonl(conn->proto.epoch); + whdr->cid = htonl(conn->proto.cid); + whdr->callNumber = 0; + whdr->seq = 0; + whdr->type = RXRPC_PACKET_TYPE_CHALLENGE; + whdr->flags = conn->out_clientflag; + whdr->userStatus = 0; + whdr->securityIndex = conn->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(conn->service_id); + + memcpy(whdr + 1, conn->rxgk.nonce, sizeof(conn->rxgk.nonce)); + + serial = rxrpc_get_next_serials(conn, 1); + whdr->serial = htonl(serial); + + trace_rxrpc_tx_challenge(conn, serial, 0, *(u32 *)&conn->rxgk.nonce); + + ret = do_udp_sendmsg(conn->local->socket, &msg, len); + if (ret > 0) + conn->peer->last_tx_at = ktime_get_seconds(); + __free_page(page); + + if (ret < 0) { + trace_rxrpc_tx_fail(conn->debug_id, serial, ret, + rxrpc_tx_point_rxgk_challenge); + return -EAGAIN; + } + + trace_rxrpc_tx_packet(conn->debug_id, whdr, + rxrpc_tx_point_rxgk_challenge); + _leave(" = 0"); + return 0; +} + +/* + * Validate a challenge packet. + */ +static bool rxgk_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + u8 nonce[20]; + + if (!conn->key) { + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxgk_abort_chall_no_key); + return false; + } + + if (key_validate(conn->key) < 0) { + rxrpc_abort_conn(conn, skb, RXGK_EXPIRED, -EPROTO, + rxgk_abort_chall_key_expired); + return false; + } + + if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), + nonce, sizeof(nonce)) < 0) { + rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_chall_short); + return false; + } + + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, 0, *(u32 *)nonce, 0); + return true; +} + +/** + * rxgk_kernel_query_challenge - Query RxGK-specific challenge parameters + * @challenge: The challenge packet to query + * + * Return: The Kerberos 5 encoding type for the challenged connection. + */ +u32 rxgk_kernel_query_challenge(struct sk_buff *challenge) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(challenge); + + return sp->chall.conn->rxgk.enctype; +} +EXPORT_SYMBOL(rxgk_kernel_query_challenge); + +/* + * Fill out the control message to pass to userspace to inform about the + * challenge. + */ +static int rxgk_challenge_to_recvmsg(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct msghdr *msg) +{ + struct rxgk_challenge chall; + + chall.base.service_id = conn->service_id; + chall.base.security_index = conn->security_ix; + chall.enctype = conn->rxgk.enctype; + + return put_cmsg(msg, SOL_RXRPC, RXRPC_CHALLENGED, sizeof(chall), &chall); +} + +/* + * Insert the requisite amount of XDR padding for the length given. + */ +static int rxgk_pad_out(struct sk_buff *response, size_t len, size_t offset) +{ + __be32 zero = 0; + size_t pad = xdr_round_up(len) - len; + int ret; + + if (!pad) + return 0; + + ret = skb_store_bits(response, offset, &zero, pad); + if (ret < 0) + return ret; + return pad; +} + +/* + * Insert the header into the response. + */ +static noinline ssize_t rxgk_insert_response_header(struct rxrpc_connection *conn, + struct rxgk_context *gk, + struct sk_buff *response, + size_t offset) +{ + struct rxrpc_skb_priv *rsp = rxrpc_skb(response); + + struct { + struct rxrpc_wire_header whdr; + __be32 start_time_msw; + __be32 start_time_lsw; + __be32 ticket_len; + } h; + int ret; + + rsp->resp.kvno = gk->key_number; + rsp->resp.version = gk->krb5->etype; + + h.whdr.epoch = htonl(conn->proto.epoch); + h.whdr.cid = htonl(conn->proto.cid); + h.whdr.callNumber = 0; + h.whdr.serial = 0; + h.whdr.seq = 0; + h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + h.whdr.flags = conn->out_clientflag; + h.whdr.userStatus = 0; + h.whdr.securityIndex = conn->security_ix; + h.whdr.cksum = htons(gk->key_number); + h.whdr.serviceId = htons(conn->service_id); + h.start_time_msw = htonl(upper_32_bits(conn->rxgk.start_time)); + h.start_time_lsw = htonl(lower_32_bits(conn->rxgk.start_time)); + h.ticket_len = htonl(gk->key->ticket.len); + + ret = skb_store_bits(response, offset, &h, sizeof(h)); + return ret < 0 ? ret : sizeof(h); +} + +/* + * Construct the authenticator to go in the response packet + * + * struct RXGK_Authenticator { + * opaque nonce[20]; + * opaque appdata<>; + * RXGK_Level level; + * unsigned int epoch; + * unsigned int cid; + * unsigned int call_numbers<>; + * }; + */ +static ssize_t rxgk_construct_authenticator(struct rxrpc_connection *conn, + struct sk_buff *challenge, + const struct krb5_buffer *appdata, + struct sk_buff *response, + size_t offset) +{ + struct { + u8 nonce[20]; + __be32 appdata_len; + } a; + struct { + __be32 level; + __be32 epoch; + __be32 cid; + __be32 call_numbers_count; + __be32 call_numbers[4]; + } b; + int ret; + + ret = skb_copy_bits(challenge, sizeof(struct rxrpc_wire_header), + a.nonce, sizeof(a.nonce)); + if (ret < 0) + return -EPROTO; + + a.appdata_len = htonl(appdata->len); + + ret = skb_store_bits(response, offset, &a, sizeof(a)); + if (ret < 0) + return ret; + offset += sizeof(a); + + if (appdata->len) { + ret = skb_store_bits(response, offset, appdata->data, appdata->len); + if (ret < 0) + return ret; + offset += appdata->len; + + ret = rxgk_pad_out(response, appdata->len, offset); + if (ret < 0) + return ret; + offset += ret; + } + + b.level = htonl(conn->security_level); + b.epoch = htonl(conn->proto.epoch); + b.cid = htonl(conn->proto.cid); + b.call_numbers_count = htonl(4); + b.call_numbers[0] = htonl(conn->channels[0].call_counter); + b.call_numbers[1] = htonl(conn->channels[1].call_counter); + b.call_numbers[2] = htonl(conn->channels[2].call_counter); + b.call_numbers[3] = htonl(conn->channels[3].call_counter); + + ret = skb_store_bits(response, offset, &b, sizeof(b)); + if (ret < 0) + return ret; + return sizeof(a) + xdr_round_up(appdata->len) + sizeof(b); +} + +static ssize_t rxgk_encrypt_authenticator(struct rxrpc_connection *conn, + struct rxgk_context *gk, + struct sk_buff *response, + size_t offset, + size_t alloc_len, + size_t auth_offset, + size_t auth_len) +{ + struct scatterlist sg[16]; + int nr_sg; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(response, sg, offset, alloc_len); + if (unlikely(nr_sg < 0)) + return nr_sg; + return crypto_krb5_encrypt(gk->krb5, gk->resp_enc, sg, nr_sg, alloc_len, + auth_offset, auth_len, false); +} + +/* + * Construct the response. + * + * struct RXGK_Response { + * rxgkTime start_time; + * RXGK_Data token; + * opaque authenticator<RXGK_MAXAUTHENTICATOR> + * }; + */ +static int rxgk_construct_response(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + struct rxrpc_skb_priv *csp, *rsp; + struct rxgk_context *gk; + struct sk_buff *response; + size_t len, auth_len, authx_len, offset, auth_offset, authx_offset; + __be32 tmp; + int ret; + + gk = rxgk_get_key(conn, NULL); + if (IS_ERR(gk)) + return PTR_ERR(gk); + + auth_len = 20 + (4 + appdata->len) + 12 + (1 + 4) * 4; + authx_len = crypto_krb5_how_much_buffer(gk->krb5, KRB5_ENCRYPT_MODE, + auth_len, &auth_offset); + len = sizeof(struct rxrpc_wire_header) + + 8 + (4 + xdr_round_up(gk->key->ticket.len)) + (4 + authx_len); + + response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS); + if (!response) + goto error; + rxrpc_new_skb(response, rxrpc_skb_new_response_rxgk); + response->len = len; + response->data_len = len; + + ret = rxgk_insert_response_header(conn, gk, response, 0); + if (ret < 0) + goto error; + offset = ret; + + ret = skb_store_bits(response, offset, gk->key->ticket.data, gk->key->ticket.len); + if (ret < 0) + goto error; + offset += gk->key->ticket.len; + ret = rxgk_pad_out(response, gk->key->ticket.len, offset); + if (ret < 0) + goto error; + + authx_offset = offset + ret + 4; /* Leave a gap for the length. */ + + ret = rxgk_construct_authenticator(conn, challenge, appdata, response, + authx_offset + auth_offset); + if (ret < 0) + goto error; + auth_len = ret; + + ret = rxgk_encrypt_authenticator(conn, gk, response, + authx_offset, authx_len, + auth_offset, auth_len); + if (ret < 0) + goto error; + authx_len = ret; + + tmp = htonl(authx_len); + ret = skb_store_bits(response, authx_offset - 4, &tmp, 4); + if (ret < 0) + goto error; + + ret = rxgk_pad_out(response, authx_len, authx_offset + authx_len); + if (ret < 0) + goto error; + len = authx_offset + authx_len + ret; + + if (len != response->len) { + response->len = len; + response->data_len = len; + } + + csp = rxrpc_skb(challenge); + rsp = rxrpc_skb(response); + rsp->resp.len = len; + rsp->resp.challenge_serial = csp->hdr.serial; + rxrpc_post_response(conn, response); + response = NULL; + ret = 0; + +error: + rxrpc_free_skb(response, rxrpc_skb_put_response); + rxgk_put(gk); + _leave(" = %d", ret); + return ret; +} + +/* + * Respond to a challenge packet. + */ +static int rxgk_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + if (key_validate(conn->key) < 0) + return rxrpc_abort_conn(conn, NULL, RXGK_EXPIRED, -EPROTO, + rxgk_abort_chall_key_expired); + + return rxgk_construct_response(conn, challenge, appdata); +} + +static int rxgk_respond_to_challenge_no_appdata(struct rxrpc_connection *conn, + struct sk_buff *challenge) +{ + struct krb5_buffer appdata = {}; + + return rxgk_respond_to_challenge(conn, challenge, &appdata); +} + +/** + * rxgk_kernel_respond_to_challenge - Respond to a challenge with appdata + * @challenge: The challenge to respond to + * @appdata: The application data to include in the RESPONSE authenticator + * + * Allow a kernel application to respond to a CHALLENGE with application data + * to be included in the RxGK RESPONSE Authenticator. + * + * Return: %0 if successful and a negative error code otherwise. + */ +int rxgk_kernel_respond_to_challenge(struct sk_buff *challenge, + struct krb5_buffer *appdata) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + + return rxgk_respond_to_challenge(csp->chall.conn, challenge, appdata); +} +EXPORT_SYMBOL(rxgk_kernel_respond_to_challenge); + +/* + * Parse sendmsg() control message and respond to challenge. We need to see if + * there's an appdata to fish out. + */ +static int rxgk_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + struct krb5_buffer appdata = {}; + struct cmsghdr *cmsg; + + for_each_cmsghdr(cmsg, msg) { + if (cmsg->cmsg_level != SOL_RXRPC || + cmsg->cmsg_type != RXRPC_RESP_RXGK_APPDATA) + continue; + if (appdata.data) + return -EINVAL; + appdata.data = CMSG_DATA(cmsg); + appdata.len = cmsg->cmsg_len - sizeof(struct cmsghdr); + } + + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +} + +/* + * Verify the authenticator. + * + * struct RXGK_Authenticator { + * opaque nonce[20]; + * opaque appdata<>; + * RXGK_Level level; + * unsigned int epoch; + * unsigned int cid; + * unsigned int call_numbers<>; + * }; + */ +static int rxgk_do_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + __be32 *p, __be32 *end) +{ + u32 app_len, call_count, level, epoch, cid, i; + + _enter(""); + + if (memcmp(p, conn->rxgk.nonce, 20) != 0) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_nonce); + p += 20 / sizeof(__be32); + + app_len = ntohl(*p++); + if (app_len > (end - p) * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_applen); + + p += xdr_round_up(app_len) / sizeof(__be32); + if (end - p < 4) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_applen); + + level = ntohl(*p++); + epoch = ntohl(*p++); + cid = ntohl(*p++); + call_count = ntohl(*p++); + + if (level != conn->security_level || + epoch != conn->proto.epoch || + cid != conn->proto.cid || + call_count > 4) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_param); + + if (end - p < call_count) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_call_list); + + for (i = 0; i < call_count; i++) { + u32 call_id = ntohl(*p++); + + if (call_id > INT_MAX) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_bad_callid); + + if (call_id < conn->channels[i].call_counter) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_call_ctr); + + if (call_id > conn->channels[i].call_counter) { + if (conn->channels[i].call) + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_call_state); + + conn->channels[i].call_counter = call_id; + } + } + + _leave(" = 0"); + return 0; +} + +/* + * Extract the authenticator and verify it. + */ +static int rxgk_verify_authenticator(struct rxrpc_connection *conn, + const struct krb5_enctype *krb5, + struct sk_buff *skb, + unsigned int auth_offset, unsigned int auth_len) +{ + void *auth; + __be32 *p; + int ret; + + auth = kmalloc(auth_len, GFP_NOFS); + if (!auth) + return -ENOMEM; + + ret = skb_copy_bits(skb, auth_offset, auth, auth_len); + if (ret < 0) { + ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EPROTO, + rxgk_abort_resp_short_auth); + goto error; + } + + p = auth; + ret = rxgk_do_verify_authenticator(conn, krb5, skb, p, p + auth_len); +error: + kfree(auth); + return ret; +} + +/* + * Verify a response. + * + * struct RXGK_Response { + * rxgkTime start_time; + * RXGK_Data token; + * opaque authenticator<RXGK_MAXAUTHENTICATOR> + * }; + */ +static int rxgk_verify_response(struct rxrpc_connection *conn, + struct sk_buff *skb) +{ + const struct krb5_enctype *krb5; + struct rxrpc_key_token *token; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + struct rxgk_response rhdr; + struct rxgk_context *gk; + struct key *key = NULL; + unsigned int offset = sizeof(struct rxrpc_wire_header); + unsigned int len = skb->len - sizeof(struct rxrpc_wire_header); + unsigned int token_offset, token_len; + unsigned int auth_offset, auth_len; + __be32 xauth_len; + int ret, ec; + + _enter("{%d}", conn->debug_id); + + /* Parse the RXGK_Response object */ + if (sizeof(rhdr) + sizeof(__be32) > len) + goto short_packet; + + if (skb_copy_bits(skb, offset, &rhdr, sizeof(rhdr)) < 0) + goto short_packet; + offset += sizeof(rhdr); + len -= sizeof(rhdr); + + token_offset = offset; + token_len = ntohl(rhdr.token_len); + if (xdr_round_up(token_len) + sizeof(__be32) > len) + goto short_packet; + + trace_rxrpc_rx_response(conn, sp->hdr.serial, 0, sp->hdr.cksum, token_len); + + offset += xdr_round_up(token_len); + len -= xdr_round_up(token_len); + + if (skb_copy_bits(skb, offset, &xauth_len, sizeof(xauth_len)) < 0) + goto short_packet; + offset += sizeof(xauth_len); + len -= sizeof(xauth_len); + + auth_offset = offset; + auth_len = ntohl(xauth_len); + if (auth_len < len) + goto short_packet; + if (auth_len & 3) + goto inconsistent; + if (auth_len < 20 + 9 * 4) + goto auth_too_short; + + /* We need to extract and decrypt the token and instantiate a session + * key for it. This bit, however, is application-specific. If + * possible, we use a default parser, but we might end up bumping this + * to the app to deal with - which might mean a round trip to + * userspace. + */ + ret = rxgk_extract_token(conn, skb, token_offset, token_len, &key); + if (ret < 0) + goto out; + + /* We now have a key instantiated from the decrypted ticket. We can + * pass this to the application so that they can parse the ticket + * content and we can use the session key it contains to derive the + * keys we need. + * + * Note that we have to switch enctype at this point as the enctype of + * the ticket doesn't necessarily match that of the transport. + */ + token = key->payload.data[0]; + conn->security_level = token->rxgk->level; + conn->rxgk.start_time = __be64_to_cpu(rhdr.start_time); + + gk = rxgk_generate_transport_key(conn, token->rxgk, sp->hdr.cksum, GFP_NOFS); + if (IS_ERR(gk)) { + ret = PTR_ERR(gk); + goto cant_get_token; + } + + krb5 = gk->krb5; + + trace_rxrpc_rx_response(conn, sp->hdr.serial, krb5->etype, sp->hdr.cksum, token_len); + + /* Decrypt, parse and verify the authenticator. */ + ret = rxgk_decrypt_skb(krb5, gk->resp_enc, skb, + &auth_offset, &auth_len, &ec); + if (ret < 0) { + rxrpc_abort_conn(conn, skb, RXGK_SEALEDINCON, ret, + rxgk_abort_resp_auth_dec); + goto out; + } + + ret = rxgk_verify_authenticator(conn, krb5, skb, auth_offset, auth_len); + if (ret < 0) + goto out; + + conn->key = key; + key = NULL; + ret = 0; +out: + key_put(key); + _leave(" = %d", ret); + return ret; + +inconsistent: + ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_xdr_align); + goto out; +auth_too_short: + ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_short_auth); + goto out; +short_packet: + ret = rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_short_packet); + goto out; + +cant_get_token: + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -EINVAL: + ret = rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_internal_error); + goto out; + case -ENOPKG: + ret = rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP, + -EKEYREJECTED, rxgk_abort_resp_nopkg); + goto out; + } + +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + goto out; +} + +/* + * clear the connection security + */ +static void rxgk_clear(struct rxrpc_connection *conn) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(conn->rxgk.keys); i++) + rxgk_put(conn->rxgk.keys[i]); +} + +/* + * Initialise the RxGK security service. + */ +static int rxgk_init(void) +{ + return 0; +} + +/* + * Clean up the RxGK security service. + */ +static void rxgk_exit(void) +{ +} + +/* + * RxRPC YFS GSSAPI-based security + */ +const struct rxrpc_security rxgk_yfs = { + .name = "yfs-rxgk", + .security_index = RXRPC_SECURITY_YFS_RXGK, + .no_key_abort = RXGK_NOTAUTH, + .init = rxgk_init, + .exit = rxgk_exit, + .preparse_server_key = rxgk_preparse_server_key, + .free_preparse_server_key = rxgk_free_preparse_server_key, + .destroy_server_key = rxgk_destroy_server_key, + .describe_server_key = rxgk_describe_server_key, + .init_connection_security = rxgk_init_connection_security, + .alloc_txbuf = rxgk_alloc_txbuf, + .secure_packet = rxgk_secure_packet, + .verify_packet = rxgk_verify_packet, + .free_call_crypto = rxgk_free_call_crypto, + .issue_challenge = rxgk_issue_challenge, + .validate_challenge = rxgk_validate_challenge, + .challenge_to_recvmsg = rxgk_challenge_to_recvmsg, + .sendmsg_respond_to_challenge = rxgk_sendmsg_respond_to_challenge, + .respond_to_challenge = rxgk_respond_to_challenge_no_appdata, + .verify_response = rxgk_verify_response, + .clear = rxgk_clear, + .default_decode_ticket = rxgk_yfs_decode_ticket, +}; diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c new file mode 100644 index 000000000000..b94b77a1c317 --- /dev/null +++ b/net/rxrpc/rxgk_app.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Application-specific bits for GSSAPI-based RxRPC security + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/key-type.h> +#include "ar-internal.h" +#include "rxgk_common.h" + +/* + * Decode a default-style YFS ticket in a response and turn it into an + * rxrpc-type key. + * + * struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + * + * struct RXGK_AuthName { + * afs_int32 kind; + * opaque data<AUTHDATAMAX>; + * opaque display<AUTHPRINTABLEMAX>; + * }; + * + * struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ +int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key) +{ + struct rxrpc_key_token *token; + const struct cred *cred = current_cred(); // TODO - use socket creds + struct key *key; + size_t pre_ticket_len, payload_len; + unsigned int klen, enctype; + void *payload, *ticket; + __be32 *t, *p, *q, tmp[2]; + int ret; + + _enter(""); + + /* Get the session key length */ + ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); + if (ret < 0) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_klen); + enctype = ntohl(tmp[0]); + klen = ntohl(tmp[1]); + + if (klen > ticket_len - 10 * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_key); + + pre_ticket_len = ((5 + 14) * sizeof(__be32) + + xdr_round_up(klen) + + sizeof(__be32)); + payload_len = pre_ticket_len + xdr_round_up(ticket_len); + + payload = kzalloc(payload_len, GFP_NOFS); + if (!payload) + return -ENOMEM; + + /* We need to fill out the XDR form for a key payload that we can pass + * to add_key(). Start by copying in the ticket so that we can parse + * it. + */ + ticket = payload + pre_ticket_len; + ret = skb_copy_bits(skb, ticket_offset, ticket, ticket_len); + if (ret < 0) { + ret = rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_tkt); + goto error; + } + + /* Fill out the form header. */ + p = payload; + p[0] = htonl(0); /* Flags */ + p[1] = htonl(1); /* len(cellname) */ + p[2] = htonl(0x20000000); /* Cellname " " */ + p[3] = htonl(1); /* #tokens */ + p[4] = htonl(15 * sizeof(__be32) + xdr_round_up(klen) + + xdr_round_up(ticket_len)); /* Token len */ + + /* Now fill in the body. Most of this we can just scrape directly from + * the ticket. + */ + t = ticket + sizeof(__be32) * 2 + xdr_round_up(klen); + q = payload + 5 * sizeof(__be32); + q[0] = htonl(RXRPC_SECURITY_YFS_RXGK); + q[1] = t[1]; /* begintime - msw */ + q[2] = t[2]; /* - lsw */ + q[3] = t[5]; /* endtime - msw */ + q[4] = t[6]; /* - lsw */ + q[5] = 0; /* level - msw */ + q[6] = t[0]; /* - lsw */ + q[7] = 0; /* lifetime - msw */ + q[8] = t[3]; /* - lsw */ + q[9] = 0; /* bytelife - msw */ + q[10] = t[4]; /* - lsw */ + q[11] = 0; /* enctype - msw */ + q[12] = htonl(enctype); /* - lsw */ + q[13] = htonl(klen); /* Key length */ + + q += 14; + + memcpy(q, ticket + sizeof(__be32) * 2, klen); + q += xdr_round_up(klen) / 4; + q[0] = htonl(ticket_len); + q++; + if (WARN_ON((unsigned long)q != (unsigned long)ticket)) { + ret = -EIO; + goto error; + } + + /* Ticket read in with skb_copy_bits above */ + q += xdr_round_up(ticket_len) / 4; + if (WARN_ON((unsigned long)q - (unsigned long)payload != payload_len)) { + ret = -EIO; + goto error; + } + + /* Now turn that into a key. */ + key = key_alloc(&key_type_rxrpc, "x", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, // TODO: Use socket owner + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA, NULL); + if (IS_ERR(key)) { + _leave(" = -ENOMEM [alloc %ld]", PTR_ERR(key)); + ret = PTR_ERR(key); + goto error; + } + + _debug("key %d", key_serial(key)); + + ret = key_instantiate_and_link(key, payload, payload_len, NULL, NULL); + if (ret < 0) + goto error_key; + + token = key->payload.data[0]; + token->no_leak_key = true; + *_key = key; + key = NULL; + ret = 0; + goto error; + +error_key: + key_put(key); +error: + kfree_sensitive(payload); + _leave(" = %d", ret); + return ret; +} + +/* + * Extract the token and set up a session key from the details. + * + * struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + * + * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-afs-08 sec 6.1] + */ +int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int token_offset, unsigned int token_len, + struct key **_key) +{ + const struct krb5_enctype *krb5; + const struct krb5_buffer *server_secret; + struct crypto_aead *token_enc = NULL; + struct key *server_key; + unsigned int ticket_offset, ticket_len; + u32 kvno, enctype; + int ret, ec; + + struct { + __be32 kvno; + __be32 enctype; + __be32 token_len; + } container; + + /* Decode the RXGK_TokenContainer object. This tells us which server + * key we should be using. We can then fetch the key, get the secret + * and set up the crypto to extract the token. + */ + if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); + + kvno = ntohl(container.kvno); + enctype = ntohl(container.enctype); + ticket_len = ntohl(container.token_len); + ticket_offset = token_offset + sizeof(container); + + if (xdr_round_up(ticket_len) > token_len - 3 * 4) + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); + + _debug("KVNO %u", kvno); + _debug("ENC %u", enctype); + _debug("TLEN %u", ticket_len); + + server_key = rxrpc_look_up_server_security(conn, skb, kvno, enctype); + if (IS_ERR(server_key)) + goto cant_get_server_key; + + down_read(&server_key->sem); + server_secret = (const void *)&server_key->payload.data[2]; + ret = rxgk_set_up_token_cipher(server_secret, &token_enc, enctype, &krb5, GFP_NOFS); + up_read(&server_key->sem); + key_put(server_key); + if (ret < 0) + goto cant_get_token; + + /* We can now decrypt and parse the token/ticket. This allows us to + * gain access to K0, from which we can derive the transport key and + * thence decode the authenticator. + */ + ret = rxgk_decrypt_skb(krb5, token_enc, skb, + &ticket_offset, &ticket_len, &ec); + crypto_free_aead(token_enc); + token_enc = NULL; + if (ret < 0) + return rxrpc_abort_conn(conn, skb, ec, ret, + rxgk_abort_resp_tok_dec); + + ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, + ticket_len, _key); + if (ret < 0) + goto cant_get_token; + + _leave(" = 0"); + return ret; + +cant_get_server_key: + ret = PTR_ERR(server_key); + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -ENOKEY: + case -EKEYREJECTED: + case -EKEYEXPIRED: + case -EKEYREVOKED: + case -EPERM: + return rxrpc_abort_conn(conn, skb, RXGK_BADKEYNO, -EKEYREJECTED, + rxgk_abort_resp_tok_nokey); + default: + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_tok_keyerr); + } + +cant_get_token: + switch (ret) { + case -ENOMEM: + goto temporary_error; + case -EINVAL: + return rxrpc_abort_conn(conn, skb, RXGK_NOTAUTH, -EKEYREJECTED, + rxgk_abort_resp_tok_internal_error); + case -ENOPKG: + return rxrpc_abort_conn(conn, skb, KRB5_PROG_KEYTYPE_NOSUPP, + -EKEYREJECTED, rxgk_abort_resp_tok_nopkg); + } + +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + return ret; +} diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h new file mode 100644 index 000000000000..7370a5655985 --- /dev/null +++ b/net/rxrpc/rxgk_common.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Common bits for GSSAPI-based RxRPC security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <crypto/krb5.h> +#include <crypto/skcipher.h> +#include <crypto/hash.h> + +/* + * Per-key number context. This is replaced when the connection is rekeyed. + */ +struct rxgk_context { + refcount_t usage; + unsigned int key_number; /* Rekeying number (goes in the rx header) */ + unsigned long flags; +#define RXGK_TK_NEEDS_REKEY 0 /* Set if this needs rekeying */ + unsigned long expiry; /* Expiration time of this key */ + long long bytes_remaining; /* Remaining Tx lifetime of this key */ + const struct krb5_enctype *krb5; /* RxGK encryption type */ + const struct rxgk_key *key; + + /* We need up to 7 keys derived from the transport key, but we don't + * actually need the transport key. Each key is derived by + * DK(TK,constant). + */ + struct crypto_aead *tx_enc; /* Transmission key */ + struct crypto_aead *rx_enc; /* Reception key */ + struct crypto_shash *tx_Kc; /* Transmission checksum key */ + struct crypto_shash *rx_Kc; /* Reception checksum key */ + struct crypto_aead *resp_enc; /* Response packet enc key */ +}; + +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_object_len(x) (4 + xdr_round_up(x)) + +/* + * rxgk_app.c + */ +int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int ticket_offset, unsigned int ticket_len, + struct key **_key); +int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, + unsigned int token_offset, unsigned int token_len, + struct key **_key); + +/* + * rxgk_kdf.c + */ +void rxgk_put(struct rxgk_context *gk); +struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn, + const struct rxgk_key *key, + unsigned int key_number, + gfp_t gfp); +int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, + struct crypto_aead **token_key, + unsigned int enctype, + const struct krb5_enctype **_krb5, + gfp_t gfp); + +/* + * Apply decryption and checksumming functions to part of an skbuff. The + * offset and length are updated to reflect the actual content of the encrypted + * region. + */ +static inline +int rxgk_decrypt_skb(const struct krb5_enctype *krb5, + struct crypto_aead *aead, + struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len, + int *_error_code) +{ + struct scatterlist sg[16]; + size_t offset = 0, len = *_len; + int nr_sg, ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(skb, sg, *_offset, len); + if (unlikely(nr_sg < 0)) + return nr_sg; + + ret = crypto_krb5_decrypt(krb5, aead, sg, nr_sg, + &offset, &len); + switch (ret) { + case 0: + *_offset += offset; + *_len = len; + break; + case -EPROTO: + case -EBADMSG: + *_error_code = RXGK_SEALEDINCON; + break; + default: + break; + } + + return ret; +} + +/* + * Check the MIC on a region of an skbuff. The offset and length are updated + * to reflect the actual content of the secure region. + */ +static inline +int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, + struct crypto_shash *shash, + const struct krb5_buffer *metadata, + struct sk_buff *skb, + unsigned int *_offset, unsigned int *_len, + u32 *_error_code) +{ + struct scatterlist sg[16]; + size_t offset = 0, len = *_len; + int nr_sg, ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + nr_sg = skb_to_sgvec(skb, sg, *_offset, len); + if (unlikely(nr_sg < 0)) + return nr_sg; + + ret = crypto_krb5_verify_mic(krb5, shash, metadata, sg, nr_sg, + &offset, &len); + switch (ret) { + case 0: + *_offset += offset; + *_len = len; + break; + case -EPROTO: + case -EBADMSG: + *_error_code = RXGK_SEALEDINCON; + break; + default: + break; + } + + return ret; +} diff --git a/net/rxrpc/rxgk_kdf.c b/net/rxrpc/rxgk_kdf.c new file mode 100644 index 000000000000..b4db5aa30e5b --- /dev/null +++ b/net/rxrpc/rxgk_kdf.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* RxGK transport key derivation. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/key-type.h> +#include <linux/slab.h> +#include <keys/rxrpc-type.h> +#include "ar-internal.h" +#include "rxgk_common.h" + +#define round16(x) (((x) + 15) & ~15) + +/* + * Constants used to derive the keys and hmacs actually used for doing stuff. + */ +#define RXGK_CLIENT_ENC_PACKET 1026U // 0x402 +#define RXGK_CLIENT_MIC_PACKET 1027U // 0x403 +#define RXGK_SERVER_ENC_PACKET 1028U // 0x404 +#define RXGK_SERVER_MIC_PACKET 1029U // 0x405 +#define RXGK_CLIENT_ENC_RESPONSE 1030U // 0x406 +#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c + +static void rxgk_free(struct rxgk_context *gk) +{ + if (gk->tx_Kc) + crypto_free_shash(gk->tx_Kc); + if (gk->rx_Kc) + crypto_free_shash(gk->rx_Kc); + if (gk->tx_enc) + crypto_free_aead(gk->tx_enc); + if (gk->rx_enc) + crypto_free_aead(gk->rx_enc); + if (gk->resp_enc) + crypto_free_aead(gk->resp_enc); + kfree(gk); +} + +void rxgk_put(struct rxgk_context *gk) +{ + if (gk && refcount_dec_and_test(&gk->usage)) + rxgk_free(gk); +} + +/* + * Transport key derivation function. + * + * TK = random-to-key(PRF+(K0, L, + * epoch || cid || start_time || key_number)) + * [tools.ietf.org/html/draft-wilkinson-afs3-rxgk-11 sec 8.3] + */ +static int rxgk_derive_transport_key(struct rxrpc_connection *conn, + struct rxgk_context *gk, + const struct rxgk_key *rxgk, + struct krb5_buffer *TK, + gfp_t gfp) +{ + const struct krb5_enctype *krb5 = gk->krb5; + struct krb5_buffer conn_info; + unsigned int L = krb5->key_bytes; + __be32 *info; + u8 *buffer; + int ret; + + _enter(""); + + conn_info.len = sizeof(__be32) * 5; + + buffer = kzalloc(round16(conn_info.len), gfp); + if (!buffer) + return -ENOMEM; + + conn_info.data = buffer; + + info = (__be32 *)conn_info.data; + info[0] = htonl(conn->proto.epoch); + info[1] = htonl(conn->proto.cid); + info[2] = htonl(conn->rxgk.start_time >> 32); + info[3] = htonl(conn->rxgk.start_time >> 0); + info[4] = htonl(gk->key_number); + + ret = crypto_krb5_calc_PRFplus(krb5, &rxgk->key, L, &conn_info, TK, gfp); + kfree_sensitive(buffer); + _leave(" = %d", ret); + return ret; +} + +/* + * Set up the ciphers for the usage keys. + */ +static int rxgk_set_up_ciphers(struct rxrpc_connection *conn, + struct rxgk_context *gk, + const struct rxgk_key *rxgk, + gfp_t gfp) +{ + const struct krb5_enctype *krb5 = gk->krb5; + struct crypto_shash *shash; + struct crypto_aead *aead; + struct krb5_buffer TK; + bool service = rxrpc_conn_is_service(conn); + int ret; + u8 *buffer; + + buffer = kzalloc(krb5->key_bytes, gfp); + if (!buffer) + return -ENOMEM; + + TK.len = krb5->key_bytes; + TK.data = buffer; + + ret = rxgk_derive_transport_key(conn, gk, rxgk, &TK, gfp); + if (ret < 0) + goto out; + + aead = crypto_krb5_prepare_encryption(krb5, &TK, RXGK_CLIENT_ENC_RESPONSE, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->resp_enc = aead; + + if (crypto_aead_blocksize(gk->resp_enc) != krb5->block_len || + crypto_aead_authsize(gk->resp_enc) != krb5->cksum_len) { + pr_notice("algo inconsistent with krb5 table %u!=%u or %u!=%u\n", + crypto_aead_blocksize(gk->resp_enc), krb5->block_len, + crypto_aead_authsize(gk->resp_enc), krb5->cksum_len); + ret = -EINVAL; + goto out; + } + + if (service) { + switch (conn->security_level) { + case RXRPC_SECURITY_AUTH: + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->tx_Kc = shash; + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->rx_Kc = shash; + break; + case RXRPC_SECURITY_ENCRYPT: + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->tx_enc = aead; + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->rx_enc = aead; + break; + } + } else { + switch (conn->security_level) { + case RXRPC_SECURITY_AUTH: + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_CLIENT_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->tx_Kc = shash; + shash = crypto_krb5_prepare_checksum( + krb5, &TK, RXGK_SERVER_MIC_PACKET, gfp); + if (IS_ERR(shash)) + goto hash_error; + gk->rx_Kc = shash; + break; + case RXRPC_SECURITY_ENCRYPT: + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_CLIENT_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->tx_enc = aead; + aead = crypto_krb5_prepare_encryption( + krb5, &TK, RXGK_SERVER_ENC_PACKET, gfp); + if (IS_ERR(aead)) + goto aead_error; + gk->rx_enc = aead; + break; + } + } + + ret = 0; +out: + kfree_sensitive(buffer); + return ret; +aead_error: + ret = PTR_ERR(aead); + goto out; +hash_error: + ret = PTR_ERR(shash); + goto out; +} + +/* + * Derive a transport key for a connection and then derive a bunch of usage + * keys from it and set up ciphers using them. + */ +struct rxgk_context *rxgk_generate_transport_key(struct rxrpc_connection *conn, + const struct rxgk_key *key, + unsigned int key_number, + gfp_t gfp) +{ + struct rxgk_context *gk; + unsigned long lifetime; + int ret = -ENOPKG; + + _enter(""); + + gk = kzalloc(sizeof(*gk), GFP_KERNEL); + if (!gk) + return ERR_PTR(-ENOMEM); + refcount_set(&gk->usage, 1); + gk->key = key; + gk->key_number = key_number; + + gk->krb5 = crypto_krb5_find_enctype(key->enctype); + if (!gk->krb5) + goto err_tk; + + ret = rxgk_set_up_ciphers(conn, gk, key, gfp); + if (ret) + goto err_tk; + + /* Set the remaining number of bytes encrypted with this key that may + * be transmitted before rekeying. Note that the spec has been + * interpreted differently on this point... + */ + switch (key->bytelife) { + case 0: + case 63: + gk->bytes_remaining = LLONG_MAX; + break; + case 1 ... 62: + gk->bytes_remaining = 1LL << key->bytelife; + break; + default: + gk->bytes_remaining = key->bytelife; + break; + } + + /* Set the time after which rekeying must occur */ + if (key->lifetime) { + lifetime = min_t(u64, key->lifetime, INT_MAX / HZ); + lifetime *= HZ; + } else { + lifetime = MAX_JIFFY_OFFSET; + } + gk->expiry = jiffies + lifetime; + return gk; + +err_tk: + rxgk_put(gk); + _leave(" = %d", ret); + return ERR_PTR(ret); +} + +/* + * Use the server secret key to set up the ciphers that will be used to extract + * the token from a response packet. + */ +int rxgk_set_up_token_cipher(const struct krb5_buffer *server_key, + struct crypto_aead **token_aead, + unsigned int enctype, + const struct krb5_enctype **_krb5, + gfp_t gfp) +{ + const struct krb5_enctype *krb5; + struct crypto_aead *aead; + + krb5 = crypto_krb5_find_enctype(enctype); + if (!krb5) + return -ENOPKG; + + aead = crypto_krb5_prepare_encryption(krb5, server_key, RXGK_SERVER_ENC_TOKEN, gfp); + if (IS_ERR(aead)) + return PTR_ERR(aead); + + *_krb5 = krb5; + *token_aead = aead; + return 0; +} diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 48a1475e6b06..3657c0661cdc 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -148,14 +148,14 @@ error: static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp) { struct rxrpc_txbuf *txb; - size_t shdr, space; + size_t shdr, alloc, limit, part; - remain = min(remain, 65535 - sizeof(struct rxrpc_wire_header)); + remain = umin(remain, 65535 - sizeof(struct rxrpc_wire_header)); switch (call->conn->security_level) { default: - space = min_t(size_t, remain, RXRPC_JUMBO_DATALEN); - return rxrpc_alloc_data_txbuf(call, space, 1, gfp); + alloc = umin(remain, RXRPC_JUMBO_DATALEN); + return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp); case RXRPC_SECURITY_AUTH: shdr = sizeof(struct rxkad_level1_hdr); break; @@ -164,15 +164,23 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem break; } - space = min_t(size_t, round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr); - space = round_up(space, RXKAD_ALIGN); + limit = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN) - shdr; + if (remain < limit) { + part = remain; + alloc = round_up(shdr + part, RXKAD_ALIGN); + } else { + part = limit; + alloc = RXRPC_JUMBO_DATALEN; + } - txb = rxrpc_alloc_data_txbuf(call, space, RXKAD_ALIGN, gfp); + txb = rxrpc_alloc_data_txbuf(call, alloc, RXKAD_ALIGN, gfp); if (!txb) return NULL; - txb->offset += shdr; - txb->space -= shdr; + txb->crypto_header = 0; + txb->sec_header = shdr; + txb->offset += shdr; + txb->space = part; return txb; } @@ -251,8 +259,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, struct rxrpc_txbuf *txb, struct skcipher_request *req) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxkad_level1_hdr *hdr = (void *)(whdr + 1); + struct rxkad_level1_hdr *hdr = txb->data; struct rxrpc_crypt iv; struct scatterlist sg; size_t pad; @@ -263,13 +270,13 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, check = txb->seq ^ call->call_id; hdr->data_size = htonl((u32)check << 16 | txb->len); - txb->len += sizeof(struct rxkad_level1_hdr); - pad = txb->len; + txb->pkt_len = sizeof(struct rxkad_level1_hdr) + txb->len; + pad = txb->pkt_len; pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; if (pad) { - memset(txb->kvec[0].iov_base + txb->offset, 0, pad); - txb->len += pad; + memset(txb->data + txb->offset, 0, pad); + txb->pkt_len += pad; } /* start the encryption afresh */ @@ -294,11 +301,10 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct skcipher_request *req) { const struct rxrpc_key_token *token; - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxkad_level2_hdr *rxkhdr = (void *)(whdr + 1); + struct rxkad_level2_hdr *rxkhdr = txb->data; struct rxrpc_crypt iv; struct scatterlist sg; - size_t pad; + size_t content, pad; u16 check; int ret; @@ -309,23 +315,20 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, rxkhdr->data_size = htonl(txb->len | (u32)check << 16); rxkhdr->checksum = 0; - txb->len += sizeof(struct rxkad_level2_hdr); - pad = txb->len; - pad = RXKAD_ALIGN - pad; - pad &= RXKAD_ALIGN - 1; - if (pad) { - memset(txb->kvec[0].iov_base + txb->offset, 0, pad); - txb->len += pad; - } + content = sizeof(struct rxkad_level2_hdr) + txb->len; + txb->pkt_len = round_up(content, RXKAD_ALIGN); + pad = txb->pkt_len - content; + if (pad) + memset(txb->data + txb->offset, 0, pad); /* encrypt from the session key */ token = call->conn->key->payload.data[0]; memcpy(&iv, token->kad->session_key, sizeof(iv)); - sg_init_one(&sg, rxkhdr, txb->len); + sg_init_one(&sg, rxkhdr, txb->pkt_len); skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sg, &sg, txb->len, iv.x); + skcipher_request_set_crypt(req, &sg, &sg, txb->pkt_len, iv.x); ret = crypto_skcipher_encrypt(req); skcipher_request_zero(req); return ret; @@ -384,19 +387,32 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) switch (call->conn->security_level) { case RXRPC_SECURITY_PLAIN: + txb->pkt_len = txb->len; ret = 0; break; case RXRPC_SECURITY_AUTH: ret = rxkad_secure_packet_auth(call, txb, req); + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; break; case RXRPC_SECURITY_ENCRYPT: ret = rxkad_secure_packet_encrypt(call, txb, req); + if (txb->alloc_size == RXRPC_JUMBO_DATALEN) + txb->jumboable = true; break; default: ret = -EPERM; break; } + /* Clear excess space in the packet */ + if (txb->pkt_len < txb->alloc_size) { + size_t gap = txb->alloc_size - txb->pkt_len; + void *p = txb->data; + + memset(p + txb->pkt_len, 0, gap); + } + skcipher_request_free(req); _leave(" = %d [set %x]", ret, y); return ret; @@ -669,6 +685,8 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) serial = rxrpc_get_next_serial(conn); whdr.serial = htonl(serial); + trace_rxrpc_tx_challenge(conn, serial, 0, conn->rxkad.nonce); + ret = kernel_sendmsg(conn->local->socket, &msg, iov, 2, len); if (ret < 0) { trace_rxrpc_tx_fail(conn->debug_id, serial, ret, @@ -684,62 +702,6 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn) } /* - * send a Kerberos security response - */ -static int rxkad_send_response(struct rxrpc_connection *conn, - struct rxrpc_host_header *hdr, - struct rxkad_response *resp, - const struct rxkad_key *s2) -{ - struct rxrpc_wire_header whdr; - struct msghdr msg; - struct kvec iov[3]; - size_t len; - u32 serial; - int ret; - - _enter(""); - - msg.msg_name = &conn->peer->srx.transport; - msg.msg_namelen = conn->peer->srx.transport_len; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_flags = 0; - - memset(&whdr, 0, sizeof(whdr)); - whdr.epoch = htonl(hdr->epoch); - whdr.cid = htonl(hdr->cid); - whdr.type = RXRPC_PACKET_TYPE_RESPONSE; - whdr.flags = conn->out_clientflag; - whdr.securityIndex = hdr->securityIndex; - whdr.serviceId = htons(hdr->serviceId); - - iov[0].iov_base = &whdr; - iov[0].iov_len = sizeof(whdr); - iov[1].iov_base = resp; - iov[1].iov_len = sizeof(*resp); - iov[2].iov_base = (void *)s2->ticket; - iov[2].iov_len = s2->ticket_len; - - len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len; - - serial = rxrpc_get_next_serial(conn); - whdr.serial = htonl(serial); - - rxrpc_local_dont_fragment(conn->local, false); - ret = kernel_sendmsg(conn->local->socket, &msg, iov, 3, len); - if (ret < 0) { - trace_rxrpc_tx_fail(conn->debug_id, serial, ret, - rxrpc_tx_point_rxkad_response); - return -EAGAIN; - } - - conn->peer->last_tx_at = ktime_get_seconds(); - _leave(" = 0"); - return 0; -} - -/* * calculate the response checksum */ static void rxkad_calc_response_checksum(struct rxkad_response *response) @@ -758,12 +720,21 @@ static void rxkad_calc_response_checksum(struct rxkad_response *response) * encrypt the response packet */ static int rxkad_encrypt_response(struct rxrpc_connection *conn, - struct rxkad_response *resp, + struct sk_buff *response, const struct rxkad_key *s2) { struct skcipher_request *req; struct rxrpc_crypt iv; struct scatterlist sg[1]; + size_t encsize = sizeof(((struct rxkad_response *)0)->encrypted); + int ret; + + sg_init_table(sg, ARRAY_SIZE(sg)); + ret = skb_to_sgvec(response, sg, + sizeof(struct rxrpc_wire_header) + + offsetof(struct rxkad_response, encrypted), encsize); + if (ret < 0) + return ret; req = skcipher_request_alloc(&conn->rxkad.cipher->base, GFP_NOFS); if (!req) @@ -772,89 +743,206 @@ static int rxkad_encrypt_response(struct rxrpc_connection *conn, /* continue encrypting from where we left off */ memcpy(&iv, s2->session_key, sizeof(iv)); - sg_init_table(sg, 1); - sg_set_buf(sg, &resp->encrypted, sizeof(resp->encrypted)); skcipher_request_set_sync_tfm(req, conn->rxkad.cipher); skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x); - crypto_skcipher_encrypt(req); + skcipher_request_set_crypt(req, sg, sg, encsize, iv.x); + ret = crypto_skcipher_encrypt(req); skcipher_request_free(req); - return 0; + return ret; } /* - * respond to a challenge packet + * Validate a challenge packet. */ -static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, - struct sk_buff *skb) +static bool rxkad_validate_challenge(struct rxrpc_connection *conn, + struct sk_buff *skb) { - const struct rxrpc_key_token *token; struct rxkad_challenge challenge; - struct rxkad_response *resp; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); - u32 version, nonce, min_level; - int ret = -EPROTO; + u32 version, min_level; + int ret; _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); - if (!conn->key) - return rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, - rxkad_abort_chall_no_key); + if (!conn->key) { + rxrpc_abort_conn(conn, skb, RX_PROTOCOL_ERROR, -EPROTO, + rxkad_abort_chall_no_key); + return false; + } ret = key_validate(conn->key); - if (ret < 0) - return rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, - rxkad_abort_chall_key_expired); + if (ret < 0) { + rxrpc_abort_conn(conn, skb, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); + return false; + } if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - &challenge, sizeof(challenge)) < 0) - return rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, - rxkad_abort_chall_short); + &challenge, sizeof(challenge)) < 0) { + rxrpc_abort_conn(conn, skb, RXKADPACKETSHORT, -EPROTO, + rxkad_abort_chall_short); + return false; + } version = ntohl(challenge.version); - nonce = ntohl(challenge.nonce); + sp->chall.rxkad_nonce = ntohl(challenge.nonce); min_level = ntohl(challenge.min_level); - trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, nonce, min_level); + trace_rxrpc_rx_challenge(conn, sp->hdr.serial, version, + sp->chall.rxkad_nonce, min_level); + + if (version != RXKAD_VERSION) { + rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, + rxkad_abort_chall_version); + return false; + } + + if (conn->security_level < min_level) { + rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, + rxkad_abort_chall_level); + return false; + } + return true; +} + +/* + * Insert the header into the response. + */ +static noinline +int rxkad_insert_response_header(struct rxrpc_connection *conn, + const struct rxrpc_key_token *token, + struct sk_buff *challenge, + struct sk_buff *response, + size_t *offset) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + struct { + struct rxrpc_wire_header whdr; + struct rxkad_response resp; + } h; + int ret; + + h.whdr.epoch = htonl(conn->proto.epoch); + h.whdr.cid = htonl(conn->proto.cid); + h.whdr.callNumber = 0; + h.whdr.serial = 0; + h.whdr.seq = 0; + h.whdr.type = RXRPC_PACKET_TYPE_RESPONSE; + h.whdr.flags = conn->out_clientflag; + h.whdr.userStatus = 0; + h.whdr.securityIndex = conn->security_ix; + h.whdr.cksum = 0; + h.whdr.serviceId = htons(conn->service_id); + h.resp.version = htonl(RXKAD_VERSION); + h.resp.__pad = 0; + h.resp.encrypted.epoch = htonl(conn->proto.epoch); + h.resp.encrypted.cid = htonl(conn->proto.cid); + h.resp.encrypted.checksum = 0; + h.resp.encrypted.securityIndex = htonl(conn->security_ix); + h.resp.encrypted.call_id[0] = htonl(conn->channels[0].call_counter); + h.resp.encrypted.call_id[1] = htonl(conn->channels[1].call_counter); + h.resp.encrypted.call_id[2] = htonl(conn->channels[2].call_counter); + h.resp.encrypted.call_id[3] = htonl(conn->channels[3].call_counter); + h.resp.encrypted.inc_nonce = htonl(csp->chall.rxkad_nonce + 1); + h.resp.encrypted.level = htonl(conn->security_level); + h.resp.kvno = htonl(token->kad->kvno); + h.resp.ticket_len = htonl(token->kad->ticket_len); + + rxkad_calc_response_checksum(&h.resp); + + ret = skb_store_bits(response, *offset, &h, sizeof(h)); + *offset += sizeof(h); + return ret; +} - if (version != RXKAD_VERSION) - return rxrpc_abort_conn(conn, skb, RXKADINCONSISTENCY, -EPROTO, - rxkad_abort_chall_version); +/* + * respond to a challenge packet + */ +static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, + struct sk_buff *challenge) +{ + const struct rxrpc_key_token *token; + struct rxrpc_skb_priv *csp, *rsp; + struct sk_buff *response; + size_t len, offset = 0; + int ret = -EPROTO; - if (conn->security_level < min_level) - return rxrpc_abort_conn(conn, skb, RXKADLEVELFAIL, -EACCES, - rxkad_abort_chall_level); + _enter("{%d,%x}", conn->debug_id, key_serial(conn->key)); + + ret = key_validate(conn->key); + if (ret < 0) + return rxrpc_abort_conn(conn, challenge, RXKADEXPIRED, ret, + rxkad_abort_chall_key_expired); token = conn->key->payload.data[0]; /* build the response packet */ - resp = kzalloc(sizeof(struct rxkad_response), GFP_NOFS); - if (!resp) - return -ENOMEM; + len = sizeof(struct rxrpc_wire_header) + + sizeof(struct rxkad_response) + + token->kad->ticket_len; + + response = alloc_skb_with_frags(0, len, 0, &ret, GFP_NOFS); + if (!response) + goto error; + rxrpc_new_skb(response, rxrpc_skb_new_response_rxkad); + response->len = len; + response->data_len = len; - resp->version = htonl(RXKAD_VERSION); - resp->encrypted.epoch = htonl(conn->proto.epoch); - resp->encrypted.cid = htonl(conn->proto.cid); - resp->encrypted.securityIndex = htonl(conn->security_ix); - resp->encrypted.inc_nonce = htonl(nonce + 1); - resp->encrypted.level = htonl(conn->security_level); - resp->kvno = htonl(token->kad->kvno); - resp->ticket_len = htonl(token->kad->ticket_len); - resp->encrypted.call_id[0] = htonl(conn->channels[0].call_counter); - resp->encrypted.call_id[1] = htonl(conn->channels[1].call_counter); - resp->encrypted.call_id[2] = htonl(conn->channels[2].call_counter); - resp->encrypted.call_id[3] = htonl(conn->channels[3].call_counter); - - /* calculate the response checksum and then do the encryption */ - rxkad_calc_response_checksum(resp); - ret = rxkad_encrypt_response(conn, resp, token->kad); - if (ret == 0) - ret = rxkad_send_response(conn, &sp->hdr, resp, token->kad); - kfree(resp); + offset = 0; + ret = rxkad_insert_response_header(conn, token, challenge, response, + &offset); + if (ret < 0) + goto error; + + ret = rxkad_encrypt_response(conn, response, token->kad); + if (ret < 0) + goto error; + + ret = skb_store_bits(response, offset, token->kad->ticket, + token->kad->ticket_len); + if (ret < 0) + goto error; + + csp = rxrpc_skb(challenge); + rsp = rxrpc_skb(response); + rsp->resp.len = len; + rsp->resp.challenge_serial = csp->hdr.serial; + rxrpc_post_response(conn, response); + response = NULL; + ret = 0; + +error: + rxrpc_free_skb(response, rxrpc_skb_put_response); return ret; } /* + * RxKAD does automatic response only as there's nothing to manage that isn't + * already in the key. + */ +static int rxkad_sendmsg_respond_to_challenge(struct sk_buff *challenge, + struct msghdr *msg) +{ + return -EINVAL; +} + +/** + * rxkad_kernel_respond_to_challenge - Respond to a challenge with appdata + * @challenge: The challenge to respond to + * + * Allow a kernel application to respond to a CHALLENGE. + * + * Return: %0 if successful and a negative error code otherwise. + */ +int rxkad_kernel_respond_to_challenge(struct sk_buff *challenge) +{ + struct rxrpc_skb_priv *csp = rxrpc_skb(challenge); + + return rxkad_respond_to_challenge(csp->chall.conn, challenge); +} +EXPORT_SYMBOL(rxkad_kernel_respond_to_challenge); + +/* * decrypt the kerberos IV ticket in the response */ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, @@ -1262,6 +1350,8 @@ const struct rxrpc_security rxkad = { .verify_packet = rxkad_verify_packet, .free_call_crypto = rxkad_free_call_crypto, .issue_challenge = rxkad_issue_challenge, + .validate_challenge = rxkad_validate_challenge, + .sendmsg_respond_to_challenge = rxkad_sendmsg_respond_to_challenge, .respond_to_challenge = rxkad_respond_to_challenge, .verify_response = rxkad_verify_response, .clear = rxkad_clear, diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 085e7892d310..0377301156b0 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "rxperf: " fmt #include <linux/module.h> #include <linux/slab.h> +#include <crypto/krb5.h> #include <net/sock.h> #include <net/af_rxrpc.h> #define RXRPC_TRACE_ONLY_DEFINE_ENUMS @@ -136,6 +137,12 @@ static void rxperf_notify_end_reply_tx(struct sock *sock, RXPERF_CALL_SV_AWAIT_ACK); } +static const struct rxrpc_kernel_ops rxperf_rxrpc_callback_ops = { + .notify_new_call = rxperf_rx_new_call, + .discard_new_call = rxperf_rx_discard_new_call, + .user_attach_call = rxperf_rx_attach, +}; + /* * Charge the incoming call preallocation. */ @@ -161,7 +168,6 @@ static void rxperf_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(rxperf_socket, rxperf_notify_rx, - rxperf_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) @@ -209,8 +215,7 @@ static int rxperf_open_socket(void) if (ret < 0) goto error_2; - rxrpc_kernel_new_call_notification(socket, rxperf_rx_new_call, - rxperf_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &rxperf_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) @@ -478,6 +483,18 @@ static int rxperf_deliver_request(struct rxperf_call *call) call->unmarshal++; fallthrough; case 2: + ret = rxperf_extract_data(call, true); + if (ret < 0) + return ret; + + /* Deal with the terminal magic cookie. */ + call->iov_len = 4; + call->kvec[0].iov_len = call->iov_len; + call->kvec[0].iov_base = call->tmp; + iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len); + call->unmarshal++; + fallthrough; + case 3: ret = rxperf_extract_data(call, false); if (ret < 0) return ret; @@ -503,7 +520,7 @@ static int rxperf_process_call(struct rxperf_call *call) reply_len + sizeof(rxperf_magic_cookie)); while (reply_len > 0) { - len = min_t(size_t, reply_len, PAGE_SIZE); + len = umin(reply_len, PAGE_SIZE); bvec_set_page(&bv, ZERO_PAGE(0), len, 0); iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, len); msg.msg_flags = MSG_MORE; @@ -534,9 +551,9 @@ static int rxperf_process_call(struct rxperf_call *call) } /* - * Add a key to the security keyring. + * Add an rxkad key to the security keyring. */ -static int rxperf_add_key(struct key *keyring) +static int rxperf_add_rxkad_key(struct key *keyring) { key_ref_t kref; int ret; @@ -562,6 +579,47 @@ static int rxperf_add_key(struct key *keyring) return ret; } +#ifdef CONFIG_RXGK +/* + * Add a yfs-rxgk key to the security keyring. + */ +static int rxperf_add_yfs_rxgk_key(struct key *keyring, u32 enctype) +{ + const struct krb5_enctype *krb5 = crypto_krb5_find_enctype(enctype); + key_ref_t kref; + char name[64]; + int ret; + u8 key[32]; + + if (!krb5 || krb5->key_len > sizeof(key)) + return 0; + + /* The key is just { 0, 1, 2, 3, 4, ... } */ + for (int i = 0; i < krb5->key_len; i++) + key[i] = i; + + sprintf(name, "%u:6:1:%u", RX_PERF_SERVICE, enctype); + + kref = key_create_or_update(make_key_ref(keyring, true), + "rxrpc_s", name, + key, krb5->key_len, + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | + KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + + if (IS_ERR(kref)) { + pr_err("Can't allocate rxperf server key: %ld\n", PTR_ERR(kref)); + return PTR_ERR(kref); + } + + ret = key_link(keyring, key_ref_to_ptr(kref)); + if (ret < 0) + pr_err("Can't link rxperf server key: %d\n", ret); + key_ref_put(kref); + return ret; +} +#endif + /* * Initialise the rxperf server. */ @@ -591,9 +649,29 @@ static int __init rxperf_init(void) goto error_keyring; } rxperf_sec_keyring = keyring; - ret = rxperf_add_key(keyring); + ret = rxperf_add_rxkad_key(keyring); + if (ret < 0) + goto error_key; +#ifdef CONFIG_RXGK + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA1_96); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES128_CTS_HMAC_SHA256_128); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_AES256_CTS_HMAC_SHA384_192); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA128_CTS_CMAC); + if (ret < 0) + goto error_key; + ret = rxperf_add_yfs_rxgk_key(keyring, KRB5_ENCTYPE_CAMELLIA256_CTS_CMAC); if (ret < 0) goto error_key; +#endif ret = rxperf_open_socket(); if (ret < 0) diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c index cb8dd1d3b1d4..078d91a6b77f 100644 --- a/net/rxrpc/security.c +++ b/net/rxrpc/security.c @@ -20,6 +20,9 @@ static const struct rxrpc_security *rxrpc_security_types[] = { #ifdef CONFIG_RXKAD [RXRPC_SECURITY_RXKAD] = &rxkad, #endif +#ifdef CONFIG_RXGK + [RXRPC_SECURITY_YFS_RXGK] = &rxgk_yfs, +#endif }; int __init rxrpc_init_security(void) @@ -114,10 +117,10 @@ found: if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) { ret = conn->security->init_connection_security(conn, token); if (ret == 0) { - spin_lock(&conn->state_lock); + spin_lock_irq(&conn->state_lock); if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) conn->state = RXRPC_CONN_CLIENT; - spin_unlock(&conn->state_lock); + spin_unlock_irq(&conn->state_lock); } } mutex_unlock(&conn->security_lock); diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 6abb8eec1b2b..ebbb78b842de 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -94,9 +94,11 @@ no_wait: */ static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win) { + rxrpc_seq_t tx_bottom = READ_ONCE(call->tx_bottom); + if (_tx_win) - *_tx_win = call->tx_bottom; - return call->tx_prepared - call->tx_bottom < 256; + *_tx_win = tx_bottom; + return call->send_top - tx_bottom < 256; } /* @@ -132,13 +134,13 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx, rxrpc_seq_t tx_start, tx_win; signed long rtt, timeout; - rtt = READ_ONCE(call->peer->srtt_us) >> 3; + rtt = READ_ONCE(call->srtt_us) >> 3; rtt = usecs_to_jiffies(rtt) * 2; if (rtt < 2) rtt = 2; timeout = rtt; - tx_start = smp_load_acquire(&call->acks_hard_ack); + tx_start = READ_ONCE(call->tx_bottom); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -195,8 +197,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx, DECLARE_WAITQUEUE(myself, current); int ret; - _enter(",{%u,%u,%u,%u}", - call->tx_bottom, call->acks_hard_ack, call->tx_top, call->tx_winsize); + _enter(",{%u,%u,%u}", + call->tx_bottom, call->tx_top, call->tx_winsize); add_wait_queue(&call->waitq, &myself); @@ -240,37 +242,77 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call, struct rxrpc_txbuf *txb, rxrpc_notify_end_tx_t notify_end_tx) { + struct rxrpc_txqueue *sq = call->send_queue; rxrpc_seq_t seq = txb->seq; bool poke, last = txb->flags & RXRPC_LAST_PACKET; - + int ix = seq & RXRPC_TXQ_MASK; rxrpc_inc_stat(call->rxnet, stat_tx_data); - ASSERTCMP(txb->seq, ==, call->tx_prepared + 1); - - /* We have to set the timestamp before queueing as the retransmit - * algorithm can see the packet as soon as we queue it. - */ - txb->last_sent = ktime_get_real(); + ASSERTCMP(txb->seq, ==, call->send_top + 1); if (last) trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last); else trace_rxrpc_txqueue(call, rxrpc_txqueue_queue); + if (WARN_ON_ONCE(sq->bufs[ix])) + trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue_dup); + else + trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue); + /* Add the packet to the call's output buffer */ - spin_lock(&call->tx_lock); - poke = list_empty(&call->tx_sendmsg); - list_add_tail(&txb->call_link, &call->tx_sendmsg); - call->tx_prepared = seq; - if (last) + poke = (READ_ONCE(call->tx_bottom) == call->send_top); + sq->bufs[ix] = txb; + /* Order send_top after the queue->next pointer and txb content. */ + smp_store_release(&call->send_top, seq); + if (last) { + set_bit(RXRPC_CALL_TX_NO_MORE, &call->flags); rxrpc_notify_end_tx(rx, call, notify_end_tx); - spin_unlock(&call->tx_lock); + call->send_queue = NULL; + } if (poke) rxrpc_poke_call(call, rxrpc_call_poke_start); } /* + * Allocate a new txqueue unit and add it to the transmission queue. + */ +static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call) +{ + struct rxrpc_txqueue *tq; + + tq = kzalloc(sizeof(*tq), sk->sk_allocation); + if (!tq) + return -ENOMEM; + + tq->xmit_ts_base = KTIME_MIN; + for (int i = 0; i < RXRPC_NR_TXQUEUE; i++) + tq->segment_xmit_ts[i] = UINT_MAX; + + if (call->send_queue) { + tq->qbase = call->send_top + 1; + call->send_queue->next = tq; + call->send_queue = tq; + } else if (WARN_ON(call->tx_queue)) { + kfree(tq); + return -ENOMEM; + } else { + /* We start at seq 1, so pretend seq 0 is hard-acked. */ + tq->nr_reported_acks = 1; + tq->segment_acked = 1UL; + tq->qbase = 0; + call->tx_qbase = 0; + call->send_queue = tq; + call->tx_qtail = tq; + call->tx_queue = tq; + } + + trace_rxrpc_tq(call, tq, call->send_top, rxrpc_tq_alloc); + return 0; +} + +/* * send data through a socket * - must be called in process context * - The caller holds the call user access mutex, but not the socket lock. @@ -288,6 +330,13 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, bool more = msg->msg_flags & MSG_MORE; int ret, copied = 0; + if (test_bit(RXRPC_CALL_TX_NO_MORE, &call->flags)) { + trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send, + call->cid, call->call_id, call->rx_consumed, + 0, -EPROTO); + return -EPROTO; + } + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); ret = rxrpc_wait_to_be_connected(call, &timeo); @@ -344,6 +393,13 @@ reload: if (!rxrpc_check_tx_space(call, NULL)) goto wait_for_space; + /* See if we need to begin/extend the Tx queue. */ + if (!call->send_queue || !((call->send_top + 1) & RXRPC_TXQ_MASK)) { + ret = rxrpc_alloc_txqueue(sk, call); + if (ret < 0) + goto maybe_error; + } + /* Work out the maximum size of a packet. Assume that * the security header is going to be in the padded * region (enc blocksize), but the trailer is not. @@ -360,10 +416,10 @@ reload: /* append next segment of data to the current buffer */ if (msg_data_left(msg) > 0) { - size_t copy = min_t(size_t, txb->space, msg_data_left(msg)); + size_t copy = umin(txb->space, msg_data_left(msg)); _debug("add %zu", copy); - if (!copy_from_iter_full(txb->kvec[0].iov_base + txb->offset, + if (!copy_from_iter_full(txb->data + txb->offset, copy, &msg->msg_iter)) goto efault; _debug("added"); @@ -385,16 +441,10 @@ reload: (msg_data_left(msg) == 0 && !more)) { if (msg_data_left(msg) == 0 && !more) txb->flags |= RXRPC_LAST_PACKET; - else if (call->tx_top - call->acks_hard_ack < - call->tx_winsize) - txb->flags |= RXRPC_MORE_PACKETS; ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; - - txb->kvec[0].iov_len += txb->len; - txb->len = txb->kvec[0].iov_len; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } @@ -557,7 +607,7 @@ static int rxrpc_sendmsg_cmsg(struct msghdr *msg, struct rxrpc_send_params *p) static struct rxrpc_call * rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, struct rxrpc_send_params *p) - __releases(&rx->sk.sk_lock.slock) + __releases(&rx->sk.sk_lock) __acquires(&call->user_mutex) { struct rxrpc_conn_parameters cp; @@ -607,7 +657,6 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, * - the socket may be either a client socket or a server socket */ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) - __releases(&rx->sk.sk_lock.slock) { struct rxrpc_call *call; bool dropped_lock = false; @@ -655,7 +704,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) } else { switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_AWAIT_CONN: - case RXRPC_CALL_SERVER_SECURING: + case RXRPC_CALL_SERVER_RECV_REQUEST: if (p.command == RXRPC_CMD_SEND_ABORT) break; fallthrough; @@ -709,14 +758,21 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) if (rxrpc_call_is_complete(call)) { /* it's too late for this call */ ret = -ESHUTDOWN; - } else if (p.command == RXRPC_CMD_SEND_ABORT) { + goto out_put_unlock; + } + + switch (p.command) { + case RXRPC_CMD_SEND_ABORT: rxrpc_propose_abort(call, p.abort_code, -ECONNABORTED, rxrpc_abort_call_sendmsg); ret = 0; - } else if (p.command != RXRPC_CMD_SEND_DATA) { - ret = -EINVAL; - } else { + break; + case RXRPC_CMD_SEND_DATA: ret = rxrpc_send_data(rx, call, msg, len, NULL, &dropped_lock); + break; + default: + ret = -EINVAL; + break; } out_put_unlock: @@ -744,6 +800,8 @@ error_release_sock: * appropriate to sending data. No control data should be supplied in @msg, * nor should an address be supplied. MSG_MORE should be flagged if there's * more data to come, otherwise this data will end the transmission phase. + * + * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, size_t len, @@ -779,8 +837,9 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data); * @error: Local error value * @why: Indication as to why. * - * Allow a kernel service to abort a call, if it's still in an abortable state - * and return true if the call was aborted, false if it was already complete. + * Allow a kernel service to abort a call if it's still in an abortable state. + * + * Return: %true if the call was aborted, %false if it was already complete. */ bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, u32 abort_code, int error, enum rxrpc_abort_reason why) diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c index e51940589ee5..36b05fd842a7 100644 --- a/net/rxrpc/server_key.c +++ b/net/rxrpc/server_key.c @@ -152,6 +152,8 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen) * * Set the server security keyring on an rxrpc socket. This is used to provide * the encryption keys for a kernel service. + * + * Return: %0 if successful and a negative error code otherwise. */ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) { @@ -169,3 +171,43 @@ int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring) return ret; } EXPORT_SYMBOL(rxrpc_sock_set_security_keyring); + +/** + * rxrpc_sock_set_manage_response - Set the manage-response flag for a kernel service + * @sk: The socket to set the keyring on + * @set: True to set, false to clear the flag + * + * Set the flag on an rxrpc socket to say that the caller wants to manage the + * RESPONSE packet and the user-defined data it may contain. Setting this + * means that recvmsg() will return messages with RXRPC_CHALLENGED in the + * control message buffer containing information about the challenge. + * + * The user should respond to the challenge by passing RXRPC_RESPOND or + * RXRPC_RESPOND_ABORT control messages with sendmsg() to the same call. + * Supplementary control messages, such as RXRPC_RESP_RXGK_APPDATA, may be + * included to indicate the parts the user wants to supply. + * + * The server will be passed the response data with a RXRPC_RESPONDED control + * message when it gets the first data from each call. + * + * Note that this is only honoured by security classes that need auxiliary data + * (e.g. RxGK). Those that don't offer the facility (e.g. RxKAD) respond + * without consulting userspace. + * + * Return: The previous setting. + */ +int rxrpc_sock_set_manage_response(struct sock *sk, bool set) +{ + struct rxrpc_sock *rx = rxrpc_sk(sk); + int ret; + + lock_sock(sk); + ret = !!test_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + if (set) + set_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + else + clear_bit(RXRPC_SOCK_MANAGE_RESPONSE, &rx->flags); + release_sock(sk); + return ret; +} +EXPORT_SYMBOL(rxrpc_sock_set_manage_response); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 9bf9a1f6e4cb..46a20cf4c402 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -11,6 +11,8 @@ #include "ar-internal.h" static struct ctl_table_header *rxrpc_sysctl_reg_table; +static const unsigned int rxrpc_rx_mtu_min = 500; +static const unsigned int rxrpc_jumbo_max = RXRPC_MAX_NR_JUMBO; static const unsigned int four = 4; static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1; static const unsigned int n_65535 = 65535; @@ -115,7 +117,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = (void *)SYSCTL_ONE, + .extra1 = (void *)&rxrpc_rx_mtu_min, .extra2 = (void *)&n_65535, }, { @@ -125,7 +127,7 @@ static struct ctl_table rxrpc_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void *)SYSCTL_ONE, - .extra2 = (void *)&four, + .extra2 = (void *)&rxrpc_jumbo_max, }, }; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index c3913d8a50d3..29767038691a 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -19,17 +19,19 @@ atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp) { - struct rxrpc_wire_header *whdr; struct rxrpc_txbuf *txb; - size_t total, hoff; + size_t total, doff, jsize = sizeof(struct rxrpc_jumbo_header); void *buf; - txb = kmalloc(sizeof(*txb), gfp); + txb = kzalloc(sizeof(*txb), gfp); if (!txb) return NULL; - hoff = round_up(sizeof(*whdr), data_align) - sizeof(*whdr); - total = hoff + sizeof(*whdr) + data_size; + /* We put a jumbo header in the buffer, but not a full wire header to + * avoid delayed-corruption problems with zerocopy. + */ + doff = round_up(jsize, data_align); + total = doff + data_size; data_align = umax(data_align, L1_CACHE_BYTES); mutex_lock(&call->conn->tx_data_alloc_lock); @@ -41,36 +43,15 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ return NULL; } - whdr = buf + hoff; - - INIT_LIST_HEAD(&txb->call_link); - INIT_LIST_HEAD(&txb->tx_link); refcount_set(&txb->ref, 1); - txb->last_sent = KTIME_MIN; txb->call_debug_id = call->debug_id; txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); + txb->alloc_size = data_size; txb->space = data_size; - txb->len = 0; - txb->offset = sizeof(*whdr); + txb->offset = 0; txb->flags = call->conn->out_clientflag; - txb->ack_why = 0; - txb->seq = call->tx_prepared + 1; - txb->serial = 0; - txb->cksum = 0; - txb->nr_kvec = 1; - txb->kvec[0].iov_base = whdr; - txb->kvec[0].iov_len = sizeof(*whdr); - - whdr->epoch = htonl(call->conn->proto.epoch); - whdr->cid = htonl(call->cid); - whdr->callNumber = htonl(call->call_id); - whdr->seq = htonl(txb->seq); - whdr->type = RXRPC_PACKET_TYPE_DATA; - whdr->flags = 0; - whdr->userStatus = 0; - whdr->securityIndex = call->security_ix; - whdr->_rsvd = 0; - whdr->serviceId = htons(call->dest_srx.srx_service); + txb->seq = call->send_top + 1; + txb->data = buf + doff; trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, rxrpc_txbuf_alloc_data); @@ -79,92 +60,6 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ return txb; } -/* - * Allocate and partially initialise an ACK packet. - */ -struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size) -{ - struct rxrpc_wire_header *whdr; - struct rxrpc_acktrailer *trailer; - struct rxrpc_ackpacket *ack; - struct rxrpc_txbuf *txb; - gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS; - void *buf, *buf2 = NULL; - u8 *filler; - - txb = kmalloc(sizeof(*txb), gfp); - if (!txb) - return NULL; - - buf = page_frag_alloc(&call->local->tx_alloc, - sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp); - if (!buf) { - kfree(txb); - return NULL; - } - - if (sack_size) { - buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp); - if (!buf2) { - page_frag_free(buf); - kfree(txb); - return NULL; - } - } - - whdr = buf; - ack = buf + sizeof(*whdr); - filler = buf + sizeof(*whdr) + sizeof(*ack) + 1; - trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3; - - INIT_LIST_HEAD(&txb->call_link); - INIT_LIST_HEAD(&txb->tx_link); - refcount_set(&txb->ref, 1); - txb->call_debug_id = call->debug_id; - txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); - txb->space = 0; - txb->len = sizeof(*whdr) + sizeof(*ack) + 3 + sizeof(*trailer); - txb->offset = 0; - txb->flags = call->conn->out_clientflag; - txb->ack_rwind = 0; - txb->seq = 0; - txb->serial = 0; - txb->cksum = 0; - txb->nr_kvec = 3; - txb->kvec[0].iov_base = whdr; - txb->kvec[0].iov_len = sizeof(*whdr) + sizeof(*ack); - txb->kvec[1].iov_base = buf2; - txb->kvec[1].iov_len = sack_size; - txb->kvec[2].iov_base = filler; - txb->kvec[2].iov_len = 3 + sizeof(*trailer); - - whdr->epoch = htonl(call->conn->proto.epoch); - whdr->cid = htonl(call->cid); - whdr->callNumber = htonl(call->call_id); - whdr->seq = 0; - whdr->type = RXRPC_PACKET_TYPE_ACK; - whdr->flags = 0; - whdr->userStatus = 0; - whdr->securityIndex = call->security_ix; - whdr->_rsvd = 0; - whdr->serviceId = htons(call->dest_srx.srx_service); - - get_page(virt_to_head_page(trailer)); - - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, - rxrpc_txbuf_alloc_ack); - atomic_inc(&rxrpc_nr_txbuf); - return txb; -} - -void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) -{ - int r; - - __refcount_inc(&txb->ref, &r); - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, r + 1, what); -} - void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) { int r = refcount_read(&txb->ref); @@ -174,13 +69,10 @@ void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb) { - int i; - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, rxrpc_txbuf_free); - for (i = 0; i < txb->nr_kvec; i++) - if (txb->kvec[i].iov_base) - page_frag_free(txb->kvec[i].iov_base); + if (txb->data) + page_frag_free(txb->data); kfree(txb); atomic_dec(&rxrpc_nr_txbuf); } @@ -202,37 +94,3 @@ void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) rxrpc_free_txbuf(txb); } } - -/* - * Shrink the transmit buffer. - */ -void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call) -{ - struct rxrpc_txbuf *txb; - rxrpc_seq_t hard_ack = smp_load_acquire(&call->acks_hard_ack); - bool wake = false; - - _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top); - - while ((txb = list_first_entry_or_null(&call->tx_buffer, - struct rxrpc_txbuf, call_link))) { - hard_ack = smp_load_acquire(&call->acks_hard_ack); - if (before(hard_ack, txb->seq)) - break; - - if (txb->seq != call->tx_bottom + 1) - rxrpc_see_txbuf(txb, rxrpc_txbuf_see_out_of_step); - ASSERTCMP(txb->seq, ==, call->tx_bottom + 1); - smp_store_release(&call->tx_bottom, call->tx_bottom + 1); - list_del_rcu(&txb->call_link); - - trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue); - - rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated); - if (after(call->acks_hard_ack, call->tx_bottom + 128)) - wake = true; - } - - if (wake) - wake_up(&call->waitq); -} diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 8180d0c12fce..ad914d2b2e22 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -403,6 +403,18 @@ config NET_SCH_ETS If unsure, say N. +config NET_SCH_BPF + bool "BPF-based Qdisc" + depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF + help + This option allows BPF-based queueing disiplines. With BPF struct_ops, + users can implement supported operators in Qdisc_ops using BPF programs. + The queue holding skb can be built with BPF maps or graphs. + + Say Y here if you want to use BPF-based Qdisc. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" help @@ -784,7 +796,7 @@ config NET_ACT_SKBEDIT config NET_ACT_CSUM tristate "Checksum Updating" depends on NET_CLS_ACT && INET - select LIBCRC32C + select NET_CRC32C help Say Y here to update some common checksum after some direct packet alterations. diff --git a/net/sched/Makefile b/net/sched/Makefile index 82c3f78ca486..904d784902d1 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -62,6 +62,7 @@ obj-$(CONFIG_NET_SCH_FQ_PIE) += sch_fq_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o +obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 839790043256..057e20cef375 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1461,17 +1461,29 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; - struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 2]; struct tc_action *act; size_t sz = 0; int err; int i; - err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO, nla, NULL, + err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX_PRIO + 1, nla, NULL, extack); if (err < 0) return err; + /* The nested attributes are parsed as types, but they are really an + * array of actions. So we parse one more than we can handle, and return + * an error if the last one is set (as that indicates that the request + * contained more than the maximum number of actions). + */ + if (tb[TCA_ACT_MAX_PRIO + 1]) { + NL_SET_ERR_MSG_FMT(extack, + "Only %d actions supported per filter", + TCA_ACT_MAX_PRIO); + return -EINVAL; + } + for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 91c0ec729823..c1f75f272757 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -287,8 +287,7 @@ static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, gact->param.tcfg_basetime = basetime; gact->param.tcfg_clockid = clockid; gact->tk_offset = tko; - hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); - gact->hitimer.function = gate_timer_func; + hrtimer_setup(&gact->hitimer, gate_timer_func, clockid, HRTIMER_MODE_ABS_SOFT); } static int tcf_gate_init(struct net *net, struct nlattr *nla, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5b3814365924..5f01f567c934 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -30,7 +30,29 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); #define MIRRED_NEST_LIMIT 4 -static DEFINE_PER_CPU(unsigned int, mirred_nest_level); + +#ifndef CONFIG_PREEMPT_RT +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); +} + +static void tcf_mirred_nest_level_dec(void) +{ + __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); +} + +#else +static u8 tcf_mirred_nest_level_inc_return(void) +{ + return current->net_xmit.sched_mirred_nest++; +} + +static void tcf_mirred_nest_level_dec(void) +{ + current->net_xmit.sched_mirred_nest--; +} +#endif static bool tcf_mirred_is_act_redirect(int action) { @@ -423,7 +445,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, int m_eaction; u32 blockid; - nest_level = __this_cpu_inc_return(mirred_nest_level); + nest_level = tcf_mirred_nest_level_inc_return(); if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); @@ -454,7 +476,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, retval); dec_nest_level: - __this_cpu_dec(mirred_nest_level); + tcf_mirred_nest_level_dec(); return retval; } diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index af7c99845948..2cef4b08befb 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -68,7 +68,7 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = { [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, [TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, - .len = 128 }, + .len = 127 }, }; static const struct nla_policy @@ -571,8 +571,8 @@ static void tunnel_key_release(struct tc_action *a) static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { + const u8 *src = ip_tunnel_info_opts(info); int len = info->options_len; - u8 *src = (u8 *)(info + 1); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); @@ -580,7 +580,7 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, return -EMSGSIZE; while (len > 0) { - struct geneve_opt *opt = (struct geneve_opt *)src; + const struct geneve_opt *opt = (const struct geneve_opt *)src; if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, opt->opt_class) || @@ -603,7 +603,7 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { - struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1); + const struct vxlan_metadata *md = ip_tunnel_info_opts(info); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN); @@ -622,7 +622,7 @@ static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, static int tunnel_key_erspan_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { - struct erspan_metadata *md = (struct erspan_metadata *)(info + 1); + const struct erspan_metadata *md = ip_tunnel_info_opts(info); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN); diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c new file mode 100644 index 000000000000..7ea8b54b2ab1 --- /dev/null +++ b/net/sched/bpf_qdisc.c @@ -0,0 +1,475 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/types.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/filter.h> +#include <net/pkt_sched.h> +#include <net/pkt_cls.h> + +#define QDISC_OP_IDX(op) (offsetof(struct Qdisc_ops, op) / sizeof(void (*)(void))) +#define QDISC_MOFF_IDX(moff) (moff / sizeof(void (*)(void))) + +static struct bpf_struct_ops bpf_Qdisc_ops; + +struct bpf_sched_data { + struct qdisc_watchdog watchdog; +}; + +struct bpf_sk_buff_ptr { + struct sk_buff *skb; +}; + +static int bpf_qdisc_init(struct btf *btf) +{ + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_qdisc_ids, struct, Qdisc) +BTF_ID_LIST_SINGLE(bpf_sk_buff_ids, struct, sk_buff) +BTF_ID_LIST_SINGLE(bpf_sk_buff_ptr_ids, struct, bpf_sk_buff_ptr) + +static bool bpf_qdisc_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + struct btf *btf = prog->aux->attach_btf; + u32 arg; + + arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off); + if (prog->aux->attach_st_ops_member_off == offsetof(struct Qdisc_ops, enqueue)) { + if (arg == 2 && type == BPF_READ) { + info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED; + info->btf = btf; + info->btf_id = bpf_sk_buff_ptr_ids[0]; + return true; + } + } + + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_qdisc_qdisc_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, size_t *end) +{ + switch (off) { + case offsetof(struct Qdisc, limit): + *end = offsetofend(struct Qdisc, limit); + break; + case offsetof(struct Qdisc, q) + offsetof(struct qdisc_skb_head, qlen): + *end = offsetof(struct Qdisc, q) + offsetofend(struct qdisc_skb_head, qlen); + break; + case offsetof(struct Qdisc, qstats) ... offsetofend(struct Qdisc, qstats) - 1: + *end = offsetofend(struct Qdisc, qstats); + break; + default: + return -EACCES; + } + + return 0; +} + +static int bpf_qdisc_sk_buff_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, size_t *end) +{ + switch (off) { + case offsetof(struct sk_buff, tstamp): + *end = offsetofend(struct sk_buff, tstamp); + break; + case offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data[0]) ... + offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, + data[QDISC_CB_PRIV_LEN - 1]): + *end = offsetof(struct sk_buff, cb) + + offsetofend(struct qdisc_skb_cb, data[QDISC_CB_PRIV_LEN - 1]); + break; + default: + return -EACCES; + } + + return 0; +} + +static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log, + const struct bpf_reg_state *reg, + int off, int size) +{ + const struct btf_type *t, *skbt, *qdisct; + size_t end; + int err; + + skbt = btf_type_by_id(reg->btf, bpf_sk_buff_ids[0]); + qdisct = btf_type_by_id(reg->btf, bpf_qdisc_ids[0]); + t = btf_type_by_id(reg->btf, reg->btf_id); + + if (t == skbt) { + err = bpf_qdisc_sk_buff_access(log, reg, off, &end); + } else if (t == qdisct) { + err = bpf_qdisc_qdisc_access(log, reg, off, &end); + } else { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + if (err) { + bpf_log(log, "no write support to %s at off %d\n", + btf_name_by_offset(reg->btf, t->name_off), off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of %s ended at %zu\n", + off, size, btf_name_by_offset(reg->btf, t->name_off), end); + return -EACCES; + } + + return 0; +} + +BTF_ID_LIST(bpf_qdisc_init_prologue_ids) +BTF_ID(func, bpf_qdisc_init_prologue) + +static int bpf_qdisc_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, init)) + return 0; + + /* r6 = r1; // r6 will be "u64 *ctx". r1 is "u64 *ctx". + * r2 = r1[16]; // r2 will be "struct netlink_ext_ack *extack" + * r1 = r1[0]; // r1 will be "struct Qdisc *sch" + * r0 = bpf_qdisc_init_prologue(r1, r2); + * if r0 == 0 goto pc+1; + * BPF_EXIT; + * r1 = r6; // r1 will be "u64 *ctx". + */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 16); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_init_prologue_ids[0]); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1); + *insn++ = BPF_EXIT_INSN(); + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + +BTF_ID_LIST(bpf_qdisc_reset_destroy_epilogue_ids) +BTF_ID(func, bpf_qdisc_reset_destroy_epilogue) + +static int bpf_qdisc_gen_epilogue(struct bpf_insn *insn_buf, const struct bpf_prog *prog, + s16 ctx_stack_off) +{ + struct bpf_insn *insn = insn_buf; + + if (prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, reset) && + prog->aux->attach_st_ops_member_off != offsetof(struct Qdisc_ops, destroy)) + return 0; + + /* r1 = stack[ctx_stack_off]; // r1 will be "u64 *ctx" + * r1 = r1[0]; // r1 will be "struct Qdisc *sch" + * r0 = bpf_qdisc_reset_destroy_epilogue(r1); + * BPF_EXIT; + */ + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_FP, ctx_stack_off); + *insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, 0); + *insn++ = BPF_CALL_KFUNC(0, bpf_qdisc_reset_destroy_epilogue_ids[0]); + *insn++ = BPF_EXIT_INSN(); + + return insn - insn_buf; +} + +__bpf_kfunc_start_defs(); + +/* bpf_skb_get_hash - Get the flow hash of an skb. + * @skb: The skb to get the flow hash from. + */ +__bpf_kfunc u32 bpf_skb_get_hash(struct sk_buff *skb) +{ + return skb_get_hash(skb); +} + +/* bpf_kfree_skb - Release an skb's reference and drop it immediately. + * @skb: The skb whose reference to be released and dropped. + */ +__bpf_kfunc void bpf_kfree_skb(struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/* bpf_qdisc_skb_drop - Drop an skb by adding it to a deferred free list. + * @skb: The skb whose reference to be released and dropped. + * @to_free_list: The list of skbs to be dropped. + */ +__bpf_kfunc void bpf_qdisc_skb_drop(struct sk_buff *skb, + struct bpf_sk_buff_ptr *to_free_list) +{ + __qdisc_drop(skb, (struct sk_buff **)to_free_list); +} + +/* bpf_qdisc_watchdog_schedule - Schedule a qdisc to a later time using a timer. + * @sch: The qdisc to be scheduled. + * @expire: The expiry time of the timer. + * @delta_ns: The slack range of the timer. + */ +__bpf_kfunc void bpf_qdisc_watchdog_schedule(struct Qdisc *sch, u64 expire, u64 delta_ns) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_schedule_range_ns(&q->watchdog, expire, delta_ns); +} + +/* bpf_qdisc_init_prologue - Hidden kfunc called in prologue of .init. */ +__bpf_kfunc int bpf_qdisc_init_prologue(struct Qdisc *sch, + struct netlink_ext_ack *extack) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *p; + + qdisc_watchdog_init(&q->watchdog, sch); + + if (sch->parent != TC_H_ROOT) { + /* If qdisc_lookup() returns NULL, it means .init is called by + * qdisc_create_dflt() in mq/mqprio_init and the parent qdisc + * has not been added to qdisc_hash yet. + */ + p = qdisc_lookup(dev, TC_H_MAJ(sch->parent)); + if (p && !(p->flags & TCQ_F_MQROOT)) { + NL_SET_ERR_MSG(extack, "BPF qdisc only supported on root or mq"); + return -EINVAL; + } + } + + return 0; +} + +/* bpf_qdisc_reset_destroy_epilogue - Hidden kfunc called in epilogue of .reset + * and .destroy + */ +__bpf_kfunc void bpf_qdisc_reset_destroy_epilogue(struct Qdisc *sch) +{ + struct bpf_sched_data *q = qdisc_priv(sch); + + qdisc_watchdog_cancel(&q->watchdog); +} + +/* bpf_qdisc_bstats_update - Update Qdisc basic statistics + * @sch: The qdisc from which an skb is dequeued. + * @skb: The skb to be dequeued. + */ +__bpf_kfunc void bpf_qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) +{ + bstats_update(&sch->bstats, skb); +} + +__bpf_kfunc_end_defs(); + +BTF_KFUNCS_START(qdisc_kfunc_ids) +BTF_ID_FLAGS(func, bpf_skb_get_hash, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_kfree_skb, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_qdisc_skb_drop, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_watchdog_schedule, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_init_prologue, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_reset_destroy_epilogue, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_qdisc_bstats_update, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(qdisc_kfunc_ids) + +BTF_SET_START(qdisc_common_kfunc_set) +BTF_ID(func, bpf_skb_get_hash) +BTF_ID(func, bpf_kfree_skb) +BTF_ID(func, bpf_dynptr_from_skb) +BTF_SET_END(qdisc_common_kfunc_set) + +BTF_SET_START(qdisc_enqueue_kfunc_set) +BTF_ID(func, bpf_qdisc_skb_drop) +BTF_ID(func, bpf_qdisc_watchdog_schedule) +BTF_SET_END(qdisc_enqueue_kfunc_set) + +BTF_SET_START(qdisc_dequeue_kfunc_set) +BTF_ID(func, bpf_qdisc_watchdog_schedule) +BTF_ID(func, bpf_qdisc_bstats_update) +BTF_SET_END(qdisc_dequeue_kfunc_set) + +enum qdisc_ops_kf_flags { + QDISC_OPS_KF_COMMON = 0, + QDISC_OPS_KF_ENQUEUE = 1 << 0, + QDISC_OPS_KF_DEQUEUE = 1 << 1, +}; + +static const u32 qdisc_ops_context_flags[] = { + [QDISC_OP_IDX(enqueue)] = QDISC_OPS_KF_ENQUEUE, + [QDISC_OP_IDX(dequeue)] = QDISC_OPS_KF_DEQUEUE, + [QDISC_OP_IDX(init)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(reset)] = QDISC_OPS_KF_COMMON, + [QDISC_OP_IDX(destroy)] = QDISC_OPS_KF_COMMON, +}; + +static int bpf_qdisc_kfunc_filter(const struct bpf_prog *prog, u32 kfunc_id) +{ + u32 moff, flags; + + if (!btf_id_set8_contains(&qdisc_kfunc_ids, kfunc_id)) + return 0; + + if (prog->aux->st_ops != &bpf_Qdisc_ops) + return -EACCES; + + moff = prog->aux->attach_st_ops_member_off; + flags = qdisc_ops_context_flags[QDISC_MOFF_IDX(moff)]; + + if ((flags & QDISC_OPS_KF_ENQUEUE) && + btf_id_set_contains(&qdisc_enqueue_kfunc_set, kfunc_id)) + return 0; + + if ((flags & QDISC_OPS_KF_DEQUEUE) && + btf_id_set_contains(&qdisc_dequeue_kfunc_set, kfunc_id)) + return 0; + + if (btf_id_set_contains(&qdisc_common_kfunc_set, kfunc_id)) + return 0; + + return -EACCES; +} + +static const struct btf_kfunc_id_set bpf_qdisc_kfunc_set = { + .owner = THIS_MODULE, + .set = &qdisc_kfunc_ids, + .filter = bpf_qdisc_kfunc_filter, +}; + +static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = { + .get_func_proto = bpf_base_func_proto, + .is_valid_access = bpf_qdisc_is_valid_access, + .btf_struct_access = bpf_qdisc_btf_struct_access, + .gen_prologue = bpf_qdisc_gen_prologue, + .gen_epilogue = bpf_qdisc_gen_epilogue, +}; + +static int bpf_qdisc_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct Qdisc_ops *uqdisc_ops; + struct Qdisc_ops *qdisc_ops; + u32 moff; + + uqdisc_ops = (const struct Qdisc_ops *)udata; + qdisc_ops = (struct Qdisc_ops *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct Qdisc_ops, priv_size): + if (uqdisc_ops->priv_size) + return -EINVAL; + qdisc_ops->priv_size = sizeof(struct bpf_sched_data); + return 1; + case offsetof(struct Qdisc_ops, peek): + qdisc_ops->peek = qdisc_peek_dequeued; + return 0; + case offsetof(struct Qdisc_ops, id): + if (bpf_obj_name_cpy(qdisc_ops->id, uqdisc_ops->id, + sizeof(qdisc_ops->id)) <= 0) + return -EINVAL; + return 1; + } + + return 0; +} + +static int bpf_qdisc_reg(void *kdata, struct bpf_link *link) +{ + return register_qdisc(kdata); +} + +static void bpf_qdisc_unreg(void *kdata, struct bpf_link *link) +{ + return unregister_qdisc(kdata); +} + +static int bpf_qdisc_validate(void *kdata) +{ + struct Qdisc_ops *ops = (struct Qdisc_ops *)kdata; + + if (!ops->enqueue || !ops->dequeue || !ops->init || + !ops->reset || !ops->destroy) + return -EINVAL; + + return 0; +} + +static int Qdisc_ops__enqueue(struct sk_buff *skb__ref, struct Qdisc *sch, + struct sk_buff **to_free) +{ + return 0; +} + +static struct sk_buff *Qdisc_ops__dequeue(struct Qdisc *sch) +{ + return NULL; +} + +static int Qdisc_ops__init(struct Qdisc *sch, struct nlattr *arg, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static void Qdisc_ops__reset(struct Qdisc *sch) +{ +} + +static void Qdisc_ops__destroy(struct Qdisc *sch) +{ +} + +static struct Qdisc_ops __bpf_ops_qdisc_ops = { + .enqueue = Qdisc_ops__enqueue, + .dequeue = Qdisc_ops__dequeue, + .init = Qdisc_ops__init, + .reset = Qdisc_ops__reset, + .destroy = Qdisc_ops__destroy, +}; + +static struct bpf_struct_ops bpf_Qdisc_ops = { + .verifier_ops = &bpf_qdisc_verifier_ops, + .reg = bpf_qdisc_reg, + .unreg = bpf_qdisc_unreg, + .validate = bpf_qdisc_validate, + .init_member = bpf_qdisc_init_member, + .init = bpf_qdisc_init, + .name = "Qdisc_ops", + .cfi_stubs = &__bpf_ops_qdisc_ops, + .owner = THIS_MODULE, +}; + +BTF_ID_LIST(bpf_sk_buff_dtor_ids) +BTF_ID(func, bpf_kfree_skb) + +static int __init bpf_qdisc_kfunc_init(void) +{ + int ret; + const struct btf_id_dtor_kfunc skb_kfunc_dtors[] = { + { + .btf_id = bpf_sk_buff_ids[0], + .kfunc_btf_id = bpf_sk_buff_dtor_ids[0] + }, + }; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_qdisc_kfunc_set); + ret = ret ?: register_btf_id_dtor_kfuncs(skb_kfunc_dtors, + ARRAY_SIZE(skb_kfunc_dtors), + THIS_MODULE); + ret = ret ?: register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops); + + return ret; +} +late_initcall(bpf_qdisc_kfunc_init); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 7578e27260c9..ecec0a1e1c1a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -97,7 +97,7 @@ tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, err = xa_alloc_cyclic(&tcf_exts_miss_cookies_xa, &n->miss_cookie_base, n, xa_limit_32b, &next, GFP_KERNEL); - if (err) + if (err < 0) goto err_xa_alloc; exts->miss_cookie_node = n; @@ -390,6 +390,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, tp->protocol = protocol; tp->prio = prio; tp->chain = chain; + tp->usesw = !tp->ops->reoffload; spin_lock_init(&tp->lock); refcount_set(&tp->refcnt, 1); @@ -410,39 +411,31 @@ static void tcf_proto_get(struct tcf_proto *tp) refcount_inc(&tp->refcnt); } -static void tcf_maintain_bypass(struct tcf_block *block) +static void tcf_proto_count_usesw(struct tcf_proto *tp, bool add) { - int filtercnt = atomic_read(&block->filtercnt); - int skipswcnt = atomic_read(&block->skipswcnt); - bool bypass_wanted = filtercnt > 0 && filtercnt == skipswcnt; - - if (bypass_wanted != block->bypass_wanted) { #ifdef CONFIG_NET_CLS_ACT - if (bypass_wanted) - static_branch_inc(&tcf_bypass_check_needed_key); - else - static_branch_dec(&tcf_bypass_check_needed_key); -#endif - block->bypass_wanted = bypass_wanted; + struct tcf_block *block = tp->chain->block; + bool counted = false; + + if (!add) { + if (tp->usesw && tp->counted) { + if (!atomic_dec_return(&block->useswcnt)) + static_branch_dec(&tcf_sw_enabled_key); + tp->counted = false; + } + return; } -} - -static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted, bool add) -{ - lockdep_assert_not_held(&block->cb_lock); - down_write(&block->cb_lock); - if (*counted != add) { - if (add) { - atomic_inc(&block->filtercnt); - *counted = true; - } else { - atomic_dec(&block->filtercnt); - *counted = false; - } + spin_lock(&tp->lock); + if (tp->usesw && !tp->counted) { + counted = true; + tp->counted = true; } - tcf_maintain_bypass(block); - up_write(&block->cb_lock); + spin_unlock(&tp->lock); + + if (counted && atomic_inc_return(&block->useswcnt) == 1) + static_branch_inc(&tcf_sw_enabled_key); +#endif } static void tcf_chain_put(struct tcf_chain *chain); @@ -451,7 +444,7 @@ static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, bool sig_destroy, struct netlink_ext_ack *extack) { tp->ops->destroy(tp, rtnl_held, extack); - tcf_block_filter_cnt_update(tp->chain->block, &tp->counted, false); + tcf_proto_count_usesw(tp, false); if (sig_destroy) tcf_proto_signal_destroyed(tp->chain, tp); tcf_chain_put(tp->chain); @@ -2064,6 +2057,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); + int ret = -EMSGSIZE; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) @@ -2108,11 +2102,45 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, return skb->len; +cls_op_not_supp: + ret = -EOPNOTSUPP; out_nlmsg_trim: nla_put_failure: -cls_op_not_supp: nlmsg_trim(skb, b); - return -1; + return ret; +} + +static struct sk_buff *tfilter_notify_prep(struct net *net, + struct sk_buff *oskb, + struct nlmsghdr *n, + struct tcf_proto *tp, + struct tcf_block *block, + struct Qdisc *q, u32 parent, + void *fh, int event, + u32 portid, bool rtnl_held, + struct netlink_ext_ack *extack) +{ + unsigned int size = oskb ? max(NLMSG_GOODSIZE, oskb->len) : NLMSG_GOODSIZE; + struct sk_buff *skb; + int ret; + +retry: + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOBUFS); + + ret = tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, + n->nlmsg_seq, n->nlmsg_flags, event, false, + rtnl_held, extack); + if (ret <= 0) { + kfree_skb(skb); + if (ret == -EMSGSIZE) { + size += NLMSG_GOODSIZE; + goto retry; + } + return ERR_PTR(-EINVAL); + } + return skb; } static int tfilter_notify(struct net *net, struct sk_buff *oskb, @@ -2128,16 +2156,10 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (!unicast && !rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return 0; - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, - n->nlmsg_seq, n->nlmsg_flags, event, - false, rtnl_held, extack) <= 0) { - kfree_skb(skb); - return -EINVAL; - } + skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, event, + portid, rtnl_held, extack); + if (IS_ERR(skb)) + return PTR_ERR(skb); if (unicast) err = rtnl_unicast(skb, net, portid); @@ -2160,16 +2182,11 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return tp->ops->delete(tp, fh, last, rtnl_held, extack); - skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); - if (!skb) - return -ENOBUFS; - - if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, - n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - false, rtnl_held, extack) <= 0) { + skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, + RTM_DELTFILTER, portid, rtnl_held, extack); + if (IS_ERR(skb)) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); - kfree_skb(skb); - return -EINVAL; + return PTR_ERR(skb); } err = tp->ops->delete(tp, fh, last, rtnl_held, extack); @@ -2409,7 +2426,7 @@ replay: tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); - tcf_block_filter_cnt_update(block, &tp->counted, true); + tcf_proto_count_usesw(tp, true); /* q pointer is NULL for shared blocks */ if (q) q->flags &= ~TCQ_F_CAN_BYPASS; @@ -3532,8 +3549,6 @@ static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) if (*flags & TCA_CLS_FLAGS_IN_HW) return; *flags |= TCA_CLS_FLAGS_IN_HW; - if (tc_skip_sw(*flags)) - atomic_inc(&block->skipswcnt); atomic_inc(&block->offloadcnt); } @@ -3542,8 +3557,6 @@ static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) if (!(*flags & TCA_CLS_FLAGS_IN_HW)) return; *flags &= ~TCA_CLS_FLAGS_IN_HW; - if (tc_skip_sw(*flags)) - atomic_dec(&block->skipswcnt); atomic_dec(&block->offloadcnt); } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 1941ebec23ff..7fbe42f0e5c2 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -509,6 +509,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(prog->gen_flags)) prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, prog->gen_flags); + if (oldprog) { idr_replace(&head->handle_idr, prog, handle); list_replace_rcu(&oldprog->link, &prog->link); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1008ec8a464c..099ff6a3e1f5 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -766,7 +766,7 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY, - .len = 128 }, + .len = 127 }, }; static const struct nla_policy @@ -2503,6 +2503,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(fnew->flags)) fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, fnew->flags); + spin_lock(&tp->lock); /* tp was deleted concurrently. -EAGAIN will cause caller to lookup diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 9f1e62ca508d..f03bf5da39ee 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -228,6 +228,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(new->flags)) new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, new->flags); + *arg = head; rcu_assign_pointer(tp->root, new); return 0; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index d3a03c57545b..2a1c00048fd6 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -951,6 +951,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(new->flags)) new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, new->flags); + u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); tcf_exts_get_net(&n->exts); @@ -1164,6 +1166,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(n->flags)) n->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, n->flags); + ins = &ht->ht[TC_U32_HASH(handle)]; for (pins = rtnl_dereference(*ins); pins; ins = &pins->next, pins = rtnl_dereference(*ins)) diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 8996c73c9779..3f2e707a11d1 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -460,7 +460,7 @@ META_COLLECTOR(int_sk_fwd_alloc) *err = -1; return; } - dst->value = sk_forward_alloc_get(sk); + dst->value = READ_ONCE(sk->sk_forward_alloc); } META_COLLECTOR(int_sk_sndbuf) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 300430b8c4d2..c5e3673aadbe 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -25,7 +25,9 @@ #include <linux/hrtimer.h> #include <linux/slab.h> #include <linux/hashtable.h> +#include <linux/bpf.h> +#include <net/netdev_lock.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -206,7 +208,7 @@ static struct Qdisc_ops *qdisc_lookup_default(const char *name) for (q = qdisc_base; q; q = q->next) { if (!strcmp(name, q->id)) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -236,7 +238,7 @@ int qdisc_set_default(const char *name) if (ops) { /* Set new default */ - module_put(default_qdisc_ops->owner); + bpf_module_put(default_qdisc_ops, default_qdisc_ops->owner); default_qdisc_ops = ops; } write_unlock(&qdisc_mod_lock); @@ -358,7 +360,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { if (nla_strcmp(kind, q->id) == 0) { - if (!try_module_get(q->owner)) + if (!bpf_try_module_get(q, q->owner)) q = NULL; break; } @@ -619,8 +621,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid) { - hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); - wd->timer.function = qdisc_watchdog; + hrtimer_setup(&wd->timer, qdisc_watchdog, clockid, HRTIMER_MODE_ABS_PINNED); wd->qdisc = qdisc; } EXPORT_SYMBOL(qdisc_watchdog_init_clockid); @@ -1267,36 +1268,8 @@ static struct Qdisc *qdisc_create(struct net_device *dev, struct qdisc_size_table *stab; ops = qdisc_lookup_ops(kind); -#ifdef CONFIG_MODULES - if (ops == NULL && kind != NULL) { - char name[IFNAMSIZ]; - if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { - /* We dropped the RTNL semaphore in order to - * perform the module load. So, even if we - * succeeded in loading the module we have to - * tell the caller to replay the request. We - * indicate this using -EAGAIN. - * We replay the request because the device may - * go away in the mean time. - */ - rtnl_unlock(); - request_module(NET_SCH_ALIAS_PREFIX "%s", name); - rtnl_lock(); - ops = qdisc_lookup_ops(kind); - if (ops != NULL) { - /* We will try again qdisc_lookup_ops, - * so don't keep a reference. - */ - module_put(ops->owner); - err = -EAGAIN; - goto err_out; - } - } - } -#endif - - err = -ENOENT; if (!ops) { + err = -ENOENT; NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown"); goto err_out; } @@ -1398,7 +1371,7 @@ err_out3: netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: - module_put(ops->owner); + bpf_module_put(ops, ops->owner); err_out: *errp = err; return NULL; @@ -1505,27 +1478,18 @@ const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { * Delete/get qdisc. */ -static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, - struct netlink_ext_ack *extack) +static int __tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack, + struct net_device *dev, + struct nlattr *tca[TCA_MAX + 1], + struct tcmsg *tcm) { struct net *net = sock_net(skb->sk); - struct tcmsg *tcm = nlmsg_data(n); - struct nlattr *tca[TCA_MAX + 1]; - struct net_device *dev; - u32 clid; struct Qdisc *q = NULL; struct Qdisc *p = NULL; + u32 clid; int err; - err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, - rtm_tca_policy, extack); - if (err < 0) - return err; - - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return -ENODEV; - clid = tcm->tcm_parent; if (clid) { if (clid != TC_H_ROOT) { @@ -1560,7 +1524,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { - NL_SET_ERR_MSG(extack, "Invalid qdisc name"); + NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); return -EINVAL; } @@ -1582,6 +1546,31 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, return 0; } +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct tcmsg *tcm = nlmsg_data(n); + struct nlattr *tca[TCA_MAX + 1]; + struct net_device *dev; + int err; + + err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, + rtm_tca_policy, extack); + if (err < 0) + return err; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + netdev_lock_ops(dev); + err = __tc_get_qdisc(skb, n, extack, dev, tca, tcm); + netdev_unlock_ops(dev); + + return err; +} + static bool req_create_or_replace(struct nlmsghdr *n) { return (n->nlmsg_flags & NLM_F_CREATE && @@ -1601,35 +1590,18 @@ static bool req_change(struct nlmsghdr *n) !(n->nlmsg_flags & NLM_F_EXCL)); } -/* - * Create/change qdisc. - */ -static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, - struct netlink_ext_ack *extack) +static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack, + struct net_device *dev, + struct nlattr *tca[TCA_MAX + 1], + struct tcmsg *tcm) { - struct net *net = sock_net(skb->sk); - struct tcmsg *tcm; - struct nlattr *tca[TCA_MAX + 1]; - struct net_device *dev; + struct Qdisc *q = NULL; + struct Qdisc *p = NULL; u32 clid; - struct Qdisc *q, *p; int err; -replay: - /* Reinit, just in case something touches this. */ - err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, - rtm_tca_policy, extack); - if (err < 0) - return err; - - tcm = nlmsg_data(n); clid = tcm->tcm_parent; - q = p = NULL; - - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return -ENODEV; - if (clid) { if (clid != TC_H_ROOT) { @@ -1664,13 +1636,17 @@ replay: q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) goto create_n_graft; + if (q->parent != tcm->tcm_parent) { + NL_SET_ERR_MSG(extack, "Cannot move an existing qdisc to a different parent"); + return -EINVAL; + } if (n->nlmsg_flags & NLM_F_EXCL) { NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); return -EEXIST; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { - NL_SET_ERR_MSG(extack, "Invalid qdisc name"); + NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); return -EINVAL; } if (q->flags & TCQ_F_INGRESS) { @@ -1746,12 +1722,12 @@ replay: return -EEXIST; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { - NL_SET_ERR_MSG(extack, "Invalid qdisc name"); + NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc"); return -EINVAL; } err = qdisc_change(q, tca, extack); if (err == 0) - qdisc_notify(net, skb, n, clid, NULL, q, extack); + qdisc_notify(sock_net(skb->sk), skb, n, clid, NULL, q, extack); return err; create_n_graft: @@ -1783,11 +1759,8 @@ create_n_graft2: tcm->tcm_parent, tcm->tcm_handle, tca, &err, extack); } - if (q == NULL) { - if (err == -EAGAIN) - goto replay; + if (!q) return err; - } graft: err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); @@ -1800,6 +1773,58 @@ graft: return 0; } +static void request_qdisc_module(struct nlattr *kind) +{ + struct Qdisc_ops *ops; + char name[IFNAMSIZ]; + + if (!kind) + return; + + ops = qdisc_lookup_ops(kind); + if (ops) { + bpf_module_put(ops, ops->owner); + return; + } + + if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { + rtnl_unlock(); + request_module(NET_SCH_ALIAS_PREFIX "%s", name); + rtnl_lock(); + } +} + +/* + * Create/change qdisc. + */ +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tca[TCA_MAX + 1]; + struct net_device *dev; + struct tcmsg *tcm; + int err; + + err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, + rtm_tca_policy, extack); + if (err < 0) + return err; + + request_qdisc_module(tca[TCA_KIND]); + + tcm = nlmsg_data(n); + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + netdev_lock_ops(dev); + err = __tc_modify_qdisc(skb, n, extack, dev, tca, tcm); + netdev_unlock_ops(dev); + + return err; +} + static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, struct netlink_callback *cb, int *q_idx_p, int s_q_idx, bool recur, @@ -1884,17 +1909,23 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) s_q_idx = 0; q_idx = 0; + netdev_lock_ops(dev); if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), skb, cb, &q_idx, s_q_idx, - true, tca[TCA_DUMP_INVISIBLE]) < 0) + true, tca[TCA_DUMP_INVISIBLE]) < 0) { + netdev_unlock_ops(dev); goto done; + } dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping), skb, cb, &q_idx, s_q_idx, false, - tca[TCA_DUMP_INVISIBLE]) < 0) + tca[TCA_DUMP_INVISIBLE]) < 0) { + netdev_unlock_ops(dev); goto done; + } + netdev_unlock_ops(dev); cont: idx++; @@ -2131,15 +2162,15 @@ static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, #endif -static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, - struct netlink_ext_ack *extack) +static int __tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack, + struct net_device *dev, + struct nlattr *tca[TCA_MAX + 1], + struct tcmsg *tcm) { struct net *net = sock_net(skb->sk); - struct tcmsg *tcm = nlmsg_data(n); - struct nlattr *tca[TCA_MAX + 1]; - struct net_device *dev; - struct Qdisc *q = NULL; const struct Qdisc_class_ops *cops; + struct Qdisc *q = NULL; unsigned long cl = 0; unsigned long new_cl; u32 portid; @@ -2147,15 +2178,6 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, u32 qid; int err; - err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, - rtm_tca_policy, extack); - if (err < 0) - return err; - - dev = __dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return -ENODEV; - /* parent == TC_H_UNSPEC - unspecified parent. parent == TC_H_ROOT - class is root, which has no parent. @@ -2250,6 +2272,12 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, return -EOPNOTSUPP; } + /* Prevent creation of traffic classes with classid TC_H_ROOT */ + if (clid == TC_H_ROOT) { + NL_SET_ERR_MSG(extack, "Cannot create traffic class with classid TC_H_ROOT"); + return -EINVAL; + } + new_cl = cl; err = -EOPNOTSUPP; if (cops->change) @@ -2264,6 +2292,31 @@ out: return err; } +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct tcmsg *tcm = nlmsg_data(n); + struct nlattr *tca[TCA_MAX + 1]; + struct net_device *dev; + int err; + + err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, + rtm_tca_policy, extack); + if (err < 0) + return err; + + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return -ENODEV; + + netdev_lock_ops(dev); + err = __tc_ctl_tclass(skb, n, extack, dev, tca, tcm); + netdev_unlock_ops(dev); + + return err; +} + struct qdisc_dump_args { struct qdisc_walker w; struct sk_buff *skb; @@ -2340,20 +2393,12 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, return 0; } -static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +static int __tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb, + struct tcmsg *tcm, struct net_device *dev) { - struct tcmsg *tcm = nlmsg_data(cb->nlh); - struct net *net = sock_net(skb->sk); struct netdev_queue *dev_queue; - struct net_device *dev; int t, s_t; - if (nlmsg_len(cb->nlh) < sizeof(*tcm)) - return 0; - dev = dev_get_by_index(net, tcm->tcm_ifindex); - if (!dev) - return 0; - s_t = cb->args[0]; t = 0; @@ -2370,10 +2415,32 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) done: cb->args[0] = t; - dev_put(dev); return skb->len; } +static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct tcmsg *tcm = nlmsg_data(cb->nlh); + struct net *net = sock_net(skb->sk); + struct net_device *dev; + int err; + + if (nlmsg_len(cb->nlh) < sizeof(*tcm)) + return 0; + + dev = dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) + return 0; + + netdev_lock_ops(dev); + err = __tc_dump_tclass(skb, cb, tcm, dev); + netdev_unlock_ops(dev); + + dev_put(dev); + + return err; +} + #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 2c2e2a67f3b2..48dd8c88903f 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -484,13 +484,14 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars, /* Call this with a freshly dequeued packet for possible congestion marking. * Returns true as an instruction to drop the packet, false for delivery. */ -static bool cobalt_should_drop(struct cobalt_vars *vars, - struct cobalt_params *p, - ktime_t now, - struct sk_buff *skb, - u32 bulk_flows) -{ - bool next_due, over_target, drop = false; +static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars, + struct cobalt_params *p, + ktime_t now, + struct sk_buff *skb, + u32 bulk_flows) +{ + enum skb_drop_reason reason = SKB_NOT_DROPPED_YET; + bool next_due, over_target; ktime_t schedule; u64 sojourn; @@ -533,7 +534,8 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, if (next_due && vars->dropping) { /* Use ECN mark if possible, otherwise drop */ - drop = !(vars->ecn_marked = INET_ECN_set_ce(skb)); + if (!(vars->ecn_marked = INET_ECN_set_ce(skb))) + reason = SKB_DROP_REASON_QDISC_CONGESTED; vars->count++; if (!vars->count) @@ -556,16 +558,17 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, } /* Simple BLUE implementation. Lack of ECN is deliberate. */ - if (vars->p_drop) - drop |= (get_random_u32() < vars->p_drop); + if (vars->p_drop && reason == SKB_NOT_DROPPED_YET && + get_random_u32() < vars->p_drop) + reason = SKB_DROP_REASON_CAKE_FLOOD; /* Overload the drop_next field as an activity timeout */ if (!vars->count) vars->drop_next = ktime_add_ns(now, p->interval); - else if (ktime_to_ns(schedule) > 0 && !drop) + else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET) vars->drop_next = now; - return drop; + return reason; } static bool cake_update_flowkeys(struct flow_keys *keys, @@ -1585,12 +1588,11 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) flow->dropped++; b->tin_dropped++; - sch->qstats.drops++; if (q->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, skb, now, true); - __qdisc_drop(skb, to_free); + qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); sch->q.qlen--; qdisc_tree_reduce_backlog(sch, 1, len); @@ -1965,13 +1967,14 @@ static void cake_clear_tin(struct Qdisc *sch, u16 tin) q->cur_tin = tin; for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++) while (!!(skb = cake_dequeue_one(sch))) - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE); } static struct sk_buff *cake_dequeue(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; + enum skb_drop_reason reason; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; @@ -2153,12 +2156,12 @@ retry: goto begin; } + reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, + (b->bulk_flow_count * + !!(q->rate_flags & + CAKE_FLAG_INGRESS))); /* Last packet in queue may be marked, shouldn't be dropped */ - if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, - (b->bulk_flow_count * - !!(q->rate_flags & - CAKE_FLAG_INGRESS))) || - !flow->head) + if (reason == SKB_NOT_DROPPED_YET || !flow->head) break; /* drop this packet, get another one */ @@ -2172,7 +2175,7 @@ retry: b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); - kfree_skb(skb); + kfree_skb_reason(skb, reason); if (q->rate_flags & CAKE_FLAG_INGRESS) goto retry; } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 3e8d4fe4d91e..c93761040c6e 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -52,7 +52,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -65,10 +65,7 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch) &q->stats, qdisc_pkt_len, codel_get_enqueue_time, drop_func, dequeue_func); - /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, - * or HTB crashes. Defer it for next round. - */ - if (q->stats.drop_count && sch->q.qlen) { + if (q->stats.drop_count) { qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len); q->stats.drop_count = 0; q->stats.drop_len = 0; @@ -89,7 +86,8 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, } q = qdisc_priv(sch); q->drop_overlimit++; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_OVERLIMIT); } static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { @@ -146,7 +144,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index c69b999fae17..9b6d79bd8737 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -35,6 +35,11 @@ struct drr_sched { struct Qdisc_class_hash clhash; }; +static bool cl_is_active(struct drr_class *cl) +{ + return !list_empty(&cl->alist); +} + static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid) { struct drr_sched *q = qdisc_priv(sch); @@ -105,6 +110,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return -ENOBUFS; gnet_stats_basic_sync_init(&cl->bstats); + INIT_LIST_HEAD(&cl->alist); cl->common.classid = classid; cl->quantum = quantum; cl->qdisc = qdisc_create_dflt(sch->dev_queue, @@ -229,7 +235,7 @@ static void drr_qlen_notify(struct Qdisc *csh, unsigned long arg) { struct drr_class *cl = (struct drr_class *)arg; - list_del(&cl->alist); + list_del_init(&cl->alist); } static int drr_dump_class(struct Qdisc *sch, unsigned long arg, @@ -336,7 +342,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct drr_sched *q = qdisc_priv(sch); struct drr_class *cl; int err = 0; - bool first; cl = drr_classify(skb, sch, &err); if (cl == NULL) { @@ -346,7 +351,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { @@ -356,7 +360,7 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - if (first) { + if (!cl_is_active(cl)) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } @@ -390,7 +394,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) if (unlikely(skb == NULL)) goto out; if (cl->qdisc->q.qlen == 0) - list_del(&cl->alist); + list_del_init(&cl->alist); bstats_update(&cl->bstats, skb); qdisc_bstats_update(sch, skb); @@ -431,7 +435,7 @@ static void drr_reset_qdisc(struct Qdisc *sch) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { if (cl->qdisc->q.qlen) - list_del(&cl->alist); + list_del_init(&cl->alist); qdisc_reset(cl->qdisc); } } diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index f80bc05d4c5a..2c069f0181c6 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -74,6 +74,11 @@ static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = { [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, }; +static bool cl_is_active(struct ets_class *cl) +{ + return !list_empty(&cl->alist); +} + static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr, unsigned int *quantum, struct netlink_ext_ack *extack) @@ -91,6 +96,8 @@ ets_class_from_arg(struct Qdisc *sch, unsigned long arg) { struct ets_sched *q = qdisc_priv(sch); + if (arg == 0 || arg > q->nbands) + return NULL; return &q->classes[arg - 1]; } @@ -291,7 +298,7 @@ static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg) * to remove them. */ if (!ets_class_is_strict(q, cl) && sch->q.qlen) - list_del(&cl->alist); + list_del_init(&cl->alist); } static int ets_class_dump(struct Qdisc *sch, unsigned long arg, @@ -414,7 +421,6 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct ets_sched *q = qdisc_priv(sch); struct ets_class *cl; int err = 0; - bool first; cl = ets_classify(skb, sch, &err); if (!cl) { @@ -424,7 +430,6 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { if (net_xmit_drop_count(err)) { @@ -434,7 +439,7 @@ static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - if (first && !ets_class_is_strict(q, cl)) { + if (!cl_is_active(cl) && !ets_class_is_strict(q, cl)) { list_add_tail(&cl->alist, &q->active); cl->deficit = cl->quantum; } @@ -486,7 +491,7 @@ static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch) if (unlikely(!skb)) goto out; if (cl->qdisc->q.qlen == 0) - list_del(&cl->alist); + list_del_init(&cl->alist); return ets_qdisc_dequeue_skb(sch, skb); } @@ -655,7 +660,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, } for (i = q->nbands; i < oldbands; i++) { if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) - list_del(&q->classes[i].alist); + list_del_init(&q->classes[i].alist); qdisc_tree_flush_backlog(q->classes[i].qdisc); } WRITE_ONCE(q->nstrict, nstrict); @@ -711,7 +716,7 @@ static void ets_qdisc_reset(struct Qdisc *sch) for (band = q->nstrict; band < q->nbands; band++) { if (q->classes[band].qdisc->q.qlen) - list_del(&q->classes[band].alist); + list_del_init(&q->classes[band].alist); } for (band = 0; band < q->nbands; band++) qdisc_reset(q->classes[band].qdisc); diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index b50b2c2cc09b..e6bfd39ff339 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -40,6 +40,9 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, { unsigned int prev_backlog; + if (unlikely(READ_ONCE(sch->limit) == 0)) + return qdisc_drop(skb, sch, to_free); + if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index a5e87f9ea986..902ff5470607 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -537,6 +537,8 @@ static bool fq_packet_beyond_horizon(const struct sk_buff *skb, return unlikely((s64)skb->tstamp > (s64)(now + q->horizon)); } +#define FQDR(reason) SKB_DROP_REASON_FQ_##reason + static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -548,7 +550,8 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX); if (unlikely(q->band_pkt_count[band] >= sch->limit)) { q->stat_band_drops[band]++; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, + FQDR(BAND_LIMIT)); } now = ktime_get_ns(); @@ -558,8 +561,9 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Check if packet timestamp is too far in the future. */ if (fq_packet_beyond_horizon(skb, q, now)) { if (q->horizon_drop) { - q->stat_horizon_drops++; - return qdisc_drop(skb, sch, to_free); + q->stat_horizon_drops++; + return qdisc_drop_reason(skb, sch, to_free, + FQDR(HORIZON_LIMIT)); } q->stat_horizon_caps++; skb->tstamp = now + q->horizon; @@ -572,7 +576,8 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (f != &q->internal) { if (unlikely(f->qlen >= q->flow_plimit)) { q->stat_flows_plimit++; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, + FQDR(FLOW_LIMIT)); } if (fq_flow_is_detached(f)) { @@ -597,6 +602,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, return NET_XMIT_SUCCESS; } +#undef FQDR static void fq_check_throttled(struct fq_sched_data *q, u64 now) { @@ -1130,7 +1136,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); } while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = fq_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 4f908c11ba95..2a0f3a513bfa 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -168,6 +168,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets, skb = dequeue_head(flow); len += qdisc_pkt_len(skb); mem += get_codel_cb(skb)->mem_usage; + tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT); __qdisc_drop(skb, to_free); } while (++i < max_packets && len < threshold); @@ -274,7 +275,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -314,10 +315,8 @@ begin: } qdisc_bstats_update(sch, skb); flow->deficit -= qdisc_pkt_len(skb); - /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0, - * or HTB crashes. Defer it for next round. - */ - if (q->cstats.drop_count && sch->q.qlen) { + + if (q->cstats.drop_count) { qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); q->cstats.drop_count = 0; @@ -442,7 +441,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, while (sch->q.qlen > sch->limit || q->memory_usage > q->memory_limit) { - struct sk_buff *skb = fq_codel_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); q->cstats.drop_len += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index c38f33ff80bd..df7fac95ab15 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -130,6 +130,7 @@ static inline void flow_queue_add(struct fq_pie_flow *flow, static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct fq_pie_sched_data *q = qdisc_priv(sch); struct fq_pie_flow *sel_flow; int ret; @@ -161,6 +162,8 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->overmemory++; } + reason = SKB_DROP_REASON_QDISC_CONGESTED; + if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars, sel_flow->backlog, skb->len)) { enqueue = true; @@ -198,8 +201,7 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, out: q->stats.dropped++; sel_flow->vars.accu_prob = 0; - __qdisc_drop(skb, to_free); - qdisc_qstats_drop(sch); + qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; } @@ -364,7 +366,7 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, /* Drop excess packets if new limit is lower */ while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = fq_pie_qdisc_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); len_dropped += qdisc_pkt_len(skb); num_dropped += 1; @@ -553,7 +555,7 @@ static void fq_pie_destroy(struct Qdisc *sch) tcf_block_put(q->block); q->p_params.tupdate = 0; - del_timer_sync(&q->adapt_timer); + timer_delete_sync(&q->adapt_timer); kvfree(q->flows); } diff --git a/net/sched/sch_frag.c b/net/sched/sch_frag.c index ce63414185fd..d1d87dce7f3f 100644 --- a/net/sched/sch_frag.c +++ b/net/sched/sch_frag.c @@ -16,14 +16,18 @@ struct sch_frag_data { unsigned int l2_len; u8 l2_data[VLAN_ETH_HLEN]; int (*xmit)(struct sk_buff *skb); + local_lock_t bh_lock; }; -static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage); +static DEFINE_PER_CPU(struct sch_frag_data, sch_frag_data_storage) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; static int sch_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) { struct sch_frag_data *data = this_cpu_ptr(&sch_frag_data_storage); + lockdep_assert_held(&data->bh_lock); if (skb_cow_head(skb, data->l2_len) < 0) { kfree_skb(skb); return -ENOMEM; @@ -95,6 +99,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, struct rtable sch_frag_rt = { 0 }; unsigned long orig_dst; + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); sch_frag_prepare_frag(skb, xmit); dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT); @@ -105,11 +110,13 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, IPCB(skb)->frag_max_size = mru; ret = ip_do_fragment(net, skb->sk, skb, sch_frag_xmit); + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); refdst_drop(orig_dst); } else if (skb_protocol(skb, true) == htons(ETH_P_IPV6)) { unsigned long orig_dst; struct rt6_info sch_frag_rt; + local_lock_nested_bh(&sch_frag_data_storage.bh_lock); sch_frag_prepare_frag(skb, xmit); memset(&sch_frag_rt, 0, sizeof(sch_frag_rt)); dst_init(&sch_frag_rt.dst, &sch_frag_dst_ops, NULL, @@ -122,6 +129,7 @@ static int sch_fragment(struct net *net, struct sk_buff *skb, ret = ipv6_stub->ipv6_fragment(net, skb->sk, skb, sch_frag_xmit); + local_unlock_nested_bh(&sch_frag_data_storage.bh_lock); refdst_drop(orig_dst); } else { net_warn_ratelimited("Fail frag %s: eth=%x, MRU=%d, MTU=%d\n", diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 38ec18f73de4..08e0e3aff976 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -24,6 +24,7 @@ #include <linux/if_vlan.h> #include <linux/skb_array.h> #include <linux/if_macvlan.h> +#include <linux/bpf.h> #include <net/sch_generic.h> #include <net/pkt_sched.h> #include <net/dst.h> @@ -551,28 +552,23 @@ static void dev_watchdog(struct timer_list *t) netdev_put(dev, &dev->watchdog_dev_tracker); } -void __netdev_watchdog_up(struct net_device *dev) +void netdev_watchdog_up(struct net_device *dev) { - if (dev->netdev_ops->ndo_tx_timeout) { - if (dev->watchdog_timeo <= 0) - dev->watchdog_timeo = 5*HZ; - if (!mod_timer(&dev->watchdog_timer, - round_jiffies(jiffies + dev->watchdog_timeo))) - netdev_hold(dev, &dev->watchdog_dev_tracker, - GFP_ATOMIC); - } -} -EXPORT_SYMBOL_GPL(__netdev_watchdog_up); - -static void dev_watchdog_up(struct net_device *dev) -{ - __netdev_watchdog_up(dev); + if (!dev->netdev_ops->ndo_tx_timeout) + return; + if (dev->watchdog_timeo <= 0) + dev->watchdog_timeo = 5*HZ; + if (!mod_timer(&dev->watchdog_timer, + round_jiffies(jiffies + dev->watchdog_timeo))) + netdev_hold(dev, &dev->watchdog_dev_tracker, + GFP_ATOMIC); } +EXPORT_SYMBOL_GPL(netdev_watchdog_up); -static void dev_watchdog_down(struct net_device *dev) +static void netdev_watchdog_down(struct net_device *dev) { netif_tx_lock_bh(dev); - if (del_timer(&dev->watchdog_timer)) + if (timer_delete(&dev->watchdog_timer)) netdev_put(dev, &dev->watchdog_dev_tracker); netif_tx_unlock_bh(dev); } @@ -591,7 +587,7 @@ void netif_carrier_on(struct net_device *dev) atomic_inc(&dev->carrier_up_count); linkwatch_fire_event(dev); if (netif_running(dev)) - __netdev_watchdog_up(dev); + netdev_watchdog_up(dev); } } EXPORT_SYMBOL(netif_carrier_on); @@ -911,8 +907,8 @@ static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch, bands[prio] = q; } - return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len, - GFP_KERNEL); + return skb_array_resize_multiple_bh(bands, PFIFO_FAST_BANDS, new_len, + GFP_KERNEL); } struct Qdisc_ops pfifo_fast_ops __read_mostly = { @@ -1006,14 +1002,14 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, { struct Qdisc *sch; - if (!try_module_get(ops->owner)) { + if (!bpf_try_module_get(ops, ops->owner)) { NL_SET_ERR_MSG(extack, "Failed to increase module reference counter"); return NULL; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { - module_put(ops->owner); + bpf_module_put(ops, ops->owner); return NULL; } sch->parent = parentid; @@ -1083,7 +1079,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc) ops->destroy(qdisc); lockdep_unregister_key(&qdisc->root_lock_key); - module_put(ops->owner); + bpf_module_put(ops, ops->owner); netdev_put(dev, &qdisc->dev_tracker); trace_qdisc_destroy(qdisc); @@ -1267,7 +1263,7 @@ void dev_activate(struct net_device *dev) if (need_watchdog) { netif_trans_update(dev); - dev_watchdog_up(dev); + netdev_watchdog_up(dev); } } EXPORT_SYMBOL(dev_activate); @@ -1282,15 +1278,17 @@ static void qdisc_deactivate(struct Qdisc *qdisc) static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, - void *_qdisc_default) + void *_sync_needed) { - struct Qdisc *qdisc_default = _qdisc_default; + bool *sync_needed = _sync_needed; struct Qdisc *qdisc; qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { + if (qdisc->enqueue) + *sync_needed = true; qdisc_deactivate(qdisc); - rcu_assign_pointer(dev_queue->qdisc, qdisc_default); + rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc); } } @@ -1357,24 +1355,22 @@ static bool some_qdisc_is_busy(struct net_device *dev) */ void dev_deactivate_many(struct list_head *head) { + bool sync_needed = false; struct net_device *dev; list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, - &noop_qdisc); + &sync_needed); if (dev_ingress_queue(dev)) dev_deactivate_queue(dev, dev_ingress_queue(dev), - &noop_qdisc); + &sync_needed); - dev_watchdog_down(dev); + netdev_watchdog_down(dev); } - /* Wait for outstanding qdisc-less dev_queue_xmit calls or - * outstanding qdisc enqueuing calls. - * This is avoided if all devices are in dismantle phase : - * Caller will call synchronize_net() for us - */ - synchronize_net(); + /* Wait for outstanding qdisc enqueuing calls. */ + if (sync_needed) + synchronize_net(); list_for_each_entry(dev, head, close_list) { netdev_for_each_tx_queue(dev, dev_reset_queue, NULL); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 7d2151c62c4a..532fde548b88 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -251,10 +251,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch, q->stats.pdrop++; drop: - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); congestion_drop: - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_CONGESTED); return NET_XMIT_CN; } @@ -913,7 +913,8 @@ static void gred_destroy(struct Qdisc *sch) for (i = 0; i < table->DPs; i++) gred_destroy_vq(table->tab[i]); - gred_offload(sch, TC_GRED_DESTROY); + if (table->opt) + gred_offload(sch, TC_GRED_DESTROY); kfree(table->opt); } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index c287bf8423b4..5a7745170e84 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -175,6 +175,11 @@ struct hfsc_sched { #define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ +static bool cl_in_el_or_vttree(struct hfsc_class *cl) +{ + return ((cl->cl_flags & HFSC_FSC) && cl->cl_nactive) || + ((cl->cl_flags & HFSC_RSC) && !RB_EMPTY_NODE(&cl->el_node)); +} /* * eligible tree holds backlogged classes being sorted by their eligible times. @@ -203,7 +208,10 @@ eltree_insert(struct hfsc_class *cl) static inline void eltree_remove(struct hfsc_class *cl) { - rb_erase(&cl->el_node, &cl->sched->eligible); + if (!RB_EMPTY_NODE(&cl->el_node)) { + rb_erase(&cl->el_node, &cl->sched->eligible); + RB_CLEAR_NODE(&cl->el_node); + } } static inline void @@ -958,6 +966,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl != NULL) { int old_flags; + int len = 0; if (parentid) { if (cl->cl_parent && @@ -988,9 +997,13 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (usc != NULL) hfsc_change_usc(cl, usc, cur_time); + if (cl->qdisc->q.qlen != 0) + len = qdisc_peek_len(cl->qdisc); + /* Check queue length again since some qdisc implementations + * (e.g., netem/codel) might empty the queue during the peek + * operation. + */ if (cl->qdisc->q.qlen != 0) { - int len = qdisc_peek_len(cl->qdisc); - if (cl->cl_flags & HFSC_RSC) { if (old_flags & HFSC_RSC) update_ed(cl, len); @@ -1032,6 +1045,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + RB_CLEAR_NODE(&cl->el_node); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @@ -1220,7 +1235,8 @@ hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg) /* vttree is now handled in update_vf() so that update_vf(cl, 0, 0) * needs to be called explicitly to remove a class from vttree. */ - update_vf(cl, 0, 0); + if (cl->cl_nactive) + update_vf(cl, 0, 0); if (cl->cl_flags & HFSC_RSC) eltree_remove(cl); } @@ -1560,7 +1576,10 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) return err; } - if (first) { + sch->qstats.backlog += len; + sch->q.qlen++; + + if (first && !cl_in_el_or_vttree(cl)) { if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); if (cl->cl_flags & HFSC_FSC) @@ -1575,9 +1594,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) } - sch->qstats.backlog += len; - sch->q.qlen++; - return NET_XMIT_SUCCESS; } @@ -1632,10 +1648,16 @@ hfsc_dequeue(struct Qdisc *sch) if (cl->qdisc->q.qlen != 0) { /* update ed */ next_len = qdisc_peek_len(cl->qdisc); - if (realtime) - update_ed(cl, next_len); - else - update_d(cl, next_len); + /* Check queue length again since some qdisc implementations + * (e.g., netem/codel) might empty the queue during the peek + * operation. + */ + if (cl->qdisc->q.qlen != 0) { + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } } else { /* the class becomes passive */ eltree_remove(cl); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 44d9efe1a96a..5aa434b46707 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -564,7 +564,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, qlen = sch->q.qlen; prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = hhf_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); rtnl_kfree_skbs(skb, skb); } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index c31bc5489bdd..14bf71f57057 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -348,7 +348,8 @@ static void htb_add_to_wait_tree(struct htb_sched *q, */ static inline void htb_next_rb_node(struct rb_node **n) { - *n = rb_next(*n); + if (*n) + *n = rb_next(*n); } /** @@ -609,8 +610,8 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) */ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(!cl->prio_activity); - + if (!cl->prio_activity) + return; htb_deactivate_prios(q, cl); cl->prio_activity = 0; } @@ -1738,8 +1739,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, if (cl->parent) cl->parent->children--; - if (cl->prio_activity) - htb_deactivate(q, cl); + htb_deactivate(q, cl); if (cl->cmode != HTB_CAN_SEND) htb_safe_rb_erase(&cl->pq_node, @@ -1947,8 +1947,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* turn parent into inner node */ qdisc_purge_queue(parent->leaf.q); parent_qdisc = parent->leaf.q; - if (parent->prio_activity) - htb_deactivate(q, parent); + htb_deactivate(q, parent); /* remove from evt list because of level change */ if (parent->cmode != HTB_CAN_SEND) { diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 71ec9986ed37..fdd79d3ccd8c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -749,9 +749,9 @@ deliver: if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) qdisc_qstats_drop(sch); - qdisc_tree_reduce_backlog(sch, 1, pkt_len); sch->qstats.backlog -= pkt_len; sch->q.qlen--; + qdisc_tree_reduce_backlog(sch, 1, pkt_len); } goto tfifo_dequeue; } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index b3dcb845b327..ff49a6c97033 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -85,6 +85,7 @@ EXPORT_SYMBOL_GPL(pie_drop_early); static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct pie_sched_data *q = qdisc_priv(sch); bool enqueue = false; @@ -93,6 +94,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto out; } + reason = SKB_DROP_REASON_QDISC_CONGESTED; + if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog, skb->len)) { enqueue = true; @@ -121,7 +124,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, out: q->stats.dropped++; q->vars.accu_prob = 0; - return qdisc_drop(skb, sch, to_free); + return qdisc_drop_reason(skb, sch, to_free, reason); } static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { @@ -192,7 +195,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); @@ -542,7 +545,7 @@ static void pie_destroy(struct Qdisc *sch) struct pie_sched_data *q = qdisc_priv(sch); q->params.tupdate = 0; - del_timer_sync(&q->adapt_timer); + timer_delete_sync(&q->adapt_timer); } static struct Qdisc_ops pie_qdisc_ops __read_mostly = { diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 6a07cdbdb9e1..bf1282cb22eb 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -202,6 +202,11 @@ struct qfq_sched { */ enum update_reason {enqueue, requeue}; +static bool cl_is_active(struct qfq_class *cl) +{ + return !list_empty(&cl->alist); +} + static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid) { struct qfq_sched *q = qdisc_priv(sch); @@ -347,7 +352,7 @@ static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl) struct qfq_aggregate *agg = cl->agg; - list_del(&cl->alist); /* remove from RR queue of the aggregate */ + list_del_init(&cl->alist); /* remove from RR queue of the aggregate */ if (list_empty(&agg->active)) /* agg is now inactive */ qfq_deactivate_agg(q, agg); } @@ -447,7 +452,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (q->wsum + delta_w > QFQ_MAX_WSUM) { NL_SET_ERR_MSG_FMT_MOD(extack, - "total weight out of range (%d + %u)\n", + "total weight out of range (%d + %u)", delta_w, q->wsum); return -EINVAL; } @@ -474,6 +479,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->deficit = lmax; + INIT_LIST_HEAD(&cl->alist); cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, NULL); @@ -982,7 +988,7 @@ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, cl->deficit -= (int) len; if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ - list_del(&cl->alist); + list_del_init(&cl->alist); else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { cl->deficit += agg->lmax; list_move_tail(&cl->alist, &agg->active); @@ -1214,7 +1220,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct qfq_class *cl; struct qfq_aggregate *agg; int err = 0; - bool first; cl = qfq_classify(skb, sch, &err); if (cl == NULL) { @@ -1236,7 +1241,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; - first = !cl->qdisc->q.qlen; err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); @@ -1252,8 +1256,8 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, ++sch->q.qlen; agg = cl->agg; - /* if the queue was not empty, then done here */ - if (!first) { + /* if the class is active, then done here */ + if (cl_is_active(cl)) { if (unlikely(skb == cl->qdisc->ops->peek(cl->qdisc)) && list_first_entry(&agg->active, struct qfq_class, alist) == cl && cl->deficit < len) @@ -1415,6 +1419,8 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg) struct qfq_sched *q = qdisc_priv(sch); struct qfq_class *cl = (struct qfq_class *)arg; + if (list_empty(&cl->alist)) + return; qfq_deactivate_class(q, cl); } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 6029bc29b51e..1ba3e0bba54f 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -70,6 +70,7 @@ static int red_use_nodrop(struct red_sched_data *q) static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_CONGESTED; struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; unsigned int len; @@ -107,6 +108,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch, break; case RED_HARD_MARK: + reason = SKB_DROP_REASON_QDISC_OVERLIMIT; qdisc_qstats_overlimit(sch); if (red_use_harddrop(q) || !red_use_ecn(q)) { q->stats.forced_drop++; @@ -143,7 +145,7 @@ congestion_drop: if (!skb) return NET_XMIT_CN | ret; - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; } @@ -216,7 +218,7 @@ static void red_destroy(struct Qdisc *sch) tcf_qevent_destroy(&q->qe_mark, sch); tcf_qevent_destroy(&q->qe_early_drop, sch); - del_timer_sync(&q->adapt_timer); + timer_delete_sync(&q->adapt_timer); red_offload(sch, false); qdisc_put(q->qdisc); } @@ -295,7 +297,7 @@ static int __red_change(struct Qdisc *sch, struct nlattr **tb, max_P); red_set_vars(&q->vars); - del_timer(&q->adapt_timer); + timer_delete(&q->adapt_timer); if (ctl->flags & TC_RED_ADAPTATIVE) mod_timer(&q->adapt_timer, jiffies + HZ/2); diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index b717e15a3a17..d2835f1168e1 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -280,6 +280,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT; struct sfb_sched_data *q = qdisc_priv(sch); unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; @@ -380,6 +381,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, } r = get_random_u16() & SFB_MAX_PROB; + reason = SKB_DROP_REASON_QDISC_CONGESTED; if (unlikely(r < p_min)) { if (unlikely(p_min > SFB_MAX_PROB / 2)) { @@ -414,7 +416,7 @@ enqueue: return ret; drop: - qdisc_drop(skb, sch, to_free); + qdisc_drop_reason(skb, sch, to_free, reason); return NET_XMIT_CN; other_drop: if (ret & __NET_XMIT_BYPASS) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index a4b8296a2fa1..b912ad99aa15 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -631,6 +631,15 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, struct red_parms *p = NULL; struct sk_buff *to_free = NULL; struct sk_buff *tail = NULL; + unsigned int maxflows; + unsigned int quantum; + unsigned int divisor; + int perturb_period; + u8 headdrop; + u8 maxdepth; + int limit; + u8 flags; + if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; @@ -652,35 +661,64 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, if (!p) return -ENOMEM; } + sch_tree_lock(sch); + + limit = q->limit; + divisor = q->divisor; + headdrop = q->headdrop; + maxdepth = q->maxdepth; + maxflows = q->maxflows; + perturb_period = q->perturb_period; + quantum = q->quantum; + flags = q->flags; + + /* update and validate configuration */ if (ctl->quantum) - q->quantum = ctl->quantum; - WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ); + quantum = ctl->quantum; + perturb_period = ctl->perturb_period * HZ; if (ctl->flows) - q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); + maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); if (ctl->divisor) { - q->divisor = ctl->divisor; - q->maxflows = min_t(u32, q->maxflows, q->divisor); + divisor = ctl->divisor; + maxflows = min_t(u32, maxflows, divisor); } if (ctl_v1) { if (ctl_v1->depth) - q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); + maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); if (p) { - swap(q->red_parms, p); - red_set_parms(q->red_parms, + red_set_parms(p, ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Plog, ctl_v1->Scell_log, NULL, ctl_v1->max_P); } - q->flags = ctl_v1->flags; - q->headdrop = ctl_v1->headdrop; + flags = ctl_v1->flags; + headdrop = ctl_v1->headdrop; } if (ctl->limit) { - q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows); - q->maxflows = min_t(u32, q->maxflows, q->limit); + limit = min_t(u32, ctl->limit, maxdepth * maxflows); + maxflows = min_t(u32, maxflows, limit); } + if (limit == 1) { + sch_tree_unlock(sch); + kfree(p); + NL_SET_ERR_MSG_MOD(extack, "invalid limit"); + return -EINVAL; + } + + /* commit configuration */ + q->limit = limit; + q->divisor = divisor; + q->headdrop = headdrop; + q->maxdepth = maxdepth; + q->maxflows = maxflows; + WRITE_ONCE(q->perturb_period, perturb_period); + q->quantum = quantum; + q->flags = flags; + if (p) + swap(q->red_parms, p); qlen = sch->q.qlen; while (sch->q.qlen > q->limit) { @@ -692,7 +730,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt, rtnl_kfree_skbs(to_free, tail); qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); - del_timer(&q->perturb_timer); + timer_delete(&q->perturb_timer); if (q->perturb_period) { mod_timer(&q->perturb_timer, jiffies + q->perturb_period); get_random_bytes(&q->perturbation, sizeof(q->perturbation)); @@ -718,7 +756,7 @@ static void sfq_destroy(struct Qdisc *sch) tcf_block_put(q->block); WRITE_ONCE(q->perturb_period, 0); - del_timer_sync(&q->perturb_timer); + timer_delete_sync(&q->perturb_timer); sfq_free(q->ht); sfq_free(q->slots); kfree(q->red_parms); diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c index 20ff7386b74b..f485f62ab721 100644 --- a/net/sched/sch_skbprio.c +++ b/net/sched/sch_skbprio.c @@ -123,8 +123,6 @@ static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Check to update highest and lowest priorities. */ if (skb_queue_empty(lp_qdisc)) { if (q->lowest_prio == q->highest_prio) { - /* The incoming packet is the only packet in queue. */ - BUG_ON(sch->q.qlen != 1); q->lowest_prio = prio; q->highest_prio = prio; } else { @@ -156,7 +154,6 @@ static struct sk_buff *skbprio_dequeue(struct Qdisc *sch) /* Update highest priority field. */ if (skb_queue_empty(hpq)) { if (q->lowest_prio == q->highest_prio) { - BUG_ON(sch->q.qlen); q->highest_prio = 0; q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1; } else { diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index a68e17891b0b..14021b812329 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1932,8 +1932,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { - hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); - q->advance_timer.function = advance_sched; + hrtimer_setup(&q->advance_timer, advance_sched, q->clockid, HRTIMER_MODE_ABS); } err = taprio_get_start_time(sch, new_admin, &start); @@ -2056,8 +2055,7 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, spin_lock_init(&q->current_entry_lock); - hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); - q->advance_timer.function = advance_sched; + hrtimer_setup(&q->advance_timer, advance_sched, CLOCK_TAI, HRTIMER_MODE_ABS); q->root = sch; diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 5da599ff84a9..24d5a35ce894 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -10,7 +10,7 @@ menuconfig IP_SCTP select CRYPTO select CRYPTO_HMAC select CRYPTO_SHA1 - select LIBCRC32C + select NET_CRC32C select NET_UDP_TUNNEL help Stream Control Transmission Protocol diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 0b0794f164cf..5793d71852b8 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -362,7 +362,7 @@ void sctp_association_free(struct sctp_association *asoc) * on our state. */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { - if (del_timer(&asoc->timers[i])) + if (timer_delete(&asoc->timers[i])) sctp_association_put(asoc); } @@ -736,24 +736,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, return peer; } -/* Delete a transport address from an association. */ -void sctp_assoc_del_peer(struct sctp_association *asoc, - const union sctp_addr *addr) -{ - struct list_head *pos; - struct list_head *temp; - struct sctp_transport *transport; - - list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { - transport = list_entry(pos, struct sctp_transport, transports); - if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) { - /* Do book keeping for removing the peer and free it. */ - sctp_assoc_rm_peer(asoc, transport); - break; - } - } -} - /* Lookup a transport by address. */ struct sctp_transport *sctp_assoc_lookup_paddr( const struct sctp_association *asoc, @@ -1521,7 +1503,7 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len) /* Stop the SACK timer. */ timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; - if (del_timer(timer)) + if (timer_delete(timer)) sctp_association_put(asoc); } } diff --git a/net/sctp/input.c b/net/sctp/input.c index a8a254a5008e..0c0d2757f6f8 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -446,7 +446,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk, pr_debug("%s: unrecognized next header type " "encountered!\n", __func__); - if (del_timer(&t->proto_unreach_timer)) + if (timer_delete(&t->proto_unreach_timer)) sctp_transport_put(t); sctp_do_sm(net, SCTP_EVENT_T_OTHER, diff --git a/net/sctp/offload.c b/net/sctp/offload.c index 502095173d88..e6f863c031b4 100644 --- a/net/sctp/offload.c +++ b/net/sctp/offload.c @@ -111,7 +111,6 @@ int __init sctp_offload_init(void) if (ret) goto ipv4; - crc32c_csum_stub = &sctp_csum_ops; return ret; ipv4: diff --git a/net/sctp/output.c b/net/sctp/output.c index a63df055ac57..23e96305cad7 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -312,7 +312,7 @@ static enum sctp_xmit sctp_packet_bundle_sack(struct sctp_packet *pkt, SCTP_MIB_OUTCTRLCHUNKS); asoc->stats.octrlchunks++; asoc->peer.sack_needed = 0; - if (del_timer(timer)) + if (timer_delete(timer)) sctp_association_put(asoc); } } diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 0dc6b8ab9963..f6b8c13dafa4 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1630,8 +1630,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, * as the receiver acknowledged any data. */ if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING && - del_timer(&asoc->timers - [SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD])) + timer_delete(&asoc->timers[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD])) sctp_association_put(asoc); /* Mark the destination transport address as @@ -1688,7 +1687,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, * address. */ if (!transport->flight_size) { - if (del_timer(&transport->T3_rtx_timer)) + if (timer_delete(&transport->T3_rtx_timer)) sctp_transport_put(transport); } else if (restart_timer) { if (!mod_timer(&transport->T3_rtx_timer, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 8b9a1b96695e..8c3b80c4d40b 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -43,6 +43,7 @@ #include <net/addrconf.h> #include <net/inet_common.h> #include <net/inet_ecn.h> +#include <net/inet_sock.h> #include <net/udp_tunnel.h> #include <net/inet_dscp.h> @@ -184,12 +185,9 @@ static void sctp_v4_copy_ip_options(struct sock *sk, struct sock *newsk) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + + newopt = sock_kmemdup(newsk, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); - if (newopt) - memcpy(newopt, inet_opt, sizeof(*inet_opt) + - inet_opt->opt.optlen); - else + if (!newopt) pr_err("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); @@ -427,16 +425,19 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct dst_entry *dst = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; - u8 tos = READ_ONCE(inet_sk(sk)->tos); + dscp_t dscp; if (t->dscp & SCTP_DSCP_SET_MASK) - tos = t->dscp & SCTP_DSCP_VAL_MASK; + dscp = inet_dsfield_to_dscp(t->dscp); + else + dscp = inet_sk_dscp(inet_sk(sk)); + memset(&_fl, 0x0, sizeof(_fl)); fl4->daddr = daddr->v4.sin_addr.s_addr; fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = tos & INET_DSCP_MASK; + fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); @@ -694,7 +695,7 @@ static void sctp_free_addr_wq(struct net *net) struct sctp_sockaddr_entry *temp; spin_lock_bh(&net->sctp.addr_wq_lock); - del_timer(&net->sctp.addr_wq_timer); + timer_delete(&net->sctp.addr_wq_timer); list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) { list_del(&addrw->list); kfree(addrw); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index f80208edd6a5..3ead591c72fd 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -115,14 +115,6 @@ static void sctp_control_set_owner_w(struct sctp_chunk *chunk) skb->destructor = sctp_control_release_owner; } -/* What was the inbound interface for this chunk? */ -int sctp_chunk_iif(const struct sctp_chunk *chunk) -{ - struct sk_buff *skb = chunk->skb; - - return SCTP_INPUT_CB(skb)->af->skb_iif(skb); -} - /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 2: The ECN capable field is reserved for future use of diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 23d6633966b1..3aa5da5e3bbd 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -734,7 +734,7 @@ static void sctp_cmd_hb_timers_stop(struct sctp_cmd_seq *cmds, list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { - if (del_timer(&t->hb_timer)) + if (timer_delete(&t->hb_timer)) sctp_transport_put(t); } } @@ -747,7 +747,7 @@ static void sctp_cmd_t3_rtx_timers_stop(struct sctp_cmd_seq *cmds, list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { - if (del_timer(&t->T3_rtx_timer)) + if (timer_delete(&t->T3_rtx_timer)) sctp_transport_put(t); } } @@ -1557,7 +1557,7 @@ static int sctp_cmd_interpreter(enum sctp_event_type event_type, case SCTP_CMD_TIMER_STOP: timer = &asoc->timers[cmd->obj.to]; - if (del_timer(timer)) + if (timer_delete(timer)) sctp_association_put(asoc); break; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 36ee34f483d7..1e5739858c20 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -72,8 +72,9 @@ /* Forward declarations for internal helper functions. */ static bool sctp_writeable(const struct sock *sk); static void sctp_wfree(struct sk_buff *skb); -static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len); +static int sctp_wait_for_sndbuf(struct sctp_association *asoc, + struct sctp_transport *transport, + long *timeo_p, size_t msg_len); static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p); static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); static int sctp_wait_for_accept(struct sock *sk, long timeo); @@ -1828,7 +1829,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) { timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); - err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); + err = sctp_wait_for_sndbuf(asoc, transport, &timeo, msg_len); if (err) goto err; if (unlikely(sinfo->sinfo_stream >= asoc->stream.outcnt)) { @@ -5626,7 +5627,8 @@ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optv } /* Helper routine to branch off an association to a new socket. */ -int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) +static int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, + struct socket **sockp) { struct sctp_association *asoc = sctp_id2assoc(sk, id); struct sctp_sock *sp = sctp_sk(sk); @@ -5674,7 +5676,6 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp) return err; } -EXPORT_SYMBOL(sctp_do_peeloff); static int sctp_getsockopt_peeloff_common(struct sock *sk, sctp_peeloff_arg_t *peeloff, struct file **newfile, unsigned flags) @@ -8320,7 +8321,7 @@ static int sctp_hash(struct sock *sk) static void sctp_unhash(struct sock *sk) { - /* STUB */ + sock_rps_delete_flow(sk); } /* Check if port is acceptable. Possibly find first available port. @@ -9099,7 +9100,8 @@ static void __sctp_write_space(struct sctp_association *asoc) wq = rcu_dereference(sk->sk_wq); if (wq) { if (waitqueue_active(&wq->wait)) - wake_up_interruptible(&wq->wait); + wake_up_interruptible_poll(&wq->wait, EPOLLOUT | + EPOLLWRNORM | EPOLLWRBAND); /* Note that we try to include the Async I/O support * here by modeling from the current TCP/UDP code. @@ -9214,8 +9216,9 @@ void sctp_sock_rfree(struct sk_buff *skb) /* Helper function to wait for space in the sndbuf. */ -static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, - size_t msg_len) +static int sctp_wait_for_sndbuf(struct sctp_association *asoc, + struct sctp_transport *transport, + long *timeo_p, size_t msg_len) { struct sock *sk = asoc->base.sk; long current_timeo = *timeo_p; @@ -9225,7 +9228,9 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc, *timeo_p, msg_len); - /* Increment the association's refcnt. */ + /* Increment the transport and association's refcnt. */ + if (transport) + sctp_transport_hold(transport); sctp_association_hold(asoc); /* Wait on the association specific sndbuf space. */ @@ -9234,7 +9239,7 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, TASK_INTERRUPTIBLE); if (asoc->base.dead) goto do_dead; - if (!*timeo_p) + if ((!*timeo_p) || (transport && transport->dead)) goto do_nonblock; if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING) goto do_error; @@ -9259,7 +9264,9 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, out: finish_wait(&asoc->wait, &wait); - /* Release the association's refcnt. */ + /* Release the transport and association's refcnt. */ + if (transport) + sctp_transport_put(transport); sctp_association_put(asoc); return err; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index c241cc552e8d..f205556c5b24 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -576,7 +576,7 @@ struct sctp_chunk *sctp_process_strreset_outreq( struct sctp_transport *t; t = asoc->strreset_chunk->transport; - if (del_timer(&t->reconf_timer)) + if (timer_delete(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); @@ -735,7 +735,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( * value SHOULD be the smallest TSN not acknowledged by the * receiver of the request plus 2^31. */ - init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1 << 31); + init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1U << 31); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC); @@ -825,7 +825,7 @@ struct sctp_chunk *sctp_process_strreset_addstrm_out( struct sctp_transport *t; t = asoc->strreset_chunk->transport; - if (del_timer(&t->reconf_timer)) + if (timer_delete(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); @@ -1076,7 +1076,7 @@ struct sctp_chunk *sctp_process_strreset_resp( /* remove everything for this reconf request */ if (!asoc->strreset_outstanding) { t = asoc->strreset_chunk->transport; - if (del_timer(&t->reconf_timer)) + if (timer_delete(&t->reconf_timer)) sctp_transport_put(t); sctp_chunk_put(asoc->strreset_chunk); diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 8e1e97be4df7..ee3eac338a9d 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -525,6 +525,8 @@ static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, return ret; } +static DEFINE_MUTEX(sctp_sysctl_mutex); + static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -549,6 +551,7 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, if (new_value > max || new_value < min) return -EINVAL; + mutex_lock(&sctp_sysctl_mutex); net->sctp.udp_port = new_value; sctp_udp_sock_stop(net); if (new_value) { @@ -561,6 +564,7 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, lock_sock(sk); sctp_sk(sk)->udp_port = htons(net->sctp.udp_port); release_sock(sk); + mutex_unlock(&sctp_sysctl_mutex); } return ret; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 2abe45af98e7..6946c1462793 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -117,8 +117,10 @@ fail: */ void sctp_transport_free(struct sctp_transport *transport) { + transport->dead = 1; + /* Try to delete the heartbeat timer. */ - if (del_timer(&transport->hb_timer)) + if (timer_delete(&transport->hb_timer)) sctp_transport_put(transport); /* Delete the T3_rtx timer if it's active. @@ -126,17 +128,17 @@ void sctp_transport_free(struct sctp_transport *transport) * structure hang around in memory since we know * the transport is going away. */ - if (del_timer(&transport->T3_rtx_timer)) + if (timer_delete(&transport->T3_rtx_timer)) sctp_transport_put(transport); - if (del_timer(&transport->reconf_timer)) + if (timer_delete(&transport->reconf_timer)) sctp_transport_put(transport); - if (del_timer(&transport->probe_timer)) + if (timer_delete(&transport->probe_timer)) sctp_transport_put(transport); /* Delete the ICMP proto unreachable timer if it's active. */ - if (del_timer(&transport->proto_unreach_timer)) + if (timer_delete(&transport->proto_unreach_timer)) sctp_transport_put(transport); sctp_transport_put(transport); @@ -829,7 +831,7 @@ void sctp_transport_reset(struct sctp_transport *t) void sctp_transport_immediate_rtx(struct sctp_transport *t) { /* Stop pending T3_rtx_timer */ - if (del_timer(&t->T3_rtx_timer)) + if (timer_delete(&t->T3_rtx_timer)) sctp_transport_put(t); sctp_retransmit(&t->asoc->outqueue, t, SCTP_RTXR_T3_RTX); diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c index 15463062fe7b..7101a48bce54 100644 --- a/net/shaper/shaper.c +++ b/net/shaper/shaper.c @@ -40,7 +40,7 @@ static void net_shaper_lock(struct net_shaper_binding *binding) { switch (binding->type) { case NET_SHAPER_BINDING_TYPE_NETDEV: - mutex_lock(&binding->netdev->lock); + netdev_lock(binding->netdev); break; } } @@ -49,7 +49,7 @@ static void net_shaper_unlock(struct net_shaper_binding *binding) { switch (binding->type) { case NET_SHAPER_BINDING_TYPE_NETDEV: - mutex_unlock(&binding->netdev->lock); + netdev_unlock(binding->netdev); break; } } @@ -1398,7 +1398,7 @@ void net_shaper_set_real_num_tx_queues(struct net_device *dev, /* Only drivers implementing shapers support ensure * the lock is acquired in advance. */ - lockdep_assert_held(&dev->lock); + netdev_assert_locked(dev); /* Take action only when decreasing the tx queue number. */ for (i = txq; i < dev->real_num_tx_queues; ++i) { diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6cc7b846cff1..3760131f1484 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -362,6 +362,9 @@ static void smc_destruct(struct sock *sk) return; } +static struct lock_class_key smc_key; +static struct lock_class_key smc_slock_key; + void smc_sk_init(struct net *net, struct sock *sk, int protocol) { struct smc_sock *smc = smc_sk(sk); @@ -375,6 +378,8 @@ void smc_sk_init(struct net *net, struct sock *sk, int protocol) INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); + sock_lock_init_class_and_name(sk, "slock-AF_SMC", &smc_slock_key, + "sk_lock-AF_SMC", &smc_key); spin_lock_init(&smc->accept_q_lock); spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); @@ -1117,7 +1122,10 @@ static int smc_find_proposal_devices(struct smc_sock *smc, ini->check_smcrv2 = true; ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; if (!(ini->smcr_version & SMC_V2) || - smc->clcsock->sk->sk_family != AF_INET || +#if IS_ENABLED(CONFIG_IPV6) + (smc->clcsock->sk->sk_family == AF_INET6 && + !ipv6_addr_v4mapped(&smc->clcsock->sk->sk_v6_rcv_saddr)) || +#endif !smc_clc_ueid_count() || smc_find_rdma_device(smc, ini)) ini->smcr_version &= ~SMC_V2; @@ -2738,7 +2746,7 @@ int smc_accept(struct socket *sock, struct socket *new_sock, release_sock(clcsk); } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { lock_sock(nsk); - smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); + smc_rx_wait(smc_sk(nsk), &timeo, 0, smc_rx_data_available); release_sock(nsk); } } @@ -3334,10 +3342,7 @@ int smc_create_clcsk(struct net *net, struct sock *sk, int family) * which need net ref. */ sk = smc->clcsock->sk; - __netns_tracker_free(net, &sk->ns_tracker, false); - sk->sk_net_refcnt = 1; - get_net_track(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sk); return 0; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 3b125d348b4a..ac07b963aede 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -795,9 +795,14 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, if (lgr->smc_version == SMC_V2) { lnk->smcibdev = ini->smcrv2.ib_dev_v2; lnk->ibport = ini->smcrv2.ib_port_v2; + lnk->wr_rx_sge_cnt = lnk->smcibdev->ibdev->attrs.max_recv_sge < 2 ? 1 : 2; + lnk->wr_rx_buflen = smc_link_shared_v2_rxbuf(lnk) ? + SMC_WR_BUF_SIZE : SMC_WR_BUF_V2_SIZE; } else { lnk->smcibdev = ini->ib_dev; lnk->ibport = ini->ib_port; + lnk->wr_rx_sge_cnt = 1; + lnk->wr_rx_buflen = SMC_WR_BUF_SIZE; } get_device(&lnk->smcibdev->ibdev->dev); atomic_inc(&lnk->smcibdev->lnk_cnt); @@ -2150,7 +2155,7 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) { size = min_t(int, PAGE_SIZE - offset, buf_size); sg_set_page(sg, vmalloc_to_page(buf), size, offset); - buf += size / sizeof(*buf); + buf += size; buf_size -= size; offset = 0; } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 69b54ecd6503..48a1b1dcb576 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -122,10 +122,14 @@ struct smc_link { } ____cacheline_aligned_in_smp; struct completion tx_ref_comp; - struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ + u8 *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ + int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */ + int wr_rx_buflen; /* buffer len for the first sge, len for the + * second sge is lgr shared if rx sge is 2. + */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ @@ -506,6 +510,11 @@ static inline bool smc_link_active(struct smc_link *lnk) return lnk->state == SMC_LNK_ACTIVE; } +static inline bool smc_link_shared_v2_rxbuf(struct smc_link *lnk) +{ + return lnk->wr_rx_sge_cnt > 1; +} + static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw) { sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x", diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 9c563cdbea90..53828833a3f7 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -662,7 +662,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { - int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, @@ -676,7 +675,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .max_send_wr = SMC_WR_BUF_CNT * 3, .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, - .max_recv_sge = sges_per_buf, + .max_recv_sge = lnk->wr_rx_sge_cnt, .max_inline_data = 0, }, .sq_sig_type = IB_SIGNAL_REQ_WR, diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 018ce8133b02..f865c58c3aa7 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -997,13 +997,14 @@ static int smc_llc_cli_conf_link(struct smc_link *link, } static void smc_llc_save_add_link_rkeys(struct smc_link *link, - struct smc_link *link_new) + struct smc_link *link_new, + u8 *llc_msg) { struct smc_llc_msg_add_link_v2_ext *ext; struct smc_link_group *lgr = link->lgr; int max, i; - ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 + + ext = (struct smc_llc_msg_add_link_v2_ext *)(llc_msg + SMC_WR_TX_SIZE); max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); down_write(&lgr->rmbs_lock); @@ -1098,7 +1099,9 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) if (rc) goto out_clear_lnk; if (lgr->smc_version == SMC_V2) { - smc_llc_save_add_link_rkeys(link, lnk_new); + u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ? + (u8 *)lgr->wr_rx_buf_v2 : (u8 *)llc; + smc_llc_save_add_link_rkeys(link, lnk_new, llc_msg); } else { rc = smc_llc_cli_rkey_exchange(link, lnk_new); if (rc) { @@ -1498,7 +1501,9 @@ int smc_llc_srv_add_link(struct smc_link *link, if (rc) goto out_err; if (lgr->smc_version == SMC_V2) { - smc_llc_save_add_link_rkeys(link, link_new); + u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ? + (u8 *)lgr->wr_rx_buf_v2 : (u8 *)add_llc; + smc_llc_save_add_link_rkeys(link, link_new, llc_msg); } else { rc = smc_llc_srv_rkey_exchange(link, link_new); if (rc) @@ -1807,8 +1812,12 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) if (lgr->smc_version == SMC_V2) { struct smc_llc_msg_delete_rkey_v2 *llcv2; - memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); - llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + if (smc_link_shared_v2_rxbuf(link)) { + memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + } else { + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)llc; + } llcv2->num_inval_rkeys = 0; max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 716808f374a8..b391c2ef463f 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -1079,14 +1079,16 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct net_device *base_ndev; struct net *net; - ndev = pnet_find_base_ndev(ndev); + base_ndev = pnet_find_base_ndev(ndev); net = dev_net(ndev); - if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + if (smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid) && + smc_pnet_find_ndev_pnetid_by_table(base_ndev, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { - smc_pnet_find_rdma_dev(ndev, ini); + smc_pnet_find_rdma_dev(base_ndev, ini); return; /* pnetid could not be determined */ } _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index f0cbe77a80b4..e7f1134453ef 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -197,7 +197,7 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, partial[i].offset = offset; partial[i].len = size; partial[i].private = (unsigned long)priv[i]; - buf += size / sizeof(*buf); + buf += size; left -= size; offset = 0; } @@ -238,22 +238,23 @@ out: return -ENOMEM; } -static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) +static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn, size_t peeked) { - return atomic_read(&conn->bytes_to_rcv) && + return smc_rx_data_available(conn, peeked) && !atomic_read(&conn->splice_pending); } /* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted * @smc smc socket * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout + * @peeked number of bytes already peeked * @fcrit add'l criterion to evaluate as function pointer * Returns: * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). */ -int smc_rx_wait(struct smc_sock *smc, long *timeo, - int (*fcrit)(struct smc_connection *conn)) +int smc_rx_wait(struct smc_sock *smc, long *timeo, size_t peeked, + int (*fcrit)(struct smc_connection *conn, size_t baseline)) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; @@ -262,7 +263,7 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, struct sock *sk = &smc->sk; int rc; - if (fcrit(conn)) + if (fcrit(conn, peeked)) return 1; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); add_wait_queue(sk_sleep(sk), &wait); @@ -271,7 +272,7 @@ int smc_rx_wait(struct smc_sock *smc, long *timeo, cflags->peer_conn_abort || READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN || conn->killed || - fcrit(conn), + fcrit(conn, peeked), &wait); remove_wait_queue(sk_sleep(sk), &wait); sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); @@ -322,11 +323,11 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, return -EAGAIN; } -static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) +static bool smc_rx_recvmsg_data_available(struct smc_sock *smc, size_t peeked) { struct smc_connection *conn = &smc->conn; - if (smc_rx_data_available(conn)) + if (smc_rx_data_available(conn, peeked)) return true; else if (conn->urg_state == SMC_URG_VALID) /* we received a single urgent Byte - skip */ @@ -344,10 +345,10 @@ static bool smc_rx_recvmsg_data_available(struct smc_sock *smc) int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, struct pipe_inode_info *pipe, size_t len, int flags) { - size_t copylen, read_done = 0, read_remaining = len; + size_t copylen, read_done = 0, read_remaining = len, peeked_bytes = 0; size_t chunk_len, chunk_off, chunk_len_sum; struct smc_connection *conn = &smc->conn; - int (*func)(struct smc_connection *conn); + int (*func)(struct smc_connection *conn, size_t baseline); union smc_host_cursor cons; int readable, chunk; char *rcvbuf_base; @@ -384,14 +385,14 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, if (conn->killed) break; - if (smc_rx_recvmsg_data_available(smc)) + if (smc_rx_recvmsg_data_available(smc, peeked_bytes)) goto copy; if (sk->sk_shutdown & RCV_SHUTDOWN) { /* smc_cdc_msg_recv_action() could have run after * above smc_rx_recvmsg_data_available() */ - if (smc_rx_recvmsg_data_available(smc)) + if (smc_rx_recvmsg_data_available(smc, peeked_bytes)) goto copy; break; } @@ -425,26 +426,28 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, } } - if (!smc_rx_data_available(conn)) { - smc_rx_wait(smc, &timeo, smc_rx_data_available); + if (!smc_rx_data_available(conn, peeked_bytes)) { + smc_rx_wait(smc, &timeo, peeked_bytes, smc_rx_data_available); continue; } copy: /* initialize variables for 1st iteration of subsequent loop */ /* could be just 1 byte, even after waiting on data above */ - readable = atomic_read(&conn->bytes_to_rcv); + readable = smc_rx_data_available(conn, peeked_bytes); splbytes = atomic_read(&conn->splice_pending); if (!readable || (msg && splbytes)) { if (splbytes) func = smc_rx_data_available_and_no_splice_pend; else func = smc_rx_data_available; - smc_rx_wait(smc, &timeo, func); + smc_rx_wait(smc, &timeo, peeked_bytes, func); continue; } smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn); + if ((flags & MSG_PEEK) && peeked_bytes) + smc_curs_add(conn->rmb_desc->len, &cons, peeked_bytes); /* subsequent splice() calls pick up where previous left */ if (splbytes) smc_curs_add(conn->rmb_desc->len, &cons, splbytes); @@ -480,6 +483,8 @@ copy: } read_remaining -= chunk_len; read_done += chunk_len; + if (flags & MSG_PEEK) + peeked_bytes += chunk_len; if (chunk_len_sum == copylen) break; /* either on 1st or 2nd iteration */ diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h index db823c97d824..994f5e42d1ba 100644 --- a/net/smc/smc_rx.h +++ b/net/smc/smc_rx.h @@ -21,11 +21,11 @@ void smc_rx_init(struct smc_sock *smc); int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, struct pipe_inode_info *pipe, size_t len, int flags); -int smc_rx_wait(struct smc_sock *smc, long *timeo, - int (*fcrit)(struct smc_connection *conn)); -static inline int smc_rx_data_available(struct smc_connection *conn) +int smc_rx_wait(struct smc_sock *smc, long *timeo, size_t peeked, + int (*fcrit)(struct smc_connection *conn, size_t baseline)); +static inline int smc_rx_data_available(struct smc_connection *conn, size_t peeked) { - return atomic_read(&conn->bytes_to_rcv); + return atomic_read(&conn->bytes_to_rcv) - peeked; } #endif /* SMC_RX_H */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 994c0cd4fddb..b04a21b8c511 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -439,7 +439,7 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) return; /* short message */ temp_wr_id = wc->wr_id; index = do_div(temp_wr_id, link->wr_rx_cnt); - wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; + wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + index * link->wr_rx_buflen); hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { if (handler->type == wr_rx->type) handler->handler(wc, wr_rx); @@ -555,7 +555,6 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) static void smc_wr_init_sge(struct smc_link *lnk) { - int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE); u32 i; @@ -608,13 +607,14 @@ static void smc_wr_init_sge(struct smc_link *lnk) * the larger spillover buffer, allowing easy data mapping. */ for (i = 0; i < lnk->wr_rx_cnt; i++) { - int x = i * sges_per_buf; + int x = i * lnk->wr_rx_sge_cnt; lnk->wr_rx_sges[x].addr = - lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; - lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; + lnk->wr_rx_dma_addr + i * lnk->wr_rx_buflen; + lnk->wr_rx_sges[x].length = smc_link_shared_v2_rxbuf(lnk) ? + SMC_WR_TX_SIZE : lnk->wr_rx_buflen; lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; - if (lnk->lgr->smc_version == SMC_V2) { + if (lnk->lgr->smc_version == SMC_V2 && smc_link_shared_v2_rxbuf(lnk)) { lnk->wr_rx_sges[x + 1].addr = lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; lnk->wr_rx_sges[x + 1].length = @@ -624,7 +624,7 @@ static void smc_wr_init_sge(struct smc_link *lnk) } lnk->wr_rx_ibs[i].next = NULL; lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; - lnk->wr_rx_ibs[i].num_sge = sges_per_buf; + lnk->wr_rx_ibs[i].num_sge = lnk->wr_rx_sge_cnt; } lnk->wr_reg.wr.next = NULL; lnk->wr_reg.wr.num_sge = 0; @@ -655,7 +655,7 @@ void smc_wr_free_link(struct smc_link *lnk) if (lnk->wr_rx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, - SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; } @@ -740,13 +740,11 @@ int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) int smc_wr_alloc_link_mem(struct smc_link *link) { - int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; - /* allocate link related memory */ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, + link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, link->wr_rx_buflen, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; @@ -774,7 +772,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, - sizeof(link->wr_rx_sges[0]) * sges_per_buf, + sizeof(link->wr_rx_sges[0]) * link->wr_rx_sge_cnt, GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; @@ -872,7 +870,7 @@ int smc_wr_create_link(struct smc_link *lnk) smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0); lnk->wr_rx_id = 0; lnk->wr_rx_dma_addr = ib_dma_map_single( - ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + ibdev, lnk->wr_rx_bufs, lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) { lnk->wr_rx_dma_addr = 0; @@ -880,13 +878,15 @@ int smc_wr_create_link(struct smc_link *lnk) goto out; } if (lnk->lgr->smc_version == SMC_V2) { - lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, - lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { - lnk->wr_rx_v2_dma_addr = 0; - rc = -EIO; - goto dma_unmap; + if (smc_link_shared_v2_rxbuf(lnk)) { + lnk->wr_rx_v2_dma_addr = + ib_dma_map_single(ibdev, lnk->lgr->wr_rx_buf_v2, + SMC_WR_BUF_V2_SIZE, DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { + lnk->wr_rx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } } lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, @@ -935,7 +935,7 @@ dma_unmap: lnk->wr_tx_v2_dma_addr = 0; } ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, - SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, + lnk->wr_rx_buflen * lnk->wr_rx_cnt, DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; out: diff --git a/net/socket.c b/net/socket.c index 9a117248f18f..9a0e720f0859 100644 --- a/net/socket.c +++ b/net/socket.c @@ -110,6 +110,8 @@ #include <linux/ptp_clock_kernel.h> #include <trace/events/sock.h> +#include "core/dev.h" + #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sysctl_net_busy_read __read_mostly; unsigned int sysctl_net_busy_poll __read_mostly; @@ -477,6 +479,11 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) sock->file = file; file->private_data = sock; stream_open(SOCK_INODE(sock), file); + /* + * Disable permission and pre-content events, but enable legacy + * inotify events for legacy users. + */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return file; } EXPORT_SYMBOL(sock_alloc_file); @@ -673,17 +680,8 @@ void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags; - if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) { - flags |= SKBTX_HW_TSTAMP; - - /* PTP hardware clocks can provide a free running cycle counter - * as a time base for virtual clocks. Tell driver to use the - * free running cycle counter for timestamp if socket is bound - * to virtual clock. - */ - if (tsflags & SOF_TIMESTAMPING_BIND_PHC) - flags |= SKBTX_HW_TSTAMP_USE_CYCLES; - } + if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) + flags |= SKBTX_HW_TSTAMP_NOBPF; if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) flags |= SKBTX_SW_TSTAMP; @@ -691,6 +689,9 @@ void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags) if (tsflags & SOF_TIMESTAMPING_TX_SCHED) flags |= SKBTX_SCHED_TSTAMP; + if (tsflags & SOF_TIMESTAMPING_TX_COMPLETION) + flags |= SKBTX_COMPLETION_TSTAMP; + *tx_flags = flags; } EXPORT_SYMBOL(__sock_tx_timestamp); @@ -774,34 +775,6 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, } EXPORT_SYMBOL(kernel_sendmsg); -/** - * kernel_sendmsg_locked - send a message through @sock (kernel-space) - * @sk: sock - * @msg: message header - * @vec: output s/g array - * @num: output s/g array length - * @size: total message data size - * - * Builds the message data with @vec and sends it through @sock. - * Returns the number of bytes sent, or an error code. - * Caller must hold @sk. - */ - -int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, - struct kvec *vec, size_t num, size_t size) -{ - struct socket *sock = sk->sk_socket; - const struct proto_ops *ops = READ_ONCE(sock->ops); - - if (!ops->sendmsg_locked) - return sock_no_sendmsg_locked(sk, msg, size); - - iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size); - - return ops->sendmsg_locked(sk, msg, msg_data_left(msg)); -} -EXPORT_SYMBOL(kernel_sendmsg_locked); - static bool skb_is_err_queue(const struct sk_buff *skb) { /* pkt_type of skbs enqueued on the error queue are set to @@ -1008,12 +981,23 @@ static void sock_recv_mark(struct msghdr *msg, struct sock *sk, } } +static void sock_recv_priority(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) +{ + if (sock_flag(sk, SOCK_RCVPRIORITY) && skb) { + __u32 priority = skb->priority; + + put_cmsg(msg, SOL_SOCKET, SO_PRIORITY, sizeof(__u32), &priority); + } +} + void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { sock_recv_timestamp(msg, sk, skb); sock_recv_drops(msg, sk, skb); sock_recv_mark(msg, sk, skb); + sock_recv_priority(msg, sk, skb); } EXPORT_SYMBOL_GPL(__sock_recv_cmsgs); @@ -1155,12 +1139,10 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) */ static DEFINE_MUTEX(br_ioctl_mutex); -static int (*br_ioctl_hook)(struct net *net, struct net_bridge *br, - unsigned int cmd, struct ifreq *ifr, +static int (*br_ioctl_hook)(struct net *net, unsigned int cmd, void __user *uarg); -void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, - unsigned int cmd, struct ifreq *ifr, +void brioctl_set(int (*hook)(struct net *net, unsigned int cmd, void __user *uarg)) { mutex_lock(&br_ioctl_mutex); @@ -1169,8 +1151,7 @@ void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, } EXPORT_SYMBOL(brioctl_set); -int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, - struct ifreq *ifr, void __user *uarg) +int br_ioctl_call(struct net *net, unsigned int cmd, void __user *uarg) { int err = -ENOPKG; @@ -1179,7 +1160,7 @@ int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, mutex_lock(&br_ioctl_mutex); if (br_ioctl_hook) - err = br_ioctl_hook(net, br, cmd, ifr, uarg); + err = br_ioctl_hook(net, cmd, uarg); mutex_unlock(&br_ioctl_mutex); return err; @@ -1279,7 +1260,9 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) case SIOCSIFBR: case SIOCBRADDBR: case SIOCBRDELBR: - err = br_ioctl_call(net, NULL, cmd, NULL, argp); + case SIOCBRADDIF: + case SIOCBRDELIF: + err = br_ioctl_call(net, cmd, argp); break; case SIOCGIFVLAN: case SIOCSIFVLAN: @@ -3439,6 +3422,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGPGRP: case SIOCBRADDBR: case SIOCBRDELBR: + case SIOCBRADDIF: + case SIOCBRDELIF: case SIOCGIFVLAN: case SIOCSIFVLAN: case SIOCGSKNS: @@ -3478,8 +3463,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGIFPFLAGS: case SIOCGIFTXQLEN: case SIOCSIFTXQLEN: - case SIOCBRADDIF: - case SIOCBRDELIF: case SIOCGIFNAME: case SIOCSIFNAME: case SIOCGMIIPHY: diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 8299ceb3e373..d946bfb424c7 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -347,7 +347,10 @@ static int strp_read_sock(struct strparser *strp) struct socket *sock = strp->sk->sk_socket; read_descriptor_t desc; - if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) + if (unlikely(!sock || !sock->ops)) + return -EBUSY; + + if (unlikely(!strp->cb.read_sock && !sock->ops->read_sock)) return -EBUSY; desc.arg.data = strp; @@ -355,7 +358,10 @@ static int strp_read_sock(struct strparser *strp) desc.count = 1; /* give more than one skb per call */ /* sk should be locked here, so okay to do read_sock */ - sock->ops->read_sock(strp->sk, &desc, strp_recv); + if (strp->cb.read_sock) + strp->cb.read_sock(strp, &desc, strp_recv); + else + sock->ops->read_sock(strp->sk, &desc, strp_recv); desc.error = strp->cb.read_sock_done(strp, desc.error); @@ -468,6 +474,7 @@ int strp_init(struct strparser *strp, struct sock *sk, strp->cb.unlock = cb->unlock ? : strp_sock_unlock; strp->cb.rcv_msg = cb->rcv_msg; strp->cb.parse_msg = cb->parse_msg; + strp->cb.read_sock = cb->read_sock; strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done; strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp; @@ -478,19 +485,6 @@ int strp_init(struct strparser *strp, struct sock *sk, } EXPORT_SYMBOL_GPL(strp_init); -/* Sock process lock held (lock_sock) */ -void __strp_unpause(struct strparser *strp) -{ - strp->paused = 0; - - if (strp->need_bytes) { - if (strp_peek_len(strp) < strp->need_bytes) - return; - } - strp_read_sock(strp); -} -EXPORT_SYMBOL_GPL(__strp_unpause); - void strp_unpause(struct strparser *strp) { strp->paused = 0; diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 04534ea537c8..5a827afd8e3b 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -489,7 +489,7 @@ static unsigned long rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - return number_cred_unused * sysctl_vfs_cache_pressure / 100; + return number_cred_unused; } static void diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index ad1736d93b76..452f67deebc6 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o -auth_rpcgss-y := auth_gss.o gss_generic_token.o \ +auth_rpcgss-y := auth_gss.o \ gss_mech_switch.o svcauth_gss.o \ gss_rpc_upcall.o gss_rpc_xdr.o trace.o diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 369310909fc9..0fa244f16876 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1545,6 +1545,7 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) struct kvec iov; struct xdr_buf verf_buf; int status; + u32 seqno; /* Credential */ @@ -1556,15 +1557,16 @@ static int gss_marshal(struct rpc_task *task, struct xdr_stream *xdr) cred_len = p++; spin_lock(&ctx->gc_seq_lock); - req->rq_seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; + seqno = (ctx->gc_seq < MAXSEQ) ? ctx->gc_seq++ : MAXSEQ; + xprt_rqst_add_seqno(req, seqno); spin_unlock(&ctx->gc_seq_lock); - if (req->rq_seqno == MAXSEQ) + if (*req->rq_seqnos == MAXSEQ) goto expired; trace_rpcgss_seqno(task); *p++ = cpu_to_be32(RPC_GSS_VERSION); *p++ = cpu_to_be32(ctx->gc_proc); - *p++ = cpu_to_be32(req->rq_seqno); + *p++ = cpu_to_be32(*req->rq_seqnos); *p++ = cpu_to_be32(gss_cred->gc_service); p = xdr_encode_netobj(p, &ctx->gc_wire_ctx); *cred_len = cpu_to_be32((p - (cred_len + 1)) << 2); @@ -1678,17 +1680,31 @@ gss_refresh_null(struct rpc_task *task) return 0; } +static u32 +gss_validate_seqno_mic(struct gss_cl_ctx *ctx, u32 seqno, __be32 *seq, __be32 *p, u32 len) +{ + struct kvec iov; + struct xdr_buf verf_buf; + struct xdr_netobj mic; + + *seq = cpu_to_be32(seqno); + iov.iov_base = seq; + iov.iov_len = 4; + xdr_buf_from_iov(&iov, &verf_buf); + mic.data = (u8 *)p; + mic.len = len; + return gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); +} + static int gss_validate(struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); __be32 *p, *seq = NULL; - struct kvec iov; - struct xdr_buf verf_buf; - struct xdr_netobj mic; u32 len, maj_stat; int status; + int i = 1; /* don't recheck the first item */ p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) @@ -1705,13 +1721,10 @@ gss_validate(struct rpc_task *task, struct xdr_stream *xdr) seq = kmalloc(4, GFP_KERNEL); if (!seq) goto validate_failed; - *seq = cpu_to_be32(task->tk_rqstp->rq_seqno); - iov.iov_base = seq; - iov.iov_len = 4; - xdr_buf_from_iov(&iov, &verf_buf); - mic.data = (u8 *)p; - mic.len = len; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); + maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[0], seq, p, len); + /* RFC 2203 5.3.3.1 - compute the checksum of each sequence number in the cache */ + while (unlikely(maj_stat == GSS_S_BAD_SIG && i < task->tk_rqstp->rq_seqno_count)) + maj_stat = gss_validate_seqno_mic(ctx, task->tk_rqstp->rq_seqnos[i], seq, p, len); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) @@ -1750,7 +1763,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (!p) goto wrap_failed; integ_len = p++; - *p = cpu_to_be32(rqstp->rq_seqno); + *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; @@ -1847,7 +1860,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (!p) goto wrap_failed; opaque_len = p++; - *p = cpu_to_be32(rqstp->rq_seqno); + *p = cpu_to_be32(*rqstp->rq_seqnos); if (rpcauth_wrap_req_encode(task, xdr)) goto wrap_failed; @@ -2001,7 +2014,7 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, offset = rcv_buf->len - xdr_stream_remaining(xdr); if (xdr_stream_decode_u32(xdr, &seqno)) goto unwrap_failed; - if (seqno != rqstp->rq_seqno) + if (seqno != *rqstp->rq_seqnos) goto bad_seqno; if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len)) goto unwrap_failed; @@ -2045,7 +2058,7 @@ unwrap_failed: trace_rpcgss_unwrap_failed(task); goto out; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno); + trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, seqno); goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); @@ -2077,7 +2090,7 @@ gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, if (maj_stat != GSS_S_COMPLETE) goto bad_unwrap; /* gss_unwrap decrypted the sequence number */ - if (be32_to_cpup(p++) != rqstp->rq_seqno) + if (be32_to_cpup(p++) != *rqstp->rq_seqnos) goto bad_seqno; /* gss_unwrap redacts the opaque blob from the head iovec. @@ -2093,7 +2106,7 @@ unwrap_failed: trace_rpcgss_unwrap_failed(task); return -EIO; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(--p)); + trace_rpcgss_bad_seqno(task, *rqstp->rq_seqnos, be32_to_cpup(--p)); return -EIO; bad_unwrap: trace_rpcgss_unwrap(task, maj_stat); @@ -2118,14 +2131,14 @@ gss_xmit_need_reencode(struct rpc_task *task) if (!ctx) goto out; - if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq))) + if (gss_seq_is_newer(*req->rq_seqnos, READ_ONCE(ctx->gc_seq))) goto out_ctx; seq_xmit = READ_ONCE(ctx->gc_seq_xmit); - while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) { + while (gss_seq_is_newer(*req->rq_seqnos, seq_xmit)) { u32 tmp = seq_xmit; - seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno); + seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, *req->rq_seqnos); if (seq_xmit == tmp) { ret = false; goto out_ctx; @@ -2134,7 +2147,7 @@ gss_xmit_need_reencode(struct rpc_task *task) win = ctx->gc_win; if (win > 0) - ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win); + ret = !gss_seq_is_newer(*req->rq_seqnos, seq_xmit - win); out_ctx: gss_put_ctx(ctx); diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c deleted file mode 100644 index 4a4082bb22ad..000000000000 --- a/net/sunrpc/auth_gss/gss_generic_token.c +++ /dev/null @@ -1,231 +0,0 @@ -/* - * linux/net/sunrpc/gss_generic_token.c - * - * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c - * - * Copyright (c) 2000 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@umich.edu> - */ - -/* - * Copyright 1993 by OpenVision Technologies, Inc. - * - * Permission to use, copy, modify, distribute, and sell this software - * and its documentation for any purpose is hereby granted without fee, - * provided that the above copyright notice appears in all copies and - * that both that copyright notice and this permission notice appear in - * supporting documentation, and that the name of OpenVision not be used - * in advertising or publicity pertaining to distribution of the software - * without specific, written prior permission. OpenVision makes no - * representations about the suitability of this software for any - * purpose. It is provided "as is" without express or implied warranty. - * - * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO - * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF - * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR - * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR - * PERFORMANCE OF THIS SOFTWARE. - */ - -#include <linux/types.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/sunrpc/sched.h> -#include <linux/sunrpc/gss_asn1.h> - - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - - -/* TWRITE_STR from gssapiP_generic.h */ -#define TWRITE_STR(ptr, str, len) \ - memcpy((ptr), (char *) (str), (len)); \ - (ptr) += (len); - -/* XXXX this code currently makes the assumption that a mech oid will - never be longer than 127 bytes. This assumption is not inherent in - the interfaces, so the code can be fixed if the OSI namespace - balloons unexpectedly. */ - -/* Each token looks like this: - -0x60 tag for APPLICATION 0, SEQUENCE - (constructed, definite-length) - <length> possible multiple bytes, need to parse/generate - 0x06 tag for OBJECT IDENTIFIER - <moid_length> compile-time constant string (assume 1 byte) - <moid_bytes> compile-time constant string - <inner_bytes> the ANY containing the application token - bytes 0,1 are the token type - bytes 2,n are the token data - -For the purposes of this abstraction, the token "header" consists of -the sequence tag and length octets, the mech OID DER encoding, and the -first two inner bytes, which indicate the token type. The token -"body" consists of everything else. - -*/ - -static int -der_length_size( int length) -{ - if (length < (1<<7)) - return 1; - else if (length < (1<<8)) - return 2; -#if (SIZEOF_INT == 2) - else - return 3; -#else - else if (length < (1<<16)) - return 3; - else if (length < (1<<24)) - return 4; - else - return 5; -#endif -} - -static void -der_write_length(unsigned char **buf, int length) -{ - if (length < (1<<7)) { - *(*buf)++ = (unsigned char) length; - } else { - *(*buf)++ = (unsigned char) (der_length_size(length)+127); -#if (SIZEOF_INT > 2) - if (length >= (1<<24)) - *(*buf)++ = (unsigned char) (length>>24); - if (length >= (1<<16)) - *(*buf)++ = (unsigned char) ((length>>16)&0xff); -#endif - if (length >= (1<<8)) - *(*buf)++ = (unsigned char) ((length>>8)&0xff); - *(*buf)++ = (unsigned char) (length&0xff); - } -} - -/* returns decoded length, or < 0 on failure. Advances buf and - decrements bufsize */ - -static int -der_read_length(unsigned char **buf, int *bufsize) -{ - unsigned char sf; - int ret; - - if (*bufsize < 1) - return -1; - sf = *(*buf)++; - (*bufsize)--; - if (sf & 0x80) { - if ((sf &= 0x7f) > ((*bufsize)-1)) - return -1; - if (sf > SIZEOF_INT) - return -1; - ret = 0; - for (; sf; sf--) { - ret = (ret<<8) + (*(*buf)++); - (*bufsize)--; - } - } else { - ret = sf; - } - - return ret; -} - -/* returns the length of a token, given the mech oid and the body size */ - -int -g_token_size(struct xdr_netobj *mech, unsigned int body_size) -{ - /* set body_size to sequence contents size */ - body_size += 2 + (int) mech->len; /* NEED overflow check */ - return 1 + der_length_size(body_size) + body_size; -} - -EXPORT_SYMBOL_GPL(g_token_size); - -/* fills in a buffer with the token header. The buffer is assumed to - be the right size. buf is advanced past the token header */ - -void -g_make_token_header(struct xdr_netobj *mech, int body_size, unsigned char **buf) -{ - *(*buf)++ = 0x60; - der_write_length(buf, 2 + mech->len + body_size); - *(*buf)++ = 0x06; - *(*buf)++ = (unsigned char) mech->len; - TWRITE_STR(*buf, mech->data, ((int) mech->len)); -} - -EXPORT_SYMBOL_GPL(g_make_token_header); - -/* - * Given a buffer containing a token, reads and verifies the token, - * leaving buf advanced past the token header, and setting body_size - * to the number of remaining bytes. Returns 0 on success, - * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the - * mechanism in the token does not match the mech argument. buf and - * *body_size are left unmodified on error. - */ -u32 -g_verify_token_header(struct xdr_netobj *mech, int *body_size, - unsigned char **buf_in, int toksize) -{ - unsigned char *buf = *buf_in; - int seqsize; - struct xdr_netobj toid; - int ret = 0; - - if ((toksize-=1) < 0) - return G_BAD_TOK_HEADER; - if (*buf++ != 0x60) - return G_BAD_TOK_HEADER; - - if ((seqsize = der_read_length(&buf, &toksize)) < 0) - return G_BAD_TOK_HEADER; - - if (seqsize != toksize) - return G_BAD_TOK_HEADER; - - if ((toksize-=1) < 0) - return G_BAD_TOK_HEADER; - if (*buf++ != 0x06) - return G_BAD_TOK_HEADER; - - if ((toksize-=1) < 0) - return G_BAD_TOK_HEADER; - toid.len = *buf++; - - if ((toksize-=toid.len) < 0) - return G_BAD_TOK_HEADER; - toid.data = buf; - buf+=toid.len; - - if (! g_OID_equal(&toid, mech)) - ret = G_WRONG_MECH; - - /* G_WRONG_MECH is not returned immediately because it's more important - to return G_BAD_TOK_HEADER if the token header is in fact bad */ - - if ((toksize-=2) < 0) - return G_BAD_TOK_HEADER; - - if (ret) - return ret; - - *buf_in = buf; - *body_size = toksize; - - return ret; -} - -EXPORT_SYMBOL_GPL(g_verify_token_header); diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index d2b02710ab07..8f2d65c1e831 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -138,60 +138,6 @@ out: return ret; } -/** - * krb5_decrypt - simple decryption of an RPCSEC GSS payload - * @tfm: initialized cipher transform - * @iv: pointer to an IV - * @in: ciphertext to decrypt - * @out: OUT: plaintext - * @length: length of input and output buffers, in bytes - * - * @iv may be NULL to force the use of an all-zero IV. - * The buffer containing the IV must be as large as the - * cipher's ivsize. - * - * Return values: - * %0: @in successfully decrypted into @out - * negative errno: @in not decrypted - */ -u32 -krb5_decrypt( - struct crypto_sync_skcipher *tfm, - void * iv, - void * in, - void * out, - int length) -{ - u32 ret = -EINVAL; - struct scatterlist sg[1]; - u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0}; - SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - - if (length % crypto_sync_skcipher_blocksize(tfm) != 0) - goto out; - - if (crypto_sync_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) { - dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n", - crypto_sync_skcipher_ivsize(tfm)); - goto out; - } - if (iv) - memcpy(local_iv, iv, crypto_sync_skcipher_ivsize(tfm)); - - memcpy(out, in, length); - sg_init_one(sg, out, length); - - skcipher_request_set_sync_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, length, local_iv); - - ret = crypto_skcipher_decrypt(req); - skcipher_request_zero(req); -out: - dprintk("RPC: gss_k5decrypt returns %d\n",ret); - return ret; -} - static int checksummer(struct scatterlist *sg, void *data) { @@ -202,96 +148,6 @@ checksummer(struct scatterlist *sg, void *data) return crypto_ahash_update(req); } -/* - * checksum the plaintext data and hdrlen bytes of the token header - * The checksum is performed over the first 8 bytes of the - * gss token header and then over the data body - */ -u32 -make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, - struct xdr_buf *body, int body_offset, u8 *cksumkey, - unsigned int usage, struct xdr_netobj *cksumout) -{ - struct crypto_ahash *tfm; - struct ahash_request *req; - struct scatterlist sg[1]; - int err = -1; - u8 *checksumdata; - unsigned int checksumlen; - - if (cksumout->len < kctx->gk5e->cksumlength) { - dprintk("%s: checksum buffer length, %u, too small for %s\n", - __func__, cksumout->len, kctx->gk5e->name); - return GSS_S_FAILURE; - } - - checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_KERNEL); - if (checksumdata == NULL) - return GSS_S_FAILURE; - - tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - goto out_free_cksum; - - req = ahash_request_alloc(tfm, GFP_KERNEL); - if (!req) - goto out_free_ahash; - - ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); - - checksumlen = crypto_ahash_digestsize(tfm); - - if (cksumkey != NULL) { - err = crypto_ahash_setkey(tfm, cksumkey, - kctx->gk5e->keylength); - if (err) - goto out; - } - - err = crypto_ahash_init(req); - if (err) - goto out; - sg_init_one(sg, header, hdrlen); - ahash_request_set_crypt(req, sg, NULL, hdrlen); - err = crypto_ahash_update(req); - if (err) - goto out; - err = xdr_process_buf(body, body_offset, body->len - body_offset, - checksummer, req); - if (err) - goto out; - ahash_request_set_crypt(req, NULL, checksumdata, 0); - err = crypto_ahash_final(req); - if (err) - goto out; - - switch (kctx->gk5e->ctype) { - case CKSUMTYPE_RSA_MD5: - err = krb5_encrypt(kctx->seq, NULL, checksumdata, - checksumdata, checksumlen); - if (err) - goto out; - memcpy(cksumout->data, - checksumdata + checksumlen - kctx->gk5e->cksumlength, - kctx->gk5e->cksumlength); - break; - case CKSUMTYPE_HMAC_SHA1_DES3: - memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength); - break; - default: - BUG(); - break; - } - cksumout->len = kctx->gk5e->cksumlength; -out: - ahash_request_free(req); -out_free_ahash: - crypto_free_ahash(tfm); -out_free_cksum: - kfree(checksumdata); - return err ? GSS_S_FAILURE : 0; -} - /** * gss_krb5_checksum - Compute the MAC for a GSS Wrap or MIC token * @tfm: an initialized hash transform @@ -442,35 +298,6 @@ encryptor(struct scatterlist *sg, void *data) return 0; } -int -gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf, - int offset, struct page **pages) -{ - int ret; - struct encryptor_desc desc; - SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - - BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0); - - skcipher_request_set_sync_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - - memset(desc.iv, 0, sizeof(desc.iv)); - desc.req = req; - desc.pos = offset; - desc.outbuf = buf; - desc.pages = pages; - desc.fragno = 0; - desc.fraglen = 0; - - sg_init_table(desc.infrags, 4); - sg_init_table(desc.outfrags, 4); - - ret = xdr_process_buf(buf, offset, buf->len - offset, encryptor, &desc); - skcipher_request_zero(req); - return ret; -} - struct decryptor_desc { u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; struct skcipher_request *req; @@ -525,32 +352,6 @@ decryptor(struct scatterlist *sg, void *data) return 0; } -int -gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf, - int offset) -{ - int ret; - struct decryptor_desc desc; - SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - - /* XXXJBF: */ - BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0); - - skcipher_request_set_sync_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - - memset(desc.iv, 0, sizeof(desc.iv)); - desc.req = req; - desc.fragno = 0; - desc.fraglen = 0; - - sg_init_table(desc.frags, 4); - - ret = xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); - skcipher_request_zero(req); - return ret; -} - /* * This function makes the assumption that it was ultimately called * from gss_wrap(). diff --git a/net/sunrpc/auth_gss/gss_krb5_internal.h b/net/sunrpc/auth_gss/gss_krb5_internal.h index 3afd4065bf3d..8769e9e705bf 100644 --- a/net/sunrpc/auth_gss/gss_krb5_internal.h +++ b/net/sunrpc/auth_gss/gss_krb5_internal.h @@ -155,10 +155,6 @@ static inline int krb5_derive_key(struct krb5_ctx *kctx, void krb5_make_confounder(u8 *p, int conflen); -u32 make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, - struct xdr_buf *body, int body_offset, u8 *cksumkey, - unsigned int usage, struct xdr_netobj *cksumout); - u32 gss_krb5_checksum(struct crypto_ahash *tfm, char *header, int hdrlen, const struct xdr_buf *body, int body_offset, struct xdr_netobj *cksumout); @@ -166,19 +162,9 @@ u32 gss_krb5_checksum(struct crypto_ahash *tfm, char *header, int hdrlen, u32 krb5_encrypt(struct crypto_sync_skcipher *key, void *iv, void *in, void *out, int length); -u32 krb5_decrypt(struct crypto_sync_skcipher *key, void *iv, void *in, - void *out, int length); - int xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen); -int gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm, - struct xdr_buf *outbuf, int offset, - struct page **pages); - -int gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm, - struct xdr_buf *inbuf, int offset); - u32 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, struct page **pages); diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index fae632da1058..c84d0cf61980 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -13,7 +13,6 @@ #include <linux/module.h> #include <linux/oid_registry.h> #include <linux/sunrpc/msg_prot.h> -#include <linux/sunrpc/gss_asn1.h> #include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/gss_err.h> diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 059f6ef1ad18..131090f31e6a 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -135,6 +135,8 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, hlist_add_head_rcu(&new->cache_list, head); detail->entries++; + if (detail->nextcheck > new->expiry_time) + detail->nextcheck = new->expiry_time + 1; cache_get(new); spin_unlock(&detail->hash_lock); @@ -281,21 +283,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h return rv; } -/* - * This is the generic cache management routine for all - * the authentication caches. - * It checks the currency of a cache item and will (later) - * initiate an upcall to fill it if needed. - * - * - * Returns 0 if the cache_head can be used, or cache_puts it and returns - * -EAGAIN if upcall is pending and request has been queued - * -ETIMEDOUT if upcall failed or request could not be queue or - * upcall completed but item is still invalid (implying that - * the cache item has been replaced with a newer one). - * -ENOENT if cache entry was negative - */ -int cache_check(struct cache_detail *detail, +int cache_check_rcu(struct cache_detail *detail, struct cache_head *h, struct cache_req *rqstp) { int rv; @@ -336,6 +324,31 @@ int cache_check(struct cache_detail *detail, rv = -ETIMEDOUT; } } + + return rv; +} +EXPORT_SYMBOL_GPL(cache_check_rcu); + +/* + * This is the generic cache management routine for all + * the authentication caches. + * It checks the currency of a cache item and will (later) + * initiate an upcall to fill it if needed. + * + * + * Returns 0 if the cache_head can be used, or cache_puts it and returns + * -EAGAIN if upcall is pending and request has been queued + * -ETIMEDOUT if upcall failed or request could not be queue or + * upcall completed but item is still invalid (implying that + * the cache item has been replaced with a newer one). + * -ENOENT if cache entry was negative + */ +int cache_check(struct cache_detail *detail, + struct cache_head *h, struct cache_req *rqstp) +{ + int rv; + + rv = cache_check_rcu(detail, h, rqstp); if (rv) cache_put(h, detail); return rv; @@ -451,24 +464,21 @@ static int cache_clean(void) } } + spin_lock(¤t_detail->hash_lock); + /* find a non-empty bucket in the table */ - while (current_detail && - current_index < current_detail->hash_size && + while (current_index < current_detail->hash_size && hlist_empty(¤t_detail->hash_table[current_index])) current_index++; /* find a cleanable entry in the bucket and clean it, or set to next bucket */ - - if (current_detail && current_index < current_detail->hash_size) { + if (current_index < current_detail->hash_size) { struct cache_head *ch = NULL; struct cache_detail *d; struct hlist_head *head; struct hlist_node *tmp; - spin_lock(¤t_detail->hash_lock); - /* Ok, now to clean this strand */ - head = ¤t_detail->hash_table[current_index]; hlist_for_each_entry_safe(ch, tmp, head, cache_list) { if (current_detail->nextcheck > ch->expiry_time) @@ -489,8 +499,10 @@ static int cache_clean(void) spin_unlock(&cache_list_lock); if (ch) sunrpc_end_cache_remove_entry(ch, d); - } else + } else { + spin_unlock(¤t_detail->hash_lock); spin_unlock(&cache_list_lock); + } return rv; } @@ -1427,17 +1439,11 @@ static int c_show(struct seq_file *m, void *p) seq_printf(m, "# expiry=%lld refcnt=%d flags=%lx\n", convert_to_wallclock(cp->expiry_time), kref_read(&cp->ref), cp->flags); - if (!cache_get_rcu(cp)) - return 0; - if (cache_check(cd, cp, NULL)) - /* cache_check does a cache_put on failure */ + if (cache_check_rcu(cd, cp, NULL)) + seq_puts(m, "# "); + else if (cache_is_expired(cd, cp)) seq_puts(m, "# "); - else { - if (cache_is_expired(cd, cp)) - seq_puts(m, "# "); - cache_put(cp, cd); - } return cd->cache_show(m, cd, cp); } @@ -1669,12 +1675,14 @@ static void remove_cache_proc_entries(struct cache_detail *cd) } } -#ifdef CONFIG_PROC_FS static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) { struct proc_dir_entry *p; struct sunrpc_net *sn; + if (!IS_ENABLED(CONFIG_PROC_FS)) + return 0; + sn = net_generic(net, sunrpc_net_id); cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc); if (cd->procfs == NULL) @@ -1702,12 +1710,6 @@ out_nomem: remove_cache_proc_entries(cd); return -ENOMEM; } -#else /* CONFIG_PROC_FS */ -static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) -{ - return 0; -} -#endif void __init cache_initialize(void) { diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0090162ee8c3..21426c3049d3 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -270,9 +270,6 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt, old = rcu_dereference_protected(clnt->cl_xprt, lockdep_is_held(&clnt->cl_lock)); - if (!xprt_bound(xprt)) - clnt->cl_autobind = 1; - clnt->cl_timeout = timeout; rcu_assign_pointer(clnt->cl_xprt, xprt); spin_unlock(&clnt->cl_lock); @@ -512,6 +509,8 @@ static struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args, clnt->cl_discrtry = 1; if (!(args->flags & RPC_CLNT_CREATE_QUIET)) clnt->cl_chatty = 1; + if (args->flags & RPC_CLNT_CREATE_NETUNREACH_FATAL) + clnt->cl_netunreach_fatal = 1; return clnt; } @@ -662,6 +661,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, new->cl_noretranstimeo = clnt->cl_noretranstimeo; new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; + new->cl_netunreach_fatal = clnt->cl_netunreach_fatal; new->cl_principal = clnt->cl_principal; new->cl_max_connect = clnt->cl_max_connect; return new; @@ -958,12 +958,17 @@ void rpc_shutdown_client(struct rpc_clnt *clnt) trace_rpc_clnt_shutdown(clnt); + clnt->cl_shutdown = 1; while (!list_empty(&clnt->cl_tasks)) { rpc_killall_tasks(clnt); wait_event_timeout(destroy_wait, list_empty(&clnt->cl_tasks), 1*HZ); } + /* wait for tasks still in workqueue or waitqueue */ + wait_event_timeout(destroy_wait, + atomic_read(&clnt->cl_task_count) == 0, 1 * HZ); + rpc_release_client(clnt); } EXPORT_SYMBOL_GPL(rpc_shutdown_client); @@ -1139,6 +1144,7 @@ void rpc_task_release_client(struct rpc_task *task) list_del(&task->tk_task); spin_unlock(&clnt->cl_lock); task->tk_client = NULL; + atomic_dec(&clnt->cl_task_count); rpc_release_client(clnt); } @@ -1189,10 +1195,9 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) task->tk_flags |= RPC_TASK_TIMEOUT; if (clnt->cl_noretranstimeo) task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; - /* Add to the client's list of all tasks */ - spin_lock(&clnt->cl_lock); - list_add_tail(&task->tk_task, &clnt->cl_tasks); - spin_unlock(&clnt->cl_lock); + if (clnt->cl_netunreach_fatal) + task->tk_flags |= RPC_TASK_NETUNREACH_FATAL; + atomic_inc(&clnt->cl_task_count); } static void @@ -1787,9 +1792,14 @@ call_reserveresult(struct rpc_task *task) if (status >= 0) { if (task->tk_rqstp) { task->tk_action = call_refresh; + + /* Add to the client's list of all tasks */ + spin_lock(&task->tk_client->cl_lock); + if (list_empty(&task->tk_task)) + list_add_tail(&task->tk_task, &task->tk_client->cl_tasks); + spin_unlock(&task->tk_client->cl_lock); return; } - rpc_call_rpcerror(task, -EIO); return; } @@ -1854,13 +1864,13 @@ call_refreshresult(struct rpc_task *task) fallthrough; case -EAGAIN: status = -EACCES; - fallthrough; - case -EKEYEXPIRED: if (!task->tk_cred_retry) break; task->tk_cred_retry--; trace_rpc_retry_refresh_status(task); return; + case -EKEYEXPIRED: + break; case -ENOMEM: rpc_delay(task, HZ >> 4); return; @@ -2094,14 +2104,17 @@ call_bind_status(struct rpc_task *task) case -EPROTONOSUPPORT: trace_rpcb_bind_version_err(task); goto retry_timeout; + case -ENETDOWN: + case -ENETUNREACH: + if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) + break; + fallthrough; case -ECONNREFUSED: /* connection problems */ case -ECONNRESET: case -ECONNABORTED: case -ENOTCONN: case -EHOSTDOWN: - case -ENETDOWN: case -EHOSTUNREACH: - case -ENETUNREACH: case -EPIPE: trace_rpcb_unreachable_err(task); if (!RPC_IS_SOFTCONN(task)) { @@ -2183,19 +2196,22 @@ call_connect_status(struct rpc_task *task) task->tk_status = 0; switch (status) { + case -ENETDOWN: + case -ENETUNREACH: + if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) + break; + fallthrough; case -ECONNREFUSED: case -ECONNRESET: /* A positive refusal suggests a rebind is needed. */ - if (RPC_IS_SOFTCONN(task)) - break; if (clnt->cl_autobind) { rpc_force_rebind(clnt); + if (RPC_IS_SOFTCONN(task)) + break; goto out_retry; } fallthrough; case -ECONNABORTED: - case -ENETDOWN: - case -ENETUNREACH: case -EHOSTUNREACH: case -EPIPE: case -EPROTO: @@ -2447,10 +2463,13 @@ call_status(struct rpc_task *task) trace_rpc_call_status(task); task->tk_status = 0; switch(status) { - case -EHOSTDOWN: case -ENETDOWN: - case -EHOSTUNREACH: case -ENETUNREACH: + if (task->tk_flags & RPC_TASK_NETUNREACH_FATAL) + goto out_exit; + fallthrough; + case -EHOSTDOWN: + case -EHOSTUNREACH: case -EPERM: if (RPC_IS_SOFTCONN(task)) goto out_exit; @@ -2752,8 +2771,13 @@ out_verifier: case -EPROTONOSUPPORT: goto out_err; case -EACCES: - /* Re-encode with a fresh cred */ - fallthrough; + /* possible RPCSEC_GSS out-of-sequence event (RFC2203), + * reset recv state and keep waiting, don't retransmit + */ + task->tk_rqstp->rq_reply_bytes_recvd = 0; + task->tk_status = xprt_request_enqueue_receive(task); + task->tk_action = call_transmit_status; + return -EBADMSG; default: goto out_garbage; } @@ -3319,8 +3343,11 @@ bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -static void rpc_show_header(void) +static void rpc_show_header(struct rpc_clnt *clnt) { + printk(KERN_INFO "clnt[%pISpc] RPC tasks[%d]\n", + (struct sockaddr *)&clnt->cl_xprt->addr, + atomic_read(&clnt->cl_task_count)); printk(KERN_INFO "-pid- flgs status -client- --rqstp- " "-timeout ---ops--\n"); } @@ -3352,7 +3379,7 @@ void rpc_show_tasks(struct net *net) spin_lock(&clnt->cl_lock); list_for_each_entry(task, &clnt->cl_tasks, tk_task) { if (!header) { - rpc_show_header(); + rpc_show_header(clnt); header++; } rpc_show_task(clnt, task); diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index a176d5a0b0ee..32417db340de 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -74,6 +74,9 @@ tasks_stop(struct seq_file *f, void *v) { struct rpc_clnt *clnt = f->private; spin_unlock(&clnt->cl_lock); + seq_printf(f, "clnt[%pISpc] RPC tasks[%d]\n", + (struct sockaddr *)&clnt->cl_xprt->addr, + atomic_read(&clnt->cl_task_count)); } static const struct seq_operations tasks_seq_operations = { @@ -179,6 +182,18 @@ xprt_info_show(struct seq_file *f, void *v) seq_printf(f, "addr: %s\n", xprt->address_strings[RPC_DISPLAY_ADDR]); seq_printf(f, "port: %s\n", xprt->address_strings[RPC_DISPLAY_PORT]); seq_printf(f, "state: 0x%lx\n", xprt->state); + seq_printf(f, "netns: %u\n", xprt->xprt_net->ns.inum); + + if (xprt->ops->get_srcaddr) { + int ret, buflen; + char buf[INET6_ADDRSTRLEN]; + + buflen = ARRAY_SIZE(buf); + ret = xprt->ops->get_srcaddr(xprt, buf, buflen); + if (ret < 0) + ret = sprintf(buf, "<closed>"); + seq_printf(f, "saddr: %.*s\n", ret, buf); + } return 0; } diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 7ce3721c06ca..98f78cd55905 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -630,8 +630,8 @@ static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry) static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent, const char *name) { - struct qstr q = QSTR_INIT(name, strlen(name)); - struct dentry *dentry = d_hash_and_lookup(parent, &q); + struct qstr q = QSTR(name); + struct dentry *dentry = try_lookup_noperm(&q, parent); if (!dentry) { dentry = d_alloc(parent, &q); if (!dentry) @@ -658,7 +658,7 @@ static void __rpc_depopulate(struct dentry *parent, for (i = start; i < eof; i++) { name.name = files[i].name; name.len = strlen(files[i].name); - dentry = d_hash_and_lookup(parent, &name); + dentry = try_lookup_noperm(&name, parent); if (dentry == NULL) continue; @@ -1190,8 +1190,7 @@ static const struct rpc_filelist files[] = { struct dentry *rpc_d_lookup_sb(const struct super_block *sb, const unsigned char *dir_name) { - struct qstr dir = QSTR_INIT(dir_name, strlen(dir_name)); - return d_hash_and_lookup(sb->s_root, &dir); + return try_lookup_noperm(&QSTR(dir_name), sb->s_root); } EXPORT_SYMBOL_GPL(rpc_d_lookup_sb); @@ -1300,11 +1299,9 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) struct dentry *gssd_dentry; struct dentry *clnt_dentry = NULL; struct dentry *pipe_dentry = NULL; - struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name, - strlen(files[RPCAUTH_gssd].name)); /* We should never get this far if "gssd" doesn't exist */ - gssd_dentry = d_hash_and_lookup(root, &q); + gssd_dentry = try_lookup_noperm(&QSTR(files[RPCAUTH_gssd].name), root); if (!gssd_dentry) return ERR_PTR(-ENOENT); @@ -1314,9 +1311,8 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) goto out; } - q.name = gssd_dummy_clnt_dir[0].name; - q.len = strlen(gssd_dummy_clnt_dir[0].name); - clnt_dentry = d_hash_and_lookup(gssd_dentry, &q); + clnt_dentry = try_lookup_noperm(&QSTR(gssd_dummy_clnt_dir[0].name), + gssd_dentry); if (!clnt_dentry) { __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); pipe_dentry = ERR_PTR(-ENOENT); diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 102c3818bc54..53bcca365fb1 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -820,9 +820,10 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) } trace_rpcb_setport(child, map->r_status, map->r_port); - xprt->ops->set_port(xprt, map->r_port); - if (map->r_port) + if (map->r_port) { + xprt->ops->set_port(xprt, map->r_port); xprt_set_bound(xprt); + } } /* diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index cef623ea1506..73bc39281ef5 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -276,6 +276,8 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { + if (unlikely(current->flags & PF_EXITING)) + return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; @@ -864,8 +866,6 @@ void rpc_signal_task(struct rpc_task *task) if (!rpc_task_set_rpc_status(task, -ERESTARTSYS)) return; trace_rpc_task_signalled(task, task->tk_action); - set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); - smp_mb__after_atomic(); queue = READ_ONCE(task->tk_waitqueue); if (queue) rpc_wake_up_queued_task(queue, task); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 79879b7d39cb..939b6239df8a 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -636,24 +636,22 @@ svc_destroy(struct svc_serv **servp) EXPORT_SYMBOL_GPL(svc_destroy); static bool -svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) +svc_init_buffer(struct svc_rqst *rqstp, const struct svc_serv *serv, int node) { - unsigned long pages, ret; - - /* bc_xprt uses fore channel allocated buffers */ - if (svc_is_backchannel(rqstp)) - return true; - - pages = size / PAGE_SIZE + 1; /* extra page as we hold both request and reply. - * We assume one is at most one page - */ - WARN_ON_ONCE(pages > RPCSVC_MAXPAGES); - if (pages > RPCSVC_MAXPAGES) - pages = RPCSVC_MAXPAGES; - - ret = alloc_pages_bulk_array_node(GFP_KERNEL, node, pages, - rqstp->rq_pages); - return ret == pages; + unsigned long ret; + + rqstp->rq_maxpages = svc_serv_maxpages(serv); + + /* rq_pages' last entry is NULL for historical reasons. */ + rqstp->rq_pages = kcalloc_node(rqstp->rq_maxpages + 1, + sizeof(struct page *), + GFP_KERNEL, node); + if (!rqstp->rq_pages) + return false; + + ret = alloc_pages_bulk_node(GFP_KERNEL, node, rqstp->rq_maxpages, + rqstp->rq_pages); + return ret == rqstp->rq_maxpages; } /* @@ -662,17 +660,19 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) static void svc_release_buffer(struct svc_rqst *rqstp) { - unsigned int i; + unsigned long i; - for (i = 0; i < ARRAY_SIZE(rqstp->rq_pages); i++) + for (i = 0; i < rqstp->rq_maxpages; i++) if (rqstp->rq_pages[i]) put_page(rqstp->rq_pages[i]); + kfree(rqstp->rq_pages); } static void svc_rqst_free(struct svc_rqst *rqstp) { folio_batch_release(&rqstp->rq_fbatch); + kfree(rqstp->rq_bvec); svc_release_buffer(rqstp); if (rqstp->rq_scratch_page) put_page(rqstp->rq_scratch_page); @@ -708,7 +708,13 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) if (!rqstp->rq_resp) goto out_enomem; - if (!svc_init_buffer(rqstp, serv->sv_max_mesg, node)) + if (!svc_init_buffer(rqstp, serv, node)) + goto out_enomem; + + rqstp->rq_bvec = kcalloc_node(rqstp->rq_maxpages, + sizeof(struct bio_vec), + GFP_KERNEL, node); + if (!rqstp->rq_bvec) goto out_enomem; rqstp->rq_err = -EAGAIN; /* No error yet */ @@ -900,7 +906,7 @@ EXPORT_SYMBOL_GPL(svc_set_num_threads); bool svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) { struct page **begin = rqstp->rq_pages; - struct page **end = &rqstp->rq_pages[RPCSVC_MAXPAGES]; + struct page **end = &rqstp->rq_pages[rqstp->rq_maxpages]; if (unlikely(rqstp->rq_next_page < begin || rqstp->rq_next_page > end)) { trace_svc_replace_page_err(rqstp); @@ -1714,46 +1720,6 @@ int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset, EXPORT_SYMBOL_GPL(svc_encode_result_payload); /** - * svc_fill_write_vector - Construct data argument for VFS write call - * @rqstp: svc_rqst to operate on - * @payload: xdr_buf containing only the write data payload - * - * Fills in rqstp::rq_vec, and returns the number of elements. - */ -unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, - struct xdr_buf *payload) -{ - struct page **pages = payload->pages; - struct kvec *first = payload->head; - struct kvec *vec = rqstp->rq_vec; - size_t total = payload->len; - unsigned int i; - - /* Some types of transport can present the write payload - * entirely in rq_arg.pages. In this case, @first is empty. - */ - i = 0; - if (first->iov_len) { - vec[i].iov_base = first->iov_base; - vec[i].iov_len = min_t(size_t, total, first->iov_len); - total -= vec[i].iov_len; - ++i; - } - - while (total) { - vec[i].iov_base = page_address(*pages); - vec[i].iov_len = min_t(size_t, total, PAGE_SIZE); - total -= vec[i].iov_len; - ++i; - ++pages; - } - - WARN_ON_ONCE(i > ARRAY_SIZE(rqstp->rq_vec)); - return i; -} -EXPORT_SYMBOL_GPL(svc_fill_write_vector); - -/** * svc_fill_symlink_pathname - Construct pathname argument for VFS symlink call * @rqstp: svc_rqst to operate on * @first: buffer containing first section of pathname diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 43c57124de52..cb14d6ddac6c 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -488,6 +488,7 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) pool = svc_pool_for_cpu(xprt->xpt_server); percpu_counter_inc(&pool->sp_sockets_queued); + xprt->xpt_qtime = ktime_get(); lwq_enqueue(&xprt->xpt_ready, &pool->sp_xprts); svc_pool_wake_idle_thread(pool); @@ -606,7 +607,8 @@ int svc_port_is_privileged(struct sockaddr *sin) } /* - * Make sure that we don't have too many active connections. If we have, + * Make sure that we don't have too many connections that have not yet + * demonstrated that they have access to the NFS server. If we have, * something must be dropped. It's not clear what will happen if we allow * "too many" connections, but when dealing with network-facing software, * we have to code defensively. Here we do that by imposing hard limits. @@ -618,34 +620,26 @@ int svc_port_is_privileged(struct sockaddr *sin) * The only somewhat efficient mechanism would be if drop old * connections from the same IP first. But right now we don't even * record the client IP in svc_sock. - * - * single-threaded services that expect a lot of clients will probably - * need to set sv_maxconn to override the default value which is based - * on the number of threads */ static void svc_check_conn_limits(struct svc_serv *serv) { - unsigned int limit = serv->sv_maxconn ? serv->sv_maxconn : - (serv->sv_nrthreads+3) * 20; - - if (serv->sv_tmpcnt > limit) { - struct svc_xprt *xprt = NULL; + if (serv->sv_tmpcnt > XPT_MAX_TMP_CONN) { + struct svc_xprt *xprt = NULL, *xprti; spin_lock_bh(&serv->sv_lock); if (!list_empty(&serv->sv_tempsocks)) { - /* Try to help the admin */ - net_notice_ratelimited("%s: too many open connections, consider increasing the %s\n", - serv->sv_name, serv->sv_maxconn ? - "max number of connections" : - "number of threads"); /* * Always select the oldest connection. It's not fair, - * but so is life + * but nor is life. */ - xprt = list_entry(serv->sv_tempsocks.prev, - struct svc_xprt, - xpt_list); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_get(xprt); + list_for_each_entry_reverse(xprti, &serv->sv_tempsocks, + xpt_list) { + if (!test_bit(XPT_PEER_VALID, &xprti->xpt_flags)) { + xprt = xprti; + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_get(xprt); + break; + } + } } spin_unlock_bh(&serv->sv_lock); @@ -658,21 +652,12 @@ static void svc_check_conn_limits(struct svc_serv *serv) static bool svc_alloc_arg(struct svc_rqst *rqstp) { - struct svc_serv *serv = rqstp->rq_server; struct xdr_buf *arg = &rqstp->rq_arg; unsigned long pages, filled, ret; - pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT; - if (pages > RPCSVC_MAXPAGES) { - pr_warn_once("svc: warning: pages=%lu > RPCSVC_MAXPAGES=%lu\n", - pages, RPCSVC_MAXPAGES); - /* use as many pages as possible */ - pages = RPCSVC_MAXPAGES; - } - + pages = rqstp->rq_maxpages; for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk_array(GFP_KERNEL, pages, - rqstp->rq_pages); + ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); if (ret > filled) /* Made progress, don't sleep yet */ continue; @@ -1039,7 +1024,8 @@ static void svc_delete_xprt(struct svc_xprt *xprt) spin_lock_bh(&serv->sv_lock); list_del_init(&xprt->xpt_list); - if (test_bit(XPT_TEMP, &xprt->xpt_flags)) + if (test_bit(XPT_TEMP, &xprt->xpt_flags) && + !test_bit(XPT_PEER_VALID, &xprt->xpt_flags)) serv->sv_tmpcnt--; spin_unlock_bh(&serv->sv_lock); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 95397677673b..e1c85123b445 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -713,8 +713,7 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) if (svc_xprt_is_dead(xprt)) goto out_notconn; - count = xdr_buf_to_bvec(rqstp->rq_bvec, - ARRAY_SIZE(rqstp->rq_bvec), xdr); + count = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, xdr); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, count, rqstp->rq_res.len); @@ -1083,9 +1082,6 @@ static void svc_tcp_fragment_received(struct svc_sock *svsk) /* If we have more data, signal svc_xprt_enqueue() to try again */ svsk->sk_tcplen = 0; svsk->sk_marker = xdr_zero; - - smp_wmb(); - tcp_set_rcvlowat(svsk->sk_sk, 1); } /** @@ -1175,17 +1171,10 @@ err_incomplete: goto err_delete; if (len == want) svc_tcp_fragment_received(svsk); - else { - /* Avoid more ->sk_data_ready() calls until the rest - * of the message has arrived. This reduces service - * thread wake-ups on large incoming messages. */ - tcp_set_rcvlowat(svsk->sk_sk, - svc_sock_reclen(svsk) - svsk->sk_tcplen); - + else trace_svcsock_tcp_recv_short(&svsk->sk_xprt, svc_sock_reclen(svsk), svsk->sk_tcplen - sizeof(rpc_fraghdr)); - } goto err_noclose; error: if (len != -EAGAIN) @@ -1229,8 +1218,8 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, memcpy(buf, &marker, sizeof(marker)); bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker)); - count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, - ARRAY_SIZE(rqstp->rq_bvec) - 1, &rqstp->rq_res); + count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, rqstp->rq_maxpages, + &rqstp->rq_res); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, 1 + count, sizeof(marker) + rqstp->rq_res.len); @@ -1350,7 +1339,8 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) svsk->sk_marker = xdr_zero; svsk->sk_tcplen = 0; svsk->sk_datalen = 0; - memset(&svsk->sk_pages[0], 0, sizeof(svsk->sk_pages)); + memset(&svsk->sk_pages[0], 0, + svsk->sk_maxpages * sizeof(struct page *)); tcp_sock_set_nodelay(sk); @@ -1389,10 +1379,13 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + unsigned long pages; - svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); + pages = svc_serv_maxpages(serv); + svsk = kzalloc(struct_size(svsk, sk_pages, pages), GFP_KERNEL); if (!svsk) return ERR_PTR(-ENOMEM); + svsk->sk_maxpages = pages; inet = sock->sk; @@ -1551,11 +1544,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, newlen = error; if (protocol == IPPROTO_TCP) { - __netns_tracker_free(net, &sock->sk->ns_tracker, false); - sock->sk->sk_net_refcnt = 1; - get_net_track(net, &sock->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); - if ((error = kernel_listen(sock, 64)) < 0) + sk_net_refcnt_upgrade(sock->sk); + if ((error = kernel_listen(sock, SOMAXCONN)) < 0) goto bummer; } diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 5c8ecdaaa985..09434e1143c5 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -59,6 +59,16 @@ static struct kobject *rpc_sysfs_object_alloc(const char *name, return NULL; } +static inline struct rpc_clnt * +rpc_sysfs_client_kobj_get_clnt(struct kobject *kobj) +{ + struct rpc_sysfs_client *c = container_of(kobj, + struct rpc_sysfs_client, kobject); + struct rpc_clnt *ret = c->clnt; + + return refcount_inc_not_zero(&ret->cl_count) ? ret : NULL; +} + static inline struct rpc_xprt * rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj) { @@ -86,6 +96,51 @@ rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj) return xprt_switch_get(x->xprt_switch); } +static ssize_t rpc_sysfs_clnt_version_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); + ssize_t ret; + + if (!clnt) + return sprintf(buf, "<closed>\n"); + + ret = sprintf(buf, "%u", clnt->cl_vers); + refcount_dec(&clnt->cl_count); + return ret; +} + +static ssize_t rpc_sysfs_clnt_program_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); + ssize_t ret; + + if (!clnt) + return sprintf(buf, "<closed>\n"); + + ret = sprintf(buf, "%s", clnt->cl_program->name); + refcount_dec(&clnt->cl_count); + return ret; +} + +static ssize_t rpc_sysfs_clnt_max_connect_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_clnt *clnt = rpc_sysfs_client_kobj_get_clnt(kobj); + ssize_t ret; + + if (!clnt) + return sprintf(buf, "<closed>\n"); + + ret = sprintf(buf, "%u\n", clnt->cl_max_connect); + refcount_dec(&clnt->cl_count); + return ret; +} + static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -129,6 +184,31 @@ static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, return ret; } +static const char *xprtsec_strings[] = { + [RPC_XPRTSEC_NONE] = "none", + [RPC_XPRTSEC_TLS_ANON] = "tls-anon", + [RPC_XPRTSEC_TLS_X509] = "tls-x509", +}; + +static ssize_t rpc_sysfs_xprt_xprtsec_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + ssize_t ret; + + if (!xprt) { + ret = sprintf(buf, "<closed>\n"); + goto out; + } + + ret = sprintf(buf, "%s\n", xprtsec_strings[xprt->xprtsec.policy]); + xprt_put(xprt); +out: + return ret; + +} + static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -206,6 +286,14 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, return ret; } +static ssize_t rpc_sysfs_xprt_del_xprt_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "# delete this xprt\n"); +} + + static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -225,6 +313,55 @@ static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, return ret; } +static ssize_t rpc_sysfs_xprt_switch_add_xprt_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "# add one xprt to this xprt_switch\n"); +} + +static ssize_t rpc_sysfs_xprt_switch_add_xprt_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rpc_xprt_switch *xprt_switch = + rpc_sysfs_xprt_switch_kobj_get_xprt(kobj); + struct xprt_create xprt_create_args; + struct rpc_xprt *xprt, *new; + + if (!xprt_switch) + return 0; + + xprt = rpc_xprt_switch_get_main_xprt(xprt_switch); + if (!xprt) + goto out; + + xprt_create_args.ident = xprt->xprt_class->ident; + xprt_create_args.net = xprt->xprt_net; + xprt_create_args.dstaddr = (struct sockaddr *)&xprt->addr; + xprt_create_args.addrlen = xprt->addrlen; + xprt_create_args.servername = xprt->servername; + xprt_create_args.bc_xprt = xprt->bc_xprt; + xprt_create_args.xprtsec = xprt->xprtsec; + xprt_create_args.connect_timeout = xprt->connect_timeout; + xprt_create_args.reconnect_timeout = xprt->max_reconnect_timeout; + + new = xprt_create_transport(&xprt_create_args); + if (IS_ERR_OR_NULL(new)) { + count = PTR_ERR(new); + goto out_put_xprt; + } + + rpc_xprt_switch_add_xprt(xprt_switch, new); + xprt_put(new); + +out_put_xprt: + xprt_put(xprt); +out: + xprt_switch_put(xprt_switch); + return count; +} + static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -335,6 +472,40 @@ out_put: return count; } +static ssize_t rpc_sysfs_xprt_del_xprt(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj); + + if (!xprt || !xps) { + count = 0; + goto out; + } + + if (xprt->main) { + count = -EINVAL; + goto release_tasks; + } + + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { + count = -EINTR; + goto out_put; + } + + xprt_set_offline_locked(xprt, xps); + xprt_delete_locked(xprt, xps); + +release_tasks: + xprt_release_write(xprt, NULL); +out_put: + xprt_put(xprt); + xprt_switch_put(xps); +out: + return count; +} + int rpc_sysfs_init(void) { rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); @@ -398,23 +569,48 @@ static const void *rpc_sysfs_xprt_namespace(const struct kobject *kobj) kobject)->xprt->xprt_net; } +static struct kobj_attribute rpc_sysfs_clnt_version = __ATTR(rpc_version, + 0444, rpc_sysfs_clnt_version_show, NULL); + +static struct kobj_attribute rpc_sysfs_clnt_program = __ATTR(program, + 0444, rpc_sysfs_clnt_program_show, NULL); + +static struct kobj_attribute rpc_sysfs_clnt_max_connect = __ATTR(max_connect, + 0444, rpc_sysfs_clnt_max_connect_show, NULL); + +static struct attribute *rpc_sysfs_rpc_clnt_attrs[] = { + &rpc_sysfs_clnt_version.attr, + &rpc_sysfs_clnt_program.attr, + &rpc_sysfs_clnt_max_connect.attr, + NULL, +}; +ATTRIBUTE_GROUPS(rpc_sysfs_rpc_clnt); + static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store); static struct kobj_attribute rpc_sysfs_xprt_srcaddr = __ATTR(srcaddr, 0644, rpc_sysfs_xprt_srcaddr_show, NULL); +static struct kobj_attribute rpc_sysfs_xprt_xprtsec = __ATTR(xprtsec, + 0644, rpc_sysfs_xprt_xprtsec_show, NULL); + static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, 0444, rpc_sysfs_xprt_info_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state, 0644, rpc_sysfs_xprt_state_show, rpc_sysfs_xprt_state_change); +static struct kobj_attribute rpc_sysfs_xprt_del = __ATTR(del_xprt, + 0644, rpc_sysfs_xprt_del_xprt_show, rpc_sysfs_xprt_del_xprt); + static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_dstaddr.attr, &rpc_sysfs_xprt_srcaddr.attr, + &rpc_sysfs_xprt_xprtsec.attr, &rpc_sysfs_xprt_info.attr, &rpc_sysfs_xprt_change_state.attr, + &rpc_sysfs_xprt_del.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_xprt); @@ -422,14 +618,20 @@ ATTRIBUTE_GROUPS(rpc_sysfs_xprt); static struct kobj_attribute rpc_sysfs_xprt_switch_info = __ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL); +static struct kobj_attribute rpc_sysfs_xprt_switch_add_xprt = + __ATTR(add_xprt, 0644, rpc_sysfs_xprt_switch_add_xprt_show, + rpc_sysfs_xprt_switch_add_xprt_store); + static struct attribute *rpc_sysfs_xprt_switch_attrs[] = { &rpc_sysfs_xprt_switch_info.attr, + &rpc_sysfs_xprt_switch_add_xprt.attr, NULL, }; ATTRIBUTE_GROUPS(rpc_sysfs_xprt_switch); static const struct kobj_type rpc_sysfs_client_type = { .release = rpc_sysfs_client_release, + .default_groups = rpc_sysfs_rpc_clnt_groups, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_client_namespace, }; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 62e07c330a66..2ea00e354ba6 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -213,6 +213,7 @@ bvec_overflow: pr_warn_once("%s: bio_vec array overflow\n", __func__); return count - 1; } +EXPORT_SYMBOL_GPL(xdr_buf_to_bvec); /** * xdr_inline_pages - Prepare receive buffer for a large reply @@ -1097,6 +1098,12 @@ out_overflow: * Checks that we have enough buffer space to encode 'nbytes' more * bytes of data. If so, update the total xdr_buf length, and * adjust the length of the current kvec. + * + * The returned pointer is valid only until the next call to + * xdr_reserve_space() or xdr_commit_encode() on @xdr. The current + * implementation of this API guarantees that space reserved for a + * four-byte data item remains valid until @xdr is destroyed, but + * that might not always be true in the future. */ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) { diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 09f245cda526..d5e0cdcad9e0 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1167,7 +1167,7 @@ xprt_request_enqueue_receive(struct rpc_task *task) spin_unlock(&xprt->queue_lock); /* Turn off autodisconnect */ - del_timer_sync(&xprt->timer); + timer_delete_sync(&xprt->timer); return 0; } @@ -1365,7 +1365,7 @@ xprt_request_enqueue_transmit(struct rpc_task *task) INIT_LIST_HEAD(&req->rq_xmit2); goto out; } - } else if (!req->rq_seqno) { + } else if (req->rq_seqno_count == 0) { list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { if (pos->rq_task->tk_owner != task->tk_owner) continue; @@ -1898,6 +1898,7 @@ xprt_request_init(struct rpc_task *task) req->rq_snd_buf.bvec = NULL; req->rq_rcv_buf.bvec = NULL; req->rq_release_snd_buf = NULL; + req->rq_seqno_count = 0; xprt_init_majortimeo(task, req, task->tk_client->cl_timeout); trace_xprt_reserve(req); @@ -2138,7 +2139,7 @@ static void xprt_destroy(struct rpc_xprt *xprt) * can only run *before* del_time_sync(), never after. */ spin_lock(&xprt->transport_lock); - del_timer_sync(&xprt->timer); + timer_delete_sync(&xprt->timer); spin_unlock(&xprt->transport_lock); /* diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 720d3ba742ec..4c5e08b0aa64 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -92,6 +92,27 @@ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, xprt_put(xprt); } +/** + * rpc_xprt_switch_get_main_xprt - Get the 'main' xprt for an xprt switch. + * @xps: pointer to struct rpc_xprt_switch. + */ +struct rpc_xprt *rpc_xprt_switch_get_main_xprt(struct rpc_xprt_switch *xps) +{ + struct rpc_xprt_iter xpi; + struct rpc_xprt *xprt; + + xprt_iter_init_listall(&xpi, xps); + + xprt = xprt_iter_get_next(&xpi); + while (xprt && !xprt->main) { + xprt_put(xprt); + xprt = xprt_iter_get_next(&xpi); + } + + xprt_iter_destroy(&xpi); + return xprt; +} + static DEFINE_IDA(rpc_xprtswitch_ids); void xprt_multipath_cleanup_ids(void) @@ -603,23 +624,6 @@ struct rpc_xprt *xprt_iter_get_helper(struct rpc_xprt_iter *xpi, } /** - * xprt_iter_get_xprt - Returns the rpc_xprt pointed to by the cursor - * @xpi: pointer to rpc_xprt_iter - * - * Returns a reference to the struct rpc_xprt that is currently - * pointed to by the cursor. - */ -struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi) -{ - struct rpc_xprt *xprt; - - rcu_read_lock(); - xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_xprt); - rcu_read_unlock(); - return xprt; -} - -/** * xprt_iter_get_next - Returns the next rpc_xprt following the cursor * @xpi: pointer to rpc_xprt_iter * diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 292022f0976e..e7e4a39ca6c6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -120,12 +120,16 @@ svc_rdma_recv_ctxt_alloc(struct svcxprt_rdma *rdma) { int node = ibdev_to_node(rdma->sc_cm_id->device); struct svc_rdma_recv_ctxt *ctxt; + unsigned long pages; dma_addr_t addr; void *buffer; - ctxt = kzalloc_node(sizeof(*ctxt), GFP_KERNEL, node); + pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server); + ctxt = kzalloc_node(struct_size(ctxt, rc_pages, pages), + GFP_KERNEL, node); if (!ctxt) goto fail0; + ctxt->rc_maxpages = pages; buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) goto fail1; @@ -497,7 +501,7 @@ static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt) * a computation, perform a simple range check. This is an * arbitrary but sensible limit (ie, not architectural). */ - if (unlikely(segcount > RPCSVC_MAXPAGES)) + if (unlikely(segcount > rctxt->rc_maxpages)) return false; p = xdr_inline_decode(&rctxt->rc_stream, diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 40797114d50a..661b3fe2779f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -765,7 +765,7 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp, } len -= seg_len; - if (len && ((head->rc_curpage + 1) > ARRAY_SIZE(rqstp->rq_pages))) + if (len && ((head->rc_curpage + 1) > rqstp->rq_maxpages)) goto out_overrun; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 96154a2367a1..914cd263c2f1 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -118,6 +118,7 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) { int node = ibdev_to_node(rdma->sc_cm_id->device); struct svc_rdma_send_ctxt *ctxt; + unsigned long pages; dma_addr_t addr; void *buffer; int i; @@ -126,13 +127,19 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) GFP_KERNEL, node); if (!ctxt) goto fail0; + pages = svc_serv_maxpages(rdma->sc_xprt.xpt_server); + ctxt->sc_pages = kcalloc_node(pages, sizeof(struct page *), + GFP_KERNEL, node); + if (!ctxt->sc_pages) + goto fail1; + ctxt->sc_maxpages = pages; buffer = kmalloc_node(rdma->sc_max_req_size, GFP_KERNEL, node); if (!buffer) - goto fail1; + goto fail2; addr = ib_dma_map_single(rdma->sc_pd->device, buffer, rdma->sc_max_req_size, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->sc_pd->device, addr)) - goto fail2; + goto fail3; svc_rdma_send_cid_init(rdma, &ctxt->sc_cid); @@ -151,8 +158,10 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_sges[i].lkey = rdma->sc_pd->local_dma_lkey; return ctxt; -fail2: +fail3: kfree(buffer); +fail2: + kfree(ctxt->sc_pages); fail1: kfree(ctxt); fail0: @@ -176,6 +185,7 @@ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) rdma->sc_max_req_size, DMA_TO_DEVICE); kfree(ctxt->sc_xprt_buf); + kfree(ctxt->sc_pages); kfree(ctxt); } } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index c3fbf0779d4a..3d7f1413df02 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -406,12 +406,12 @@ static void svc_rdma_xprt_done(struct rpcrdma_notification *rn) */ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) { + unsigned int ctxts, rq_depth, maxpayload; struct svcxprt_rdma *listen_rdma; struct svcxprt_rdma *newxprt = NULL; struct rdma_conn_param conn_param; struct rpcrdma_connect_private pmsg; struct ib_qp_init_attr qp_attr; - unsigned int ctxts, rq_depth; struct ib_device *dev; int ret = 0; RPC_IFDEBUG(struct sockaddr *sap); @@ -462,12 +462,14 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_bc_requests = 2; } - /* Arbitrarily estimate the number of rw_ctxs needed for - * this transport. This is enough rw_ctxs to make forward - * progress even if the client is using one rkey per page - * in each Read chunk. + /* Arbitrary estimate of the needed number of rdma_rw contexts. */ - ctxts = 3 * RPCSVC_MAXPAGES; + maxpayload = min(xprt->xpt_server->sv_max_payload, + RPCSVC_MAXPAYLOAD_RDMA); + ctxts = newxprt->sc_max_requests * 3 * + rdma_rw_mr_factor(dev, newxprt->sc_port_num, + maxpayload >> PAGE_SHIFT); + newxprt->sc_sq_depth = rq_depth + ctxts; if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) newxprt->sc_sq_depth = dev->attrs.max_qp_wr; @@ -575,6 +577,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) ib_destroy_qp(newxprt->sc_qp); rdma_destroy_id(newxprt->sc_cm_id); + rpcrdma_rn_unregister(dev, &newxprt->sc_rn); /* This call to put will destroy the transport */ svc_xprt_put(&newxprt->sc_xprt); return NULL; @@ -621,7 +624,8 @@ static void __svc_rdma_free(struct work_struct *work) /* Destroy the CM ID */ rdma_destroy_id(rdma->sc_cm_id); - rpcrdma_rn_unregister(device, &rdma->sc_rn); + if (!test_bit(XPT_LISTENER, &rdma->sc_xprt.xpt_flags)) + rpcrdma_rn_unregister(device, &rdma->sc_rn); kfree(rdma); } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c60936d8cef7..04ff66758fc3 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1941,12 +1941,8 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, goto out; } - if (protocol == IPPROTO_TCP) { - __netns_tracker_free(xprt->xprt_net, &sock->sk->ns_tracker, false); - sock->sk->sk_net_refcnt = 1; - get_net_track(xprt->xprt_net, &sock->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(xprt->xprt_net, 1); - } + if (protocol == IPPROTO_TCP) + sk_net_refcnt_upgrade(sock->sk); filp = sock_alloc_file(sock, O_NONBLOCK, NULL); if (IS_ERR(filp)) @@ -2581,7 +2577,15 @@ static void xs_tls_handshake_done(void *data, int status, key_serial_t peerid) struct sock_xprt *lower_transport = container_of(lower_xprt, struct sock_xprt, xprt); - lower_transport->xprt_err = status ? -EACCES : 0; + switch (status) { + case 0: + case -EACCES: + case -ETIMEDOUT: + lower_transport->xprt_err = status; + break; + default: + lower_transport->xprt_err = -EACCES; + } complete(&lower_transport->handshake_done); xprt_put(lower_xprt); } @@ -2722,20 +2726,14 @@ static void xs_tcp_tls_setup_socket(struct work_struct *work) if (status) goto out_close; xprt_release_write(lower_xprt, NULL); - trace_rpc_socket_connect(upper_xprt, upper_transport->sock, 0); - if (!xprt_test_and_set_connected(upper_xprt)) { - upper_xprt->connect_cookie++; - clear_bit(XPRT_SOCK_CONNECTING, &upper_transport->sock_state); - xprt_clear_connecting(upper_xprt); - - upper_xprt->stat.connect_count++; - upper_xprt->stat.connect_time += (long)jiffies - - upper_xprt->stat.connect_start; - xs_run_error_worker(upper_transport, XPRT_SOCK_WAKE_PENDING); - } rpc_shutdown_client(lower_clnt); + /* Check for ingress data that arrived before the socket's + * ->data_ready callback was set up. + */ + xs_poll_check_readable(upper_transport); + out_unlock: current_restore_flags(pflags, PF_MEMALLOC); upper_transport->clnt = NULL; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 6488ead9e464..4d5fbacef496 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -472,7 +472,7 @@ bool switchdev_port_obj_act_is_deferred(struct net_device *dev, EXPORT_SYMBOL_GPL(switchdev_port_obj_act_is_deferred); static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain); -static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain); +static RAW_NOTIFIER_HEAD(switchdev_blocking_notif_chain); /** * register_switchdev_notifier - Register notifier @@ -518,17 +518,27 @@ EXPORT_SYMBOL_GPL(call_switchdev_notifiers); int register_switchdev_blocking_notifier(struct notifier_block *nb) { - struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain; + struct raw_notifier_head *chain = &switchdev_blocking_notif_chain; + int err; + + rtnl_lock(); + err = raw_notifier_chain_register(chain, nb); + rtnl_unlock(); - return blocking_notifier_chain_register(chain, nb); + return err; } EXPORT_SYMBOL_GPL(register_switchdev_blocking_notifier); int unregister_switchdev_blocking_notifier(struct notifier_block *nb) { - struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain; + struct raw_notifier_head *chain = &switchdev_blocking_notif_chain; + int err; - return blocking_notifier_chain_unregister(chain, nb); + rtnl_lock(); + err = raw_notifier_chain_unregister(chain, nb); + rtnl_unlock(); + + return err; } EXPORT_SYMBOL_GPL(unregister_switchdev_blocking_notifier); @@ -536,10 +546,11 @@ int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, struct switchdev_notifier_info *info, struct netlink_ext_ack *extack) { + ASSERT_RTNL(); info->dev = dev; info->extack = extack; - return blocking_notifier_call_chain(&switchdev_blocking_notif_chain, - val, info); + return raw_notifier_call_chain(&switchdev_blocking_notif_chain, + val, info); } EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers); diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 43c3f1c971b8..ea5bb131ebd0 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -425,7 +425,7 @@ static void tipc_aead_free(struct rcu_head *rp) } free_percpu(aead->tfm_entry); kfree_sensitive(aead->key); - kfree(aead); + kfree_sensitive(aead); } static int tipc_aead_users(struct tipc_aead __rcu *aead) @@ -817,12 +817,20 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, goto exit; } + /* Get net to avoid freed tipc_crypto when delete namespace */ + if (!maybe_get_net(aead->crypto->net)) { + tipc_bearer_put(b); + rc = -ENODEV; + goto exit; + } + /* Now, do encrypt */ rc = crypto_aead_encrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); + put_net(aead->crypto->net); exit: kfree(ctx); @@ -860,6 +868,7 @@ static void tipc_aead_encrypt_done(void *data, int err) kfree(tx_ctx); tipc_bearer_put(b); tipc_aead_put(aead); + put_net(net); } /** @@ -2293,8 +2302,8 @@ static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr) keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME))); /* Verify the supplied size values */ - if (unlikely(size != keylen + sizeof(struct tipc_aead_key) || - keylen > TIPC_AEAD_KEY_SIZE_MAX)) { + if (unlikely(keylen > TIPC_AEAD_KEY_SIZE_MAX || + size != keylen + sizeof(struct tipc_aead_key))) { pr_debug("%s: invalid MSG_CRYPTO key size\n", rx->name); goto exit; } diff --git a/net/tipc/link.c b/net/tipc/link.c index 5c2088a469ce..3ee44d731700 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1046,6 +1046,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) { if (imp == TIPC_SYSTEM_IMPORTANCE) { pr_warn("%s<%s>, link overflow", link_rst_msg, l->name); + __skb_queue_purge(list); return -ENOBUFS; } rc = link_schedule_user(l, hdr); @@ -1951,7 +1952,6 @@ void tipc_link_create_dummy_tnl_msg(struct tipc_link *l, void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq) { - struct sk_buff_head *fdefq = &tnl->failover_deferdq; struct sk_buff *skb, *tnlskb; struct tipc_msg *hdr, tnlhdr; struct sk_buff_head *queue = &l->transmq; @@ -2078,6 +2078,8 @@ tnl: tipc_link_xmit(tnl, &tnlq, xmitq); if (mtyp == FAILOVER_MSG) { + struct sk_buff_head *fdefq = &tnl->failover_deferdq; + tnl->drop_point = l->rcv_nxt; tnl->failover_reasm_skb = l->reasm_buf; l->reasm_buf = NULL; @@ -2226,7 +2228,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME) break; - strncpy(if_name, data, TIPC_MAX_IF_NAME); + strscpy(if_name, data, TIPC_MAX_IF_NAME); /* Update own tolerance if peer indicates a non-zero value */ if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index e2f19627e43d..b45c5b91bc7a 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -716,7 +716,8 @@ void tipc_mon_reinit_self(struct net *net) if (!mon) continue; write_lock_bh(&mon->lock); - mon->self->addr = tipc_own_addr(net); + if (mon->self) + mon->self->addr = tipc_own_addr(net); write_unlock_bh(&mon->lock); } } diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index d1180370fdf4..e74940eab3a4 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -949,8 +949,8 @@ void tipc_nametbl_stop(struct net *net) } spin_unlock_bh(&tn->nametbl_lock); - synchronize_net(); - kfree(nt); + /* TODO: clear tn->nametbl, implement proper RCU rules ? */ + kfree_rcu(nt, rcu); } static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg, diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 3bcd9ef8cee3..7ff6eeebaae6 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -90,6 +90,7 @@ struct publication { /** * struct name_table - table containing all existing port name publications + * @rcu: RCU callback head used for deferred freeing * @services: name sequence hash lists * @node_scope: all local publications with node scope * - used by name_distr during re-init of name table @@ -102,6 +103,7 @@ struct publication { * @snd_nxt: next sequence number to be used */ struct name_table { + struct rcu_head rcu; struct hlist_head services[TIPC_NAMETBL_SIZE]; struct list_head node_scope; struct list_head cluster_scope; diff --git a/net/tipc/node.c b/net/tipc/node.c index 500320e5ca47..cb43f2016a70 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -638,7 +638,7 @@ static void tipc_node_delete(struct tipc_node *node) trace_tipc_node_delete(node, true, " "); tipc_node_delete_from_list(node); - del_timer_sync(&node->timer); + timer_delete_sync(&node->timer); tipc_node_put(node); } @@ -1581,7 +1581,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, tipc_node_read_lock(node); link = node->links[bearer_id].link; if (link) { - strncpy(linkname, tipc_link_name(link), len); + strscpy(linkname, tipc_link_name(link), len); err = 0; } tipc_node_read_unlock(node); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 05d49ad81290..621addab2834 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -177,7 +177,7 @@ void tipc_sub_unsubscribe(struct tipc_subscription *sub) { tipc_nametbl_unsubscribe(sub); if (sub->evt.s.timeout != TIPC_WAIT_FOREVER) - del_timer_sync(&sub->timer); + timer_delete_sync(&sub->timer); list_del(&sub->sub_list); tipc_sub_put(sub); } diff --git a/net/tls/tls.h b/net/tls/tls.h index e5e47452308a..774859b63f0d 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -145,7 +145,8 @@ void tls_err_abort(struct sock *sk, int err); int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, const struct tls_cipher_desc *cipher_desc); -int tls_set_sw_offload(struct sock *sk, int tx); +int tls_set_sw_offload(struct sock *sk, int tx, + struct tls_crypto_info *new_crypto_info); void tls_update_rx_zc_capable(struct tls_context *tls_ctx); void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); void tls_sw_strparser_done(struct tls_context *tls_ctx); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index dc063c2c7950..f672a62a9a52 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -157,7 +157,7 @@ static void delete_all_records(struct tls_offload_context_tx *offload_ctx) offload_ctx->retransmit_hint = NULL; } -static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq) +static void tls_tcp_clean_acked(struct sock *sk, u32 acked_seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_record_info *info, *temp; @@ -204,7 +204,7 @@ void tls_device_sk_destruct(struct sock *sk) destroy_record(ctx->open_record); delete_all_records(ctx); crypto_free_aead(ctx->aead_send); - clean_acked_data_disable(inet_csk(sk)); + clean_acked_data_disable(tcp_sk(sk)); } tls_device_queue_ctx_destruction(tls_ctx); @@ -1126,7 +1126,7 @@ int tls_set_device_offload(struct sock *sk) start_marker_record->num_frags = 0; list_add_tail(&start_marker_record->list, &offload_ctx->records_list); - clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked); + clean_acked_data_enable(tcp_sk(sk), &tls_tcp_clean_acked); ctx->push_pending_record = tls_device_push_pending_record; /* TLS offload is greatly simplified if we don't send @@ -1172,7 +1172,7 @@ int tls_set_device_offload(struct sock *sk) release_lock: up_read(&device_offload_lock); - clean_acked_data_disable(inet_csk(sk)); + clean_acked_data_disable(tcp_sk(sk)); crypto_free_aead(offload_ctx->aead_send); free_offload_ctx: kfree(offload_ctx); @@ -1227,7 +1227,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx) context->resync_nh_reset = 1; ctx->priv_ctx_rx = context; - rc = tls_set_sw_offload(sk, 0); + rc = tls_set_sw_offload(sk, 0, NULL); if (rc) goto release_ctx; diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index f9e3d3d90dcf..03d508a45aae 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -37,17 +37,6 @@ #include "tls.h" -static void chain_to_walk(struct scatterlist *sg, struct scatter_walk *walk) -{ - struct scatterlist *src = walk->sg; - int diff = walk->offset - src->offset; - - sg_set_page(sg, sg_page(src), - src->length - diff, walk->offset); - - scatterwalk_crypto_chain(sg, sg_next(src), 2); -} - static int tls_enc_record(struct aead_request *aead_req, struct crypto_aead *aead, char *aad, char *iv, __be64 rcd_sn, @@ -69,16 +58,13 @@ static int tls_enc_record(struct aead_request *aead_req, buf_size = TLS_HEADER_SIZE + cipher_desc->iv; len = min_t(int, *in_len, buf_size); - scatterwalk_copychunks(buf, in, len, 0); - scatterwalk_copychunks(buf, out, len, 1); + memcpy_from_scatterwalk(buf, in, len); + memcpy_to_scatterwalk(out, buf, len); *in_len -= len; if (!*in_len) return 0; - scatterwalk_pagedone(in, 0, 1); - scatterwalk_pagedone(out, 1, 1); - len = buf[4] | (buf[3] << 8); len -= cipher_desc->iv; @@ -90,8 +76,8 @@ static int tls_enc_record(struct aead_request *aead_req, sg_init_table(sg_out, ARRAY_SIZE(sg_out)); sg_set_buf(sg_in, aad, TLS_AAD_SPACE_SIZE); sg_set_buf(sg_out, aad, TLS_AAD_SPACE_SIZE); - chain_to_walk(sg_in + 1, in); - chain_to_walk(sg_out + 1, out); + scatterwalk_get_sglist(in, sg_in + 1); + scatterwalk_get_sglist(out, sg_out + 1); *in_len -= len; if (*in_len < 0) { @@ -110,10 +96,8 @@ static int tls_enc_record(struct aead_request *aead_req, } if (*in_len) { - scatterwalk_copychunks(NULL, in, len, 2); - scatterwalk_pagedone(in, 0, 1); - scatterwalk_copychunks(NULL, out, len, 2); - scatterwalk_pagedone(out, 1, 1); + scatterwalk_skip(in, len); + scatterwalk_skip(out, len); } len -= cipher_desc->tag; @@ -162,9 +146,6 @@ static int tls_enc_records(struct aead_request *aead_req, } while (rc == 0 && len); - scatterwalk_done(&in, 0, 0); - scatterwalk_done(&out, 1, 0); - return rc; } diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 6b4b9f2749a6..a3ccb3135e51 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -423,9 +423,10 @@ static __poll_t tls_sk_poll(struct file *file, struct socket *sock, ctx = tls_sw_ctx_rx(tls_ctx); psock = sk_psock_get(sk); - if (skb_queue_empty_lockless(&ctx->rx_list) && - !tls_strp_msg_ready(ctx) && - sk_psock_queue_empty(psock)) + if ((skb_queue_empty_lockless(&ctx->rx_list) && + !tls_strp_msg_ready(ctx) && + sk_psock_queue_empty(psock)) || + READ_ONCE(ctx->key_update_pending)) mask &= ~(EPOLLIN | EPOLLRDNORM); if (psock) @@ -612,11 +613,13 @@ static int validate_crypto_info(const struct tls_crypto_info *crypto_info, static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, unsigned int optlen, int tx) { - struct tls_crypto_info *crypto_info; - struct tls_crypto_info *alt_crypto_info; + struct tls_crypto_info *crypto_info, *alt_crypto_info; + struct tls_crypto_info *old_crypto_info = NULL; struct tls_context *ctx = tls_get_ctx(sk); const struct tls_cipher_desc *cipher_desc; union tls_crypto_context *crypto_ctx; + union tls_crypto_context tmp = {}; + bool update = false; int rc = 0; int conf; @@ -633,9 +636,21 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, crypto_info = &crypto_ctx->info; - /* Currently we don't support set crypto info more than one time */ - if (TLS_CRYPTO_INFO_READY(crypto_info)) - return -EBUSY; + if (TLS_CRYPTO_INFO_READY(crypto_info)) { + /* Currently we only support setting crypto info more + * than one time for TLS 1.3 + */ + if (crypto_info->version != TLS_1_3_VERSION) { + TLS_INC_STATS(sock_net(sk), tx ? LINUX_MIB_TLSTXREKEYERROR + : LINUX_MIB_TLSRXREKEYERROR); + return -EBUSY; + } + + update = true; + old_crypto_info = crypto_info; + crypto_info = &tmp.info; + crypto_ctx = &tmp; + } rc = copy_from_sockptr(crypto_info, optval, sizeof(*crypto_info)); if (rc) { @@ -643,7 +658,14 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, goto err_crypto_info; } - rc = validate_crypto_info(crypto_info, alt_crypto_info); + if (update) { + /* Ensure that TLS version and ciphers are not modified */ + if (crypto_info->version != old_crypto_info->version || + crypto_info->cipher_type != old_crypto_info->cipher_type) + rc = -EINVAL; + } else { + rc = validate_crypto_info(crypto_info, alt_crypto_info); + } if (rc) goto err_crypto_info; @@ -673,11 +695,17 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXDEVICE); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); } else { - rc = tls_set_sw_offload(sk, 1); + rc = tls_set_sw_offload(sk, 1, + update ? crypto_info : NULL); if (rc) goto err_crypto_info; - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW); - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); + + if (update) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXREKEYOK); + } else { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); + } conf = TLS_SW; } } else { @@ -687,14 +715,21 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICE); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); } else { - rc = tls_set_sw_offload(sk, 0); + rc = tls_set_sw_offload(sk, 0, + update ? crypto_info : NULL); if (rc) goto err_crypto_info; - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW); - TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); + + if (update) { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXREKEYOK); + } else { + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); + } conf = TLS_SW; } - tls_sw_strparser_arm(sk, ctx); + if (!update) + tls_sw_strparser_arm(sk, ctx); } if (tx) @@ -702,6 +737,10 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, else ctx->rx_conf = conf; update_sk_prot(sk, ctx); + + if (update) + return 0; + if (tx) { ctx->sk_write_space = sk->sk_write_space; sk->sk_write_space = tls_write_space; @@ -713,6 +752,10 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, return 0; err_crypto_info: + if (update) { + TLS_INC_STATS(sock_net(sk), tx ? LINUX_MIB_TLSTXREKEYERROR + : LINUX_MIB_TLSRXREKEYERROR); + } memzero_explicit(crypto_ctx, sizeof(*crypto_ctx)); return rc; } @@ -809,6 +852,11 @@ static int tls_setsockopt(struct sock *sk, int level, int optname, return do_tls_setsockopt(sk, optname, optval, optlen); } +static int tls_disconnect(struct sock *sk, int flags) +{ + return -EOPNOTSUPP; +} + struct tls_context *tls_ctx_create(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -904,6 +952,7 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_BASE][TLS_BASE] = *base; prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt; prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt; + prot[TLS_BASE][TLS_BASE].disconnect = tls_disconnect; prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close; prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; @@ -1014,7 +1063,7 @@ static u16 tls_user_config(struct tls_context *ctx, bool tx) return 0; } -static int tls_get_info(struct sock *sk, struct sk_buff *skb) +static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin) { u16 version, cipher_type; struct tls_context *ctx; @@ -1072,7 +1121,7 @@ nla_failure: return err; } -static size_t tls_get_info_size(const struct sock *sk) +static size_t tls_get_info_size(const struct sock *sk, bool net_admin) { size_t size = 0; diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c index 68982728f620..367666aa07b8 100644 --- a/net/tls/tls_proc.c +++ b/net/tls/tls_proc.c @@ -22,6 +22,11 @@ static const struct snmp_mib tls_mib_list[] = { SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC), SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIB_TLSDECRYPTRETRY), SNMP_MIB_ITEM("TlsRxNoPadViolation", LINUX_MIB_TLSRXNOPADVIOL), + SNMP_MIB_ITEM("TlsRxRekeyOk", LINUX_MIB_TLSRXREKEYOK), + SNMP_MIB_ITEM("TlsRxRekeyError", LINUX_MIB_TLSRXREKEYERROR), + SNMP_MIB_ITEM("TlsTxRekeyOk", LINUX_MIB_TLSTXREKEYOK), + SNMP_MIB_ITEM("TlsTxRekeyError", LINUX_MIB_TLSTXREKEYERROR), + SNMP_MIB_ITEM("TlsRxRekeyReceived", LINUX_MIB_TLSRXREKEYRECEIVED), SNMP_MIB_SENTINEL }; diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 77e33e1e340e..65b0da6fdf6a 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -396,7 +396,6 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) return 0; shinfo = skb_shinfo(strp->anchor); - shinfo->frag_list = NULL; /* If we don't know the length go max plus page for cipher overhead */ need_spc = strp->stm.full_len ?: TLS_MAX_PAYLOAD_SIZE + PAGE_SIZE; @@ -412,6 +411,8 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) page, 0, 0); } + shinfo->frag_list = NULL; + strp->copy_mode = 1; strp->stm.offset = 0; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 7bcc9b4408a2..fc88e34b7f33 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -908,6 +908,13 @@ more_data: &msg_redir, send, flags); lock_sock(sk); if (err < 0) { + /* Regardless of whether the data represented by + * msg_redir is sent successfully, we have already + * uncharged it via sk_msg_return_zero(). The + * msg->sg.size represents the remaining unprocessed + * data, which needs to be uncharged here. + */ + sk_mem_uncharge(sk, msg->sg.size); *copied -= sk_msg_free_nocharge(sk, &msg_redir); msg->sg.size = 0; } @@ -1120,9 +1127,13 @@ alloc_encrypted: num_async++; else if (ret == -ENOMEM) goto wait_for_memory; - else if (ctx->open_rec && ret == -ENOSPC) + else if (ctx->open_rec && ret == -ENOSPC) { + if (msg_pl->cork_bytes) { + ret = 0; + goto send_end; + } goto rollback_iter; - else if (ret != -EAGAIN) + } else if (ret != -EAGAIN) goto send_end; } continue; @@ -1314,6 +1325,10 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, int ret = 0; long timeo; + /* a rekey is pending, let userspace deal with it */ + if (unlikely(ctx->key_update_pending)) + return -EKEYEXPIRED; + timeo = sock_rcvtimeo(sk, nonblock); while (!tls_strp_msg_ready(ctx)) { @@ -1720,6 +1735,36 @@ tls_decrypt_device(struct sock *sk, struct msghdr *msg, return 1; } +static int tls_check_pending_rekey(struct sock *sk, struct tls_context *ctx, + struct sk_buff *skb) +{ + const struct strp_msg *rxm = strp_msg(skb); + const struct tls_msg *tlm = tls_msg(skb); + char hs_type; + int err; + + if (likely(tlm->control != TLS_RECORD_TYPE_HANDSHAKE)) + return 0; + + if (rxm->full_len < 1) + return 0; + + err = skb_copy_bits(skb, rxm->offset, &hs_type, 1); + if (err < 0) { + DEBUG_NET_WARN_ON_ONCE(1); + return err; + } + + if (hs_type == TLS_HANDSHAKE_KEYUPDATE) { + struct tls_sw_context_rx *rx_ctx = ctx->priv_ctx_rx; + + WRITE_ONCE(rx_ctx->key_update_pending, true); + TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXREKEYRECEIVED); + } + + return 0; +} + static int tls_rx_one_record(struct sock *sk, struct msghdr *msg, struct tls_decrypt_arg *darg) { @@ -1739,7 +1784,7 @@ static int tls_rx_one_record(struct sock *sk, struct msghdr *msg, rxm->full_len -= prot->overhead_size; tls_advance_record_sn(sk, prot, &tls_ctx->rx); - return 0; + return tls_check_pending_rekey(sk, tls_ctx, darg->skb); } int decrypt_skb(struct sock *sk, struct scatterlist *sgout) @@ -2684,12 +2729,22 @@ int init_prot_info(struct tls_prot_info *prot, return 0; } -int tls_set_sw_offload(struct sock *sk, int tx) +static void tls_finish_key_update(struct sock *sk, struct tls_context *tls_ctx) { + struct tls_sw_context_rx *ctx = tls_ctx->priv_ctx_rx; + + WRITE_ONCE(ctx->key_update_pending, false); + /* wake-up pre-existing poll() */ + ctx->saved_data_ready(sk); +} + +int tls_set_sw_offload(struct sock *sk, int tx, + struct tls_crypto_info *new_crypto_info) +{ + struct tls_crypto_info *crypto_info, *src_crypto_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; struct tls_sw_context_rx *sw_ctx_rx = NULL; const struct tls_cipher_desc *cipher_desc; - struct tls_crypto_info *crypto_info; char *iv, *rec_seq, *key, *salt; struct cipher_context *cctx; struct tls_prot_info *prot; @@ -2701,44 +2756,47 @@ int tls_set_sw_offload(struct sock *sk, int tx) ctx = tls_get_ctx(sk); prot = &ctx->prot_info; - if (tx) { - ctx->priv_ctx_tx = init_ctx_tx(ctx, sk); - if (!ctx->priv_ctx_tx) - return -ENOMEM; + /* new_crypto_info != NULL means rekey */ + if (!new_crypto_info) { + if (tx) { + ctx->priv_ctx_tx = init_ctx_tx(ctx, sk); + if (!ctx->priv_ctx_tx) + return -ENOMEM; + } else { + ctx->priv_ctx_rx = init_ctx_rx(ctx); + if (!ctx->priv_ctx_rx) + return -ENOMEM; + } + } + if (tx) { sw_ctx_tx = ctx->priv_ctx_tx; crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; aead = &sw_ctx_tx->aead_send; } else { - ctx->priv_ctx_rx = init_ctx_rx(ctx); - if (!ctx->priv_ctx_rx) - return -ENOMEM; - sw_ctx_rx = ctx->priv_ctx_rx; crypto_info = &ctx->crypto_recv.info; cctx = &ctx->rx; aead = &sw_ctx_rx->aead_recv; } - cipher_desc = get_cipher_desc(crypto_info->cipher_type); + src_crypto_info = new_crypto_info ?: crypto_info; + + cipher_desc = get_cipher_desc(src_crypto_info->cipher_type); if (!cipher_desc) { rc = -EINVAL; goto free_priv; } - rc = init_prot_info(prot, crypto_info, cipher_desc); + rc = init_prot_info(prot, src_crypto_info, cipher_desc); if (rc) goto free_priv; - iv = crypto_info_iv(crypto_info, cipher_desc); - key = crypto_info_key(crypto_info, cipher_desc); - salt = crypto_info_salt(crypto_info, cipher_desc); - rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc); - - memcpy(cctx->iv, salt, cipher_desc->salt); - memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv); - memcpy(cctx->rec_seq, rec_seq, cipher_desc->rec_seq); + iv = crypto_info_iv(src_crypto_info, cipher_desc); + key = crypto_info_key(src_crypto_info, cipher_desc); + salt = crypto_info_salt(src_crypto_info, cipher_desc); + rec_seq = crypto_info_rec_seq(src_crypto_info, cipher_desc); if (!*aead) { *aead = crypto_alloc_aead(cipher_desc->cipher_name, 0, 0); @@ -2751,20 +2809,30 @@ int tls_set_sw_offload(struct sock *sk, int tx) ctx->push_pending_record = tls_sw_push_pending_record; + /* setkey is the last operation that could fail during a + * rekey. if it succeeds, we can start modifying the + * context. + */ rc = crypto_aead_setkey(*aead, key, cipher_desc->key); - if (rc) - goto free_aead; + if (rc) { + if (new_crypto_info) + goto out; + else + goto free_aead; + } - rc = crypto_aead_setauthsize(*aead, prot->tag_size); - if (rc) - goto free_aead; + if (!new_crypto_info) { + rc = crypto_aead_setauthsize(*aead, prot->tag_size); + if (rc) + goto free_aead; + } - if (sw_ctx_rx) { + if (!tx && !new_crypto_info) { tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); tls_update_rx_zc_capable(ctx); sw_ctx_rx->async_capable = - crypto_info->version != TLS_1_3_VERSION && + src_crypto_info->version != TLS_1_3_VERSION && !!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC); rc = tls_strp_init(&sw_ctx_rx->strp, sk); @@ -2772,18 +2840,33 @@ int tls_set_sw_offload(struct sock *sk, int tx) goto free_aead; } + memcpy(cctx->iv, salt, cipher_desc->salt); + memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv); + memcpy(cctx->rec_seq, rec_seq, cipher_desc->rec_seq); + + if (new_crypto_info) { + unsafe_memcpy(crypto_info, new_crypto_info, + cipher_desc->crypto_info, + /* size was checked in do_tls_setsockopt_conf */); + memzero_explicit(new_crypto_info, cipher_desc->crypto_info); + if (!tx) + tls_finish_key_update(sk, ctx); + } + goto out; free_aead: crypto_free_aead(*aead); *aead = NULL; free_priv: - if (tx) { - kfree(ctx->priv_ctx_tx); - ctx->priv_ctx_tx = NULL; - } else { - kfree(ctx->priv_ctx_rx); - ctx->priv_ctx_rx = NULL; + if (!new_crypto_info) { + if (tx) { + kfree(ctx->priv_ctx_tx); + ctx->priv_ctx_tx = NULL; + } else { + kfree(ctx->priv_ctx_rx); + ctx->priv_ctx_rx = NULL; + } } out: return rc; diff --git a/net/unix/Kconfig b/net/unix/Kconfig index 8b5d04210d7c..6f1783c1659b 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -17,9 +17,11 @@ config UNIX Say Y unless you know what you are doing. config AF_UNIX_OOB - bool + bool "UNIX: out-of-bound messages" depends on UNIX default y + help + Support for MSG_OOB in UNIX domain sockets. If unsure, say Y. config UNIX_DIAG tristate "UNIX: socket monitoring interface" diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6b1762300443..2e2e9997a68e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -77,46 +77,40 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/signal.h> -#include <linux/sched/signal.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/stat.h> +#include <linux/bpf-cgroup.h> +#include <linux/btf_ids.h> #include <linux/dcache.h> -#include <linux/namei.h> -#include <linux/socket.h> -#include <linux/un.h> +#include <linux/errno.h> #include <linux/fcntl.h> +#include <linux/file.h> #include <linux/filter.h> -#include <linux/termios.h> -#include <linux/sockios.h> -#include <linux/net.h> -#include <linux/in.h> #include <linux/fs.h> -#include <linux/slab.h> -#include <linux/uaccess.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <net/net_namespace.h> -#include <net/sock.h> -#include <net/tcp_states.h> -#include <net/af_unix.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <net/scm.h> +#include <linux/fs_struct.h> #include <linux/init.h> -#include <linux/poll.h> -#include <linux/rtnetlink.h> +#include <linux/kernel.h> #include <linux/mount.h> -#include <net/checksum.h> +#include <linux/namei.h> +#include <linux/net.h> +#include <linux/pidfs.h> +#include <linux/poll.h> +#include <linux/proc_fs.h> +#include <linux/sched/signal.h> #include <linux/security.h> +#include <linux/seq_file.h> +#include <linux/skbuff.h> +#include <linux/slab.h> +#include <linux/socket.h> #include <linux/splice.h> -#include <linux/freezer.h> -#include <linux/file.h> -#include <linux/btf_ids.h> -#include <linux/bpf-cgroup.h> +#include <linux/string.h> +#include <linux/uaccess.h> +#include <net/af_unix.h> +#include <net/net_namespace.h> +#include <net/scm.h> +#include <net/tcp_states.h> +#include <uapi/linux/sockios.h> +#include <uapi/linux/termios.h> + +#include "af_unix.h" static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; @@ -286,14 +280,9 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) } #endif /* CONFIG_SECURITY_NETWORK */ -static inline int unix_our_peer(struct sock *sk, struct sock *osk) -{ - return unix_peer(osk) == sk; -} - static inline int unix_may_send(struct sock *sk, struct sock *osk) { - return unix_peer(osk) == NULL || unix_our_peer(sk, osk); + return !unix_peer(osk) || unix_peer(osk) == sk; } static inline int unix_recvq_full_lockless(const struct sock *sk) @@ -627,7 +616,9 @@ static void unix_write_space(struct sock *sk) static void unix_dgram_disconnected(struct sock *sk, struct sock *other) { if (!skb_queue_empty(&sk->sk_receive_queue)) { - skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge_reason(&sk->sk_receive_queue, + SKB_DROP_REASON_UNIX_DISCONNECT); + wake_up_interruptible_all(&unix_sk(sk)->peer_wait); /* If one link of bidirectional dgram pipe is disconnected, @@ -645,7 +636,7 @@ static void unix_sock_destructor(struct sock *sk) { struct unix_sock *u = unix_sk(sk); - skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); @@ -655,6 +646,9 @@ static void unix_sock_destructor(struct sock *sk) return; } + if (sk->sk_peer_pid) + pidfs_put_pid(sk->sk_peer_pid); + if (u->addr) unix_release_addr(u->addr); @@ -720,8 +714,8 @@ static void unix_release_sock(struct sock *sk, int embrion) if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); - /* passed fds are erased in the kfree_skb hook */ - kfree_skb(skb); + /* passed fds are erased in the kfree_skb hook */ + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); } if (path.dentry) @@ -746,13 +740,48 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_gc(); /* Garbage collect fds */ } -static void init_peercred(struct sock *sk) +struct unix_peercred { + struct pid *peer_pid; + const struct cred *peer_cred; +}; + +static inline int prepare_peercred(struct unix_peercred *peercred) +{ + struct pid *pid; + int err; + + pid = task_tgid(current); + err = pidfs_register_pid(pid); + if (likely(!err)) { + peercred->peer_pid = get_pid(pid); + peercred->peer_cred = get_current_cred(); + } + return err; +} + +static void drop_peercred(struct unix_peercred *peercred) { - sk->sk_peer_pid = get_pid(task_tgid(current)); - sk->sk_peer_cred = get_current_cred(); + const struct cred *cred = NULL; + struct pid *pid = NULL; + + might_sleep(); + + swap(peercred->peer_pid, pid); + swap(peercred->peer_cred, cred); + + pidfs_put_pid(pid); + put_pid(pid); + put_cred(cred); } -static void update_peercred(struct sock *sk) +static inline void init_peercred(struct sock *sk, + const struct unix_peercred *peercred) +{ + sk->sk_peer_pid = peercred->peer_pid; + sk->sk_peer_cred = peercred->peer_cred; +} + +static void update_peercred(struct sock *sk, struct unix_peercred *peercred) { const struct cred *old_cred; struct pid *old_pid; @@ -760,11 +789,11 @@ static void update_peercred(struct sock *sk) spin_lock(&sk->sk_peer_lock); old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; - init_peercred(sk); + init_peercred(sk, peercred); spin_unlock(&sk->sk_peer_lock); - put_pid(old_pid); - put_cred(old_cred); + peercred->peer_pid = old_pid; + peercred->peer_cred = old_cred; } static void copy_peercred(struct sock *sk, struct sock *peersk) @@ -773,15 +802,22 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) spin_lock(&sk->sk_peer_lock); sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); + pidfs_get_pid(sk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); } +static bool unix_may_passcred(const struct sock *sk) +{ + return sk->sk_scm_credentials || sk->sk_scm_pidfd; +} + static int unix_listen(struct socket *sock, int backlog) { int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); + struct unix_peercred peercred = {}; err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) @@ -789,6 +825,9 @@ static int unix_listen(struct socket *sock, int backlog) err = -EINVAL; if (!READ_ONCE(u->addr)) goto out; /* No listens on an unbound socket */ + err = prepare_peercred(&peercred); + if (err) + goto out; unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) goto out_unlock; @@ -798,11 +837,12 @@ static int unix_listen(struct socket *sock, int backlog) WRITE_ONCE(sk->sk_state, TCP_LISTEN); /* set credentials so connect can copy them */ - update_peercred(sk); + update_peercred(sk, &peercred); err = 0; out_unlock: unix_state_unlock(sk); + drop_peercred(&peercred); out: return err; } @@ -962,13 +1002,6 @@ static void unix_close(struct sock *sk, long timeout) */ } -static void unix_unhash(struct sock *sk) -{ - /* Nothing to do here, unix socket does not need a ->unhash(). - * This is merely for sockmap. - */ -} - static bool unix_bpf_bypass_getsockopt(int level, int optname) { if (level == SOL_SOCKET) { @@ -999,7 +1032,6 @@ struct proto unix_stream_proto = { .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, - .unhash = unix_unhash, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_stream_bpf_update_proto, @@ -1030,6 +1062,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sock_init_data(sock, sk); + sk->sk_scm_rights = 1; sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; @@ -1113,7 +1146,7 @@ static int unix_release(struct socket *sock) } static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, - int type) + int type, int flags) { struct inode *inode; struct path path; @@ -1121,13 +1154,39 @@ static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, int err; unix_mkname_bsd(sunaddr, addr_len); - err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); - if (err) - goto fail; - err = path_permission(&path, MAY_WRITE); - if (err) - goto path_put; + if (flags & SOCK_COREDUMP) { + const struct cred *cred; + struct cred *kcred; + struct path root; + + kcred = prepare_kernel_cred(&init_task); + if (!kcred) { + err = -ENOMEM; + goto fail; + } + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + + cred = override_creds(kcred); + err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path, + LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | + LOOKUP_NO_MAGICLINKS, &path); + put_cred(revert_creds(cred)); + path_put(&root); + if (err) + goto fail; + } else { + err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); + if (err) + goto fail; + + err = path_permission(&path, MAY_WRITE); + if (err) + goto path_put; + } err = -ECONNREFUSED; inode = d_backing_inode(path.dentry); @@ -1177,12 +1236,12 @@ static struct sock *unix_find_abstract(struct net *net, static struct sock *unix_find_other(struct net *net, struct sockaddr_un *sunaddr, - int addr_len, int type) + int addr_len, int type, int flags) { struct sock *sk; if (sunaddr->sun_path[0]) - sk = unix_find_bsd(sunaddr, addr_len, type); + sk = unix_find_bsd(sunaddr, addr_len, type, flags); else sk = unix_find_abstract(net, sunaddr, addr_len, type); @@ -1431,16 +1490,14 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; - if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && - !READ_ONCE(unix_sk(sk)->addr)) { + if (unix_may_passcred(sk) && !READ_ONCE(unix_sk(sk)->addr)) { err = unix_autobind(sk); if (err) goto out; } restart: - other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); + other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type, 0); if (IS_ERR(other)) { err = PTR_ERR(other); goto out; @@ -1511,7 +1568,6 @@ out: } static long unix_wait_for_peer(struct sock *other, long timeo) - __releases(&unix_sk(other)->lock) { struct unix_sock *u = unix_sk(other); int sched; @@ -1538,6 +1594,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; struct unix_sock *u = unix_sk(sk), *newu, *otheru; + struct unix_peercred peercred = {}; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; unsigned char state; @@ -1552,9 +1609,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (err) goto out; - if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && - !READ_ONCE(u->addr)) { + if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; @@ -1563,32 +1618,34 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); /* First of all allocate resources. - If we will make it after state is locked, - we will have to recheck all again in any case. + * If we will make it after state is locked, + * we will have to recheck all again in any case. */ /* create new sock for complete connection */ newsk = unix_create1(net, NULL, 0, sock->type); if (IS_ERR(newsk)) { err = PTR_ERR(newsk); - newsk = NULL; goto out; } - err = -ENOMEM; + err = prepare_peercred(&peercred); + if (err) + goto out; /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); - if (skb == NULL) - goto out; + if (!skb) { + err = -ENOMEM; + goto out_free_sk; + } restart: /* Find listening sock. */ - other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); + other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, flags); if (IS_ERR(other)) { err = PTR_ERR(other); - other = NULL; - goto out; + goto out_free_skb; } unix_state_lock(other); @@ -1600,23 +1657,25 @@ restart: goto restart; } - err = -ECONNREFUSED; - if (other->sk_state != TCP_LISTEN) - goto out_unlock; - if (other->sk_shutdown & RCV_SHUTDOWN) + if (other->sk_state != TCP_LISTEN || + other->sk_shutdown & RCV_SHUTDOWN) { + err = -ECONNREFUSED; goto out_unlock; + } if (unix_recvq_full_lockless(other)) { - err = -EAGAIN; - if (!timeo) + if (!timeo) { + err = -EAGAIN; goto out_unlock; + } timeo = unix_wait_for_peer(other, timeo); + sock_put(other); err = sock_intr_errno(timeo); if (signal_pending(current)) - goto out; - sock_put(other); + goto out_free_skb; + goto restart; } @@ -1646,10 +1705,12 @@ restart: /* The way is open! Fastly set all the necessary fields... */ sock_hold(sk); - unix_peer(newsk) = sk; - newsk->sk_state = TCP_ESTABLISHED; - newsk->sk_type = sk->sk_type; - init_peercred(newsk); + unix_peer(newsk) = sk; + newsk->sk_state = TCP_ESTABLISHED; + newsk->sk_type = sk->sk_type; + newsk->sk_scm_recv_flags = other->sk_scm_recv_flags; + init_peercred(newsk, &peercred); + newu = unix_sk(newsk); newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); @@ -1701,29 +1762,40 @@ restart: return 0; out_unlock: - if (other) - unix_state_unlock(other); - + unix_state_unlock(other); + sock_put(other); +out_free_skb: + consume_skb(skb); +out_free_sk: + unix_release_sock(newsk, 0); out: - kfree_skb(skb); - if (newsk) - unix_release_sock(newsk, 0); - if (other) - sock_put(other); + drop_peercred(&peercred); return err; } static int unix_socketpair(struct socket *socka, struct socket *sockb) { + struct unix_peercred ska_peercred = {}, skb_peercred = {}; struct sock *ska = socka->sk, *skb = sockb->sk; + int err; + + err = prepare_peercred(&ska_peercred); + if (err) + return err; + + err = prepare_peercred(&skb_peercred); + if (err) { + drop_peercred(&ska_peercred); + return err; + } /* Join our sockets back to back */ sock_hold(ska); sock_hold(skb); unix_peer(ska) = skb; unix_peer(skb) = ska; - init_peercred(ska); - init_peercred(skb); + init_peercred(ska, &ska_peercred); + init_peercred(skb, &skb_peercred); ska->sk_state = TCP_ESTABLISHED; skb->sk_state = TCP_ESTABLISHED; @@ -1732,17 +1804,6 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) return 0; } -static void unix_sock_inherit_flags(const struct socket *old, - struct socket *new) -{ - if (test_bit(SOCK_PASSCRED, &old->flags)) - set_bit(SOCK_PASSCRED, &new->flags); - if (test_bit(SOCK_PASSPIDFD, &old->flags)) - set_bit(SOCK_PASSPIDFD, &new->flags); - if (test_bit(SOCK_PASSSEC, &old->flags)) - set_bit(SOCK_PASSSEC, &new->flags); -} - static int unix_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { @@ -1779,7 +1840,6 @@ static int unix_accept(struct socket *sock, struct socket *newsock, unix_state_lock(tsk); unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; - unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; @@ -1888,7 +1948,7 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen { int err = 0; - UNIXCB(skb).pid = get_pid(scm->pid); + UNIXCB(skb).pid = get_pid(scm->pid); UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; @@ -1900,28 +1960,19 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen return err; } -static bool unix_passcred_enabled(const struct socket *sock, - const struct sock *other) -{ - return test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags) || - !other->sk_socket || - test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || - test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); -} - /* * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket * asserted SOCK_PASSCRED. */ -static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, - const struct sock *other) +static void unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, + const struct sock *other) { if (UNIXCB(skb).pid) return; - if (unix_passcred_enabled(sock, other)) { - UNIXCB(skb).pid = get_pid(task_tgid(current)); + + if (unix_may_passcred(sk) || unix_may_passcred(other)) { + UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } } @@ -1964,7 +2015,6 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { - DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); struct sock *sk = sock->sk, *other = NULL; struct unix_sock *u = unix_sk(sk); struct scm_cookie scm; @@ -1980,12 +2030,13 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, wait_for_unix_gc(scm.fp); - err = -EOPNOTSUPP; - if (msg->msg_flags&MSG_OOB) + if (msg->msg_flags & MSG_OOB) { + err = -EOPNOTSUPP; goto out; + } if (msg->msg_namelen) { - err = unix_validate_addr(sunaddr, msg->msg_namelen); + err = unix_validate_addr(msg->msg_name, msg->msg_namelen); if (err) goto out; @@ -1995,25 +2046,18 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, NULL); if (err) goto out; - } else { - sunaddr = NULL; - err = -ENOTCONN; - other = unix_peer_get(sk); - if (!other) - goto out; } - if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && - !READ_ONCE(u->addr)) { + if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; } - err = -EMSGSIZE; - if (len > READ_ONCE(sk->sk_sndbuf) - 32) + if (len > READ_ONCE(sk->sk_sndbuf) - 32) { + err = -EMSGSIZE; goto out; + } if (len > SKB_MAX_ALLOC) { data_len = min_t(size_t, @@ -2027,7 +2071,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, skb = sock_alloc_send_pskb(sk, len - data_len, data_len, msg->msg_flags & MSG_DONTWAIT, &err, PAGE_ALLOC_COSTLY_ORDER); - if (skb == NULL) + if (!skb) goto out; err = unix_scm_to_skb(&scm, skb, true); @@ -2043,17 +2087,18 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); -restart: - if (!other) { - err = -ECONNRESET; - if (sunaddr == NULL) - goto out_free; - - other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, - sk->sk_type); + if (msg->msg_namelen) { +lookup: + other = unix_find_other(sock_net(sk), msg->msg_name, + msg->msg_namelen, sk->sk_type, 0); if (IS_ERR(other)) { err = PTR_ERR(other); - other = NULL; + goto out_free; + } + } else { + other = unix_peer_get(sk); + if (!other) { + err = -ENOTCONN; goto out_free; } } @@ -2061,36 +2106,37 @@ restart: if (sk_filter(other, skb) < 0) { /* Toss the packet but do not return any error to the sender */ err = len; - goto out_free; + goto out_sock_put; } +restart: sk_locked = 0; unix_state_lock(other); restart_locked: - err = -EPERM; - if (!unix_may_send(sk, other)) + + if (!unix_may_send(sk, other)) { + err = -EPERM; goto out_unlock; + } if (unlikely(sock_flag(other, SOCK_DEAD))) { - /* - * Check with 1003.1g - what should - * datagram error - */ - unix_state_unlock(other); - sock_put(other); + /* Check with 1003.1g - what should datagram error */ - if (!sk_locked) - unix_state_lock(sk); + unix_state_unlock(other); - err = 0; if (sk->sk_type == SOCK_SEQPACKET) { /* We are here only when racing with unix_release_sock() * is clearing @other. Never change state to TCP_CLOSE * unlike SOCK_DGRAM wants. */ - unix_state_unlock(sk); err = -EPIPE; - } else if (unix_peer(sk) == other) { + goto out_sock_put; + } + + if (!sk_locked) + unix_state_lock(sk); + + if (unix_peer(sk) == other) { unix_peer(sk) = NULL; unix_dgram_peer_wake_disconnect_wakeup(sk, other); @@ -2100,19 +2146,29 @@ restart_locked: unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; - } else { - unix_state_unlock(sk); + goto out_sock_put; } - other = NULL; - if (err) - goto out_free; - goto restart; + unix_state_unlock(sk); + + if (!msg->msg_namelen) { + err = -ECONNRESET; + goto out_sock_put; + } + + sock_put(other); + goto lookup; } - err = -EPIPE; - if (other->sk_shutdown & RCV_SHUTDOWN) + if (other->sk_shutdown & RCV_SHUTDOWN) { + err = -EPIPE; + goto out_unlock; + } + + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + err = -EPERM; goto out_unlock; + } if (sk->sk_type != SOCK_SEQPACKET) { err = security_unix_may_send(sk->sk_socket, other->sk_socket); @@ -2132,7 +2188,7 @@ restart_locked: err = sock_intr_errno(timeo); if (signal_pending(current)) - goto out_free; + goto out_sock_put; goto restart; } @@ -2160,7 +2216,8 @@ restart_locked: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); - maybe_add_creds(skb, sock, other); + + unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); @@ -2173,11 +2230,11 @@ out_unlock: if (sk_locked) unix_state_unlock(sk); unix_state_unlock(other); +out_sock_put: + sock_put(other); out_free: - kfree_skb(skb); + consume_skb(skb); out: - if (other) - sock_put(other); scm_destroy(&scm); return err; } @@ -2188,41 +2245,42 @@ out: #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) #if IS_ENABLED(CONFIG_AF_UNIX_OOB) -static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, +static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, struct scm_cookie *scm, bool fds_sent) { struct unix_sock *ousk = unix_sk(other); struct sk_buff *skb; - int err = 0; + int err; - skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); + skb = sock_alloc_send_skb(sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) return err; err = unix_scm_to_skb(scm, skb, !fds_sent); - if (err < 0) { - kfree_skb(skb); - return err; - } + if (err < 0) + goto out; + skb_put(skb, 1); err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); - if (err) { - kfree_skb(skb); - return err; - } + if (err) + goto out; unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) { - unix_state_unlock(other); - kfree_skb(skb); - return -EPIPE; + err = -EPIPE; + goto out_unlock; } - maybe_add_creds(skb, sock, other); + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + err = -EPERM; + goto out_unlock; + } + + unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); @@ -2234,6 +2292,11 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other unix_state_unlock(other); other->sk_data_ready(other); + return 0; +out_unlock: + unix_state_unlock(other); +out: + consume_skb(skb); return err; } #endif @@ -2242,13 +2305,11 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; + struct sk_buff *skb = NULL; struct sock *other = NULL; - int err, size; - struct sk_buff *skb; - int sent = 0; struct scm_cookie scm; bool fds_sent = false; - int data_len; + int err, sent = 0; err = scm_send(sock, msg, &scm, false); if (err < 0) @@ -2256,8 +2317,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, wait_for_unix_gc(scm.fp); - err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { + err = -EOPNOTSUPP; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (len) len--; @@ -2270,17 +2331,19 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; } else { - err = -ENOTCONN; other = unix_peer(sk); - if (!other) + if (!other) { + err = -ENOTCONN; goto out_err; + } } if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) - goto pipe_err; + goto out_pipe; while (sent < len) { - size = len - sent; + int size = len - sent; + int data_len; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb = sock_alloc_send_pskb(sk, 0, 0, @@ -2306,20 +2369,18 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, /* Only send the fds in the first buffer */ err = unix_scm_to_skb(&scm, skb, !fds_sent); - if (err < 0) { - kfree_skb(skb); - goto out_err; - } + if (err < 0) + goto out_free; + fds_sent = true; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb->ip_summed = CHECKSUM_UNNECESSARY; err = skb_splice_from_iter(skb, &msg->msg_iter, size, sk->sk_allocation); - if (err < 0) { - kfree_skb(skb); - goto out_err; - } + if (err < 0) + goto out_free; + size = err; refcount_add(size, &sk->sk_wmem_alloc); } else { @@ -2327,19 +2388,23 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, skb->data_len = data_len; skb->len = size; err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size); - if (err) { - kfree_skb(skb); - goto out_err; - } + if (err) + goto out_free; } unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) - goto pipe_err_free; + goto out_pipe_unlock; - maybe_add_creds(skb, sock, other); + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + unix_state_unlock(other); + err = -EPERM; + goto out_free; + } + + unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); @@ -2349,7 +2414,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (msg->msg_flags & MSG_OOB) { - err = queue_oob(sock, msg, other, &scm, fds_sent); + err = queue_oob(sk, msg, other, &scm, fds_sent); if (err) goto out_err; sent++; @@ -2360,13 +2425,14 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, return sent; -pipe_err_free: +out_pipe_unlock: unix_state_unlock(other); - kfree_skb(skb); -pipe_err: - if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL)) +out_pipe: + if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; +out_free: + consume_skb(skb); out_err: scm_destroy(&scm); return sent ? : err; @@ -2699,7 +2765,7 @@ unlock: spin_unlock(&sk->sk_receive_queue.lock); consume_skb(read_skb); - kfree_skb(unread_skb); + kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return skb; } @@ -2728,7 +2794,7 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (sock_flag(sk, SOCK_DEAD)) { unix_state_unlock(sk); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); return -ECONNRESET; } @@ -2742,7 +2808,7 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) unix_state_unlock(sk); if (drop) { - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return -EAGAIN; } } @@ -2870,8 +2936,7 @@ unlock: /* Never glue messages from different writers */ if (!unix_skb_scm_eq(skb, &scm)) break; - } else if (test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) { + } else if (unix_may_passcred(sk)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); diff --git a/net/unix/af_unix.h b/net/unix/af_unix.h new file mode 100644 index 000000000000..59db179df9bb --- /dev/null +++ b/net/unix/af_unix.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __AF_UNIX_H +#define __AF_UNIX_H + +#include <linux/uidgid.h> + +#define UNIX_HASH_MOD (256 - 1) +#define UNIX_HASH_SIZE (256 * 2) +#define UNIX_HASH_BITS 8 + +struct sock *unix_peer_get(struct sock *sk); + +struct unix_skb_parms { + struct pid *pid; /* skb credentials */ + kuid_t uid; + kgid_t gid; + struct scm_fp_list *fp; /* Passed files */ +#ifdef CONFIG_SECURITY_NETWORK + u32 secid; /* Security ID */ +#endif + u32 consumed; +} __randomize_layout; + +#define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) + +/* GC for SCM_RIGHTS */ +extern unsigned int unix_tot_inflight; +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); +void unix_del_edges(struct scm_fp_list *fpl); +void unix_update_edges(struct unix_sock *receiver); +int unix_prepare_fpl(struct scm_fp_list *fpl); +void unix_destroy_fpl(struct scm_fp_list *fpl); +void unix_gc(void); +void wait_for_unix_gc(struct scm_fp_list *fpl); + +/* SOCK_DIAG */ +long unix_inq_len(struct sock *sk); +long unix_outq_len(struct sock *sk); + +/* sysctl */ +#ifdef CONFIG_SYSCTL +int unix_sysctl_register(struct net *net); +void unix_sysctl_unregister(struct net *net); +#else +static inline int unix_sysctl_register(struct net *net) +{ + return 0; +} + +static inline void unix_sysctl_unregister(struct net *net) +{ +} +#endif + +/* BPF SOCKMAP */ +int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags); +int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, int flags); + +#ifdef CONFIG_BPF_SYSCALL +extern struct proto unix_dgram_proto; +extern struct proto unix_stream_proto; + +int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); +int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); +void __init unix_bpf_build_proto(void); +#else +static inline void __init unix_bpf_build_proto(void) +{ +} +#endif + +#endif diff --git a/net/unix/diag.c b/net/unix/diag.c index 9138af8b465e..79b182d0e62a 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -1,15 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <linux/types.h> -#include <linux/spinlock.h> -#include <linux/sock_diag.h> -#include <linux/unix_diag.h> -#include <linux/skbuff.h> + +#include <linux/dcache.h> #include <linux/module.h> -#include <linux/uidgid.h> -#include <net/netlink.h> +#include <linux/skbuff.h> +#include <linux/sock_diag.h> +#include <linux/types.h> +#include <linux/user_namespace.h> #include <net/af_unix.h> +#include <net/netlink.h> #include <net/tcp_states.h> -#include <net/sock.h> +#include <uapi/linux/unix_diag.h> + +#include "af_unix.h" static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 0068e758be4d..01e2b9452c75 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -63,24 +63,33 @@ * wrt receive and holding up unrelated socket operations. */ -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/un.h> -#include <linux/net.h> #include <linux/fs.h> +#include <linux/list.h> #include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/file.h> -#include <linux/proc_fs.h> -#include <linux/mutex.h> -#include <linux/wait.h> - -#include <net/sock.h> +#include <linux/socket.h> +#include <linux/workqueue.h> #include <net/af_unix.h> #include <net/scm.h> #include <net/tcp_states.h> +#include "af_unix.h" + +struct unix_vertex { + struct list_head edges; + struct list_head entry; + struct list_head scc_entry; + unsigned long out_degree; + unsigned long index; + unsigned long scc_index; +}; + +struct unix_edge { + struct unix_sock *predecessor; + struct unix_sock *successor; + struct list_head vertex_entry; + struct list_head stack_entry; +}; + struct unix_sock *unix_get_socket(struct file *filp) { struct inode *inode = file_inode(filp); @@ -573,7 +582,7 @@ static void __unix_gc(struct work_struct *work) UNIXCB(skb).fp->dead = true; } - __skb_queue_purge(&hitlist); + __skb_queue_purge_reason(&hitlist, SKB_DROP_REASON_SOCKET_CLOSE); skip_gc: WRITE_ONCE(gc_in_progress, false); } diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c index 357b3e5f3847..e02ed6e3955c 100644 --- a/net/unix/sysctl_net_unix.c +++ b/net/unix/sysctl_net_unix.c @@ -5,11 +5,13 @@ * Authors: Mike Shaver. */ -#include <linux/mm.h> #include <linux/slab.h> +#include <linux/string.h> #include <linux/sysctl.h> - #include <net/af_unix.h> +#include <net/net_namespace.h> + +#include "af_unix.h" static struct ctl_table unix_table[] = { { diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c index bca2d86ba97d..e0d30d6d22ac 100644 --- a/net/unix/unix_bpf.c +++ b/net/unix/unix_bpf.c @@ -1,11 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Cong Wang <cong.wang@bytedance.com> */ -#include <linux/skmsg.h> #include <linux/bpf.h> -#include <net/sock.h> +#include <linux/skmsg.h> #include <net/af_unix.h> +#include "af_unix.h" + #define unix_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ !skb_queue_empty(&__psock->ingress_skb) || \ diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index fa9d1b49599b..2e7a3034e965 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -337,7 +337,10 @@ EXPORT_SYMBOL_GPL(vsock_find_connected_socket); void vsock_remove_sock(struct vsock_sock *vsk) { - vsock_remove_bound(vsk); + /* Transport reassignment must not remove the binding. */ + if (sock_flag(sk_vsock(vsk), SOCK_DEAD)) + vsock_remove_bound(vsk); + vsock_remove_connected(vsk); } EXPORT_SYMBOL_GPL(vsock_remove_sock); @@ -821,6 +824,13 @@ static void __vsock_release(struct sock *sk, int level) */ lock_sock_nested(sk, level); + /* Indicate to vsock_remove_sock() that the socket is being released and + * can be removed from the bound_table. Unlike transport reassignment + * case, where the socket must remain bound despite vsock_remove_sock() + * being called from the transport release() callback. + */ + sock_set_flag(sk, SOCK_DEAD); + if (vsk->transport) vsk->transport->release(vsk); else if (sock_type_connectible(sk->sk_type)) @@ -1003,6 +1013,39 @@ out: return err; } +void vsock_linger(struct sock *sk) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + ssize_t (*unsent)(struct vsock_sock *vsk); + struct vsock_sock *vsk = vsock_sk(sk); + long timeout; + + if (!sock_flag(sk, SOCK_LINGER)) + return; + + timeout = sk->sk_lingertime; + if (!timeout) + return; + + /* Transports must implement `unsent_bytes` if they want to support + * SOCK_LINGER through `vsock_linger()` since we use it to check when + * the socket can be closed. + */ + unsent = vsk->transport->unsent_bytes; + if (!unsent) + return; + + add_wait_queue(sk_sleep(sk), &wait); + + do { + if (sk_wait_event(sk, &timeout, unsent(vsk) == 0, &wait)) + break; + } while (!signal_pending(current) && timeout); + + remove_wait_queue(sk_sleep(sk), &wait); +} +EXPORT_SYMBOL_GPL(vsock_linger); + static int vsock_shutdown(struct socket *sock, int mode) { int err; @@ -1179,6 +1222,9 @@ static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor) { struct vsock_sock *vsk = vsock_sk(sk); + if (WARN_ON_ONCE(!vsk->transport)) + return -ENODEV; + return vsk->transport->read_skb(vsk, read_actor); } @@ -1519,6 +1565,11 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, if (err < 0) goto out; + /* sk_err might have been set as a result of an earlier + * (failed) connect attempt. + */ + sk->sk_err = 0; + /* Mark sock as connecting and set the error code to in * progress in case this is a non-blocking connect. */ @@ -1533,7 +1584,11 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, timeout = vsk->connect_timeout; prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); - while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) { + /* If the socket is already closing or it is in an error state, there + * is no point in waiting. + */ + while (sk->sk_state != TCP_ESTABLISHED && + sk->sk_state != TCP_CLOSING && sk->sk_err == 0) { if (flags & O_NONBLOCK) { /* If we're not going to block, we schedule a timeout * function to generate a timeout on the connection diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 56c232cf5b0f..31342ab502b4 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -13,12 +13,12 @@ #include <linux/hyperv.h> #include <net/sock.h> #include <net/af_vsock.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> /* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some * stricter requirements on the hv_sock ring buffer size of six 4K pages. - * hyperv-tlfs defines HV_HYP_PAGE_SIZE as 4K. Newer hosts don't have this - * limitation; but, keep the defaults the same for compat. + * HV_HYP_PAGE_SIZE is defined as 4K. Newer hosts don't have this limitation; + * but, keep the defaults the same for compat. */ #define RINGBUFFER_HVS_RCV_SIZE (HV_HYP_PAGE_SIZE * 6) #define RINGBUFFER_HVS_SND_SIZE (HV_HYP_PAGE_SIZE * 6) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index b58c3818f284..f0e48e6911fc 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -670,6 +670,13 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock) }; int ret; + mutex_lock(&vsock->rx_lock); + vsock->rx_buf_nr = 0; + vsock->rx_buf_max_nr = 0; + mutex_unlock(&vsock->rx_lock); + + atomic_set(&vsock->queued_replies, 0); + ret = virtio_find_vqs(vdev, VSOCK_VQ_MAX, vsock->vqs, vqs_info, NULL); if (ret < 0) return ret; @@ -779,9 +786,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev) vsock->vdev = vdev; - vsock->rx_buf_nr = 0; - vsock->rx_buf_max_nr = 0; - atomic_set(&vsock->queued_replies, 0); mutex_init(&vsock->tx_lock); mutex_init(&vsock->rx_lock); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 7f7de6d88096..1b5d9896edae 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -87,7 +87,7 @@ static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk, uarg = msg_zerocopy_realloc(sk_vsock(vsk), iter->count, - NULL); + NULL, false); if (!uarg) return -1; @@ -107,8 +107,7 @@ static int virtio_transport_fill_skb(struct sk_buff *skb, { if (zcopy) return __zerocopy_sg_from_iter(info->msg, NULL, skb, - &info->msg->msg_iter, - len); + &info->msg->msg_iter, len, NULL); return memcpy_from_msg(skb_put(skb, len), info->msg, len); } @@ -441,18 +440,20 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, u32 len) { - if (vvs->rx_bytes + len > vvs->buf_alloc) + if (vvs->buf_used + len > vvs->buf_alloc) return false; vvs->rx_bytes += len; + vvs->buf_used += len; return true; } static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, - u32 len) + u32 bytes_read, u32 bytes_dequeued) { - vvs->rx_bytes -= len; - vvs->fwd_cnt += len; + vvs->rx_bytes -= bytes_read; + vvs->buf_used -= bytes_dequeued; + vvs->fwd_cnt += bytes_dequeued; } void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb) @@ -581,11 +582,11 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - size_t bytes, total = 0; struct sk_buff *skb; u32 fwd_cnt_delta; bool low_rx_bytes; int err = -EFAULT; + size_t total = 0; u32 free_space; spin_lock_bh(&vvs->rx_lock); @@ -597,6 +598,8 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, } while (total < len && !skb_queue_empty(&vvs->rx_queue)) { + size_t bytes, dequeued = 0; + skb = skb_peek(&vvs->rx_queue); bytes = min_t(size_t, len - total, @@ -620,12 +623,12 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, VIRTIO_VSOCK_SKB_CB(skb)->offset += bytes; if (skb->len == VIRTIO_VSOCK_SKB_CB(skb)->offset) { - u32 pkt_len = le32_to_cpu(virtio_vsock_hdr(skb)->len); - - virtio_transport_dec_rx_pkt(vvs, pkt_len); + dequeued = le32_to_cpu(virtio_vsock_hdr(skb)->len); __skb_unlink(skb, &vvs->rx_queue); consume_skb(skb); } + + virtio_transport_dec_rx_pkt(vvs, bytes, dequeued); } fwd_cnt_delta = vvs->fwd_cnt - vvs->last_fwd_cnt; @@ -781,7 +784,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, msg->msg_flags |= MSG_EOR; } - virtio_transport_dec_rx_pkt(vvs, pkt_len); + virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len); kfree_skb(skb); } @@ -1192,23 +1195,6 @@ static void virtio_transport_remove_sock(struct vsock_sock *vsk) vsock_remove_sock(vsk); } -static void virtio_transport_wait_close(struct sock *sk, long timeout) -{ - if (timeout) { - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - add_wait_queue(sk_sleep(sk), &wait); - - do { - if (sk_wait_event(sk, &timeout, - sock_flag(sk, SOCK_DONE), &wait)) - break; - } while (!signal_pending(current) && timeout); - - remove_wait_queue(sk_sleep(sk), &wait); - } -} - static void virtio_transport_cancel_close_work(struct vsock_sock *vsk, bool cancel_timeout) { @@ -1278,8 +1264,8 @@ static bool virtio_transport_close(struct vsock_sock *vsk) if ((sk->sk_shutdown & SHUTDOWN_MASK) != SHUTDOWN_MASK) (void)virtio_transport_shutdown(vsk, SHUTDOWN_MASK); - if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) - virtio_transport_wait_close(sk, sk->sk_lingertime); + if (!(current->flags & PF_EXITING)) + vsock_linger(sk); if (sock_flag(sk, SOCK_DONE)) { return true; @@ -1735,6 +1721,7 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto struct sock *sk = sk_vsock(vsk); struct virtio_vsock_hdr *hdr; struct sk_buff *skb; + u32 pkt_len; int off = 0; int err; @@ -1752,7 +1739,8 @@ int virtio_transport_read_skb(struct vsock_sock *vsk, skb_read_actor_t recv_acto if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) vvs->msg_count--; - virtio_transport_dec_rx_pkt(vvs, le32_to_cpu(hdr->len)); + pkt_len = le32_to_cpu(hdr->len); + virtio_transport_dec_rx_pkt(vvs, pkt_len, pkt_len); spin_unlock_bh(&vvs->rx_lock); virtio_transport_send_credit_update(vsk); diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c index f201d9eca1df..07b96d56f3a5 100644 --- a/net/vmw_vsock/vsock_bpf.c +++ b/net/vmw_vsock/vsock_bpf.c @@ -87,7 +87,7 @@ static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, lock_sock(sk); vsk = vsock_sk(sk); - if (!vsk->transport) { + if (WARN_ON_ONCE(!vsk->transport)) { copied = -ENODEV; goto out; } diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 40b6375a5de4..193734b7f9dc 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -6,7 +6,7 @@ * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2018-2024 Intel Corporation + * Copyright 2018-2025 Intel Corporation */ #include <linux/export.h> @@ -55,6 +55,51 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef, } EXPORT_SYMBOL(cfg80211_chandef_create); +static u32 cfg80211_get_start_freq(const struct cfg80211_chan_def *chandef, + u32 cf) +{ + u32 start_freq, center_freq, bandwidth; + + center_freq = MHZ_TO_KHZ((cf == 1) ? + chandef->center_freq1 : chandef->center_freq2); + bandwidth = MHZ_TO_KHZ(cfg80211_chandef_get_width(chandef)); + + if (bandwidth <= MHZ_TO_KHZ(20)) + start_freq = center_freq; + else + start_freq = center_freq - bandwidth / 2 + MHZ_TO_KHZ(10); + + return start_freq; +} + +static u32 cfg80211_get_end_freq(const struct cfg80211_chan_def *chandef, + u32 cf) +{ + u32 end_freq, center_freq, bandwidth; + + center_freq = MHZ_TO_KHZ((cf == 1) ? + chandef->center_freq1 : chandef->center_freq2); + bandwidth = MHZ_TO_KHZ(cfg80211_chandef_get_width(chandef)); + + if (bandwidth <= MHZ_TO_KHZ(20)) + end_freq = center_freq; + else + end_freq = center_freq + bandwidth / 2 - MHZ_TO_KHZ(10); + + return end_freq; +} + +#define for_each_subchan(chandef, freq, cf) \ + for (u32 punctured = chandef->punctured, \ + cf = 1, freq = cfg80211_get_start_freq(chandef, cf); \ + freq <= cfg80211_get_end_freq(chandef, cf); \ + freq += MHZ_TO_KHZ(20), \ + ((cf == 1 && chandef->center_freq2 != 0 && \ + freq > cfg80211_get_end_freq(chandef, cf)) ? \ + (cf++, freq = cfg80211_get_start_freq(chandef, cf), \ + punctured = 0) : (punctured >>= 1))) \ + if (!(punctured & 1)) + struct cfg80211_per_bw_puncturing_values { u8 len; const u16 *valid_values; @@ -258,11 +303,6 @@ int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width) } EXPORT_SYMBOL(nl80211_chan_width_to_mhz); -static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c) -{ - return nl80211_chan_width_to_mhz(c->width); -} - static bool cfg80211_valid_center_freq(u32 center, enum nl80211_chan_width width) { @@ -582,29 +622,11 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1, } EXPORT_SYMBOL(cfg80211_chandef_compatible); -static void cfg80211_set_chans_dfs_state(struct wiphy *wiphy, u32 center_freq, - u32 bandwidth, - enum nl80211_dfs_state dfs_state) -{ - struct ieee80211_channel *c; - u32 freq; - - for (freq = center_freq - bandwidth/2 + 10; - freq <= center_freq + bandwidth/2 - 10; - freq += 20) { - c = ieee80211_get_channel(wiphy, freq); - if (!c || !(c->flags & IEEE80211_CHAN_RADAR)) - continue; - - c->dfs_state = dfs_state; - c->dfs_state_entered = jiffies; - } -} - void cfg80211_set_dfs_state(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, enum nl80211_dfs_state dfs_state) { + struct ieee80211_channel *c; int width; if (WARN_ON(!cfg80211_chandef_valid(chandef))) @@ -614,41 +636,14 @@ void cfg80211_set_dfs_state(struct wiphy *wiphy, if (width < 0) return; - cfg80211_set_chans_dfs_state(wiphy, chandef->center_freq1, - width, dfs_state); - - if (!chandef->center_freq2) - return; - cfg80211_set_chans_dfs_state(wiphy, chandef->center_freq2, - width, dfs_state); -} - -static u32 cfg80211_get_start_freq(u32 center_freq, - u32 bandwidth) -{ - u32 start_freq; - - bandwidth = MHZ_TO_KHZ(bandwidth); - if (bandwidth <= MHZ_TO_KHZ(20)) - start_freq = center_freq; - else - start_freq = center_freq - bandwidth / 2 + MHZ_TO_KHZ(10); - - return start_freq; -} - -static u32 cfg80211_get_end_freq(u32 center_freq, - u32 bandwidth) -{ - u32 end_freq; - - bandwidth = MHZ_TO_KHZ(bandwidth); - if (bandwidth <= MHZ_TO_KHZ(20)) - end_freq = center_freq; - else - end_freq = center_freq + bandwidth / 2 - MHZ_TO_KHZ(10); + for_each_subchan(chandef, freq, cf) { + c = ieee80211_get_channel_khz(wiphy, freq); + if (!c || !(c->flags & IEEE80211_CHAN_RADAR)) + continue; - return end_freq; + c->dfs_state = dfs_state; + c->dfs_state_entered = jiffies; + } } static bool @@ -725,17 +720,12 @@ static bool cfg80211_dfs_permissive_chan(struct wiphy *wiphy, } static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, - u32 center_freq, - u32 bandwidth, - enum nl80211_iftype iftype) + const struct cfg80211_chan_def *chandef, + enum nl80211_iftype iftype) { struct ieee80211_channel *c; - u32 freq, start_freq, end_freq; - start_freq = cfg80211_get_start_freq(center_freq, bandwidth); - end_freq = cfg80211_get_end_freq(center_freq, bandwidth); - - for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + for_each_subchan(chandef, freq, cf) { c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return -EINVAL; @@ -768,25 +758,9 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, if (width < 0) return -EINVAL; - ret = cfg80211_get_chans_dfs_required(wiphy, - ieee80211_chandef_to_khz(chandef), - width, iftype); - if (ret < 0) - return ret; - else if (ret > 0) - return BIT(chandef->width); - - if (!chandef->center_freq2) - return 0; - - ret = cfg80211_get_chans_dfs_required(wiphy, - MHZ_TO_KHZ(chandef->center_freq2), - width, iftype); - if (ret < 0) - return ret; - else if (ret > 0) - return BIT(chandef->width); + ret = cfg80211_get_chans_dfs_required(wiphy, chandef, iftype); + return (ret > 0) ? BIT(chandef->width) : ret; break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_OCB: @@ -806,16 +780,18 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_chandef_dfs_required); -static int cfg80211_get_chans_dfs_usable(struct wiphy *wiphy, - u32 center_freq, - u32 bandwidth) +bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) { struct ieee80211_channel *c; - u32 freq, start_freq, end_freq; - int count = 0; + int width, count = 0; + + if (WARN_ON(!cfg80211_chandef_valid(chandef))) + return false; - start_freq = cfg80211_get_start_freq(center_freq, bandwidth); - end_freq = cfg80211_get_end_freq(center_freq, bandwidth); + width = cfg80211_chandef_get_width(chandef); + if (width < 0) + return false; /* * Check entire range of channels for the bandwidth. @@ -823,61 +799,24 @@ static int cfg80211_get_chans_dfs_usable(struct wiphy *wiphy, * DFS_AVAILABLE). Return number of usable channels * (require CAC). Allow DFS and non-DFS channel mix. */ - for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + for_each_subchan(chandef, freq, cf) { c = ieee80211_get_channel_khz(wiphy, freq); if (!c) - return -EINVAL; + return false; if (c->flags & IEEE80211_CHAN_DISABLED) - return -EINVAL; + return false; if (c->flags & IEEE80211_CHAN_RADAR) { if (c->dfs_state == NL80211_DFS_UNAVAILABLE) - return -EINVAL; + return false; if (c->dfs_state == NL80211_DFS_USABLE) count++; } } - return count; -} - -bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, - const struct cfg80211_chan_def *chandef) -{ - int width; - int r1, r2 = 0; - - if (WARN_ON(!cfg80211_chandef_valid(chandef))) - return false; - - width = cfg80211_chandef_get_width(chandef); - if (width < 0) - return false; - - r1 = cfg80211_get_chans_dfs_usable(wiphy, - MHZ_TO_KHZ(chandef->center_freq1), - width); - - if (r1 < 0) - return false; - - switch (chandef->width) { - case NL80211_CHAN_WIDTH_80P80: - WARN_ON(!chandef->center_freq2); - r2 = cfg80211_get_chans_dfs_usable(wiphy, - MHZ_TO_KHZ(chandef->center_freq2), - width); - if (r2 < 0) - return false; - break; - default: - WARN_ON(chandef->center_freq2); - break; - } - - return (r1 + r2 > 0); + return count > 0; } EXPORT_SYMBOL(cfg80211_chandef_dfs_usable); @@ -1039,10 +978,10 @@ bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy, if (!reg_dfs_domain_same(wiphy, &rdev->wiphy)) continue; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + found = cfg80211_is_wiphy_oper_chan(&rdev->wiphy, chan) || cfg80211_offchan_chain_is_active(rdev, chan); - wiphy_unlock(&rdev->wiphy); if (found) return true; @@ -1051,26 +990,29 @@ bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy, return false; } -static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy, - u32 center_freq, - u32 bandwidth) +static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) { struct ieee80211_channel *c; - u32 freq, start_freq, end_freq; + int width; bool dfs_offload; + if (WARN_ON(!cfg80211_chandef_valid(chandef))) + return false; + + width = cfg80211_chandef_get_width(chandef); + if (width < 0) + return false; + dfs_offload = wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD); - start_freq = cfg80211_get_start_freq(center_freq, bandwidth); - end_freq = cfg80211_get_end_freq(center_freq, bandwidth); - /* * Check entire range of channels for the bandwidth. * If any channel in between is disabled or has not * had gone through CAC return false */ - for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + for_each_subchan(chandef, freq, cf) { c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return false; @@ -1087,124 +1029,54 @@ static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy, return true; } -static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy, - const struct cfg80211_chan_def *chandef) +unsigned int +cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) { + struct ieee80211_channel *c; int width; - int r; + unsigned int t1 = 0, t2 = 0; if (WARN_ON(!cfg80211_chandef_valid(chandef))) - return false; + return 0; width = cfg80211_chandef_get_width(chandef); if (width < 0) - return false; - - r = cfg80211_get_chans_dfs_available(wiphy, - MHZ_TO_KHZ(chandef->center_freq1), - width); - - /* If any of channels unavailable for cf1 just return */ - if (!r) - return r; - - switch (chandef->width) { - case NL80211_CHAN_WIDTH_80P80: - WARN_ON(!chandef->center_freq2); - r = cfg80211_get_chans_dfs_available(wiphy, - MHZ_TO_KHZ(chandef->center_freq2), - width); - break; - default: - WARN_ON(chandef->center_freq2); - break; - } - - return r; -} - -static unsigned int cfg80211_get_chans_dfs_cac_time(struct wiphy *wiphy, - u32 center_freq, - u32 bandwidth) -{ - struct ieee80211_channel *c; - u32 start_freq, end_freq, freq; - unsigned int dfs_cac_ms = 0; - - start_freq = cfg80211_get_start_freq(center_freq, bandwidth); - end_freq = cfg80211_get_end_freq(center_freq, bandwidth); + return 0; - for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + for_each_subchan(chandef, freq, cf) { c = ieee80211_get_channel_khz(wiphy, freq); - if (!c) - return 0; - - if (c->flags & IEEE80211_CHAN_DISABLED) - return 0; + if (!c || (c->flags & IEEE80211_CHAN_DISABLED)) { + if (cf == 1) + t1 = INT_MAX; + else + t2 = INT_MAX; + continue; + } if (!(c->flags & IEEE80211_CHAN_RADAR)) continue; - if (c->dfs_cac_ms > dfs_cac_ms) - dfs_cac_ms = c->dfs_cac_ms; - } + if (cf == 1 && c->dfs_cac_ms > t1) + t1 = c->dfs_cac_ms; - return dfs_cac_ms; -} - -unsigned int -cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, - const struct cfg80211_chan_def *chandef) -{ - int width; - unsigned int t1 = 0, t2 = 0; - - if (WARN_ON(!cfg80211_chandef_valid(chandef))) - return 0; + if (cf == 2 && c->dfs_cac_ms > t2) + t2 = c->dfs_cac_ms; + } - width = cfg80211_chandef_get_width(chandef); - if (width < 0) + if (t1 == INT_MAX && t2 == INT_MAX) return 0; - t1 = cfg80211_get_chans_dfs_cac_time(wiphy, - MHZ_TO_KHZ(chandef->center_freq1), - width); + if (t1 == INT_MAX) + return t2; - if (!chandef->center_freq2) + if (t2 == INT_MAX) return t1; - t2 = cfg80211_get_chans_dfs_cac_time(wiphy, - MHZ_TO_KHZ(chandef->center_freq2), - width); - return max(t1, t2); } EXPORT_SYMBOL(cfg80211_chandef_dfs_cac_time); -static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy, - u32 center_freq, u32 bandwidth, - u32 prohibited_flags, - u32 permitting_flags) -{ - struct ieee80211_channel *c; - u32 freq, start_freq, end_freq; - - start_freq = cfg80211_get_start_freq(center_freq, bandwidth); - end_freq = cfg80211_get_end_freq(center_freq, bandwidth); - - for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { - c = ieee80211_get_channel_khz(wiphy, freq); - if (!c) - return false; - if (c->flags & permitting_flags) - continue; - if (c->flags & prohibited_flags) - return false; - } - - return true; -} - /* check if the operating channels are valid and supported */ static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels, enum ieee80211_edmg_bw_config edmg_bw_config, @@ -1270,6 +1142,7 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, bool ext_nss_cap, support_80_80 = false, support_320 = false; const struct ieee80211_sband_iftype_data *iftd; struct ieee80211_supported_band *sband; + struct ieee80211_channel *c; int i; if (WARN_ON(!cfg80211_chandef_valid(chandef))) @@ -1420,19 +1293,17 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, if (width < 20) prohibited_flags |= IEEE80211_CHAN_NO_OFDM; + for_each_subchan(chandef, freq, cf) { + c = ieee80211_get_channel_khz(wiphy, freq); + if (!c) + return false; + if (c->flags & permitting_flags) + continue; + if (c->flags & prohibited_flags) + return false; + } - if (!cfg80211_secondary_chans_ok(wiphy, - ieee80211_chandef_to_khz(chandef), - width, prohibited_flags, - permitting_flags)) - return false; - - if (!chandef->center_freq2) - return true; - return cfg80211_secondary_chans_ok(wiphy, - MHZ_TO_KHZ(chandef->center_freq2), - width, prohibited_flags, - permitting_flags); + return true; } bool cfg80211_chandef_usable(struct wiphy *wiphy, @@ -1621,6 +1492,12 @@ bool cfg80211_reg_check_beaconing(struct wiphy *wiphy, if (cfg->reg_power == IEEE80211_REG_VLP_AP) permitting_flags |= IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP; + if ((cfg->iftype == NL80211_IFTYPE_P2P_GO || + cfg->iftype == NL80211_IFTYPE_AP) && + (chandef->width == NL80211_CHAN_WIDTH_20_NOHT || + chandef->width == NL80211_CHAN_WIDTH_20)) + permitting_flags |= IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY; + return _cfg80211_reg_can_beacon(wiphy, chandef, cfg->iftype, check_no_ir ? IEEE80211_CHAN_NO_IR : 0, permitting_flags); diff --git a/net/wireless/core.c b/net/wireless/core.c index afbdc549fb4a..dcce326fdb8c 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -143,10 +143,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, if (result) return result; - if (!IS_ERR_OR_NULL(rdev->wiphy.debugfsdir)) - debugfs_rename(rdev->wiphy.debugfsdir->d_parent, - rdev->wiphy.debugfsdir, - rdev->wiphy.debugfsdir->d_parent, newname); + debugfs_change_name(rdev->wiphy.debugfsdir, "%s", newname); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); @@ -165,11 +162,11 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; - wdev->netdev->netns_local = false; + wdev->netdev->netns_immutable = false; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); if (err) break; - wdev->netdev->netns_local = true; + wdev->netdev->netns_immutable = true; } if (err) { @@ -181,17 +178,18 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, list) { if (!wdev->netdev) continue; - wdev->netdev->netns_local = false; + wdev->netdev->netns_immutable = false; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); WARN_ON(err); - wdev->netdev->netns_local = true; + wdev->netdev->netns_immutable = true; } return err; } - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; @@ -212,7 +210,6 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, continue; nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE); } - wiphy_unlock(&rdev->wiphy); return 0; } @@ -221,9 +218,9 @@ static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data) { struct cfg80211_registered_device *rdev = data; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + rdev_rfkill_poll(rdev); - wiphy_unlock(&rdev->wiphy); } void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, @@ -283,7 +280,7 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) /* otherwise, check iftype */ - wiphy_lock(wiphy); + guard(wiphy)(wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: @@ -295,8 +292,6 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy) default: break; } - - wiphy_unlock(wiphy); } } EXPORT_SYMBOL_GPL(cfg80211_shutdown_all_interfaces); @@ -331,9 +326,9 @@ static void cfg80211_event_work(struct work_struct *work) rdev = container_of(work, struct cfg80211_registered_device, event_work); - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + cfg80211_process_rdev_events(rdev); - wiphy_unlock(&rdev->wiphy); } void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) @@ -347,10 +342,10 @@ void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) if (wdev->netdev) dev_close(wdev->netdev); - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + cfg80211_leave(rdev, wdev); cfg80211_remove_virtual_intf(rdev, wdev); - wiphy_unlock(&rdev->wiphy); } } } @@ -423,9 +418,9 @@ static void cfg80211_wiphy_work(struct work_struct *work) trace_wiphy_work_worker_start(&rdev->wiphy); - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); if (rdev->suspended) - goto out; + return; spin_lock_irq(&rdev->wiphy_work_lock); wk = list_first_entry_or_null(&rdev->wiphy_work_list, @@ -441,8 +436,6 @@ static void cfg80211_wiphy_work(struct work_struct *work) } else { spin_unlock_irq(&rdev->wiphy_work_lock); } -out: - wiphy_unlock(&rdev->wiphy); } /* exported functions */ @@ -553,6 +546,9 @@ use_default_name: INIT_WORK(&rdev->mgmt_registrations_update_wk, cfg80211_mgmt_registrations_update_wk); spin_lock_init(&rdev->mgmt_registrations_lock); + INIT_WORK(&rdev->wiphy_work, cfg80211_wiphy_work); + INIT_LIST_HEAD(&rdev->wiphy_work_list); + spin_lock_init(&rdev->wiphy_work_lock); #ifdef CONFIG_CFG80211_DEFAULT_PS rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; @@ -570,9 +566,6 @@ use_default_name: return NULL; } - INIT_WORK(&rdev->wiphy_work, cfg80211_wiphy_work); - INIT_LIST_HEAD(&rdev->wiphy_work_list); - spin_lock_init(&rdev->wiphy_work_lock); INIT_WORK(&rdev->rfkill_block, cfg80211_rfkill_block_work); INIT_WORK(&rdev->conn_work, cfg80211_conn_work); INIT_WORK(&rdev->event_work, cfg80211_event_work); @@ -800,6 +793,7 @@ int wiphy_register(struct wiphy *wiphy) BIT(NL80211_CHAN_WIDTH_80) | BIT(NL80211_CHAN_WIDTH_80P80) | BIT(NL80211_CHAN_WIDTH_160) | + BIT(NL80211_CHAN_WIDTH_320) | BIT(NL80211_CHAN_WIDTH_5) | BIT(NL80211_CHAN_WIDTH_10)))) return -EINVAL; @@ -1198,6 +1192,13 @@ void cfg80211_dev_free(struct cfg80211_registered_device *rdev) { struct cfg80211_internal_bss *scan, *tmp; struct cfg80211_beacon_registration *reg, *treg; + unsigned long flags; + + spin_lock_irqsave(&rdev->wiphy_work_lock, flags); + WARN_ON(!list_empty(&rdev->wiphy_work_list)); + spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); + cancel_work_sync(&rdev->wiphy_work); + rfkill_destroy(rdev->wiphy.rfkill); list_for_each_entry_safe(reg, treg, &rdev->beacon_registrations, list) { list_del(®->list); @@ -1520,15 +1521,15 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, SET_NETDEV_DEVTYPE(dev, &wiphy_type); wdev->netdev = dev; /* can only change netns with wiphy */ - dev->netns_local = true; + dev->netns_immutable = true; cfg80211_init_wdev(wdev); break; case NETDEV_REGISTER: if (!wdev->registered) { - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + cfg80211_register_wdev(rdev, wdev); - wiphy_unlock(&rdev->wiphy); } break; case NETDEV_UNREGISTER: @@ -1537,16 +1538,16 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, * so check wdev->registered. */ if (wdev->registered && !wdev->registering) { - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + _cfg80211_unregister_wdev(wdev, false); - wiphy_unlock(&rdev->wiphy); } break; case NETDEV_GOING_DOWN: - wiphy_lock(&rdev->wiphy); - cfg80211_leave(rdev, wdev); - cfg80211_remove_links(wdev); - wiphy_unlock(&rdev->wiphy); + scoped_guard(wiphy, &rdev->wiphy) { + cfg80211_leave(rdev, wdev); + cfg80211_remove_links(wdev); + } /* since we just did cfg80211_leave() nothing to do there */ cancel_work_sync(&wdev->disconnect_wk); cancel_work_sync(&wdev->pmsr_free_wk); @@ -1721,7 +1722,7 @@ void wiphy_delayed_work_queue(struct wiphy *wiphy, trace_wiphy_delayed_work_queue(wiphy, &dwork->work, delay); if (!delay) { - del_timer(&dwork->timer); + timer_delete(&dwork->timer); wiphy_work_queue(wiphy, &dwork->work); return; } @@ -1736,7 +1737,7 @@ void wiphy_delayed_work_cancel(struct wiphy *wiphy, { lockdep_assert_held(&wiphy->mtx); - del_timer_sync(&dwork->timer); + timer_delete_sync(&dwork->timer); wiphy_work_cancel(wiphy, &dwork->work); } EXPORT_SYMBOL_GPL(wiphy_delayed_work_cancel); @@ -1746,7 +1747,7 @@ void wiphy_delayed_work_flush(struct wiphy *wiphy, { lockdep_assert_held(&wiphy->mtx); - del_timer_sync(&dwork->timer); + timer_delete_sync(&dwork->timer); wiphy_work_flush(wiphy, &dwork->work); } EXPORT_SYMBOL_GPL(wiphy_delayed_work_flush); diff --git a/net/wireless/core.h b/net/wireless/core.h index 4c45f994a8c0..c56a35040caa 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -3,7 +3,7 @@ * Wireless configuration interface internals. * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #ifndef __NET_WIRELESS_CORE_H #define __NET_WIRELESS_CORE_H @@ -180,7 +180,6 @@ struct cfg80211_internal_bss { struct list_head list; struct list_head hidden_list; struct rb_node rbn; - u64 ts_boottime; unsigned long ts; unsigned long refcount; atomic_t hold; @@ -567,6 +566,10 @@ int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask); +int cfg80211_assoc_ml_reconf(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ml_reconf_req *req); + /** * struct cfg80211_colocated_ap - colocated AP information * diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index a5eb92d93074..05d44a443518 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -4,7 +4,7 @@ * * Copyright (c) 2009, Jouni Malinen <j@w1.fi> * Copyright (c) 2015 Intel Deutschland GmbH - * Copyright (C) 2019-2020, 2022-2024 Intel Corporation + * Copyright (C) 2019-2020, 2022-2025 Intel Corporation */ #include <linux/kernel.h> @@ -352,6 +352,13 @@ cfg80211_mlme_check_mlo_compat(const struct ieee80211_multi_link_elem *mle_a, return -EINVAL; } + if (ieee80211_mle_get_ext_mld_capa_op((const u8 *)mle_a) != + ieee80211_mle_get_ext_mld_capa_op((const u8 *)mle_b)) { + NL_SET_ERR_MSG(extack, + "extended link MLD capabilities/ops mismatch"); + return -EINVAL; + } + return 0; } @@ -627,10 +634,10 @@ void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) rdev = container_of(wk, struct cfg80211_registered_device, mgmt_registrations_update_wk); - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) cfg80211_mgmt_registrations_update(wdev); - wiphy_unlock(&rdev->wiphy); } int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, @@ -1193,10 +1200,10 @@ cfg80211_background_cac_event(struct cfg80211_registered_device *rdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event) { - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + __cfg80211_background_cac_event(rdev, rdev->background_radar_wdev, chandef, event); - wiphy_unlock(&rdev->wiphy); } void cfg80211_background_cac_done_wk(struct work_struct *work) @@ -1287,3 +1294,83 @@ void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev) &rdev->background_radar_chandef, NL80211_RADAR_CAC_ABORTED); } + +int cfg80211_assoc_ml_reconf(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ml_reconf_req *req) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + int err; + + lockdep_assert_wiphy(wdev->wiphy); + + err = rdev_assoc_ml_reconf(rdev, dev, req); + if (!err) { + int link_id; + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; + link_id++) { + if (!req->add_links[link_id].bss) + continue; + + cfg80211_ref_bss(&rdev->wiphy, req->add_links[link_id].bss); + cfg80211_hold_bss(bss_from_pub(req->add_links[link_id].bss)); + } + } + + return err; +} + +void cfg80211_mlo_reconf_add_done(struct net_device *dev, + struct cfg80211_mlo_reconf_done_data *data) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + int link_id; + + lockdep_assert_wiphy(wiphy); + + trace_cfg80211_mlo_reconf_add_done(dev, data->added_links, + data->buf, data->len); + + if (WARN_ON(!wdev->valid_links)) + return; + + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) + return; + + /* validate that a BSS is given for each added link */ + for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { + struct cfg80211_bss *bss = data->links[link_id].bss; + + if (!(data->added_links & BIT(link_id))) + continue; + + if (WARN_ON(!bss)) + return; + } + + for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { + struct cfg80211_bss *bss = data->links[link_id].bss; + + if (!bss) + continue; + + if (data->added_links & BIT(link_id)) { + wdev->links[link_id].client.current_bss = + bss_from_pub(bss); + + memcpy(wdev->links[link_id].addr, + data->links[link_id].addr, + ETH_ALEN); + } else { + cfg80211_unhold_bss(bss_from_pub(bss)); + cfg80211_put_bss(wiphy, bss); + } + } + + wdev->valid_links |= data->added_links; + nl80211_mlo_reconf_add_done(dev, data); +} +EXPORT_SYMBOL(cfg80211_mlo_reconf_add_done); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index dd84fc54fb9b..fd5f79266471 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/if.h> @@ -294,6 +294,21 @@ static int validate_he_capa(const struct nlattr *attr, return 0; } +static int validate_supported_selectors(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *supported_selectors = nla_data(attr); + u8 supported_selectors_len = nla_len(attr); + + /* The top bit must not be set as it is not part of the selector */ + for (int i = 0; i < supported_selectors_len; i++) { + if (supported_selectors[i] & 0x80) + return -EINVAL; + } + + return 0; +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -454,6 +469,8 @@ nl80211_mbssid_config_policy[NL80211_MBSSID_CONFIG_ATTR_MAX + 1] = { [NL80211_MBSSID_CONFIG_ATTR_INDEX] = { .type = NLA_U8 }, [NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX] = { .type = NLA_U32 }, [NL80211_MBSSID_CONFIG_ATTR_EMA] = { .type = NLA_FLAG }, + [NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID] = + NLA_POLICY_MAX(NLA_U8, IEEE80211_MLD_MAX_NUM_LINKS), }; static const struct nla_policy @@ -818,6 +835,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MLD_ADDR] = NLA_POLICY_EXACT_LEN(ETH_ALEN), [NL80211_ATTR_MLO_SUPPORT] = { .type = NLA_FLAG }, [NL80211_ATTR_MAX_NUM_AKM_SUITES] = { .type = NLA_REJECT }, + [NL80211_ATTR_EML_CAPABILITY] = { .type = NLA_U16 }, [NL80211_ATTR_PUNCT_BITMAP] = NLA_POLICY_FULL_RANGE(NLA_U32, &nl80211_punct_bitmap_range), @@ -830,6 +848,12 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MLO_TTLM_ULINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8), [NL80211_ATTR_ASSOC_SPP_AMSDU] = { .type = NLA_FLAG }, [NL80211_ATTR_VIF_RADIO_MASK] = { .type = NLA_U32 }, + [NL80211_ATTR_SUPPORTED_SELECTORS] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_supported_selectors, + NL80211_MAX_SUPP_SELECTORS), + [NL80211_ATTR_MLO_RECONF_REM_LINKS] = { .type = NLA_U16 }, + [NL80211_ATTR_EPCS] = { .type = NLA_FLAG }, + [NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS] = { .type = NLA_U16 }, }; /* policy for the key attributes */ @@ -1214,6 +1238,10 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_ALLOW_6GHZ_VLP_AP)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY) && + nla_put_flag(msg, + NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -2748,6 +2776,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, CMD(update_ft_ies, UPDATE_FT_IES); if (rdev->wiphy.sar_capa) CMD(set_sar_specs, SET_SAR_SPECS); + CMD(assoc_ml_reconf, ASSOC_MLO_RECONF); } #undef CMD @@ -3626,7 +3655,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } else wdev = netdev->ieee80211_ptr; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); /* * end workaround code, by now the rdev is available @@ -3639,32 +3668,24 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rtnl_unlock(); if (result) - goto out; + return result; if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) { struct ieee80211_txq_params txq_params; struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1]; - if (!rdev->ops->set_txq_params) { - result = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->set_txq_params) + return -EOPNOTSUPP; - if (!netdev) { - result = -EINVAL; - goto out; - } + if (!netdev) + return -EINVAL; if (netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && - netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { - result = -EINVAL; - goto out; - } + netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EINVAL; - if (!netif_running(netdev)) { - result = -ENETDOWN; - goto out; - } + if (!netif_running(netdev)) + return -ENETDOWN; nla_for_each_nested(nl_txq_params, info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS], @@ -3675,10 +3696,11 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) txq_params_policy, info->extack); if (result) - goto out; + return result; + result = parse_txq_params(tb, &txq_params); if (result) - goto out; + return result; txq_params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -3694,7 +3716,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) result = rdev_set_txq_params(rdev, netdev, &txq_params); if (result) - goto out; + return result; } } @@ -3711,7 +3733,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (result) - goto out; + return result; } if (info->attrs[NL80211_ATTR_WIPHY_TX_POWER_SETTING]) { @@ -3722,19 +3744,15 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (!(rdev->wiphy.features & NL80211_FEATURE_VIF_TXPOWER)) txp_wdev = NULL; - if (!rdev->ops->set_tx_power) { - result = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->set_tx_power) + return -EOPNOTSUPP; idx = NL80211_ATTR_WIPHY_TX_POWER_SETTING; type = nla_get_u32(info->attrs[idx]); if (!info->attrs[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] && - (type != NL80211_TX_POWER_AUTOMATIC)) { - result = -EINVAL; - goto out; - } + (type != NL80211_TX_POWER_AUTOMATIC)) + return -EINVAL; if (type != NL80211_TX_POWER_AUTOMATIC) { idx = NL80211_ATTR_WIPHY_TX_POWER_LEVEL; @@ -3743,7 +3761,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) result = rdev_set_tx_power(rdev, txp_wdev, type, mbm); if (result) - goto out; + return result; } if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] && @@ -3752,10 +3770,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if ((!rdev->wiphy.available_antennas_tx && !rdev->wiphy.available_antennas_rx) || - !rdev->ops->set_antenna) { - result = -EOPNOTSUPP; - goto out; - } + !rdev->ops->set_antenna) + return -EOPNOTSUPP; tx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX]); rx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]); @@ -3763,17 +3779,15 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) /* reject antenna configurations which don't match the * available antenna masks, except for the "all" mask */ if ((~tx_ant && (tx_ant & ~rdev->wiphy.available_antennas_tx)) || - (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) { - result = -EINVAL; - goto out; - } + (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) + return -EINVAL; tx_ant = tx_ant & rdev->wiphy.available_antennas_tx; rx_ant = rx_ant & rdev->wiphy.available_antennas_rx; result = rdev_set_antenna(rdev, tx_ant, rx_ant); if (result) - goto out; + return result; } changed = 0; @@ -3795,10 +3809,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]) { frag_threshold = nla_get_u32( info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]); - if (frag_threshold < 256) { - result = -EINVAL; - goto out; - } + if (frag_threshold < 256) + return -EINVAL; if (frag_threshold != (u32) -1) { /* @@ -3819,10 +3831,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) { - if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) { - result = -EINVAL; - goto out; - } + if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) + return -EINVAL; coverage_class = nla_get_u8( info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]); @@ -3830,20 +3840,17 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) { - if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) { - result = -EOPNOTSUPP; - goto out; - } + if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) + return -EOPNOTSUPP; changed |= WIPHY_PARAM_DYN_ACK; } if (info->attrs[NL80211_ATTR_TXQ_LIMIT]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_TXQS)) { - result = -EOPNOTSUPP; - goto out; - } + NL80211_EXT_FEATURE_TXQS)) + return -EOPNOTSUPP; + txq_limit = nla_get_u32( info->attrs[NL80211_ATTR_TXQ_LIMIT]); changed |= WIPHY_PARAM_TXQ_LIMIT; @@ -3851,10 +3858,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_TXQS)) { - result = -EOPNOTSUPP; - goto out; - } + NL80211_EXT_FEATURE_TXQS)) + return -EOPNOTSUPP; + txq_memory_limit = nla_get_u32( info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]); changed |= WIPHY_PARAM_TXQ_MEMORY_LIMIT; @@ -3862,10 +3868,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_TXQ_QUANTUM]) { if (!wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_TXQS)) { - result = -EOPNOTSUPP; - goto out; - } + NL80211_EXT_FEATURE_TXQS)) + return -EOPNOTSUPP; + txq_quantum = nla_get_u32( info->attrs[NL80211_ATTR_TXQ_QUANTUM]); changed |= WIPHY_PARAM_TXQ_QUANTUM; @@ -3877,10 +3882,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u8 old_coverage_class; u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum; - if (!rdev->ops->set_wiphy_params) { - result = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->set_wiphy_params) + return -EOPNOTSUPP; old_retry_short = rdev->wiphy.retry_short; old_retry_long = rdev->wiphy.retry_long; @@ -3918,15 +3921,11 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.txq_limit = old_txq_limit; rdev->wiphy.txq_memory_limit = old_txq_memory_limit; rdev->wiphy.txq_quantum = old_txq_quantum; - goto out; + return result; } } - result = 0; - -out: - wiphy_unlock(&rdev->wiphy); - return result; + return 0; } int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef) @@ -4010,10 +4009,10 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag goto nla_put_failure; } - if (rdev->ops->get_tx_power) { + if (rdev->ops->get_tx_power && !wdev->valid_links) { int dbm, ret; - ret = rdev_get_tx_power(rdev, wdev, &dbm); + ret = rdev_get_tx_power(rdev, wdev, 0, &dbm); if (ret == 0 && nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL, DBM_TO_MBM(dbm))) @@ -4082,6 +4081,15 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (ret == 0 && nl80211_send_chandef(msg, &chandef)) goto nla_put_failure; + if (rdev->ops->get_tx_power) { + int dbm, ret; + + ret = rdev_get_tx_power(rdev, wdev, link_id, &dbm); + if (ret == 0 && + nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL, + DBM_TO_MBM(dbm))) + goto nla_put_failure; + } nla_nest_end(msg, link); } @@ -4144,22 +4152,22 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * if_idx = 0; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (if_idx < if_start) { if_idx++; continue; } + if (nl80211_send_iface(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev, - NL80211_CMD_NEW_INTERFACE) < 0) { - wiphy_unlock(&rdev->wiphy); + NL80211_CMD_NEW_INTERFACE) < 0) goto out; - } + if_idx++; } - wiphy_unlock(&rdev->wiphy); if_start = 0; wp_idx++; @@ -4221,6 +4229,11 @@ static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags) if (flags[flag]) *mntrflags |= (1<<flag); + /* cooked monitor mode is incompatible with other modes */ + if (*mntrflags & MONITOR_FLAG_COOK_FRAMES && + *mntrflags != MONITOR_FLAG_COOK_FRAMES) + return -EOPNOTSUPP; + *mntrflags |= MONITOR_FLAG_CHANGED; return 0; @@ -4246,6 +4259,10 @@ static int nl80211_parse_mon_options(struct cfg80211_registered_device *rdev, change = true; } + /* MONITOR_FLAG_COOK_FRAMES is deprecated, refuse cooperation */ + if (params->flags & MONITOR_FLAG_COOK_FRAMES) + return -EOPNOTSUPP; + if (params->flags & MONITOR_FLAG_ACTIVE && !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR)) return -EOPNOTSUPP; @@ -4517,16 +4534,13 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; - int ret; /* to avoid failing a new interface creation due to pending removal */ cfg80211_destroy_ifaces(rdev); - wiphy_lock(&rdev->wiphy); - ret = _nl80211_new_interface(skb, info); - wiphy_unlock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); - return ret; + return _nl80211_new_interface(skb, info); } static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) @@ -5512,11 +5526,13 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, static int nl80211_parse_mbssid_config(struct wiphy *wiphy, struct net_device *dev, + unsigned int link_id, struct nlattr *attrs, struct cfg80211_mbssid_config *config, u8 num_elems) { struct nlattr *tb[NL80211_MBSSID_CONFIG_ATTR_MAX + 1]; + int tx_link_id = -1; if (!wiphy->mbssid_max_interfaces) return -EOPNOTSUPP; @@ -5540,6 +5556,9 @@ static int nl80211_parse_mbssid_config(struct wiphy *wiphy, (!config->index && !num_elems)) return -EINVAL; + if (tb[NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID]) + tx_link_id = nla_get_u8(tb[NL80211_MBSSID_CONFIG_ATTR_TX_LINK_ID]); + if (tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]) { u32 tx_ifindex = nla_get_u32(tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]); @@ -5561,10 +5580,25 @@ static int nl80211_parse_mbssid_config(struct wiphy *wiphy, } config->tx_wdev = tx_netdev->ieee80211_ptr; + /* Caller should call dev_put(config->tx_wdev) from this point */ + + if (config->tx_wdev->valid_links) { + if (tx_link_id == -1 || + !(config->tx_wdev->valid_links & BIT(tx_link_id))) + return -ENOLINK; + + config->tx_link_id = tx_link_id; + } } else { + if (tx_link_id >= 0 && tx_link_id != link_id) + return -EINVAL; + config->tx_wdev = dev->ieee80211_ptr; } } else if (!config->index) { + if (tx_link_id >= 0 && tx_link_id != link_id) + return -EINVAL; + config->tx_wdev = dev->ieee80211_ptr; } else { return -EINVAL; @@ -6314,7 +6348,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_MBSSID_CONFIG]) { - err = nl80211_parse_mbssid_config(&rdev->wiphy, dev, + err = nl80211_parse_mbssid_config(&rdev->wiphy, dev, link_id, info->attrs[NL80211_ATTR_MBSSID_CONFIG], ¶ms->mbssid_config, params->beacon.mbssid_ies ? @@ -6750,9 +6784,6 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO_U64(RX_BYTES64, rx_bytes); PUT_SINFO_U64(TX_BYTES64, tx_bytes); - PUT_SINFO(LLID, llid, u16); - PUT_SINFO(PLID, plid, u16); - PUT_SINFO(PLINK_STATE, plink_state, u8); PUT_SINFO_U64(RX_DURATION, rx_duration); PUT_SINFO_U64(TX_DURATION, tx_duration); @@ -6796,13 +6827,18 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(TX_RETRIES, tx_retries, u32); PUT_SINFO(TX_FAILED, tx_failed, u32); PUT_SINFO(EXPECTED_THROUGHPUT, expected_throughput, u32); - PUT_SINFO(AIRTIME_LINK_METRIC, airtime_link_metric, u32); PUT_SINFO(BEACON_LOSS, beacon_loss_count, u32); + + PUT_SINFO(LLID, llid, u16); + PUT_SINFO(PLID, plid, u16); + PUT_SINFO(PLINK_STATE, plink_state, u8); + PUT_SINFO(AIRTIME_LINK_METRIC, airtime_link_metric, u32); PUT_SINFO(LOCAL_PM, local_pm, u32); PUT_SINFO(PEER_PM, peer_pm, u32); PUT_SINFO(NONPEER_PM, nonpeer_pm, u32); PUT_SINFO(CONNECTED_TO_GATE, connected_to_gate, u8); PUT_SINFO(CONNECTED_TO_AS, connected_to_as, u8); + PUT_SINFO_U64(T_OFFSET, t_offset); if (sinfo->filled & BIT_ULL(NL80211_STA_INFO_BSS_PARAM)) { bss_param = nla_nest_start_noflag(msg, @@ -6830,7 +6866,6 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, &sinfo->sta_flags)) goto nla_put_failure; - PUT_SINFO_U64(T_OFFSET, t_offset); PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc); PUT_SINFO_U64(BEACON_RX, rx_beacon); PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); @@ -7106,6 +7141,11 @@ int cfg80211_check_station_change(struct wiphy *wiphy, return -EINVAL; } + /* Accept EMLSR capabilities only for AP client before association */ + if (statype != CFG80211_STA_AP_CLIENT_UNASSOC && + params->eml_cap_present) + return -EINVAL; + switch (statype) { case CFG80211_STA_AP_MLME_CLIENT: /* Use this only for authorizing/unauthorizing a station */ @@ -7461,6 +7501,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) params.link_sta_params.he_6ghz_capa = nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); + if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) { + params.eml_cap_present = true; + params.eml_cap = + nla_get_u16(info->attrs[NL80211_ATTR_EML_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]) params.airtime_weight = nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]); @@ -7619,6 +7665,12 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) } } + if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) { + params.eml_cap_present = true; + params.eml_cap = + nla_get_u16(info->attrs[NL80211_ATTR_EML_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) params.link_sta_params.he_6ghz_capa = nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); @@ -10098,7 +10150,7 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, struct cfg80211_chan_def chandef; enum nl80211_dfs_regions dfs_region; unsigned int cac_time_ms; - int err = -EINVAL; + int err; flush_delayed_work(&rdev->dfs_update_channels_wk); @@ -10113,35 +10165,29 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, return -EINVAL; } - wiphy_lock(wiphy); + guard(wiphy)(wiphy); dfs_region = reg_get_dfs_region(wiphy); if (dfs_region == NL80211_DFS_UNSET) - goto unlock; + return -EINVAL; err = nl80211_parse_chandef(rdev, info, &chandef); if (err) - goto unlock; + return err; err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype); if (err < 0) - goto unlock; + return err; - if (err == 0) { - err = -EINVAL; - goto unlock; - } + if (err == 0) + return -EINVAL; - if (!cfg80211_chandef_dfs_usable(wiphy, &chandef)) { - err = -EINVAL; - goto unlock; - } + if (!cfg80211_chandef_dfs_usable(wiphy, &chandef)) + return -EINVAL; - if (nla_get_flag(info->attrs[NL80211_ATTR_RADAR_BACKGROUND])) { - err = cfg80211_start_background_radar_detection(rdev, wdev, - &chandef); - goto unlock; - } + if (nla_get_flag(info->attrs[NL80211_ATTR_RADAR_BACKGROUND])) + return cfg80211_start_background_radar_detection(rdev, wdev, + &chandef); if (cfg80211_beaconing_iface_active(wdev)) { /* During MLO other link(s) can beacon, only the current link @@ -10151,26 +10197,19 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, !wdev->links[link_id].ap.beacon_interval) { /* nothing */ } else { - err = -EBUSY; - goto unlock; + return -EBUSY; } } - if (wdev->links[link_id].cac_started) { - err = -EBUSY; - goto unlock; - } + if (wdev->links[link_id].cac_started) + return -EBUSY; /* CAC start is offloaded to HW and can't be started manually */ - if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD)) { - err = -EOPNOTSUPP; - goto unlock; - } + if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD)) + return -EOPNOTSUPP; - if (!rdev->ops->start_radar_detection) { - err = -EOPNOTSUPP; - goto unlock; - } + if (!rdev->ops->start_radar_detection) + return -EOPNOTSUPP; cac_time_ms = cfg80211_chandef_dfs_cac_time(&rdev->wiphy, &chandef); if (WARN_ON(!cac_time_ms)) @@ -10178,29 +10217,28 @@ static int nl80211_start_radar_detection(struct sk_buff *skb, err = rdev_start_radar_detection(rdev, dev, &chandef, cac_time_ms, link_id); - if (!err) { - switch (wdev->iftype) { - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_P2P_GO: - wdev->links[0].ap.chandef = chandef; - break; - case NL80211_IFTYPE_ADHOC: - wdev->u.ibss.chandef = chandef; - break; - case NL80211_IFTYPE_MESH_POINT: - wdev->u.mesh.chandef = chandef; - break; - default: - break; - } - wdev->links[link_id].cac_started = true; - wdev->links[link_id].cac_start_time = jiffies; - wdev->links[link_id].cac_time_ms = cac_time_ms; + if (err) + return err; + + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + wdev->links[link_id].ap.chandef = chandef; + break; + case NL80211_IFTYPE_ADHOC: + wdev->u.ibss.chandef = chandef; + break; + case NL80211_IFTYPE_MESH_POINT: + wdev->u.mesh.chandef = chandef; + break; + default: + break; } -unlock: - wiphy_unlock(wiphy); + wdev->links[link_id].cac_started = true; + wdev->links[link_id].cac_start_time = jiffies; + wdev->links[link_id].cac_time_ms = cac_time_ms; - return err; + return 0; } static int nl80211_notify_radar_detection(struct sk_buff *skb, @@ -10533,9 +10571,9 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, intbss->parent_bssid))) goto nla_put_failure; - if (intbss->ts_boottime && + if (res->ts_boottime && nla_put_u64_64bit(msg, NL80211_BSS_LAST_SEEN_BOOTTIME, - intbss->ts_boottime, NL80211_BSS_PAD)) + res->ts_boottime, NL80211_BSS_PAD)) goto nla_put_failure; if (!nl80211_put_signal(msg, intbss->pub.chains, @@ -10897,6 +10935,13 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); } + if (info->attrs[NL80211_ATTR_SUPPORTED_SELECTORS]) { + req.supported_selectors = + nla_data(info->attrs[NL80211_ATTR_SUPPORTED_SELECTORS]); + req.supported_selectors_len = + nla_len(info->attrs[NL80211_ATTR_SUPPORTED_SELECTORS]); + } + auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]); if (!nl80211_valid_auth_type(rdev, auth_type, NL80211_CMD_AUTHENTICATE)) return -EINVAL; @@ -11127,12 +11172,84 @@ static struct cfg80211_bss *nl80211_assoc_bss(struct cfg80211_registered_device return bss; } +static int nl80211_process_links(struct cfg80211_registered_device *rdev, + struct cfg80211_assoc_link *links, + int assoc_link_id, + const u8 *ssid, int ssid_len, + struct genl_info *info) +{ + unsigned int attrsize = NUM_NL80211_ATTR * sizeof(struct nlattr *); + struct nlattr **attrs __free(kfree) = kzalloc(attrsize, GFP_KERNEL); + struct nlattr *link; + unsigned int link_id; + int rem, err; + + if (!attrs) + return -ENOMEM; + + nla_for_each_nested(link, info->attrs[NL80211_ATTR_MLO_LINKS], rem) { + memset(attrs, 0, attrsize); + + nla_parse_nested(attrs, NL80211_ATTR_MAX, link, NULL, NULL); + + if (!attrs[NL80211_ATTR_MLO_LINK_ID]) { + NL_SET_BAD_ATTR(info->extack, link); + return -EINVAL; + } + + link_id = nla_get_u8(attrs[NL80211_ATTR_MLO_LINK_ID]); + /* cannot use the same link ID again */ + if (links[link_id].bss) { + NL_SET_BAD_ATTR(info->extack, link); + return -EINVAL; + } + links[link_id].bss = + nl80211_assoc_bss(rdev, ssid, ssid_len, attrs, + assoc_link_id, link_id); + if (IS_ERR(links[link_id].bss)) { + err = PTR_ERR(links[link_id].bss); + links[link_id].bss = NULL; + NL_SET_ERR_MSG_ATTR(info->extack, link, + "Error fetching BSS for link"); + return err; + } + + if (attrs[NL80211_ATTR_IE]) { + links[link_id].elems = nla_data(attrs[NL80211_ATTR_IE]); + links[link_id].elems_len = + nla_len(attrs[NL80211_ATTR_IE]); + + if (cfg80211_find_elem(WLAN_EID_FRAGMENT, + links[link_id].elems, + links[link_id].elems_len)) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[NL80211_ATTR_IE], + "cannot deal with fragmentation"); + return -EINVAL; + } + + if (cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + links[link_id].elems, + links[link_id].elems_len)) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[NL80211_ATTR_IE], + "cannot deal with non-inheritance"); + return -EINVAL; + } + } + + links[link_id].disabled = + nla_get_flag(attrs[NL80211_ATTR_MLO_LINK_DISABLED]); + } + + return 0; +} + static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct cfg80211_assoc_request req = {}; - struct nlattr **attrs = NULL; const u8 *ap_addr, *ssid; unsigned int link_id; int err, ssid_len; @@ -11179,6 +11296,13 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_PREV_BSSID]) req.prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]); + if (info->attrs[NL80211_ATTR_SUPPORTED_SELECTORS]) { + req.supported_selectors = + nla_data(info->attrs[NL80211_ATTR_SUPPORTED_SELECTORS]); + req.supported_selectors_len = + nla_len(info->attrs[NL80211_ATTR_SUPPORTED_SELECTORS]); + } + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HT])) req.flags |= ASSOC_REQ_DISABLE_HT; @@ -11264,10 +11388,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) req.link_id = nl80211_link_id_or_invalid(info->attrs); if (info->attrs[NL80211_ATTR_MLO_LINKS]) { - unsigned int attrsize = NUM_NL80211_ATTR * sizeof(*attrs); - struct nlattr *link; - int rem = 0; - if (req.link_id < 0) return -EINVAL; @@ -11282,72 +11402,10 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) req.ap_mld_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]); ap_addr = req.ap_mld_addr; - attrs = kzalloc(attrsize, GFP_KERNEL); - if (!attrs) - return -ENOMEM; - - nla_for_each_nested(link, - info->attrs[NL80211_ATTR_MLO_LINKS], - rem) { - memset(attrs, 0, attrsize); - - nla_parse_nested(attrs, NL80211_ATTR_MAX, - link, NULL, NULL); - - if (!attrs[NL80211_ATTR_MLO_LINK_ID]) { - err = -EINVAL; - NL_SET_BAD_ATTR(info->extack, link); - goto free; - } - - link_id = nla_get_u8(attrs[NL80211_ATTR_MLO_LINK_ID]); - /* cannot use the same link ID again */ - if (req.links[link_id].bss) { - err = -EINVAL; - NL_SET_BAD_ATTR(info->extack, link); - goto free; - } - req.links[link_id].bss = - nl80211_assoc_bss(rdev, ssid, ssid_len, attrs, - req.link_id, link_id); - if (IS_ERR(req.links[link_id].bss)) { - err = PTR_ERR(req.links[link_id].bss); - req.links[link_id].bss = NULL; - NL_SET_ERR_MSG_ATTR(info->extack, - link, "Error fetching BSS for link"); - goto free; - } - - if (attrs[NL80211_ATTR_IE]) { - req.links[link_id].elems = - nla_data(attrs[NL80211_ATTR_IE]); - req.links[link_id].elems_len = - nla_len(attrs[NL80211_ATTR_IE]); - - if (cfg80211_find_elem(WLAN_EID_FRAGMENT, - req.links[link_id].elems, - req.links[link_id].elems_len)) { - NL_SET_ERR_MSG_ATTR(info->extack, - attrs[NL80211_ATTR_IE], - "cannot deal with fragmentation"); - err = -EINVAL; - goto free; - } - - if (cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, - req.links[link_id].elems, - req.links[link_id].elems_len)) { - NL_SET_ERR_MSG_ATTR(info->extack, - attrs[NL80211_ATTR_IE], - "cannot deal with non-inheritance"); - err = -EINVAL; - goto free; - } - } - - req.links[link_id].disabled = - nla_get_flag(attrs[NL80211_ATTR_MLO_LINK_DISABLED]); - } + err = nl80211_process_links(rdev, req.links, req.link_id, + ssid, ssid_len, info); + if (err) + goto free; if (!req.links[req.link_id].bss) { err = -EINVAL; @@ -11368,8 +11426,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) goto free; } - kfree(attrs); - attrs = NULL; + if (info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]) + req.ext_mld_capa_ops = + nla_get_u16(info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]); } else { if (req.link_id >= 0) return -EINVAL; @@ -11379,6 +11438,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(req.bss)) return PTR_ERR(req.bss); ap_addr = req.bss->bssid; + + if (info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]) + return -EINVAL; } err = nl80211_crypto_settings(rdev, info, &req.crypto, 1); @@ -11429,7 +11491,6 @@ free: for (link_id = 0; link_id < ARRAY_SIZE(req.links); link_id++) cfg80211_put_bss(&rdev->wiphy, req.links[link_id].bss); cfg80211_put_bss(&rdev->wiphy, req.bss); - kfree(attrs); return err; } @@ -16481,6 +16542,91 @@ nl80211_set_ttlm(struct sk_buff *skb, struct genl_info *info) return rdev_set_ttlm(rdev, dev, ¶ms); } +static int nl80211_assoc_ml_reconf(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_ml_reconf_req req = {}; + unsigned int link_id; + u16 add_links; + int err; + + if (!wdev->valid_links) + return -EINVAL; + + if (dev->ieee80211_ptr->conn_owner_nlportid && + dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) + return -EPERM; + + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; + + add_links = 0; + if (info->attrs[NL80211_ATTR_MLO_LINKS]) { + err = nl80211_process_links(rdev, req.add_links, + /* mark as MLO, but not assoc */ + IEEE80211_MLD_MAX_NUM_LINKS, + NULL, 0, info); + if (err) + return err; + + for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; + link_id++) { + if (!req.add_links[link_id].bss) + continue; + add_links |= BIT(link_id); + } + } + + if (info->attrs[NL80211_ATTR_MLO_RECONF_REM_LINKS]) + req.rem_links = + nla_get_u16(info->attrs[NL80211_ATTR_MLO_RECONF_REM_LINKS]); + + /* Validate that existing links are not added, removed links are valid + * and don't allow adding and removing the same links + */ + if ((add_links & req.rem_links) || !(add_links | req.rem_links) || + (wdev->valid_links & add_links) || + ((wdev->valid_links & req.rem_links) != req.rem_links)) { + err = -EINVAL; + goto out; + } + + if (info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]) + req.ext_mld_capa_ops = + nla_get_u16(info->attrs[NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS]); + + err = cfg80211_assoc_ml_reconf(rdev, dev, &req); + +out: + for (link_id = 0; link_id < ARRAY_SIZE(req.add_links); link_id++) + cfg80211_put_bss(&rdev->wiphy, req.add_links[link_id].bss); + + return err; +} + +static int +nl80211_epcs_cfg(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + bool val; + + if (wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; + + if (!wdev->connected) + return -ENOLINK; + + val = nla_get_flag(info->attrs[NL80211_ATTR_EPCS]); + + return rdev_set_epcs(rdev, dev, val); +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -17673,6 +17819,18 @@ static const struct genl_small_ops nl80211_small_ops[] = { .flags = GENL_UNS_ADMIN_PERM, .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), }, + { + .cmd = NL80211_CMD_ASSOC_MLO_RECONF, + .doit = nl80211_assoc_ml_reconf, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + }, + { + .cmd = NL80211_CMD_EPCS_CFG, + .doit = nl80211_epcs_cfg, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + }, }; static struct genl_family nl80211_fam __ro_after_init = { @@ -18448,10 +18606,9 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, peer_addr)) goto nla_put_failure; - if ((td_bitmap_len > 0) && td_bitmap) - if (nla_put(msg, NL80211_ATTR_TD_BITMAP, - td_bitmap_len, td_bitmap)) - goto nla_put_failure; + if (td_bitmap_len > 0 && td_bitmap && + nla_put(msg, NL80211_ATTR_TD_BITMAP, td_bitmap_len, td_bitmap)) + goto nla_put_failure; genlmsg_end(msg, hdr); @@ -18569,6 +18726,23 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask) } EXPORT_SYMBOL(cfg80211_links_removed); +void nl80211_mlo_reconf_add_done(struct net_device *dev, + struct cfg80211_mlo_reconf_done_data *data) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct nl80211_mlme_event event = { + .cmd = NL80211_CMD_ASSOC_MLO_RECONF, + .buf = data->buf, + .buf_len = data->len, + .uapsd_queues = -1, + }; + + nl80211_send_mlme_event(rdev, dev, &event, GFP_KERNEL); +} +EXPORT_SYMBOL(nl80211_mlo_reconf_add_done); + void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, gfp_t gfp) @@ -20401,6 +20575,39 @@ void cfg80211_schedule_channels_check(struct wireless_dev *wdev) } EXPORT_SYMBOL(cfg80211_schedule_channels_check); +void cfg80211_epcs_changed(struct net_device *netdev, bool enabled) +{ + struct wireless_dev *wdev = netdev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_epcs_changed(wdev, enabled); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_EPCS_CFG); + if (!hdr) { + nlmsg_free(msg); + return; + } + + if (enabled && nla_put_flag(msg, NL80211_ATTR_EPCS)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + NL80211_MCGRP_MLME, GFP_KERNEL); + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_epcs_changed); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index ffaab9a92e5b..5e25782af1e0 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -124,4 +124,7 @@ void cfg80211_free_coalesce(struct cfg80211_coalesce *coalesce); /* peer measurement */ int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info); +void nl80211_mlo_reconf_add_done(struct net_device *dev, + struct cfg80211_mlo_reconf_done_data *data); + #endif /* __NET_WIRELESS_NL80211_H */ diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 0396fa19bdf1..a117f5093ca2 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -630,9 +630,9 @@ void cfg80211_pmsr_free_wk(struct work_struct *work) struct wireless_dev *wdev = container_of(work, struct wireless_dev, pmsr_free_wk); - wiphy_lock(wdev->wiphy); + guard(wiphy)(wdev->wiphy); + cfg80211_pmsr_process_abort(wdev); - wiphy_unlock(wdev->wiphy); } void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev) diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index adb6105bbb7d..9f4783c2354c 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018, 2021-2024 Intel Corporation + * Copyright (C) 2018, 2021-2025 Intel Corporation */ #ifndef __CFG80211_RDEV_OPS #define __CFG80211_RDEV_OPS @@ -600,11 +600,12 @@ static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev, } static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, int *dbm) + struct wireless_dev *wdev, unsigned int link_id, + int *dbm) { int ret; - trace_rdev_get_tx_power(&rdev->wiphy, wdev); - ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, dbm); + trace_rdev_get_tx_power(&rdev->wiphy, wdev, link_id); + ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, link_id, dbm); trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm); return ret; } @@ -1546,4 +1547,36 @@ rdev_get_radio_mask(struct cfg80211_registered_device *rdev, return rdev->ops->get_radio_mask(wiphy, dev); } + +static inline int +rdev_assoc_ml_reconf(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ml_reconf_req *req) +{ + struct wiphy *wiphy = &rdev->wiphy; + int ret = -EOPNOTSUPP; + + trace_rdev_assoc_ml_reconf(wiphy, dev, req); + if (rdev->ops->assoc_ml_reconf) + ret = rdev->ops->assoc_ml_reconf(wiphy, dev, req); + trace_rdev_return_int(wiphy, ret); + + return ret; +} + +static inline int +rdev_set_epcs(struct cfg80211_registered_device *rdev, + struct net_device *dev, bool val) +{ + struct wiphy *wiphy = &rdev->wiphy; + int ret = -EOPNOTSUPP; + + trace_rdev_set_epcs(wiphy, dev, val); + if (rdev->ops->set_epcs) + ret = rdev->ops->set_epcs(wiphy, dev, val); + trace_rdev_return_int(wiphy, ret); + + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 1df65a5a44f7..c1752b31734f 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,7 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018 - 2025 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -407,7 +407,8 @@ static bool is_an_alpha2(const char *alpha2) { if (!alpha2) return false; - return isalpha(alpha2[0]) && isalpha(alpha2[1]); + return isascii(alpha2[0]) && isalpha(alpha2[0]) && + isascii(alpha2[1]) && isalpha(alpha2[1]); } static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y) @@ -1602,6 +1603,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_PSD; if (rd_flags & NL80211_RRF_ALLOW_6GHZ_VLP_AP) channel_flags |= IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP; + if (rd_flags & NL80211_RRF_ALLOW_20MHZ_ACTIVITY) + channel_flags |= IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY; return channel_flags; } @@ -2465,11 +2468,11 @@ static void reg_leave_invalid_chans(struct wiphy *wiphy) struct wireless_dev *wdev; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - wiphy_lock(wiphy); + guard(wiphy)(wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) if (!reg_wdev_chan_valid(wiphy, wdev)) cfg80211_leave(rdev, wdev); - wiphy_unlock(wiphy); } static void reg_check_chans_work(struct work_struct *work) @@ -2649,13 +2652,11 @@ void wiphy_apply_custom_regulatory(struct wiphy *wiphy, return; rtnl_lock(); - wiphy_lock(wiphy); - - tmp = get_wiphy_regdom(wiphy); - rcu_assign_pointer(wiphy->regd, new_regd); - rcu_free_regdom(tmp); - - wiphy_unlock(wiphy); + scoped_guard(wiphy, wiphy) { + tmp = get_wiphy_regdom(wiphy); + rcu_assign_pointer(wiphy->regd, new_regd); + rcu_free_regdom(tmp); + } rtnl_unlock(); } EXPORT_SYMBOL(wiphy_apply_custom_regulatory); @@ -2825,9 +2826,9 @@ reg_process_hint_driver(struct wiphy *wiphy, tmp = get_wiphy_regdom(wiphy); ASSERT_RTNL(); - wiphy_lock(wiphy); - rcu_assign_pointer(wiphy->regd, regd); - wiphy_unlock(wiphy); + scoped_guard(wiphy, wiphy) { + rcu_assign_pointer(wiphy->regd, regd); + } rcu_free_regdom(tmp); } @@ -3205,9 +3206,9 @@ static void reg_process_self_managed_hints(void) ASSERT_RTNL(); for_each_rdev(rdev) { - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + reg_process_self_managed_hint(&rdev->wiphy); - wiphy_unlock(&rdev->wiphy); } reg_check_channels(); @@ -3600,14 +3601,12 @@ static bool is_wiphy_all_set_reg_flag(enum ieee80211_regulatory_flags flag) struct wireless_dev *wdev; for_each_rdev(rdev) { - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { - if (!(wdev->wiphy->regulatory_flags & flag)) { - wiphy_unlock(&rdev->wiphy); + if (!(wdev->wiphy->regulatory_flags & flag)) return false; - } } - wiphy_unlock(&rdev->wiphy); } return true; @@ -3883,19 +3882,18 @@ static int reg_set_rd_driver(const struct ieee80211_regdomain *rd, if (!driver_request->intersect) { ASSERT_RTNL(); - wiphy_lock(request_wiphy); - if (request_wiphy->regd) - tmp = get_wiphy_regdom(request_wiphy); - - regd = reg_copy_regd(rd); - if (IS_ERR(regd)) { - wiphy_unlock(request_wiphy); - return PTR_ERR(regd); + scoped_guard(wiphy, request_wiphy) { + if (request_wiphy->regd) + tmp = get_wiphy_regdom(request_wiphy); + + regd = reg_copy_regd(rd); + if (IS_ERR(regd)) + return PTR_ERR(regd); + + rcu_assign_pointer(request_wiphy->regd, regd); + rcu_free_regdom(tmp); } - rcu_assign_pointer(request_wiphy->regd, regd); - rcu_free_regdom(tmp); - wiphy_unlock(request_wiphy); reset_regdomains(false, rd); return 0; } diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 1c6fd45aa809..e8a4fe44ec2d 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -5,7 +5,7 @@ * Copyright 2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ #include <linux/kernel.h> #include <linux/slab.h> @@ -272,12 +272,19 @@ cfg80211_gen_new_ie(const u8 *ie, size_t ielen, { const struct element *non_inherit_elem, *parent, *sub; u8 *pos = new_ie; - u8 id, ext_id; + const u8 *mbssid_index_ie; + u8 id, ext_id, bssid_index = 255; unsigned int match_len; non_inherit_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, subie, subie_len); + mbssid_index_ie = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, subie, + subie_len); + if (mbssid_index_ie && mbssid_index_ie[1] > 0 && + mbssid_index_ie[2] > 0 && mbssid_index_ie[2] <= 46) + bssid_index = mbssid_index_ie[2]; + /* We copy the elements one by one from the parent to the generated * elements. * If they are not inherited (included in subie or in the non @@ -316,6 +323,24 @@ cfg80211_gen_new_ie(const u8 *ie, size_t ielen, continue; } + /* For ML probe response, match the MLE in the frame body with + * MLD id being 'bssid_index' + */ + if (parent->id == WLAN_EID_EXTENSION && parent->datalen > 1 && + parent->data[0] == WLAN_EID_EXT_EHT_MULTI_LINK && + bssid_index == ieee80211_mle_get_mld_id(parent->data + 1)) { + if (!cfg80211_copy_elem_with_frags(parent, + ie, ielen, + &pos, new_ie, + new_ie_len)) + return 0; + + /* Continue here to prevent processing the MLE in + * sub-element, which AP MLD should not carry + */ + continue; + } + /* Already copied if an earlier element had the same type */ if (cfg80211_find_elem_match(id, ie, (u8 *)parent - ie, &ext_id, match_len, 0)) @@ -704,7 +729,7 @@ cfg80211_parse_colocated_ap_iter(void *_data, u8 type, bss_params))) return RNR_ITER_CONTINUE; - entry = kzalloc(sizeof(*entry) + IEEE80211_MAX_SSID_LEN, GFP_ATOMIC); + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return RNR_ITER_ERROR; @@ -713,6 +738,17 @@ cfg80211_parse_colocated_ap_iter(void *_data, u8 type, if (!cfg80211_parse_ap_info(entry, tbtt_info, tbtt_info_len, data->ssid_elem, data->s_ssid_tmp)) { + struct cfg80211_colocated_ap *tmp; + + /* Don't add duplicate BSSIDs on the same channel. */ + list_for_each_entry(tmp, &data->ap_list, list) { + if (ether_addr_equal(tmp->bssid, entry->bssid) && + tmp->center_freq == entry->center_freq) { + kfree(entry); + return RNR_ITER_CONTINUE; + } + } + data->n_coloc++; list_add_tail(&entry->list, &data->ap_list); } else { @@ -763,12 +799,11 @@ static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, } } + request->n_channels++; request->channels[n_channels] = chan; if (add_to_6ghz) request->scan_6ghz_params[request->n_6ghz_params].channel_idx = n_channels; - - request->n_channels++; } static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap, @@ -858,9 +893,7 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) if (ret) continue; - entry = kzalloc(sizeof(*entry) + IEEE80211_MAX_SSID_LEN, - GFP_ATOMIC); - + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; @@ -1238,7 +1271,8 @@ void cfg80211_sched_scan_results_wk(struct work_struct *work) rdev = container_of(work, struct cfg80211_registered_device, sched_scan_res_wk); - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { if (req->report_results) { req->report_results = false; @@ -1253,7 +1287,6 @@ void cfg80211_sched_scan_results_wk(struct work_struct *work) NL80211_CMD_SCHED_SCAN_RESULTS); } } - wiphy_unlock(&rdev->wiphy); } void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) @@ -1288,9 +1321,9 @@ EXPORT_SYMBOL(cfg80211_sched_scan_stopped_locked); void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid) { - wiphy_lock(wiphy); + guard(wiphy)(wiphy); + cfg80211_sched_scan_stopped_locked(wiphy, reqid); - wiphy_unlock(wiphy); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped); @@ -1332,7 +1365,7 @@ void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs) { struct cfg80211_internal_bss *bss; - unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC); + unsigned long age_jiffies = secs_to_jiffies(age_secs); spin_lock_bh(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) @@ -1901,7 +1934,7 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, known->pub.signal = new->pub.signal; known->pub.capability = new->pub.capability; known->ts = new->ts; - known->ts_boottime = new->ts_boottime; + known->pub.ts_boottime = new->pub.ts_boottime; known->parent_tsf = new->parent_tsf; known->pub.chains = new->pub.chains; memcpy(known->pub.chain_signal, new->pub.chain_signal, @@ -2258,7 +2291,7 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, tmp.pub.signal = 0; tmp.pub.beacon_interval = data->beacon_interval; tmp.pub.capability = data->capability; - tmp.ts_boottime = drv_data->boottime_ns; + tmp.pub.ts_boottime = drv_data->boottime_ns; tmp.parent_tsf = drv_data->parent_tsf; ether_addr_copy(tmp.parent_bssid, drv_data->parent_bssid); tmp.pub.chains = drv_data->chains; @@ -2648,7 +2681,7 @@ cfg80211_defrag_mle(const struct element *mle, const u8 *ie, size_t ielen, /* Required length for first defragmentation */ buf_len = mle->datalen - 1; for_each_element(elem, mle->data + mle->datalen, - ielen - sizeof(*mle) + mle->datalen) { + ie + ielen - mle->data - mle->datalen) { if (elem->id != WLAN_EID_FRAGMENT) break; @@ -3217,6 +3250,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, const u8 *ie; size_t ielen; u64 tsf; + size_t s1g_optional_len; if (WARN_ON(!mgmt)) return NULL; @@ -3231,12 +3265,11 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (void *) mgmt; - if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) - min_hdr_len = offsetof(struct ieee80211_ext, - u.s1g_short_beacon.variable); - else - min_hdr_len = offsetof(struct ieee80211_ext, - u.s1g_beacon.variable); + s1g_optional_len = + ieee80211_s1g_optional_len(ext->frame_control); + min_hdr_len = + offsetof(struct ieee80211_ext, u.s1g_beacon.variable) + + s1g_optional_len; } else { /* same for beacons */ min_hdr_len = offsetof(struct ieee80211_mgmt, @@ -3252,11 +3285,7 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, const struct ieee80211_s1g_bcn_compat_ie *compat; const struct element *elem; - if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) - ie = ext->u.s1g_short_beacon.variable; - else - ie = ext->u.s1g_beacon.variable; - + ie = ext->u.s1g_beacon.variable + s1g_optional_len; elem = cfg80211_find_elem(WLAN_EID_S1G_BCN_COMPAT, ie, ielen); if (!elem) return NULL; @@ -3565,10 +3594,8 @@ int cfg80211_wext_siwscan(struct net_device *dev, /* translate "Scan for SSID" request */ if (wreq) { if (wrqu->data.flags & IW_SCAN_THIS_ESSID) { - if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) { - err = -EINVAL; - goto out; - } + if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) + return -EINVAL; memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len); creq->ssids[0].ssid_len = wreq->essid_len; } @@ -3584,20 +3611,20 @@ int cfg80211_wext_siwscan(struct net_device *dev, eth_broadcast_addr(creq->bssid); - wiphy_lock(&rdev->wiphy); - - rdev->scan_req = creq; - err = rdev_scan(rdev, creq); - if (err) { - rdev->scan_req = NULL; - /* creq will be freed below */ - } else { - nl80211_send_scan_start(rdev, dev->ieee80211_ptr); - /* creq now owned by driver */ - creq = NULL; - dev_hold(dev); + scoped_guard(wiphy, &rdev->wiphy) { + rdev->scan_req = creq; + err = rdev_scan(rdev, creq); + if (err) { + rdev->scan_req = NULL; + /* creq will be freed below */ + } else { + nl80211_send_scan_start(rdev, dev->ieee80211_ptr); + /* creq now owned by driver */ + creq = NULL; + dev_hold(dev); + } } - wiphy_unlock(&rdev->wiphy); + out: kfree(creq); return err; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 268171600087..cf998500a965 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -252,7 +252,7 @@ void cfg80211_conn_work(struct work_struct *work) u8 bssid_buf[ETH_ALEN], *bssid = NULL; enum nl80211_timeout_reason treason; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) @@ -280,8 +280,6 @@ void cfg80211_conn_work(struct work_struct *work) __cfg80211_connect_result(wdev->netdev, &cr, false); } } - - wiphy_unlock(&rdev->wiphy); } static void cfg80211_step_auth_next(struct cfg80211_conn *conn, @@ -693,13 +691,13 @@ static bool cfg80211_is_all_idle(void) * as chan dfs state, etc. */ for_each_rdev(rdev) { - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (wdev->conn || wdev->connected || cfg80211_beaconing_iface_active(wdev)) is_all_idle = false; } - wiphy_unlock(&rdev->wiphy); } return is_all_idle; @@ -1583,7 +1581,7 @@ void cfg80211_autodisconnect_wk(struct work_struct *work) container_of(work, struct wireless_dev, disconnect_wk); struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - wiphy_lock(wdev->wiphy); + guard(wiphy)(wdev->wiphy); if (wdev->conn_owner_nlportid) { switch (wdev->iftype) { @@ -1619,6 +1617,4 @@ void cfg80211_autodisconnect_wk(struct work_struct *work) break; } } - - wiphy_unlock(wdev->wiphy); } diff --git a/net/wireless/tests/scan.c b/net/wireless/tests/scan.c index e12f620b5f42..b1a9c1466d6c 100644 --- a/net/wireless/tests/scan.c +++ b/net/wireless/tests/scan.c @@ -810,6 +810,8 @@ static void test_cfg80211_parse_colocated_ap(struct kunit *test) skb_put_data(input, "123", 3); ies = kunit_kzalloc(test, struct_size(ies, data, input->len), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, ies); + ies->len = input->len; memcpy(ies->data, input->data, input->len); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index d5c9bb614fa6..4ed9fada4ec0 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -2,7 +2,7 @@ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018, 2020-2024 Intel Corporation + * Copyright (C) 2018, 2020-2025 Intel Corporation */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cfg80211 @@ -1378,6 +1378,7 @@ TRACE_EVENT(rdev_assoc, __dynamic_array(u8, fils_kek, req->fils_kek_len) __dynamic_array(u8, fils_nonces, req->fils_nonces ? 2 * FILS_NONCE_LEN : 0) + __field(u16, ext_mld_capa_ops) ), TP_fast_assign( WIPHY_ASSIGN; @@ -1404,6 +1405,7 @@ TRACE_EVENT(rdev_assoc, if (req->fils_nonces) memcpy(__get_dynamic_array(fils_nonces), req->fils_nonces, 2 * FILS_NONCE_LEN); + __entry->ext_mld_capa_ops = req->ext_mld_capa_ops; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", bssid: %pM" ", previous bssid: %pM, use mfp: %s, flags: 0x%x", @@ -1690,9 +1692,28 @@ TRACE_EVENT(rdev_set_wiphy_params, WIPHY_PR_ARG, __entry->changed) ); -DEFINE_EVENT(wiphy_wdev_evt, rdev_get_tx_power, - TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), - TP_ARGS(wiphy, wdev) +DECLARE_EVENT_CLASS(wiphy_wdev_link_evt, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id), + TP_ARGS(wiphy, wdev, link_id), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(unsigned int, link_id) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->link_id = link_id; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %u", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id) +); + +DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_tx_power, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + unsigned int link_id), + TP_ARGS(wiphy, wdev, link_id) ); TRACE_EVENT(rdev_set_tx_power, @@ -2192,25 +2213,6 @@ TRACE_EVENT(rdev_set_noack_map, TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", noack_map: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map) ); - -DECLARE_EVENT_CLASS(wiphy_wdev_link_evt, - TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id), - TP_ARGS(wiphy, wdev, link_id), - TP_STRUCT__entry( - WIPHY_ENTRY - WDEV_ENTRY - __field(unsigned int, link_id) - ), - TP_fast_assign( - WIPHY_ASSIGN; - WDEV_ASSIGN; - __entry->link_id = link_id; - ), - TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %u", - WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id) -); - DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_channel, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, unsigned int link_id), @@ -3049,6 +3051,24 @@ TRACE_EVENT(rdev_set_ttlm, WIPHY_PR_ARG, NETDEV_PR_ARG) ); +TRACE_EVENT(rdev_set_epcs, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + bool val), + TP_ARGS(wiphy, netdev, val), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(bool, val) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->val = val; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", config=%u", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->val) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ @@ -4100,10 +4120,71 @@ TRACE_EVENT(cfg80211_links_removed, NETDEV_ASSIGN; __entry->link_mask = link_mask; ), - TP_printk(NETDEV_PR_FMT ", link_mask:%u", NETDEV_PR_ARG, + TP_printk(NETDEV_PR_FMT ", link_mask:0x%x", NETDEV_PR_ARG, __entry->link_mask) ); +TRACE_EVENT(cfg80211_mlo_reconf_add_done, + TP_PROTO(struct net_device *netdev, u16 link_mask, + const u8 *buf, size_t len), + TP_ARGS(netdev, link_mask, buf, len), + TP_STRUCT__entry( + NETDEV_ENTRY + __field(u16, link_mask) + __dynamic_array(u8, buf, len) + ), + TP_fast_assign( + NETDEV_ASSIGN; + __entry->link_mask = link_mask; + memcpy(__get_dynamic_array(buf), buf, len); + ), + TP_printk(NETDEV_PR_FMT ", link_mask:0x%x", + NETDEV_PR_ARG, __entry->link_mask) +); + +TRACE_EVENT(rdev_assoc_ml_reconf, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_ml_reconf_req *req), + TP_ARGS(wiphy, netdev, req), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(u16, add_links) + __field(u16, rem_links) + __field(u16, ext_mld_capa_ops) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + u32 i; + + __entry->add_links = 0; + __entry->rem_links = req->rem_links; + for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) + if (req->add_links[i].bss) + __entry->add_links |= BIT(i); + __entry->ext_mld_capa_ops = req->ext_mld_capa_ops; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", add_links=0x%x, rem_links=0x%x", + WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->add_links, __entry->rem_links) +); + +TRACE_EVENT(cfg80211_epcs_changed, + TP_PROTO(struct wireless_dev *wdev, bool enabled), + TP_ARGS(wdev, enabled), + TP_STRUCT__entry( + WDEV_ENTRY + __field(u32, enabled) + ), + TP_fast_assign( + WDEV_ASSIGN; + __entry->enabled = enabled; + ), + TP_printk(WDEV_PR_FMT ", enabled=%u", + WDEV_PR_ARG, __entry->enabled) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index 65c8e47246b7..ed868c0f7ca8 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -5,7 +5,7 @@ * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2023, 2025 Intel Corporation */ #include <linux/export.h> #include <linux/bitops.h> @@ -2572,7 +2572,6 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; - int ret; wdev = dev->ieee80211_ptr; if (!wdev) @@ -2584,11 +2583,9 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr, memset(sinfo, 0, sizeof(*sinfo)); - wiphy_lock(&rdev->wiphy); - ret = rdev_get_station(rdev, dev, mac_addr, sinfo); - wiphy_unlock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); - return ret; + return rdev_get_station(rdev, dev, mac_addr, sinfo); } EXPORT_SYMBOL(cfg80211_get_station); @@ -2911,7 +2908,7 @@ bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, u32 freq, width; freq = ieee80211_chandef_to_khz(chandef); - width = nl80211_chan_width_to_mhz(chandef->width); + width = cfg80211_chandef_get_width(chandef); if (!ieee80211_radio_freq_range_valid(radio, freq, width)) return false; diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 90d5c0592667..a74b1afc594e 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -39,7 +39,6 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, struct cfg80211_registered_device *rdev; struct vif_params vifparams; enum nl80211_iftype type; - int ret; rdev = wiphy_to_rdev(wdev->wiphy); @@ -62,11 +61,9 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, memset(&vifparams, 0, sizeof(vifparams)); - wiphy_lock(wdev->wiphy); - ret = cfg80211_change_iface(rdev, dev, type, &vifparams); - wiphy_unlock(wdev->wiphy); + guard(wiphy)(wdev->wiphy); - return ret; + return cfg80211_change_iface(rdev, dev, type, &vifparams); } int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info, @@ -258,23 +255,17 @@ int cfg80211_wext_siwrts(struct net_device *dev, u32 orts = wdev->wiphy->rts_threshold; int err; - wiphy_lock(&rdev->wiphy); - if (rts->disabled || !rts->fixed) { + guard(wiphy)(&rdev->wiphy); + if (rts->disabled || !rts->fixed) wdev->wiphy->rts_threshold = (u32) -1; - } else if (rts->value < 0) { - err = -EINVAL; - goto out; - } else { + else if (rts->value < 0) + return -EINVAL; + else wdev->wiphy->rts_threshold = rts->value; - } err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD); - if (err) wdev->wiphy->rts_threshold = orts; - -out: - wiphy_unlock(&rdev->wiphy); return err; } @@ -302,12 +293,12 @@ int cfg80211_wext_siwfrag(struct net_device *dev, u32 ofrag = wdev->wiphy->frag_threshold; int err; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + if (frag->disabled || !frag->fixed) { wdev->wiphy->frag_threshold = (u32) -1; } else if (frag->value < 256) { - err = -EINVAL; - goto out; + return -EINVAL; } else { /* Fragment length must be even, so strip LSB. */ wdev->wiphy->frag_threshold = frag->value & ~0x1; @@ -316,9 +307,6 @@ int cfg80211_wext_siwfrag(struct net_device *dev, err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD); if (err) wdev->wiphy->frag_threshold = ofrag; -out: - wiphy_unlock(&rdev->wiphy); - return err; } @@ -352,7 +340,8 @@ static int cfg80211_wext_siwretry(struct net_device *dev, (retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT) return -EINVAL; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + if (retry->flags & IW_RETRY_LONG) { wdev->wiphy->retry_long = retry->value; changed |= WIPHY_PARAM_RETRY_LONG; @@ -371,7 +360,6 @@ static int cfg80211_wext_siwretry(struct net_device *dev, wdev->wiphy->retry_short = oshort; wdev->wiphy->retry_long = olong; } - wiphy_unlock(&rdev->wiphy); return err; } @@ -578,9 +566,9 @@ static int cfg80211_wext_siwencode(struct net_device *dev, struct iw_point *erq = &wrqu->encoding; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - int idx, err; - bool remove = false; struct key_params params; + bool remove = false; + int idx; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) @@ -592,11 +580,9 @@ static int cfg80211_wext_siwencode(struct net_device *dev, !rdev->ops->set_default_key) return -EOPNOTSUPP; - wiphy_lock(&rdev->wiphy); - if (wdev->valid_links) { - err = -EOPNOTSUPP; - goto out; - } + guard(wiphy)(&rdev->wiphy); + if (wdev->valid_links) + return -EOPNOTSUPP; idx = erq->flags & IW_ENCODE_INDEX; if (idx == 0) { @@ -604,8 +590,7 @@ static int cfg80211_wext_siwencode(struct net_device *dev, if (idx < 0) idx = 0; } else if (idx < 1 || idx > 4) { - err = -EINVAL; - goto out; + return -EINVAL; } else { idx--; } @@ -614,7 +599,8 @@ static int cfg80211_wext_siwencode(struct net_device *dev, remove = true; else if (erq->length == 0) { /* No key data - just set the default TX key index */ - err = 0; + int err = 0; + if (wdev->connected || (wdev->iftype == NL80211_IFTYPE_ADHOC && wdev->u.ibss.current_bss)) @@ -622,28 +608,22 @@ static int cfg80211_wext_siwencode(struct net_device *dev, true); if (!err) wdev->wext.default_key = idx; - goto out; + return err; } memset(¶ms, 0, sizeof(params)); params.key = keybuf; params.key_len = erq->length; - if (erq->length == 5) { + if (erq->length == 5) params.cipher = WLAN_CIPHER_SUITE_WEP40; - } else if (erq->length == 13) { + else if (erq->length == 13) params.cipher = WLAN_CIPHER_SUITE_WEP104; - } else if (!remove) { - err = -EINVAL; - goto out; - } - - err = cfg80211_set_encryption(rdev, dev, false, NULL, remove, - wdev->wext.default_key == -1, - idx, ¶ms); -out: - wiphy_unlock(&rdev->wiphy); + else if (!remove) + return -EINVAL; - return err; + return cfg80211_set_encryption(rdev, dev, false, NULL, remove, + wdev->wext.default_key == -1, + idx, ¶ms); } static int cfg80211_wext_siwencodeext(struct net_device *dev, @@ -659,7 +639,6 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev, bool remove = false; struct key_params params; u32 cipher; - int ret; if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_ADHOC) @@ -734,16 +713,13 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev, params.seq_len = 6; } - wiphy_lock(wdev->wiphy); - ret = cfg80211_set_encryption( - rdev, dev, - !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY), - addr, remove, - ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY, - idx, ¶ms); - wiphy_unlock(wdev->wiphy); + guard(wiphy)(wdev->wiphy); - return ret; + return cfg80211_set_encryption(rdev, dev, + !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY), + addr, remove, + ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY, + idx, ¶ms); } static int cfg80211_wext_giwencode(struct net_device *dev, @@ -794,61 +770,41 @@ static int cfg80211_wext_siwfreq(struct net_device *dev, struct cfg80211_chan_def chandef = { .width = NL80211_CHAN_WIDTH_20_NOHT, }; - int freq, ret; + int freq; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); switch (wdev->iftype) { case NL80211_IFTYPE_STATION: - ret = cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra); - break; + return cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra); case NL80211_IFTYPE_ADHOC: - ret = cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); - break; + return cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra); case NL80211_IFTYPE_MONITOR: freq = cfg80211_wext_freq(wextfreq); - if (freq < 0) { - ret = freq; - break; - } - if (freq == 0) { - ret = -EINVAL; - break; - } + if (freq < 0) + return freq; + if (freq == 0) + return -EINVAL; + chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); - if (!chandef.chan) { - ret = -EINVAL; - break; - } - ret = cfg80211_set_monitor_channel(rdev, dev, &chandef); - break; + if (!chandef.chan) + return -EINVAL; + return cfg80211_set_monitor_channel(rdev, dev, &chandef); case NL80211_IFTYPE_MESH_POINT: freq = cfg80211_wext_freq(wextfreq); - if (freq < 0) { - ret = freq; - break; - } - if (freq == 0) { - ret = -EINVAL; - break; - } + if (freq < 0) + return freq; + if (freq == 0) + return -EINVAL; chandef.center_freq1 = freq; chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq); - if (!chandef.chan) { - ret = -EINVAL; - break; - } - ret = cfg80211_set_mesh_channel(rdev, wdev, &chandef); - break; + if (!chandef.chan) + return -EINVAL; + return cfg80211_set_mesh_channel(rdev, wdev, &chandef); default: - ret = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - - wiphy_unlock(&rdev->wiphy); - - return ret; } static int cfg80211_wext_giwfreq(struct net_device *dev, @@ -861,35 +817,26 @@ static int cfg80211_wext_giwfreq(struct net_device *dev, struct cfg80211_chan_def chandef = {}; int ret; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (wdev->iftype) { case NL80211_IFTYPE_STATION: - ret = cfg80211_mgd_wext_giwfreq(dev, info, freq, extra); - break; + return cfg80211_mgd_wext_giwfreq(dev, info, freq, extra); case NL80211_IFTYPE_ADHOC: - ret = cfg80211_ibss_wext_giwfreq(dev, info, freq, extra); - break; + return cfg80211_ibss_wext_giwfreq(dev, info, freq, extra); case NL80211_IFTYPE_MONITOR: - if (!rdev->ops->get_channel) { - ret = -EINVAL; - break; - } + if (!rdev->ops->get_channel) + return -EINVAL; ret = rdev_get_channel(rdev, wdev, 0, &chandef); if (ret) - break; + return ret; freq->m = chandef.chan->center_freq; freq->e = 6; - ret = 0; - break; + return ret; default: - ret = -EINVAL; - break; + return -EINVAL; } - - wiphy_unlock(&rdev->wiphy); - - return ret; } static int cfg80211_wext_siwtxpower(struct net_device *dev, @@ -900,7 +847,6 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); enum nl80211_tx_power_setting type; int dbm = 0; - int ret; if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM) return -EINVAL; @@ -942,11 +888,9 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, return 0; } - wiphy_lock(&rdev->wiphy); - ret = rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); - wiphy_unlock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); - return ret; + return rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); } static int cfg80211_wext_giwtxpower(struct net_device *dev, @@ -965,9 +909,9 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev, if (!rdev->ops->get_tx_power) return -EOPNOTSUPP; - wiphy_lock(&rdev->wiphy); - err = rdev_get_tx_power(rdev, wdev, &val); - wiphy_unlock(&rdev->wiphy); + scoped_guard(wiphy, &rdev->wiphy) { + err = rdev_get_tx_power(rdev, wdev, 0, &val); + } if (err) return err; @@ -1209,9 +1153,9 @@ static int cfg80211_wext_siwpower(struct net_device *dev, timeout = wrq->value / 1000; } - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + err = rdev_set_power_mgmt(rdev, dev, ps, timeout); - wiphy_unlock(&rdev->wiphy); if (err) return err; @@ -1244,8 +1188,8 @@ static int cfg80211_wext_siwrate(struct net_device *dev, struct cfg80211_bitrate_mask mask; u32 fixed, maxrate; struct ieee80211_supported_band *sband; - int band, ridx, ret; bool match = false; + int band, ridx; if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; @@ -1283,14 +1227,12 @@ static int cfg80211_wext_siwrate(struct net_device *dev, if (!match) return -EINVAL; - wiphy_lock(&rdev->wiphy); - if (dev->ieee80211_ptr->valid_links) - ret = -EOPNOTSUPP; - else - ret = rdev_set_bitrate_mask(rdev, dev, 0, NULL, &mask); - wiphy_unlock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); - return ret; + if (dev->ieee80211_ptr->valid_links) + return -EOPNOTSUPP; + + return rdev_set_bitrate_mask(rdev, dev, 0, NULL, &mask); } static int cfg80211_wext_giwrate(struct net_device *dev, @@ -1319,9 +1261,9 @@ static int cfg80211_wext_giwrate(struct net_device *dev, if (err) return err; - wiphy_lock(&rdev->wiphy); - err = rdev_get_station(rdev, dev, addr, &sinfo); - wiphy_unlock(&rdev->wiphy); + scoped_guard(wiphy, &rdev->wiphy) { + err = rdev_get_station(rdev, dev, addr, &sinfo); + } if (err) return err; @@ -1420,23 +1362,17 @@ static int cfg80211_wext_siwap(struct net_device *dev, struct sockaddr *ap_addr = &wrqu->ap_addr; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - int ret; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - ret = cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); - break; + return cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra); case NL80211_IFTYPE_STATION: - ret = cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); - break; + return cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra); default: - ret = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - wiphy_unlock(&rdev->wiphy); - - return ret; } static int cfg80211_wext_giwap(struct net_device *dev, @@ -1446,23 +1382,17 @@ static int cfg80211_wext_giwap(struct net_device *dev, struct sockaddr *ap_addr = &wrqu->ap_addr; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - int ret; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - ret = cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); - break; + return cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra); case NL80211_IFTYPE_STATION: - ret = cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); - break; + return cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra); default: - ret = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - wiphy_unlock(&rdev->wiphy); - - return ret; } static int cfg80211_wext_siwessid(struct net_device *dev, @@ -1472,23 +1402,17 @@ static int cfg80211_wext_siwessid(struct net_device *dev, struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - int ret; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - ret = cfg80211_ibss_wext_siwessid(dev, info, data, ssid); - break; + return cfg80211_ibss_wext_siwessid(dev, info, data, ssid); case NL80211_IFTYPE_STATION: - ret = cfg80211_mgd_wext_siwessid(dev, info, data, ssid); - break; + return cfg80211_mgd_wext_siwessid(dev, info, data, ssid); default: - ret = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - wiphy_unlock(&rdev->wiphy); - - return ret; } static int cfg80211_wext_giwessid(struct net_device *dev, @@ -1498,26 +1422,20 @@ static int cfg80211_wext_giwessid(struct net_device *dev, struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - int ret; data->flags = 0; data->length = 0; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: - ret = cfg80211_ibss_wext_giwessid(dev, info, data, ssid); - break; + return cfg80211_ibss_wext_giwessid(dev, info, data, ssid); case NL80211_IFTYPE_STATION: - ret = cfg80211_mgd_wext_giwessid(dev, info, data, ssid); - break; + return cfg80211_mgd_wext_giwessid(dev, info, data, ssid); default: - ret = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - wiphy_unlock(&rdev->wiphy); - - return ret; } static int cfg80211_wext_siwpmksa(struct net_device *dev, @@ -1528,7 +1446,6 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev, struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_pmksa cfg_pmksa; struct iw_pmksa *pmksa = (struct iw_pmksa *)extra; - int ret; memset(&cfg_pmksa, 0, sizeof(struct cfg80211_pmksa)); @@ -1538,39 +1455,27 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev, cfg_pmksa.bssid = pmksa->bssid.sa_data; cfg_pmksa.pmkid = pmksa->pmkid; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (pmksa->cmd) { case IW_PMKSA_ADD: - if (!rdev->ops->set_pmksa) { - ret = -EOPNOTSUPP; - break; - } + if (!rdev->ops->set_pmksa) + return -EOPNOTSUPP; - ret = rdev_set_pmksa(rdev, dev, &cfg_pmksa); - break; + return rdev_set_pmksa(rdev, dev, &cfg_pmksa); case IW_PMKSA_REMOVE: - if (!rdev->ops->del_pmksa) { - ret = -EOPNOTSUPP; - break; - } + if (!rdev->ops->del_pmksa) + return -EOPNOTSUPP; - ret = rdev_del_pmksa(rdev, dev, &cfg_pmksa); - break; + return rdev_del_pmksa(rdev, dev, &cfg_pmksa); case IW_PMKSA_FLUSH: - if (!rdev->ops->flush_pmksa) { - ret = -EOPNOTSUPP; - break; - } + if (!rdev->ops->flush_pmksa) + return -EOPNOTSUPP; - ret = rdev_flush_pmksa(rdev, dev); - break; + return rdev_flush_pmksa(rdev, dev); default: - ret = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - wiphy_unlock(&rdev->wiphy); - - return ret; } static const iw_handler cfg80211_handlers[] = { diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 3bb04b05c5ce..bea70eb6f034 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -640,10 +640,8 @@ EXPORT_SYMBOL(wireless_send_event); #ifdef CONFIG_CFG80211_WEXT static void wireless_warn_cfg80211_wext(void) { - char name[sizeof(current->comm)]; - pr_warn_once("warning: `%s' uses wireless extensions which will stop working for Wi-Fi 7 hardware; use nl80211\n", - get_task_comm(name, current)); + current->comm); } #endif diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 8edd9ada69d0..573b6b15a446 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -302,8 +302,8 @@ int cfg80211_wext_siwgenie(struct net_device *dev, struct iw_point *data = &wrqu->data; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + int ie_len = data->length; u8 *ie = extra; - int ie_len = data->length, err; if (wdev->iftype != NL80211_IFTYPE_STATION) return -EOPNOTSUPP; @@ -311,39 +311,31 @@ int cfg80211_wext_siwgenie(struct net_device *dev, if (!ie_len) ie = NULL; - wiphy_lock(wdev->wiphy); + guard(wiphy)(wdev->wiphy); /* no change */ - err = 0; if (wdev->wext.ie_len == ie_len && memcmp(wdev->wext.ie, ie, ie_len) == 0) - goto out; + return 0; if (ie_len) { ie = kmemdup(extra, ie_len, GFP_KERNEL); - if (!ie) { - err = -ENOMEM; - goto out; - } - } else + if (!ie) + return -ENOMEM; + } else { ie = NULL; + } kfree(wdev->wext.ie); wdev->wext.ie = ie; wdev->wext.ie_len = ie_len; - if (wdev->conn) { - err = cfg80211_disconnect(rdev, dev, - WLAN_REASON_DEAUTH_LEAVING, false); - if (err) - goto out; - } + if (wdev->conn) + return cfg80211_disconnect(rdev, dev, + WLAN_REASON_DEAUTH_LEAVING, false); /* userspace better not think we'll reconnect */ - err = 0; - out: - wiphy_unlock(wdev->wiphy); - return err; + return 0; } int cfg80211_wext_siwmlme(struct net_device *dev, @@ -353,7 +345,6 @@ int cfg80211_wext_siwmlme(struct net_device *dev, struct wireless_dev *wdev = dev->ieee80211_ptr; struct iw_mlme *mlme = (struct iw_mlme *)extra; struct cfg80211_registered_device *rdev; - int err; if (!wdev) return -EOPNOTSUPP; @@ -366,17 +357,13 @@ int cfg80211_wext_siwmlme(struct net_device *dev, if (mlme->addr.sa_family != ARPHRD_ETHER) return -EINVAL; - wiphy_lock(&rdev->wiphy); + guard(wiphy)(&rdev->wiphy); + switch (mlme->cmd) { case IW_MLME_DEAUTH: case IW_MLME_DISASSOC: - err = cfg80211_disconnect(rdev, dev, mlme->reason_code, true); - break; + return cfg80211_disconnect(rdev, dev, mlme->reason_code, true); default: - err = -EOPNOTSUPP; - break; + return -EOPNOTSUPP; } - wiphy_unlock(&rdev->wiphy); - - return err; } diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c index 5460b9146dd8..37b190499405 100644 --- a/net/x25/x25_link.c +++ b/net/x25/x25_link.c @@ -55,7 +55,7 @@ static void x25_t20timer_expiry(struct timer_list *t) static inline void x25_stop_t20timer(struct x25_neigh *nb) { - del_timer(&nb->t20timer); + timer_delete(&nb->t20timer); } /* diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c index 9376365cdcc9..e4c5ad5b070f 100644 --- a/net/x25/x25_timer.c +++ b/net/x25/x25_timer.c @@ -41,7 +41,7 @@ void x25_start_heartbeat(struct sock *sk) void x25_stop_heartbeat(struct sock *sk) { - del_timer(&sk->sk_timer); + timer_delete(&sk->sk_timer); } void x25_start_t2timer(struct sock *sk) @@ -74,7 +74,7 @@ void x25_start_t23timer(struct sock *sk) void x25_stop_timer(struct sock *sk) { - del_timer(&x25_sk(sk)->timer); + timer_delete(&x25_sk(sk)->timer); } unsigned long x25_display_timer(struct sock *sk) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 89d2bef96469..72c000c0ae5f 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -25,6 +25,7 @@ #include <linux/vmalloc.h> #include <net/xdp_sock_drv.h> #include <net/busy_poll.h> +#include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/xdp.h> @@ -337,13 +338,14 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) u32 len = xdp_get_buff_len(xdp); int err; - spin_lock_bh(&xs->rx_lock); err = xsk_rcv_check(xs, xdp, len); if (!err) { + spin_lock_bh(&xs->pool->rx_lock); err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); + spin_unlock_bh(&xs->pool->rx_lock); } - spin_unlock_bh(&xs->rx_lock); + return err; } @@ -742,6 +744,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, goto free_err; } } + + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + skb->skb_mstamp_ns = meta->request.launch_time; } } @@ -802,8 +807,11 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - if (xsk_cq_reserve_addr_locked(xs->pool, desc.addr)) + err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr); + if (err) { + err = -EAGAIN; goto out; + } skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { @@ -875,7 +883,7 @@ static bool xsk_no_wakeup(struct sock *sk) #ifdef CONFIG_NET_RX_BUSY_POLL /* Prefer busy-polling, skip the wakeup. */ return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && - READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID; + napi_id_valid(READ_ONCE(sk->sk_napi_id)); #else return false; #endif @@ -1178,6 +1186,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_release; } + netdev_lock_ops(dev); + if (!xs->rx && !xs->tx) { err = -EINVAL; goto out_unlock; @@ -1294,7 +1304,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->queue_id = qid; xp_add_xsk(xs->pool, xs); - if (xs->zc && qid < dev->real_num_rx_queues) { + if (qid < dev->real_num_rx_queues) { struct netdev_rx_queue *rxq; rxq = __netif_get_rx_queue(dev, qid); @@ -1312,6 +1322,7 @@ out_unlock: smp_wmb(); WRITE_ONCE(xs->state, XSK_BOUND); } + netdev_unlock_ops(dev); out_release: mutex_unlock(&xs->mutex); rtnl_unlock(); @@ -1724,7 +1735,6 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); xs->state = XSK_READY; mutex_init(&xs->mutex); - spin_lock_init(&xs->rx_lock); INIT_LIST_HEAD(&xs->map_list); spin_lock_init(&xs->map_list_lock); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 1f7975b49657..aa9788f20d0d 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/netdevice.h> +#include <net/netdev_lock.h> #include <net/xsk_buff_pool.h> #include <net/xdp_sock.h> #include <net/xdp_sock_drv.h> @@ -87,6 +89,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->addrs = umem->addrs; pool->tx_metadata_len = umem->tx_metadata_len; pool->tx_sw_csum = umem->flags & XDP_UMEM_TX_SW_CSUM; + spin_lock_init(&pool->rx_lock); INIT_LIST_HEAD(&pool->free_list); INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); @@ -105,7 +108,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, if (pool->unaligned) pool->free_heads[i] = xskb; else - xp_init_xskb_addr(xskb, pool, i * pool->chunk_size); + xp_init_xskb_addr(xskb, pool, (u64)i * pool->chunk_size); } return pool; @@ -219,6 +222,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, bpf.xsk.pool = pool; bpf.xsk.queue_id = queue_id; + netdev_ops_assert_locked(netdev); err = netdev->netdev_ops->ndo_bpf(netdev, &bpf); if (err) goto err_unreg_pool; @@ -263,13 +267,17 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, void xp_clear_dev(struct xsk_buff_pool *pool) { + struct net_device *netdev = pool->netdev; + if (!pool->netdev) return; + netdev_lock_ops(netdev); xp_disable_drv_zc(pool); xsk_clear_pool_at_qid(pool->netdev, pool->queue_id); - dev_put(pool->netdev); pool->netdev = NULL; + netdev_unlock_ops(netdev); + dev_put(netdev); } static void xp_release_deferred(struct work_struct *work) @@ -699,18 +707,56 @@ void xp_free(struct xdp_buff_xsk *xskb) } EXPORT_SYMBOL(xp_free); -void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +static u64 __xp_raw_get_addr(const struct xsk_buff_pool *pool, u64 addr) +{ + return pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; +} + +static void *__xp_raw_get_data(const struct xsk_buff_pool *pool, u64 addr) { - addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; return pool->addrs + addr; } + +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +{ + return __xp_raw_get_data(pool, __xp_raw_get_addr(pool, addr)); +} EXPORT_SYMBOL(xp_raw_get_data); -dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +static dma_addr_t __xp_raw_get_dma(const struct xsk_buff_pool *pool, u64 addr) { - addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; return (pool->dma_pages[addr >> PAGE_SHIFT] & ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK); } + +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +{ + return __xp_raw_get_dma(pool, __xp_raw_get_addr(pool, addr)); +} EXPORT_SYMBOL(xp_raw_get_dma); + +/** + * xp_raw_get_ctx - get &xdp_desc context + * @pool: XSk buff pool desc address belongs to + * @addr: desc address (from userspace) + * + * Helper for getting desc's DMA address and metadata pointer, if present. + * Saves one call on hotpath, double calculation of the actual address, + * and inline checks for metadata presence and sanity. + * + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata + * pointer, if it is present and valid (initialized to %NULL otherwise). + */ +struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + struct xdp_desc_ctx ret; + + addr = __xp_raw_get_addr(pool, addr); + + ret.dma = __xp_raw_get_dma(pool, addr); + ret.meta = __xsk_buff_get_metadata(pool, __xp_raw_get_data(pool, addr)); + + return ret; +} +EXPORT_SYMBOL(xp_raw_get_ctx); diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index d7b16f2c23e9..f0157702718f 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -135,6 +135,22 @@ config NET_KEY_MIGRATE If unsure, say N. +config XFRM_IPTFS + tristate "IPsec IP-TFS/AGGFRAG (RFC 9347) encapsulation support" + depends on XFRM + help + Information on the IP-TFS/AGGFRAG encapsulation can be found + in RFC 9347. This feature supports demand driven (i.e., + non-constant send rate) IP-TFS to take advantage of the + AGGFRAG ESP payload encapsulation. This payload type + supports aggregation and fragmentation of the inner IP + packet stream which in turn yields higher small-packet + bandwidth as well as reducing MTU/PMTU issues. Congestion + control is unimplementated as the send rate is demand driven + rather than constant. + + If unsure, say N. + config XFRM_ESPINTCP bool diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 512e0b2f8514..5a1787587cb3 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -21,5 +21,6 @@ obj-$(CONFIG_XFRM_USER) += xfrm_user.o obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o +obj-$(CONFIG_XFRM_IPTFS) += xfrm_iptfs.o obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index fe82e2d07300..fc7a603b04f1 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -171,8 +171,10 @@ int espintcp_queue_out(struct sock *sk, struct sk_buff *skb) struct espintcp_ctx *ctx = espintcp_getctx(sk); if (skb_queue_len(&ctx->out_queue) >= - READ_ONCE(net_hotdata.max_backlog)) + READ_ONCE(net_hotdata.max_backlog)) { + kfree_skb(skb); return -ENOBUFS; + } __skb_queue_tail(&ctx->out_queue, skb); diff --git a/net/xfrm/trace_iptfs.h b/net/xfrm/trace_iptfs.h new file mode 100644 index 000000000000..74391ba24445 --- /dev/null +++ b/net/xfrm/trace_iptfs.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* xfrm_trace_iptfs.h + * + * August 12 2023, Christian Hopps <chopps@labn.net> + * + * Copyright (c) 2023, LabN Consulting, L.L.C. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM iptfs + +#if !defined(_TRACE_IPTFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_IPTFS_H + +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/tracepoint.h> +#include <net/ip.h> + +struct xfrm_iptfs_data; + +TRACE_EVENT(iptfs_egress_recv, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u16 blkoff), + TP_ARGS(skb, xtfs, blkoff), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(void *, head) + __field(void *, head_pg_addr) + __field(void *, pg0addr) + __field(u32, skb_len) + __field(u32, data_len) + __field(u32, headroom) + __field(u32, tailroom) + __field(u32, tail) + __field(u32, end) + __field(u32, pg0off) + __field(u8, head_frag) + __field(u8, frag_list) + __field(u8, nr_frags) + __field(u16, blkoff)), + TP_fast_assign(__entry->skb = skb; + __entry->head = skb->head; + __entry->skb_len = skb->len; + __entry->data_len = skb->data_len; + __entry->headroom = skb_headroom(skb); + __entry->tailroom = skb_tailroom(skb); + __entry->tail = (u32)skb->tail; + __entry->end = (u32)skb->end; + __entry->head_frag = skb->head_frag; + __entry->frag_list = (bool)skb_shinfo(skb)->frag_list; + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->blkoff = blkoff; + __entry->head_pg_addr = page_address(virt_to_head_page(skb->head)); + __entry->pg0addr = (__entry->nr_frags + ? page_address(netmem_to_page(skb_shinfo(skb)->frags[0].netmem)) + : NULL); + __entry->pg0off = (__entry->nr_frags + ? skb_shinfo(skb)->frags[0].offset + : 0); + ), + TP_printk("EGRESS: skb=%p len=%u data_len=%u headroom=%u head_frag=%u frag_list=%u nr_frags=%u blkoff=%u\n\t\ttailroom=%u tail=%u end=%u head=%p hdpgaddr=%p pg0->addr=%p pg0->data=%p pg0->off=%u", + __entry->skb, __entry->skb_len, __entry->data_len, __entry->headroom, + __entry->head_frag, __entry->frag_list, __entry->nr_frags, __entry->blkoff, + __entry->tailroom, __entry->tail, __entry->end, __entry->head, + __entry->head_pg_addr, __entry->pg0addr, __entry->pg0addr + __entry->pg0off, + __entry->pg0off) + ) + +DECLARE_EVENT_CLASS(iptfs_ingress_preq_event, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, + u32 pmtu, u8 was_gso), + TP_ARGS(skb, xtfs, pmtu, was_gso), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(u32, skb_len) + __field(u32, data_len) + __field(u32, pmtu) + __field(u32, queue_size) + __field(u32, proto_seq) + __field(u8, proto) + __field(u8, was_gso) + ), + TP_fast_assign(__entry->skb = skb; + __entry->skb_len = skb->len; + __entry->data_len = skb->data_len; + __entry->queue_size = + xtfs->cfg.max_queue_size - xtfs->queue_size; + __entry->proto = __trace_ip_proto(ip_hdr(skb)); + __entry->proto_seq = __trace_ip_proto_seq(ip_hdr(skb)); + __entry->pmtu = pmtu; + __entry->was_gso = was_gso; + ), + TP_printk("INGRPREQ: skb=%p len=%u data_len=%u qsize=%u proto=%u proto_seq=%u pmtu=%u was_gso=%u", + __entry->skb, __entry->skb_len, __entry->data_len, + __entry->queue_size, __entry->proto, __entry->proto_seq, + __entry->pmtu, __entry->was_gso)); + +DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_enqueue, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), + TP_ARGS(skb, xtfs, pmtu, was_gso)); + +DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_no_queue_space, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), + TP_ARGS(skb, xtfs, pmtu, was_gso)); + +DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_too_big, + TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso), + TP_ARGS(skb, xtfs, pmtu, was_gso)); + +DECLARE_EVENT_CLASS(iptfs_ingress_postq_event, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(u32, skb_len) + __field(u32, data_len) + __field(u32, mtu) + __field(u32, proto_seq) + __field(u16, blkoff) + __field(u8, proto)), + TP_fast_assign(__entry->skb = skb; + __entry->skb_len = skb->len; + __entry->data_len = skb->data_len; + __entry->mtu = mtu; + __entry->blkoff = blkoff; + __entry->proto = iph ? __trace_ip_proto(iph) : 0; + __entry->proto_seq = iph ? __trace_ip_proto_seq(iph) : 0; + ), + TP_printk("INGRPSTQ: skb=%p len=%u data_len=%u mtu=%u blkoff=%u proto=%u proto_seq=%u", + __entry->skb, __entry->skb_len, __entry->data_len, __entry->mtu, + __entry->blkoff, __entry->proto, __entry->proto_seq)); + +DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_dequeue, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, + struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph)); + +DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_fragmenting, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, + struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph)); + +DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_final_fragment, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, + struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph)); + +DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_toobig, + TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, + struct iphdr *iph), + TP_ARGS(skb, mtu, blkoff, iph)); + +TRACE_EVENT(iptfs_ingress_nth_peek, + TP_PROTO(struct sk_buff *skb, u32 remaining), + TP_ARGS(skb, remaining), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(u32, skb_len) + __field(u32, remaining)), + TP_fast_assign(__entry->skb = skb; + __entry->skb_len = skb->len; + __entry->remaining = remaining; + ), + TP_printk("INGRPSTQ: NTHPEEK: skb=%p len=%u remaining=%u", + __entry->skb, __entry->skb_len, __entry->remaining)); + +TRACE_EVENT(iptfs_ingress_nth_add, TP_PROTO(struct sk_buff *skb, u8 share_ok), + TP_ARGS(skb, share_ok), + TP_STRUCT__entry(__field(struct sk_buff *, skb) + __field(u32, skb_len) + __field(u32, data_len) + __field(u8, share_ok) + __field(u8, head_frag) + __field(u8, pp_recycle) + __field(u8, cloned) + __field(u8, shared) + __field(u8, nr_frags) + __field(u8, frag_list) + ), + TP_fast_assign(__entry->skb = skb; + __entry->skb_len = skb->len; + __entry->data_len = skb->data_len; + __entry->share_ok = share_ok; + __entry->head_frag = skb->head_frag; + __entry->pp_recycle = skb->pp_recycle; + __entry->cloned = skb_cloned(skb); + __entry->shared = skb_shared(skb); + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->frag_list = (bool)skb_shinfo(skb)->frag_list; + ), + TP_printk("INGRPSTQ: NTHADD: skb=%p len=%u data_len=%u share_ok=%u head_frag=%u pp_recycle=%u cloned=%u shared=%u nr_frags=%u frag_list=%u", + __entry->skb, __entry->skb_len, __entry->data_len, __entry->share_ok, + __entry->head_frag, __entry->pp_recycle, __entry->cloned, __entry->shared, + __entry->nr_frags, __entry->frag_list)); + +DECLARE_EVENT_CLASS(iptfs_timer_event, + TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), + TP_ARGS(xtfs, time_val), + TP_STRUCT__entry(__field(u64, time_val) + __field(u64, set_time)), + TP_fast_assign(__entry->time_val = time_val; + __entry->set_time = xtfs->iptfs_settime; + ), + TP_printk("TIMER: set_time=%llu time_val=%llu", + __entry->set_time, __entry->time_val)); + +DEFINE_EVENT(iptfs_timer_event, iptfs_timer_start, + TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), + TP_ARGS(xtfs, time_val)); + +DEFINE_EVENT(iptfs_timer_event, iptfs_timer_expire, + TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val), + TP_ARGS(xtfs, time_val)); + +#endif /* _TRACE_IPTFS_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../net/xfrm +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace_iptfs +#include <trace/define_trace.h> diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index e6da7e8495c9..749011e031c0 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -5,13 +5,13 @@ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> */ +#include <crypto/acompress.h> #include <crypto/aead.h> #include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/pfkeyv2.h> -#include <linux/crypto.h> #include <linux/scatterlist.h> #include <net/xfrm.h> #if IS_ENABLED(CONFIG_INET_ESP) || IS_ENABLED(CONFIG_INET6_ESP) @@ -669,7 +669,7 @@ static const struct xfrm_algo_list xfrm_ealg_list = { }; static const struct xfrm_algo_list xfrm_calg_list = { - .find = crypto_has_comp, + .find = crypto_has_acomp, .algs = calg_list, .entries = ARRAY_SIZE(calg_list), }; @@ -828,8 +828,7 @@ void xfrm_probe_algs(void) } for (i = 0; i < calg_entries(); i++) { - status = crypto_has_comp(calg_list[i].name, 0, - CRYPTO_ALG_ASYNC); + status = crypto_has_acomp(calg_list[i].name, 0, 0); if (calg_list[i].available != status) calg_list[i].available = status; } diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c index 5b9ee63e30b6..b8d2e6930041 100644 --- a/net/xfrm/xfrm_compat.c +++ b/net/xfrm/xfrm_compat.c @@ -284,9 +284,15 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src) case XFRMA_SA_DIR: case XFRMA_NAT_KEEPALIVE_INTERVAL: case XFRMA_SA_PCPU: + case XFRMA_IPTFS_DROP_TIME: + case XFRMA_IPTFS_REORDER_WINDOW: + case XFRMA_IPTFS_DONT_FRAG: + case XFRMA_IPTFS_INIT_DELAY: + case XFRMA_IPTFS_MAX_QSIZE: + case XFRMA_IPTFS_PKT_SIZE: return xfrm_nla_cpy(dst, src, nla_len(src)); default: - BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE); pr_warn_once("unsupported nla_type %d\n", src->nla_type); return -EOPNOTSUPP; } @@ -441,7 +447,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla, int err; if (type > XFRMA_MAX) { - BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); + BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE); NL_SET_ERR_MSG(extack, "Bad attribute"); return -EOPNOTSUPP; } diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index b33c4591e09a..81fd486b5e56 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -42,7 +42,8 @@ static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb, skb->transport_header = skb->network_header + hsize; skb_reset_mac_len(skb); - pskb_pull(skb, skb->mac_len + x->props.header_len); + pskb_pull(skb, + skb->mac_len + x->props.header_len - x->props.enc_hdr_len); } static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, @@ -68,6 +69,7 @@ static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) { switch (x->outer_mode.encap) { + case XFRM_MODE_IPTFS: case XFRM_MODE_TUNNEL: if (x->outer_mode.family == AF_INET) return __xfrm_mode_tunnel_prep(x, skb, @@ -143,10 +145,6 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return NULL; } - /* This skb was already validated on the upper/virtual dev */ - if ((x->xso.dev != dev) && (x->xso.real_dev == dev)) - return skb; - local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); err = !skb_queue_empty(&sd->xfrm_backlog); @@ -157,8 +155,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return skb; } - if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) || - unlikely(xmit_xfrm_check_overflow(skb)))) { + if (skb_is_gso(skb) && unlikely(xmit_xfrm_check_overflow(skb))) { struct sk_buff *segs; /* Packet got rerouted, fixup features and segment it. */ @@ -242,11 +239,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xfrm_address_t *daddr; bool is_packet_offload; - if (!x->type_offload) { - NL_SET_ERR_MSG(extack, "Type doesn't support offload"); - return -EINVAL; - } - if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) { NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); @@ -259,6 +251,11 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } + if (xuo->flags & XFRM_OFFLOAD_INBOUND && x->if_id) { + NL_SET_ERR_MSG(extack, "XFRM if_id is not supported in RX path"); + return -EINVAL; + } + is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET; /* We don't yet support TFC padding. */ @@ -308,9 +305,15 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } + xfrm_set_type_offload(x); + if (!x->type_offload) { + NL_SET_ERR_MSG(extack, "Type doesn't support offload"); + dev_put(dev); + return -EINVAL; + } + xso->dev = dev; netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC); - xso->real_dev = dev; if (xuo->flags & XFRM_OFFLOAD_INBOUND) xso->dir = XFRM_DEV_OFFLOAD_IN; @@ -322,14 +325,14 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, else xso->type = XFRM_DEV_OFFLOAD_CRYPTO; - err = dev->xfrmdev_ops->xdo_dev_state_add(x, extack); + err = dev->xfrmdev_ops->xdo_dev_state_add(dev, x, extack); if (err) { xso->dev = NULL; xso->dir = 0; - xso->real_dev = NULL; netdev_put(dev, &xso->dev_tracker); xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; + xfrm_unset_type_offload(x); /* User explicitly requested packet offload mode and configured * policy in addition to the XFRM state. So be civil to users, * and return an error instead of taking fallback path. @@ -373,7 +376,6 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, xdo->dev = dev; netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC); - xdo->real_dev = dev; xdo->type = XFRM_DEV_OFFLOAD_PACKET; switch (dir) { case XFRM_POLICY_IN: @@ -395,7 +397,6 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp, err = dev->xfrmdev_ops->xdo_dev_policy_add(xp, extack); if (err) { xdo->dev = NULL; - xdo->real_dev = NULL; xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; xdo->dir = 0; netdev_put(dev, &xdo->dev_tracker); @@ -413,14 +414,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) struct dst_entry *dst = skb_dst(skb); struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct net_device *dev = x->xso.dev; + bool check_tunnel_size; - if (!x->type_offload || - (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap)) + if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED) return false; - if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET || - ((!dev || (dev == xfrm_dst_path(dst)->dev)) && - !xdst->child->xfrm)) { + if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) { mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) goto ok; @@ -432,8 +431,29 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) return false; ok: - if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_offload_ok) - return x->xso.dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x); + check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && + x->props.mode == XFRM_MODE_TUNNEL; + switch (x->props.family) { + case AF_INET: + /* Check for IPv4 options */ + if (ip_hdr(skb)->ihl != 5) + return false; + if (check_tunnel_size && xfrm4_tunnel_check_size(skb)) + return false; + break; + case AF_INET6: + /* Check for IPv6 extensions */ + if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr)) + return false; + if (check_tunnel_size && xfrm6_tunnel_check_size(skb)) + return false; + break; + default: + break; + } + + if (dev->xfrmdev_ops->xdo_dev_offload_ok) + return dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x); return true; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 841a60a6fbfe..7e6a71b9d6a3 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -48,7 +48,7 @@ static DEFINE_SPINLOCK(xfrm_input_afinfo_lock); static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[2][AF_INET6 + 1]; static struct gro_cells gro_cells; -static struct net_device xfrm_napi_dev; +static struct net_device *xfrm_napi_dev; static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet); @@ -446,6 +446,9 @@ static int xfrm_inner_mode_input(struct xfrm_state *x, WARN_ON_ONCE(1); break; default: + if (x->mode_cbs && x->mode_cbs->input) + return x->mode_cbs->input(x, skb); + WARN_ON_ONCE(1); break; } @@ -453,6 +456,10 @@ static int xfrm_inner_mode_input(struct xfrm_state *x, return -EOPNOTSUPP; } +/* NOTE: encap_type - In addition to the normal (non-negative) values for + * encap_type, a negative value of -1 or -2 can be used to resume/restart this + * function after a previous invocation early terminated for async operation. + */ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { const struct xfrm_state_afinfo *afinfo; @@ -489,6 +496,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) family = x->props.family; + /* An encap_type of -2 indicates reconstructed inner packet */ + if (encap_type == -2) + goto resume_decapped; + /* An encap_type of -1 indicates async resumption. */ if (encap_type == -1) { async = 1; @@ -679,11 +690,14 @@ resume: XFRM_MODE_SKB_CB(skb)->protocol = nexthdr; - if (xfrm_inner_mode_input(x, skb)) { + err = xfrm_inner_mode_input(x, skb); + if (err == -EINPROGRESS) + return 0; + else if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); goto drop; } - +resume_decapped: if (x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL) { decaps = 1; break; @@ -811,8 +825,11 @@ void __init xfrm_input_init(void) int err; int i; - init_dummy_netdev(&xfrm_napi_dev); - err = gro_cells_init(&gro_cells, &xfrm_napi_dev); + xfrm_napi_dev = alloc_netdev_dummy(0); + if (!xfrm_napi_dev) + panic("Failed to allocate XFRM dummy netdev\n"); + + err = gro_cells_init(&gro_cells, xfrm_napi_dev); if (err) gro_cells.cells = NULL; diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 98f1e2b67c76..cb1e12740c87 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -242,10 +242,9 @@ static void xfrmi_dev_free(struct net_device *dev) gro_cells_destroy(&xi->gro_cells); } -static int xfrmi_create(struct net_device *dev) +static int xfrmi_create(struct net *net, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - struct net *net = dev_net(dev); struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; @@ -506,7 +505,7 @@ xmit: skb_dst_set(skb, dst); skb->dev = tdev; - err = dst_output(xi->net, skb->sk, skb); + err = dst_output(xi->net, skb_to_full_sk(skb), skb); if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { @@ -814,15 +813,17 @@ static void xfrmi_netlink_parms(struct nlattr *data[], parms->collect_md = true; } -static int xfrmi_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) +static int xfrmi_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, + struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); + struct nlattr **data = params->data; struct xfrm_if_parms p = {}; struct xfrm_if *xi; + struct net *net; int err; + net = params->link_net ? : dev_net(dev); xfrmi_netlink_parms(data, &p); if (p.collect_md) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); @@ -851,7 +852,7 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, xi->net = net; xi->dev = dev; - err = xfrmi_create(dev); + err = xfrmi_create(net, dev); return err; } @@ -951,32 +952,28 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .get_link_net = xfrmi_get_link_net, }; -static void __net_exit xfrmi_exit_batch_rtnl(struct list_head *net_exit_list, - struct list_head *dev_to_kill) +static void __net_exit xfrmi_exit_rtnl(struct net *net, + struct list_head *dev_to_kill) { - struct net *net; + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); + struct xfrm_if __rcu **xip; + struct xfrm_if *xi; + int i; - ASSERT_RTNL(); - list_for_each_entry(net, net_exit_list, exit_list) { - struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - struct xfrm_if __rcu **xip; - struct xfrm_if *xi; - int i; - - for (i = 0; i < XFRMI_HASH_SIZE; i++) { - for (xip = &xfrmn->xfrmi[i]; - (xi = rtnl_dereference(*xip)) != NULL; - xip = &xi->next) - unregister_netdevice_queue(xi->dev, dev_to_kill); - } - xi = rtnl_dereference(xfrmn->collect_md_xfrmi); - if (xi) + for (i = 0; i < XFRMI_HASH_SIZE; i++) { + for (xip = &xfrmn->xfrmi[i]; + (xi = rtnl_net_dereference(net, *xip)) != NULL; + xip = &xi->next) unregister_netdevice_queue(xi->dev, dev_to_kill); } + + xi = rtnl_net_dereference(net, xfrmn->collect_md_xfrmi); + if (xi) + unregister_netdevice_queue(xi->dev, dev_to_kill); } static struct pernet_operations xfrmi_net_ops = { - .exit_batch_rtnl = xfrmi_exit_batch_rtnl, + .exit_rtnl = xfrmi_exit_rtnl, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index 9c0fa0e1786a..907c3ccb440d 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -3,7 +3,7 @@ * IP Payload Compression Protocol (IPComp) - RFC3173. * * Copyright (c) 2003 James Morris <jmorris@intercode.com.au> - * Copyright (c) 2003-2008 Herbert Xu <herbert@gondor.apana.org.au> + * Copyright (c) 2003-2025 Herbert Xu <herbert@gondor.apana.org.au> * * Todo: * - Tunable compression parameters. @@ -11,303 +11,301 @@ * - Adaptive compression. */ -#include <linux/crypto.h> +#include <crypto/acompress.h> #include <linux/err.h> -#include <linux/list.h> #include <linux/module.h> -#include <linux/mutex.h> -#include <linux/percpu.h> +#include <linux/skbuff_ref.h> #include <linux/slab.h> -#include <linux/smp.h> -#include <linux/vmalloc.h> -#include <net/ip.h> #include <net/ipcomp.h> #include <net/xfrm.h> -struct ipcomp_tfms { - struct list_head list; - struct crypto_comp * __percpu *tfms; - int users; +#define IPCOMP_SCRATCH_SIZE 65400 + +struct ipcomp_skb_cb { + struct xfrm_skb_cb xfrm; + struct acomp_req *req; }; -static DEFINE_MUTEX(ipcomp_resource_mutex); -static void * __percpu *ipcomp_scratches; -static int ipcomp_scratch_users; -static LIST_HEAD(ipcomp_tfms_list); +struct ipcomp_data { + u16 threshold; + struct crypto_acomp *tfm; +}; -static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) +struct ipcomp_req_extra { + struct xfrm_state *x; + struct scatterlist sg[]; +}; + +static inline struct ipcomp_skb_cb *ipcomp_cb(struct sk_buff *skb) { - struct ipcomp_data *ipcd = x->data; - const int plen = skb->len; - int dlen = IPCOMP_SCRATCH_SIZE; - const u8 *start = skb->data; - u8 *scratch = *this_cpu_ptr(ipcomp_scratches); - struct crypto_comp *tfm = *this_cpu_ptr(ipcd->tfms); - int err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); - int len; + struct ipcomp_skb_cb *cb = (void *)skb->cb; - if (err) - return err; + BUILD_BUG_ON(sizeof(*cb) > sizeof(skb->cb)); + return cb; +} - if (dlen < (plen + sizeof(struct ip_comp_hdr))) - return -EINVAL; +static int ipcomp_post_acomp(struct sk_buff *skb, int err, int hlen) +{ + struct acomp_req *req = ipcomp_cb(skb)->req; + struct ipcomp_req_extra *extra; + struct scatterlist *dsg; + int len, dlen; - len = dlen - plen; - if (len > skb_tailroom(skb)) - len = skb_tailroom(skb); + if (unlikely(err)) + goto out_free_req; - __skb_put(skb, len); + extra = acomp_request_extra(req); + dsg = extra->sg; + dlen = req->dlen; - len += plen; - skb_copy_to_linear_data(skb, scratch, len); + pskb_trim_unique(skb, 0); + __skb_put(skb, hlen); - while ((scratch += len, dlen -= len) > 0) { + /* Only update truesize on input. */ + if (!hlen) + skb->truesize += dlen; + skb->data_len = dlen; + skb->len += dlen; + + do { skb_frag_t *frag; struct page *page; - if (WARN_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) - return -EMSGSIZE; - frag = skb_shinfo(skb)->frags + skb_shinfo(skb)->nr_frags; - page = alloc_page(GFP_ATOMIC); - - if (!page) - return -ENOMEM; + page = sg_page(dsg); + dsg = sg_next(dsg); len = PAGE_SIZE; if (dlen < len) len = dlen; skb_frag_fill_page_desc(frag, page, 0, len); - memcpy(skb_frag_address(frag), scratch, len); - - skb->truesize += len; - skb->data_len += len; - skb->len += len; skb_shinfo(skb)->nr_frags++; - } + } while ((dlen -= len)); - return 0; + for (; dsg; dsg = sg_next(dsg)) + __free_page(sg_page(dsg)); + +out_free_req: + acomp_request_free(req); + return err; } -int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) +static int ipcomp_input_done2(struct sk_buff *skb, int err) { - int nexthdr; - int err = -ENOMEM; - struct ip_comp_hdr *ipch; - - if (skb_linearize_cow(skb)) - goto out; - - skb->ip_summed = CHECKSUM_NONE; + struct ip_comp_hdr *ipch = ip_comp_hdr(skb); + const int plen = skb->len; - /* Remove ipcomp header and decompress original payload */ - ipch = (void *)skb->data; - nexthdr = ipch->nexthdr; + skb_reset_transport_header(skb); - skb->transport_header = skb->network_header + sizeof(*ipch); - __skb_pull(skb, sizeof(*ipch)); - err = ipcomp_decompress(x, skb); - if (err) - goto out; + return ipcomp_post_acomp(skb, err, 0) ?: + skb->len < (plen + sizeof(ip_comp_hdr)) ? -EINVAL : + ipch->nexthdr; +} - err = nexthdr; +static void ipcomp_input_done(void *data, int err) +{ + struct sk_buff *skb = data; -out: - return err; + xfrm_input_resume(skb, ipcomp_input_done2(skb, err)); } -EXPORT_SYMBOL_GPL(ipcomp_input); -static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) +static struct acomp_req *ipcomp_setup_req(struct xfrm_state *x, + struct sk_buff *skb, int minhead, + int dlen) { + const int dnfrags = min(MAX_SKB_FRAGS, 16); struct ipcomp_data *ipcd = x->data; + struct ipcomp_req_extra *extra; + struct scatterlist *sg, *dsg; const int plen = skb->len; - int dlen = IPCOMP_SCRATCH_SIZE; - u8 *start = skb->data; - struct crypto_comp *tfm; - u8 *scratch; + struct crypto_acomp *tfm; + struct acomp_req *req; + int nfrags; + int total; int err; + int i; - local_bh_disable(); - scratch = *this_cpu_ptr(ipcomp_scratches); - tfm = *this_cpu_ptr(ipcd->tfms); - err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); - if (err) - goto out; - - if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) { - err = -EMSGSIZE; - goto out; - } + ipcomp_cb(skb)->req = NULL; - memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen); - local_bh_enable(); + do { + struct sk_buff *trailer; - pskb_trim(skb, dlen + sizeof(struct ip_comp_hdr)); - return 0; + if (skb->len > PAGE_SIZE) { + if (skb_linearize_cow(skb)) + return ERR_PTR(-ENOMEM); + nfrags = 1; + break; + } -out: - local_bh_enable(); - return err; -} + if (!skb_cloned(skb) && skb_headlen(skb) >= minhead) { + if (!skb_is_nonlinear(skb)) { + nfrags = 1; + break; + } else if (!skb_has_frag_list(skb)) { + nfrags = skb_shinfo(skb)->nr_frags; + nfrags++; + break; + } + } -int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) -{ - int err; - struct ip_comp_hdr *ipch; - struct ipcomp_data *ipcd = x->data; + nfrags = skb_cow_data(skb, skb_headlen(skb) < minhead ? + minhead - skb_headlen(skb) : 0, + &trailer); + if (nfrags < 0) + return ERR_PTR(nfrags); + } while (0); + + tfm = ipcd->tfm; + req = acomp_request_alloc_extra( + tfm, sizeof(*extra) + sizeof(*sg) * (nfrags + dnfrags), + GFP_ATOMIC); + ipcomp_cb(skb)->req = req; + if (!req) + return ERR_PTR(-ENOMEM); + + extra = acomp_request_extra(req); + extra->x = x; + + dsg = extra->sg; + sg = dsg + dnfrags; + sg_init_table(sg, nfrags); + err = skb_to_sgvec(skb, sg, 0, plen); + if (unlikely(err < 0)) + return ERR_PTR(err); + + sg_init_table(dsg, dnfrags); + total = 0; + for (i = 0; i < dnfrags && total < dlen; i++) { + struct page *page; - if (skb->len < ipcd->threshold) { - /* Don't bother compressing */ - goto out_ok; + page = alloc_page(GFP_ATOMIC); + if (!page) + break; + sg_set_page(dsg + i, page, PAGE_SIZE, 0); + total += PAGE_SIZE; } + if (!i) + return ERR_PTR(-ENOMEM); + sg_mark_end(dsg + i - 1); + dlen = min(dlen, total); - if (skb_linearize_cow(skb)) - goto out_ok; - - err = ipcomp_compress(x, skb); - - if (err) { - goto out_ok; - } + acomp_request_set_params(req, sg, dsg, plen, dlen); - /* Install ipcomp header, convert into ipcomp datagram. */ - ipch = ip_comp_hdr(skb); - ipch->nexthdr = *skb_mac_header(skb); - ipch->flags = 0; - ipch->cpi = htons((u16 )ntohl(x->id.spi)); - *skb_mac_header(skb) = IPPROTO_COMP; -out_ok: - skb_push(skb, -skb_network_offset(skb)); - return 0; + return req; } -EXPORT_SYMBOL_GPL(ipcomp_output); -static void ipcomp_free_scratches(void) +static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) { - int i; - void * __percpu *scratches; - - if (--ipcomp_scratch_users) - return; + struct acomp_req *req; + int err; - scratches = ipcomp_scratches; - if (!scratches) - return; + req = ipcomp_setup_req(x, skb, 0, IPCOMP_SCRATCH_SIZE); + err = PTR_ERR(req); + if (IS_ERR(req)) + goto out; - for_each_possible_cpu(i) - vfree(*per_cpu_ptr(scratches, i)); + acomp_request_set_callback(req, 0, ipcomp_input_done, skb); + err = crypto_acomp_decompress(req); + if (err == -EINPROGRESS) + return err; - free_percpu(scratches); - ipcomp_scratches = NULL; +out: + return ipcomp_input_done2(skb, err); } -static void * __percpu *ipcomp_alloc_scratches(void) +int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { - void * __percpu *scratches; - int i; - - if (ipcomp_scratch_users++) - return ipcomp_scratches; - - scratches = alloc_percpu(void *); - if (!scratches) - return NULL; + struct ip_comp_hdr *ipch __maybe_unused; - ipcomp_scratches = scratches; + if (!pskb_may_pull(skb, sizeof(*ipch))) + return -EINVAL; - for_each_possible_cpu(i) { - void *scratch; + skb->ip_summed = CHECKSUM_NONE; - scratch = vmalloc_node(IPCOMP_SCRATCH_SIZE, cpu_to_node(i)); - if (!scratch) - return NULL; - *per_cpu_ptr(scratches, i) = scratch; - } + /* Remove ipcomp header and decompress original payload */ + __skb_pull(skb, sizeof(*ipch)); - return scratches; + return ipcomp_decompress(x, skb); } +EXPORT_SYMBOL_GPL(ipcomp_input); -static void ipcomp_free_tfms(struct crypto_comp * __percpu *tfms) +static int ipcomp_output_push(struct sk_buff *skb) { - struct ipcomp_tfms *pos; - int cpu; - - list_for_each_entry(pos, &ipcomp_tfms_list, list) { - if (pos->tfms == tfms) - break; - } - - WARN_ON(list_entry_is_head(pos, &ipcomp_tfms_list, list)); - - if (--pos->users) - return; + skb_push(skb, -skb_network_offset(skb)); + return 0; +} - list_del(&pos->list); - kfree(pos); +static int ipcomp_output_done2(struct xfrm_state *x, struct sk_buff *skb, + int err) +{ + struct ip_comp_hdr *ipch; - if (!tfms) - return; + err = ipcomp_post_acomp(skb, err, sizeof(*ipch)); + if (err) + goto out_ok; - for_each_possible_cpu(cpu) { - struct crypto_comp *tfm = *per_cpu_ptr(tfms, cpu); - crypto_free_comp(tfm); - } - free_percpu(tfms); + /* Install ipcomp header, convert into ipcomp datagram. */ + ipch = ip_comp_hdr(skb); + ipch->nexthdr = *skb_mac_header(skb); + ipch->flags = 0; + ipch->cpi = htons((u16 )ntohl(x->id.spi)); + *skb_mac_header(skb) = IPPROTO_COMP; +out_ok: + return ipcomp_output_push(skb); } -static struct crypto_comp * __percpu *ipcomp_alloc_tfms(const char *alg_name) +static void ipcomp_output_done(void *data, int err) { - struct ipcomp_tfms *pos; - struct crypto_comp * __percpu *tfms; - int cpu; + struct ipcomp_req_extra *extra; + struct sk_buff *skb = data; + struct acomp_req *req; + req = ipcomp_cb(skb)->req; + extra = acomp_request_extra(req); - list_for_each_entry(pos, &ipcomp_tfms_list, list) { - struct crypto_comp *tfm; + xfrm_output_resume(skb_to_full_sk(skb), skb, + ipcomp_output_done2(extra->x, skb, err)); +} - /* This can be any valid CPU ID so we don't need locking. */ - tfm = this_cpu_read(*pos->tfms); +static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ip_comp_hdr *ipch __maybe_unused; + struct acomp_req *req; + int err; - if (!strcmp(crypto_comp_name(tfm), alg_name)) { - pos->users++; - return pos->tfms; - } - } + req = ipcomp_setup_req(x, skb, sizeof(*ipch), + skb->len - sizeof(*ipch)); + err = PTR_ERR(req); + if (IS_ERR(req)) + goto out; - pos = kmalloc(sizeof(*pos), GFP_KERNEL); - if (!pos) - return NULL; + acomp_request_set_callback(req, 0, ipcomp_output_done, skb); + err = crypto_acomp_compress(req); + if (err == -EINPROGRESS) + return err; - pos->users = 1; - INIT_LIST_HEAD(&pos->list); - list_add(&pos->list, &ipcomp_tfms_list); +out: + return ipcomp_output_done2(x, skb, err); +} - pos->tfms = tfms = alloc_percpu(struct crypto_comp *); - if (!tfms) - goto error; +int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipcomp_data *ipcd = x->data; - for_each_possible_cpu(cpu) { - struct crypto_comp *tfm = crypto_alloc_comp(alg_name, 0, - CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - goto error; - *per_cpu_ptr(tfms, cpu) = tfm; + if (skb->len < ipcd->threshold) { + /* Don't bother compressing */ + return ipcomp_output_push(skb); } - return tfms; - -error: - ipcomp_free_tfms(tfms); - return NULL; + return ipcomp_compress(x, skb); } +EXPORT_SYMBOL_GPL(ipcomp_output); static void ipcomp_free_data(struct ipcomp_data *ipcd) { - if (ipcd->tfms) - ipcomp_free_tfms(ipcd->tfms); - ipcomp_free_scratches(); + crypto_free_acomp(ipcd->tfm); } void ipcomp_destroy(struct xfrm_state *x) @@ -316,9 +314,7 @@ void ipcomp_destroy(struct xfrm_state *x) if (!ipcd) return; xfrm_state_delete_tunnel(x); - mutex_lock(&ipcomp_resource_mutex); ipcomp_free_data(ipcd); - mutex_unlock(&ipcomp_resource_mutex); kfree(ipcd); } EXPORT_SYMBOL_GPL(ipcomp_destroy); @@ -345,14 +341,9 @@ int ipcomp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) if (!ipcd) goto out; - mutex_lock(&ipcomp_resource_mutex); - if (!ipcomp_alloc_scratches()) - goto error; - - ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name); - if (!ipcd->tfms) + ipcd->tfm = crypto_alloc_acomp(x->calg->alg_name, 0, 0); + if (IS_ERR(ipcd->tfm)) goto error; - mutex_unlock(&ipcomp_resource_mutex); calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0); BUG_ON(!calg_desc); @@ -364,7 +355,6 @@ out: error: ipcomp_free_data(ipcd); - mutex_unlock(&ipcomp_resource_mutex); kfree(ipcd); goto out; } diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c new file mode 100644 index 000000000000..3b6d7284fc70 --- /dev/null +++ b/net/xfrm/xfrm_iptfs.c @@ -0,0 +1,2762 @@ +// SPDX-License-Identifier: GPL-2.0 +/* xfrm_iptfs: IPTFS encapsulation support + * + * April 21 2022, Christian Hopps <chopps@labn.net> + * + * Copyright (c) 2022, LabN Consulting, L.L.C. + * + */ + +#include <linux/kernel.h> +#include <linux/icmpv6.h> +#include <linux/skbuff_ref.h> +#include <net/gro.h> +#include <net/icmp.h> +#include <net/ip6_route.h> +#include <net/inet_ecn.h> +#include <net/xfrm.h> + +#include <crypto/aead.h> + +#include "xfrm_inout.h" +#include "trace_iptfs.h" + +/* IPTFS encap (header) values. */ +#define IPTFS_SUBTYPE_BASIC 0 +#define IPTFS_SUBTYPE_CC 1 + +/* ----------------------------------------------- */ +/* IP-TFS default SA values (tunnel egress/dir-in) */ +/* ----------------------------------------------- */ + +/** + * define IPTFS_DEFAULT_DROP_TIME_USECS - default drop time + * + * The default IPTFS drop time in microseconds. The drop time is the amount of + * time before a missing out-of-order IPTFS tunnel packet is considered lost. + * See also the reorder window. + * + * Default 1s. + */ +#define IPTFS_DEFAULT_DROP_TIME_USECS 1000000 + +/** + * define IPTFS_DEFAULT_REORDER_WINDOW - default reorder window size + * + * The default IPTFS reorder window size. The reorder window size dictates the + * maximum number of IPTFS tunnel packets in a sequence that may arrive out of + * order. + * + * Default 3. (tcp folks suggested) + */ +#define IPTFS_DEFAULT_REORDER_WINDOW 3 + +/* ------------------------------------------------ */ +/* IPTFS default SA values (tunnel ingress/dir-out) */ +/* ------------------------------------------------ */ + +/** + * define IPTFS_DEFAULT_INIT_DELAY_USECS - default initial output delay + * + * The initial output delay is the amount of time prior to servicing the output + * queue after queueing the first packet on said queue. This applies anytime the + * output queue was previously empty. + * + * Default 0. + */ +#define IPTFS_DEFAULT_INIT_DELAY_USECS 0 + +/** + * define IPTFS_DEFAULT_MAX_QUEUE_SIZE - default max output queue size. + * + * The default IPTFS max output queue size in octets. The output queue is where + * received packets destined for output over an IPTFS tunnel are stored prior to + * being output in aggregated/fragmented form over the IPTFS tunnel. + * + * Default 1M. + */ +#define IPTFS_DEFAULT_MAX_QUEUE_SIZE (1024 * 10240) + +/* Assumed: skb->head is cache aligned. + * + * L2 Header resv: Arrange for cacheline to start at skb->data - 16 to keep the + * to-be-pushed L2 header in the same cacheline as resulting `skb->data` (i.e., + * the L3 header). If cacheline size is > 64 then skb->data + pushed L2 will all + * be in a single cacheline if we simply reserve 64 bytes. + * + * L3 Header resv: For L3+L2 headers (i.e., skb->data points at the IPTFS payload) + * we want `skb->data` to be cacheline aligned and all pushed L2L3 headers will + * be in their own cacheline[s]. 128 works for cachelins up to 128 bytes, for + * any larger cacheline sizes the pushed headers will simply share the cacheline + * with the start of the IPTFS payload (skb->data). + */ +#define XFRM_IPTFS_MIN_L3HEADROOM 128 +#define XFRM_IPTFS_MIN_L2HEADROOM (L1_CACHE_BYTES > 64 ? 64 : 64 + 16) + +/* Min to try to share outer iptfs skb data vs copying into new skb */ +#define IPTFS_PKT_SHARE_MIN 129 + +#define NSECS_IN_USEC 1000 + +#define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT + +/** + * struct xfrm_iptfs_config - configuration for the IPTFS tunnel. + * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery, + * otherwise the user specified value. + * @max_queue_size: The maximum number of octets allowed to be queued to be sent + * over the IPTFS SA. The queue size is measured as the size of all the + * packets enqueued. + * @reorder_win_size: the number slots in the reorder window, thus the number of + * packets that may arrive out of order. + * @dont_frag: true to inhibit fragmenting across IPTFS outer packets. + */ +struct xfrm_iptfs_config { + u32 pkt_size; /* outer_packet_size or 0 */ + u32 max_queue_size; /* octets */ + u16 reorder_win_size; + u8 dont_frag : 1; +}; + +struct skb_wseq { + struct sk_buff *skb; + u64 drop_time; +}; + +/** + * struct xfrm_iptfs_data - mode specific xfrm state. + * @cfg: IPTFS tunnel config. + * @x: owning SA (xfrm_state). + * @queue: queued user packets to send. + * @queue_size: number of octets on queue (sum of packet sizes). + * @ecn_queue_size: octets above with ECN mark. + * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet. + * @iptfs_timer: output timer. + * @iptfs_settime: time the output timer was set. + * @payload_mtu: max payload size. + * @w_seq_set: true after first seq received. + * @w_wantseq: waiting for this seq number as next to process (in order). + * @w_saved: the saved buf array (reorder window). + * @w_savedlen: the saved len (not size). + * @drop_lock: lock to protect reorder queue. + * @drop_timer: timer for considering next packet lost. + * @drop_time_ns: timer intervan in nanoseconds. + * @ra_newskb: new pkt being reassembled. + * @ra_wantseq: expected next sequence for reassembly. + * @ra_runt: last pkt bytes from very end of last skb. + * @ra_runtlen: size of ra_runt. + */ +struct xfrm_iptfs_data { + struct xfrm_iptfs_config cfg; + + /* Ingress User Input */ + struct xfrm_state *x; /* owning state */ + struct sk_buff_head queue; /* output queue */ + + u32 queue_size; /* octets */ + u32 ecn_queue_size; /* octets above which ECN mark */ + u64 init_delay_ns; /* nanoseconds */ + struct hrtimer iptfs_timer; /* output timer */ + time64_t iptfs_settime; /* time timer was set */ + u32 payload_mtu; /* max payload size */ + + /* Tunnel input reordering */ + bool w_seq_set; /* true after first seq received */ + u64 w_wantseq; /* expected next sequence */ + struct skb_wseq *w_saved; /* the saved buf array */ + u32 w_savedlen; /* the saved len (not size) */ + spinlock_t drop_lock; + struct hrtimer drop_timer; + u64 drop_time_ns; + + /* Tunnel input reassembly */ + struct sk_buff *ra_newskb; /* new pkt being reassembled */ + u64 ra_wantseq; /* expected next sequence */ + u8 ra_runt[6]; /* last pkt bytes from last skb */ + u8 ra_runtlen; /* count of ra_runt */ +}; + +static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu); +static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me); +static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me); + +/* ================= */ +/* Utility Functions */ +/* ================= */ + +#ifdef TRACEPOINTS_ENABLED +static u32 __trace_ip_proto(struct iphdr *iph) +{ + if (iph->version == 4) + return iph->protocol; + return ((struct ipv6hdr *)iph)->nexthdr; +} + +static u32 __trace_ip_proto_seq(struct iphdr *iph) +{ + void *nexthdr; + u32 protocol = 0; + + if (iph->version == 4) { + nexthdr = (void *)(iph + 1); + protocol = iph->protocol; + } else if (iph->version == 6) { + nexthdr = (void *)(((struct ipv6hdr *)(iph)) + 1); + protocol = ((struct ipv6hdr *)(iph))->nexthdr; + } + switch (protocol) { + case IPPROTO_ICMP: + return ntohs(((struct icmphdr *)nexthdr)->un.echo.sequence); + case IPPROTO_ICMPV6: + return ntohs(((struct icmp6hdr *)nexthdr)->icmp6_sequence); + case IPPROTO_TCP: + return ntohl(((struct tcphdr *)nexthdr)->seq); + case IPPROTO_UDP: + return ntohs(((struct udphdr *)nexthdr)->source); + default: + return 0; + } +} +#endif /*TRACEPOINTS_ENABLED*/ + +static u64 __esp_seq(struct sk_buff *skb) +{ + u64 seq = ntohl(XFRM_SKB_CB(skb)->seq.input.low); + + return seq | (u64)ntohl(XFRM_SKB_CB(skb)->seq.input.hi) << 32; +} + +/* ======================= */ +/* IPTFS SK_BUFF Functions */ +/* ======================= */ + +/** + * iptfs_alloc_skb() - Allocate a new `skb`. + * @tpl: the skb to copy required meta-data from. + * @len: the linear length of the head data, zero is fine. + * @l3resv: true if skb reserve needs to support pushing L3 headers + * + * A new `skb` is allocated and required meta-data is copied from `tpl`, the + * head data is sized to `len` + reserved space set according to the @l3resv + * boolean. + * + * When @l3resv is false, resv is XFRM_IPTFS_MIN_L2HEADROOM which arranges for + * `skb->data - 16` which is a good guess for good cache alignment (placing the + * to be pushed L2 header at the start of a cacheline. + * + * Otherwise, @l3resv is true and resv is set to the correct reserved space for + * dst->dev plus the calculated L3 overhead for the xfrm dst or + * XFRM_IPTFS_MIN_L3HEADROOM whichever is larger. This is then cache aligned so + * that all the headers will commonly fall in a cacheline when possible. + * + * l3resv=true is used on tunnel ingress (tx), because we need to reserve for + * the new IPTFS packet (i.e., L2+L3 headers). On tunnel egress (rx) the data + * being copied into the skb includes the user L3 headers already so we only + * need to reserve for L2. + * + * Return: the new skb or NULL. + */ +static struct sk_buff *iptfs_alloc_skb(struct sk_buff *tpl, u32 len, bool l3resv) +{ + struct sk_buff *skb; + u32 resv; + + if (!l3resv) { + resv = XFRM_IPTFS_MIN_L2HEADROOM; + } else { + struct dst_entry *dst = skb_dst(tpl); + + resv = LL_RESERVED_SPACE(dst->dev) + dst->header_len; + resv = max(resv, XFRM_IPTFS_MIN_L3HEADROOM); + resv = L1_CACHE_ALIGN(resv); + } + + skb = alloc_skb(len + resv, GFP_ATOMIC | __GFP_NOWARN); + if (!skb) + return NULL; + + skb_reserve(skb, resv); + + if (!l3resv) { + /* xfrm_input resume needs dev and xfrm ext from tunnel pkt */ + skb->dev = tpl->dev; + __skb_ext_copy(skb, tpl); + } + + /* dropped by xfrm_input, used by xfrm_output */ + skb_dst_copy(skb, tpl); + + return skb; +} + +/** + * iptfs_skb_head_to_frag() - initialize a skb_frag_t based on skb head data + * @skb: skb with the head data + * @frag: frag to initialize + */ +static void iptfs_skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag) +{ + struct page *page = virt_to_head_page(skb->data); + unsigned char *addr = (unsigned char *)page_address(page); + + skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb)); +} + +/** + * struct iptfs_skb_frag_walk - use to track a walk through fragments + * @fragi: current fragment index + * @past: length of data in fragments before @fragi + * @total: length of data in all fragments + * @nr_frags: number of fragments present in array + * @initial_offset: the value passed in to skb_prepare_frag_walk() + * @frags: the page fragments inc. room for head page + * @pp_recycle: copy of skb->pp_recycle + */ +struct iptfs_skb_frag_walk { + u32 fragi; + u32 past; + u32 total; + u32 nr_frags; + u32 initial_offset; + skb_frag_t frags[MAX_SKB_FRAGS + 1]; + bool pp_recycle; +}; + +/** + * iptfs_skb_prepare_frag_walk() - initialize a frag walk over an skb. + * @skb: the skb to walk. + * @initial_offset: start the walk @initial_offset into the skb. + * @walk: the walk to initialize + * + * Future calls to skb_add_frags() will expect the @offset value to be at + * least @initial_offset large. + */ +static void iptfs_skb_prepare_frag_walk(struct sk_buff *skb, u32 initial_offset, + struct iptfs_skb_frag_walk *walk) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + skb_frag_t *frag, *from; + u32 i; + + walk->initial_offset = initial_offset; + walk->fragi = 0; + walk->past = 0; + walk->total = 0; + walk->nr_frags = 0; + walk->pp_recycle = skb->pp_recycle; + + if (skb->head_frag) { + if (initial_offset >= skb_headlen(skb)) { + initial_offset -= skb_headlen(skb); + } else { + frag = &walk->frags[walk->nr_frags++]; + iptfs_skb_head_to_frag(skb, frag); + frag->offset += initial_offset; + frag->len -= initial_offset; + walk->total += frag->len; + initial_offset = 0; + } + } else { + initial_offset -= skb_headlen(skb); + } + + for (i = 0; i < shinfo->nr_frags; i++) { + from = &shinfo->frags[i]; + if (initial_offset >= from->len) { + initial_offset -= from->len; + continue; + } + frag = &walk->frags[walk->nr_frags++]; + *frag = *from; + if (initial_offset) { + frag->offset += initial_offset; + frag->len -= initial_offset; + initial_offset = 0; + } + walk->total += frag->len; + } +} + +static u32 iptfs_skb_reset_frag_walk(struct iptfs_skb_frag_walk *walk, + u32 offset) +{ + /* Adjust offset to refer to internal walk values */ + offset -= walk->initial_offset; + + /* Get to the correct fragment for offset */ + while (offset < walk->past) { + walk->past -= walk->frags[--walk->fragi].len; + if (offset >= walk->past) + break; + } + while (offset >= walk->past + walk->frags[walk->fragi].len) + walk->past += walk->frags[walk->fragi++].len; + + /* offset now relative to this current frag */ + offset -= walk->past; + return offset; +} + +/** + * iptfs_skb_can_add_frags() - check if ok to add frags from walk to skb + * @skb: skb to check for adding frags to + * @walk: the walk that will be used as source for frags. + * @offset: offset from beginning of original skb to start from. + * @len: amount of data to add frag references to in @skb. + * + * Return: true if ok to add frags. + */ +static bool iptfs_skb_can_add_frags(const struct sk_buff *skb, + struct iptfs_skb_frag_walk *walk, + u32 offset, u32 len) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + u32 fragi, nr_frags, fraglen; + + if (skb_has_frag_list(skb) || skb->pp_recycle != walk->pp_recycle) + return false; + + /* Make offset relative to current frag after setting that */ + offset = iptfs_skb_reset_frag_walk(walk, offset); + + /* Verify we have array space for the fragments we need to add */ + fragi = walk->fragi; + nr_frags = shinfo->nr_frags; + while (len && fragi < walk->nr_frags) { + skb_frag_t *frag = &walk->frags[fragi]; + + fraglen = frag->len; + if (offset) { + fraglen -= offset; + offset = 0; + } + if (++nr_frags > MAX_SKB_FRAGS) + return false; + if (len <= fraglen) + return true; + len -= fraglen; + fragi++; + } + /* We may not copy all @len but what we have will fit. */ + return true; +} + +/** + * iptfs_skb_add_frags() - add a range of fragment references into an skb + * @skb: skb to add references into + * @walk: the walk to add referenced fragments from. + * @offset: offset from beginning of original skb to start from. + * @len: amount of data to add frag references to in @skb. + * + * iptfs_skb_can_add_frags() should be called before this function to verify + * that the destination @skb is compatible with the walk and has space in the + * array for the to be added frag references. + * + * Return: The number of bytes not added to @skb b/c we reached the end of the + * walk before adding all of @len. + */ +static int iptfs_skb_add_frags(struct sk_buff *skb, + struct iptfs_skb_frag_walk *walk, u32 offset, + u32 len) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + u32 fraglen; + + if (!walk->nr_frags || offset >= walk->total + walk->initial_offset) + return len; + + /* make offset relative to current frag after setting that */ + offset = iptfs_skb_reset_frag_walk(walk, offset); + + while (len && walk->fragi < walk->nr_frags) { + skb_frag_t *frag = &walk->frags[walk->fragi]; + skb_frag_t *tofrag = &shinfo->frags[shinfo->nr_frags]; + + *tofrag = *frag; + if (offset) { + tofrag->offset += offset; + tofrag->len -= offset; + offset = 0; + } + __skb_frag_ref(tofrag); + shinfo->nr_frags++; + + /* see if we are done */ + fraglen = tofrag->len; + if (len < fraglen) { + tofrag->len = len; + skb->len += len; + skb->data_len += len; + return 0; + } + /* advance to next source fragment */ + len -= fraglen; /* careful, use dst bv_len */ + skb->len += fraglen; /* careful, " " " */ + skb->data_len += fraglen; /* careful, " " " */ + walk->past += frag->len; /* careful, use src bv_len */ + walk->fragi++; + } + return len; +} + +/* ================================== */ +/* IPTFS Trace Event Definitions */ +/* ================================== */ + +#define CREATE_TRACE_POINTS +#include "trace_iptfs.h" + +/* ================================== */ +/* IPTFS Receiving (egress) Functions */ +/* ================================== */ + +/** + * iptfs_pskb_add_frags() - Create and add frags into a new sk_buff. + * @tpl: template to create new skb from. + * @walk: The source for fragments to add. + * @off: The offset into @walk to add frags from, also used with @st and + * @copy_len. + * @len: The length of data to add covering frags from @walk into @skb. + * This must be <= @skblen. + * @st: The sequence state to copy from into the new head skb. + * @copy_len: Copy @copy_len bytes from @st at offset @off into the new skb + * linear space. + * + * Create a new sk_buff `skb` using the template @tpl. Copy @copy_len bytes from + * @st into the new skb linear space, and then add shared fragments from the + * frag walk for the remaining @len of data (i.e., @len - @copy_len bytes). + * + * Return: The newly allocated sk_buff `skb` or NULL if an error occurs. + */ +static struct sk_buff * +iptfs_pskb_add_frags(struct sk_buff *tpl, struct iptfs_skb_frag_walk *walk, + u32 off, u32 len, struct skb_seq_state *st, u32 copy_len) +{ + struct sk_buff *skb; + + skb = iptfs_alloc_skb(tpl, copy_len, false); + if (!skb) + return NULL; + + /* this should not normally be happening */ + if (!iptfs_skb_can_add_frags(skb, walk, off + copy_len, + len - copy_len)) { + kfree_skb(skb); + return NULL; + } + + if (copy_len && + skb_copy_seq_read(st, off, skb_put(skb, copy_len), copy_len)) { + XFRM_INC_STATS(dev_net(st->root_skb->dev), + LINUX_MIB_XFRMINERROR); + kfree_skb(skb); + return NULL; + } + + iptfs_skb_add_frags(skb, walk, off + copy_len, len - copy_len); + return skb; +} + +/** + * iptfs_pskb_extract_seq() - Create and load data into a new sk_buff. + * @skblen: the total data size for `skb`. + * @st: The source for the rest of the data to copy into `skb`. + * @off: The offset into @st to copy data from. + * @len: The length of data to copy from @st into `skb`. This must be <= + * @skblen. + * + * Create a new sk_buff `skb` with @skblen of packet data space. If non-zero, + * copy @rlen bytes of @runt into `skb`. Then using seq functions copy @len + * bytes from @st into `skb` starting from @off. + * + * It is an error for @len to be greater than the amount of data left in @st. + * + * Return: The newly allocated sk_buff `skb` or NULL if an error occurs. + */ +static struct sk_buff * +iptfs_pskb_extract_seq(u32 skblen, struct skb_seq_state *st, u32 off, int len) +{ + struct sk_buff *skb = iptfs_alloc_skb(st->root_skb, skblen, false); + + if (!skb) + return NULL; + if (skb_copy_seq_read(st, off, skb_put(skb, len), len)) { + XFRM_INC_STATS(dev_net(st->root_skb->dev), LINUX_MIB_XFRMINERROR); + kfree_skb(skb); + return NULL; + } + return skb; +} + +/** + * iptfs_input_save_runt() - save data in xtfs runt space. + * @xtfs: xtfs state + * @seq: the current sequence + * @buf: packet data + * @len: length of packet data + * + * Save the small (`len`) start of a fragmented packet in `buf` in the xtfs data + * runt space. + */ +static void iptfs_input_save_runt(struct xfrm_iptfs_data *xtfs, u64 seq, + u8 *buf, int len) +{ + memcpy(xtfs->ra_runt, buf, len); + + xtfs->ra_runtlen = len; + xtfs->ra_wantseq = seq + 1; +} + +/** + * __iptfs_iphlen() - return the v4/v6 header length using packet data. + * @data: pointer at octet with version nibble + * + * The version data has been checked to be valid (i.e., either 4 or 6). + * + * Return: the IP header size based on the IP version. + */ +static u32 __iptfs_iphlen(u8 *data) +{ + struct iphdr *iph = (struct iphdr *)data; + + if (iph->version == 0x4) + return sizeof(*iph); + return sizeof(struct ipv6hdr); +} + +/** + * __iptfs_iplen() - return the v4/v6 length using packet data. + * @data: pointer to ip (v4/v6) packet header + * + * Grab the IPv4 or IPv6 length value in the start of the inner packet header + * pointed to by `data`. Assumes data len is enough for the length field only. + * + * The version data has been checked to be valid (i.e., either 4 or 6). + * + * Return: the length value. + */ +static u32 __iptfs_iplen(u8 *data) +{ + struct iphdr *iph = (struct iphdr *)data; + + if (iph->version == 0x4) + return ntohs(iph->tot_len); + return ntohs(((struct ipv6hdr *)iph)->payload_len) + + sizeof(struct ipv6hdr); +} + +/** + * iptfs_complete_inner_skb() - finish preparing the inner packet for gro recv. + * @x: xfrm state + * @skb: the inner packet + * + * Finish the standard xfrm processing on the inner packet prior to sending back + * through gro_cells_receive. We do this separately b/c we are building a list + * of packets in the hopes that one day a list will be taken by + * xfrm_input. + */ +static void iptfs_complete_inner_skb(struct xfrm_state *x, struct sk_buff *skb) +{ + skb_reset_network_header(skb); + + /* The packet is going back through gro_cells_receive no need to + * set this. + */ + skb_reset_transport_header(skb); + + /* Packet already has checksum value set. */ + skb->ip_summed = CHECKSUM_NONE; + + /* Our skb will contain the header data copied when this outer packet + * which contained the start of this inner packet. This is true + * when we allocate a new skb as well as when we reuse the existing skb. + */ + if (ip_hdr(skb)->version == 0x4) { + struct iphdr *iph = ip_hdr(skb); + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) + IP_ECN_set_ce(iph); + + skb->protocol = htons(ETH_P_IP); + } else { + struct ipv6hdr *iph = ipv6_hdr(skb); + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) + IP6_ECN_set_ce(skb, iph); + + skb->protocol = htons(ETH_P_IPV6); + } +} + +static void __iptfs_reassem_done(struct xfrm_iptfs_data *xtfs, bool free) +{ + assert_spin_locked(&xtfs->drop_lock); + + /* We don't care if it works locking takes care of things */ + hrtimer_try_to_cancel(&xtfs->drop_timer); + if (free) + kfree_skb(xtfs->ra_newskb); + xtfs->ra_newskb = NULL; +} + +/** + * iptfs_reassem_abort() - In-progress packet is aborted free the state. + * @xtfs: xtfs state + */ +static void iptfs_reassem_abort(struct xfrm_iptfs_data *xtfs) +{ + __iptfs_reassem_done(xtfs, true); +} + +/** + * iptfs_reassem_done() - In-progress packet is complete, clear the state. + * @xtfs: xtfs state + */ +static void iptfs_reassem_done(struct xfrm_iptfs_data *xtfs) +{ + __iptfs_reassem_done(xtfs, false); +} + +/** + * iptfs_reassem_cont() - Continue the reassembly of an inner packets. + * @xtfs: xtfs state + * @seq: sequence of current packet + * @st: seq read stat for current packet + * @skb: current packet + * @data: offset into sequential packet data + * @blkoff: packet blkoff value + * @list: list of skbs to enqueue completed packet on + * + * Process an IPTFS payload that has a non-zero `blkoff` or when we are + * expecting the continuation b/c we have a runt or in-progress packet. + * + * Return: the new data offset to continue processing from. + */ +static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq, + struct skb_seq_state *st, struct sk_buff *skb, + u32 data, u32 blkoff, struct list_head *list) +{ + struct iptfs_skb_frag_walk _fragwalk; + struct iptfs_skb_frag_walk *fragwalk = NULL; + struct sk_buff *newskb = xtfs->ra_newskb; + u32 remaining = skb->len - data; + u32 runtlen = xtfs->ra_runtlen; + u32 copylen, fraglen, ipremain, iphlen, iphremain, rrem; + + /* Handle packet fragment we aren't expecting */ + if (!runtlen && !xtfs->ra_newskb) + return data + min(blkoff, remaining); + + /* Important to remember that input to this function is an ordered + * packet stream (unless the user disabled the reorder window). Thus if + * we are waiting for, and expecting the next packet so we can continue + * assembly, a newer sequence number indicates older ones are not coming + * (or if they do should be ignored). Technically we can receive older + * ones when the reorder window is disabled; however, the user should + * have disabled fragmentation in this case, and regardless we don't + * deal with it. + * + * blkoff could be zero if the stream is messed up (or it's an all pad + * insertion) be careful to handle that case in each of the below + */ + + /* Too old case: This can happen when the reorder window is disabled so + * ordering isn't actually guaranteed. + */ + if (seq < xtfs->ra_wantseq) + return data + remaining; + + /* Too new case: We missed what we wanted cleanup. */ + if (seq > xtfs->ra_wantseq) { + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + if (blkoff == 0) { + if ((*skb->data & 0xF0) != 0) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + /* Handle all pad case, advance expected sequence number. + * (RFC 9347 S2.2.3) + */ + xtfs->ra_wantseq++; + /* will end parsing */ + return data + remaining; + } + + if (runtlen) { + /* Regardless of what happens we're done with the runt */ + xtfs->ra_runtlen = 0; + + /* The start of this inner packet was at the very end of the last + * iptfs payload which didn't include enough for the ip header + * length field. We must have *at least* that now. + */ + rrem = sizeof(xtfs->ra_runt) - runtlen; + if (remaining < rrem || blkoff < rrem) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + /* fill in the runt data */ + if (skb_copy_seq_read(st, data, &xtfs->ra_runt[runtlen], + rrem)) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + + /* We have enough data to get the ip length value now, + * allocate an in progress skb + */ + ipremain = __iptfs_iplen(xtfs->ra_runt); + if (ipremain < sizeof(xtfs->ra_runt)) { + /* length has to be at least runtsize large */ + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + /* For the runt case we don't attempt sharing currently. NOTE: + * Currently, this IPTFS implementation will not create runts. + */ + + newskb = iptfs_alloc_skb(skb, ipremain, false); + if (!newskb) { + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINERROR); + goto abandon; + } + xtfs->ra_newskb = newskb; + + /* Copy the runt data into the buffer, but leave data + * pointers the same as normal non-runt case. The extra `rrem` + * recopied bytes are basically cacheline free. Allows using + * same logic below to complete. + */ + memcpy(skb_put(newskb, runtlen), xtfs->ra_runt, + sizeof(xtfs->ra_runt)); + } + + /* Continue reassembling the packet */ + ipremain = __iptfs_iplen(newskb->data); + iphlen = __iptfs_iphlen(newskb->data); + + ipremain -= newskb->len; + if (blkoff < ipremain) { + /* Corrupt data, we don't have enough to complete the packet */ + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + + /* We want the IP header in linear space */ + if (newskb->len < iphlen) { + iphremain = iphlen - newskb->len; + if (blkoff < iphremain) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINIPTFSERROR); + goto abandon; + } + fraglen = min(blkoff, remaining); + copylen = min(fraglen, iphremain); + if (skb_copy_seq_read(st, data, skb_put(newskb, copylen), + copylen)) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + /* this is a silly condition that might occur anyway */ + if (copylen < iphremain) { + xtfs->ra_wantseq++; + return data + fraglen; + } + /* update data and things derived from it */ + data += copylen; + blkoff -= copylen; + remaining -= copylen; + ipremain -= copylen; + } + + fraglen = min(blkoff, remaining); + copylen = min(fraglen, ipremain); + + /* If we may have the opportunity to share prepare a fragwalk. */ + if (!skb_has_frag_list(skb) && !skb_has_frag_list(newskb) && + (skb->head_frag || skb->len == skb->data_len) && + skb->pp_recycle == newskb->pp_recycle) { + fragwalk = &_fragwalk; + iptfs_skb_prepare_frag_walk(skb, data, fragwalk); + } + + /* Try share then copy. */ + if (fragwalk && + iptfs_skb_can_add_frags(newskb, fragwalk, data, copylen)) { + iptfs_skb_add_frags(newskb, fragwalk, data, copylen); + } else { + /* copy fragment data into newskb */ + if (skb_copy_seq_read(st, data, skb_put(newskb, copylen), + copylen)) { + XFRM_INC_STATS(xs_net(xtfs->x), + LINUX_MIB_XFRMINBUFFERERROR); + goto abandon; + } + } + + if (copylen < ipremain) { + xtfs->ra_wantseq++; + } else { + /* We are done with packet reassembly! */ + iptfs_reassem_done(xtfs); + iptfs_complete_inner_skb(xtfs->x, newskb); + list_add_tail(&newskb->list, list); + } + + /* will continue on to new data block or end */ + return data + fraglen; + +abandon: + if (xtfs->ra_newskb) { + iptfs_reassem_abort(xtfs); + } else { + xtfs->ra_runtlen = 0; + xtfs->ra_wantseq = 0; + } + /* skip past fragment, maybe to end */ + return data + min(blkoff, remaining); +} + +static bool __input_process_payload(struct xfrm_state *x, u32 data, + struct skb_seq_state *skbseq, + struct list_head *sublist) +{ + u8 hbytes[sizeof(struct ipv6hdr)]; + struct iptfs_skb_frag_walk _fragwalk; + struct iptfs_skb_frag_walk *fragwalk = NULL; + struct sk_buff *defer, *first_skb, *next, *skb; + const unsigned char *old_mac; + struct xfrm_iptfs_data *xtfs; + struct iphdr *iph; + struct net *net; + u32 first_iplen, iphlen, iplen, remaining, tail; + u32 capturelen; + u64 seq; + + xtfs = x->mode_data; + net = xs_net(x); + skb = skbseq->root_skb; + first_skb = NULL; + defer = NULL; + + seq = __esp_seq(skb); + + /* Save the old mac header if set */ + old_mac = skb_mac_header_was_set(skb) ? skb_mac_header(skb) : NULL; + + /* New packets */ + + tail = skb->len; + while (data < tail) { + __be16 protocol = 0; + + /* Gather information on the next data block. + * `data` points to the start of the data block. + */ + remaining = tail - data; + + /* try and copy enough bytes to read length from ipv4/ipv6 */ + iphlen = min_t(u32, remaining, 6); + if (skb_copy_seq_read(skbseq, data, hbytes, iphlen)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto done; + } + + iph = (struct iphdr *)hbytes; + if (iph->version == 0x4) { + /* must have at least tot_len field present */ + if (remaining < 4) { + /* save the bytes we have, advance data and exit */ + iptfs_input_save_runt(xtfs, seq, hbytes, + remaining); + data += remaining; + break; + } + + iplen = be16_to_cpu(iph->tot_len); + iphlen = iph->ihl << 2; + protocol = cpu_to_be16(ETH_P_IP); + XFRM_MODE_SKB_CB(skbseq->root_skb)->tos = iph->tos; + } else if (iph->version == 0x6) { + /* must have at least payload_len field present */ + if (remaining < 6) { + /* save the bytes we have, advance data and exit */ + iptfs_input_save_runt(xtfs, seq, hbytes, + remaining); + data += remaining; + break; + } + + iplen = be16_to_cpu(((struct ipv6hdr *)hbytes)->payload_len); + iplen += sizeof(struct ipv6hdr); + iphlen = sizeof(struct ipv6hdr); + protocol = cpu_to_be16(ETH_P_IPV6); + XFRM_MODE_SKB_CB(skbseq->root_skb)->tos = + ipv6_get_dsfield((struct ipv6hdr *)iph); + } else if (iph->version == 0x0) { + /* pad */ + data = tail; + break; + } else { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto done; + } + + if (unlikely(skbseq->stepped_offset)) { + /* We need to reset our seq read, it can't backup at + * this point. + */ + struct sk_buff *save = skbseq->root_skb; + + skb_abort_seq_read(skbseq); + skb_prepare_seq_read(save, data, tail, skbseq); + } + + if (first_skb) { + skb = NULL; + } else { + first_skb = skb; + first_iplen = iplen; + fragwalk = NULL; + + /* We are going to skip over `data` bytes to reach the + * start of the IP header of `iphlen` len for `iplen` + * inner packet. + */ + + if (skb_has_frag_list(skb)) { + defer = skb; + skb = NULL; + } else if (data + iphlen <= skb_headlen(skb) && + /* make sure our header is 32-bit aligned? */ + /* ((uintptr_t)(skb->data + data) & 0x3) == 0 && */ + skb_tailroom(skb) + tail - data >= iplen) { + /* Reuse the received skb. + * + * We have enough headlen to pull past any + * initial fragment data, leaving at least the + * IP header in the linear buffer space. + * + * For linear buffer space we only require that + * linear buffer space is large enough to + * eventually hold the entire reassembled + * packet (by including tailroom in the check). + * + * For non-linear tailroom is 0 and so we only + * re-use if the entire packet is present + * already. + * + * NOTE: there are many more options for + * sharing, KISS for now. Also, this can produce + * skb's with the IP header unaligned to 32 + * bits. If that ends up being a problem then a + * check should be added to the conditional + * above that the header lies on a 32-bit + * boundary as well. + */ + skb_pull(skb, data); + + /* our range just changed */ + data = 0; + tail = skb->len; + remaining = skb->len; + + skb->protocol = protocol; + skb_mac_header_rebuild(skb); + if (skb->mac_len) + eth_hdr(skb)->h_proto = skb->protocol; + + /* all pointers could be changed now reset walk */ + skb_abort_seq_read(skbseq); + skb_prepare_seq_read(skb, data, tail, skbseq); + } else if (skb->head_frag && + /* We have the IP header right now */ + remaining >= iphlen) { + fragwalk = &_fragwalk; + iptfs_skb_prepare_frag_walk(skb, data, fragwalk); + defer = skb; + skb = NULL; + } else { + /* We couldn't reuse the input skb so allocate a + * new one. + */ + defer = skb; + skb = NULL; + } + + /* Don't trim `first_skb` until the end as we are + * walking that data now. + */ + } + + capturelen = min(iplen, remaining); + if (!skb) { + if (!fragwalk || + /* Large enough to be worth sharing */ + iplen < IPTFS_PKT_SHARE_MIN || + /* Have IP header + some data to share. */ + capturelen <= iphlen || + /* Try creating skb and adding frags */ + !(skb = iptfs_pskb_add_frags(first_skb, fragwalk, + data, capturelen, + skbseq, iphlen))) { + skb = iptfs_pskb_extract_seq(iplen, skbseq, data, capturelen); + } + if (!skb) { + /* skip to next packet or done */ + data += capturelen; + continue; + } + + skb->protocol = protocol; + if (old_mac) { + /* rebuild the mac header */ + skb_set_mac_header(skb, -first_skb->mac_len); + memcpy(skb_mac_header(skb), old_mac, first_skb->mac_len); + eth_hdr(skb)->h_proto = skb->protocol; + } + } + + data += capturelen; + + if (skb->len < iplen) { + /* Start reassembly */ + spin_lock(&xtfs->drop_lock); + + xtfs->ra_newskb = skb; + xtfs->ra_wantseq = seq + 1; + if (!hrtimer_is_queued(&xtfs->drop_timer)) { + /* softirq blocked lest the timer fire and interrupt us */ + hrtimer_start(&xtfs->drop_timer, + xtfs->drop_time_ns, + IPTFS_HRTIMER_MODE); + } + + spin_unlock(&xtfs->drop_lock); + + break; + } + + iptfs_complete_inner_skb(x, skb); + list_add_tail(&skb->list, sublist); + } + + if (data != tail) + /* this should not happen from the above code */ + XFRM_INC_STATS(net, LINUX_MIB_XFRMINIPTFSERROR); + + if (first_skb && first_iplen && !defer && first_skb != xtfs->ra_newskb) { + /* first_skb is queued b/c !defer and not partial */ + if (pskb_trim(first_skb, first_iplen)) { + /* error trimming */ + list_del(&first_skb->list); + defer = first_skb; + } + first_skb->ip_summed = CHECKSUM_NONE; + } + + /* Send the packets! */ + list_for_each_entry_safe(skb, next, sublist, list) { + skb_list_del_init(skb); + if (xfrm_input(skb, 0, 0, -2)) + kfree_skb(skb); + } +done: + skb = skbseq->root_skb; + skb_abort_seq_read(skbseq); + + if (defer) { + consume_skb(defer); + } else if (!first_skb) { + /* skb is the original passed in skb, but we didn't get far + * enough to process it as the first_skb, if we had it would + * either be save in ra_newskb, trimmed and sent on as an skb or + * placed in defer to be freed. + */ + kfree_skb(skb); + } + return true; +} + +/** + * iptfs_input_ordered() - handle next in order IPTFS payload. + * @x: xfrm state + * @skb: current packet + * + * Process the IPTFS payload in `skb` and consume it afterwards. + */ +static void iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ip_iptfs_cc_hdr iptcch; + struct skb_seq_state skbseq; + struct list_head sublist; /* rename this it's just a list */ + struct xfrm_iptfs_data *xtfs; + struct ip_iptfs_hdr *ipth; + struct net *net; + u32 blkoff, data, remaining; + bool consumed = false; + u64 seq; + + xtfs = x->mode_data; + net = xs_net(x); + + seq = __esp_seq(skb); + + /* Large enough to hold both types of header */ + ipth = (struct ip_iptfs_hdr *)&iptcch; + + skb_prepare_seq_read(skb, 0, skb->len, &skbseq); + + /* Get the IPTFS header and validate it */ + + if (skb_copy_seq_read(&skbseq, 0, ipth, sizeof(*ipth))) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto done; + } + data = sizeof(*ipth); + + trace_iptfs_egress_recv(skb, xtfs, be16_to_cpu(ipth->block_offset)); + + /* Set data past the basic header */ + if (ipth->subtype == IPTFS_SUBTYPE_CC) { + /* Copy the rest of the CC header */ + remaining = sizeof(iptcch) - sizeof(*ipth); + if (skb_copy_seq_read(&skbseq, data, ipth + 1, remaining)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto done; + } + data += remaining; + } else if (ipth->subtype != IPTFS_SUBTYPE_BASIC) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); + goto done; + } + + if (ipth->flags != 0) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); + goto done; + } + + INIT_LIST_HEAD(&sublist); + + /* Handle fragment at start of payload, and/or waiting reassembly. */ + + blkoff = ntohs(ipth->block_offset); + /* check before locking i.e., maybe */ + if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { + spin_lock(&xtfs->drop_lock); + + /* check again after lock */ + if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) { + data = iptfs_reassem_cont(xtfs, seq, &skbseq, skb, data, + blkoff, &sublist); + } + + spin_unlock(&xtfs->drop_lock); + } + + /* New packets */ + consumed = __input_process_payload(x, data, &skbseq, &sublist); +done: + if (!consumed) { + skb = skbseq.root_skb; + skb_abort_seq_read(&skbseq); + kfree_skb(skb); + } +} + +/* ------------------------------- */ +/* Input (Egress) Re-ordering Code */ +/* ------------------------------- */ + +static void __vec_shift(struct xfrm_iptfs_data *xtfs, u32 shift) +{ + u32 savedlen = xtfs->w_savedlen; + + if (shift > savedlen) + shift = savedlen; + if (shift != savedlen) + memcpy(xtfs->w_saved, xtfs->w_saved + shift, + (savedlen - shift) * sizeof(*xtfs->w_saved)); + memset(xtfs->w_saved + savedlen - shift, 0, + shift * sizeof(*xtfs->w_saved)); + xtfs->w_savedlen -= shift; +} + +static void __reorder_past(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb, + struct list_head *freelist) +{ + list_add_tail(&inskb->list, freelist); +} + +static u32 __reorder_drop(struct xfrm_iptfs_data *xtfs, struct list_head *list) + +{ + struct skb_wseq *s, *se; + const u32 savedlen = xtfs->w_savedlen; + time64_t now = ktime_get_raw_fast_ns(); + u32 count = 0; + u32 scount = 0; + + if (xtfs->w_saved[0].drop_time > now) + goto set_timer; + + ++xtfs->w_wantseq; + + /* Keep flushing packets until we reach a drop time greater than now. */ + s = xtfs->w_saved; + se = s + savedlen; + do { + /* Walking past empty slots until we reach a packet */ + for (; s < se && !s->skb; s++) { + if (s->drop_time > now) + goto outerdone; + } + /* Sending packets until we hit another empty slot. */ + for (; s < se && s->skb; scount++, s++) + list_add_tail(&s->skb->list, list); + } while (s < se); +outerdone: + + count = s - xtfs->w_saved; + if (count) { + xtfs->w_wantseq += count; + + /* Shift handled slots plus final empty slot into slot 0. */ + __vec_shift(xtfs, count); + } + + if (xtfs->w_savedlen) { +set_timer: + /* Drifting is OK */ + hrtimer_start(&xtfs->drop_timer, + xtfs->w_saved[0].drop_time - now, + IPTFS_HRTIMER_MODE); + } + return scount; +} + +static void __reorder_this(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb, + struct list_head *list) +{ + struct skb_wseq *s, *se; + const u32 savedlen = xtfs->w_savedlen; + u32 count = 0; + + /* Got what we wanted. */ + list_add_tail(&inskb->list, list); + ++xtfs->w_wantseq; + if (!savedlen) + return; + + /* Flush remaining consecutive packets. */ + + /* Keep sending until we hit another missed pkt. */ + for (s = xtfs->w_saved, se = s + savedlen; s < se && s->skb; s++) + list_add_tail(&s->skb->list, list); + count = s - xtfs->w_saved; + if (count) + xtfs->w_wantseq += count; + + /* Shift handled slots plus final empty slot into slot 0. */ + __vec_shift(xtfs, count + 1); +} + +/* Set the slot's drop time and all the empty slots below it until reaching a + * filled slot which will already be set. + */ +static void iptfs_set_window_drop_times(struct xfrm_iptfs_data *xtfs, int index) +{ + const u32 savedlen = xtfs->w_savedlen; + struct skb_wseq *s = xtfs->w_saved; + time64_t drop_time; + + assert_spin_locked(&xtfs->drop_lock); + + if (savedlen > index + 1) { + /* we are below another, our drop time and the timer are already set */ + return; + } + /* we are the most future so get a new drop time. */ + drop_time = ktime_get_raw_fast_ns(); + drop_time += xtfs->drop_time_ns; + + /* Walk back through the array setting drop times as we go */ + s[index].drop_time = drop_time; + while (index-- > 0 && !s[index].skb) + s[index].drop_time = drop_time; + + /* If we walked all the way back, schedule the drop timer if needed */ + if (index == -1 && !hrtimer_is_queued(&xtfs->drop_timer)) + hrtimer_start(&xtfs->drop_timer, xtfs->drop_time_ns, + IPTFS_HRTIMER_MODE); +} + +static void __reorder_future_fits(struct xfrm_iptfs_data *xtfs, + struct sk_buff *inskb, + struct list_head *freelist) +{ + const u64 inseq = __esp_seq(inskb); + const u64 wantseq = xtfs->w_wantseq; + const u64 distance = inseq - wantseq; + const u32 savedlen = xtfs->w_savedlen; + const u32 index = distance - 1; + + /* Handle future sequence number received which fits in the window. + * + * We know we don't have the seq we want so we won't be able to flush + * anything. + */ + + /* slot count is 4, saved size is 3 savedlen is 2 + * + * "window boundary" is based on the fixed window size + * distance is also slot number + * index is an array index (i.e., - 1 of slot) + * : : - implicit NULL after array len + * + * +--------- used length (savedlen == 2) + * | +----- array size (nslots - 1 == 3) + * | | + window boundary (nslots == 4) + * V V | V + * | + * 0 1 2 3 | slot number + * --- 0 1 2 | array index + * [-] [b] : :| array + * + * "2" "3" "4" *5*| seq numbers + * + * We receive seq number 5 + * distance == 3 [inseq(5) - w_wantseq(2)] + * index == 2 [distance(6) - 1] + */ + + if (xtfs->w_saved[index].skb) { + /* a dup of a future */ + list_add_tail(&inskb->list, freelist); + return; + } + + xtfs->w_saved[index].skb = inskb; + xtfs->w_savedlen = max(savedlen, index + 1); + iptfs_set_window_drop_times(xtfs, index); +} + +static void __reorder_future_shifts(struct xfrm_iptfs_data *xtfs, + struct sk_buff *inskb, + struct list_head *list) +{ + const u32 nslots = xtfs->cfg.reorder_win_size + 1; + const u64 inseq = __esp_seq(inskb); + u32 savedlen = xtfs->w_savedlen; + u64 wantseq = xtfs->w_wantseq; + struct skb_wseq *wnext; + struct sk_buff *slot0; + u32 beyond, shifting, slot; + u64 distance; + + /* Handle future sequence number received. + * + * IMPORTANT: we are at least advancing w_wantseq (i.e., wantseq) by 1 + * b/c we are beyond the window boundary. + * + * We know we don't have the wantseq so that counts as a drop. + */ + + /* example: slot count is 4, array size is 3 savedlen is 2, slot 0 is + * the missing sequence number. + * + * the final slot at savedlen (index savedlen - 1) is always occupied. + * + * beyond is "beyond array size" not savedlen. + * + * +--------- array length (savedlen == 2) + * | +----- array size (nslots - 1 == 3) + * | | +- window boundary (nslots == 4) + * V V | + * | + * 0 1 2 3 | slot number + * --- 0 1 2 | array index + * [b] [c] : :| array + * | + * "2" "3" "4" "5"|*6* seq numbers + * + * We receive seq number 6 + * distance == 4 [inseq(6) - w_wantseq(2)] + * newslot == distance + * index == 3 [distance(4) - 1] + * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))] + * shifting == 1 [min(savedlen(2), beyond(1)] + * slot0_skb == [b], and should match w_wantseq + * + * +--- window boundary (nslots == 4) + * 0 1 2 3 | 4 slot number + * --- 0 1 2 | 3 array index + * [b] : : : :| array + * "2" "3" "4" "5" *6* seq numbers + * + * We receive seq number 6 + * distance == 4 [inseq(6) - w_wantseq(2)] + * newslot == distance + * index == 3 [distance(4) - 1] + * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))] + * shifting == 1 [min(savedlen(1), beyond(1)] + * slot0_skb == [b] and should match w_wantseq + * + * +-- window boundary (nslots == 4) + * 0 1 2 3 | 4 5 6 slot number + * --- 0 1 2 | 3 4 5 array index + * [-] [c] : :| array + * "2" "3" "4" "5" "6" "7" *8* seq numbers + * + * savedlen = 2, beyond = 3 + * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-] + * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c] + * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1) + * + * We receive seq number 8 + * distance == 6 [inseq(8) - w_wantseq(2)] + * newslot == distance + * index == 5 [distance(6) - 1] + * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))] + * shifting == 2 [min(savedlen(2), beyond(3)] + * + * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true. + */ + + /* Now send any packets that are being shifted out of saved, and account + * for missing packets that are exiting the window as we shift it. + */ + + distance = inseq - wantseq; + beyond = distance - (nslots - 1); + + /* If savedlen > beyond we are shifting some, else all. */ + shifting = min(savedlen, beyond); + + /* slot0 is the buf that just shifted out and into slot0 */ + slot0 = NULL; + wnext = xtfs->w_saved; + for (slot = 1; slot <= shifting; slot++, wnext++) { + /* handle what was in slot0 before we occupy it */ + if (slot0) + list_add_tail(&slot0->list, list); + slot0 = wnext->skb; + wnext->skb = NULL; + } + + /* slot0 is now either NULL (in which case it's what we now are waiting + * for, or a buf in which case we need to handle it like we received it; + * however, we may be advancing past that buffer as well.. + */ + + /* Handle case where we need to shift more than we had saved, slot0 will + * be NULL iff savedlen is 0, otherwise slot0 will always be + * non-NULL b/c we shifted the final element, which is always set if + * there is any saved, into slot0. + */ + if (savedlen < beyond) { + if (savedlen != 0) + list_add_tail(&slot0->list, list); + slot0 = NULL; + /* slot0 has had an empty slot pushed into it */ + } + + /* Remove the entries */ + __vec_shift(xtfs, beyond); + + /* Advance want seq */ + xtfs->w_wantseq += beyond; + + /* Process drops here when implementing congestion control */ + + /* We've shifted. plug the packet in at the end. */ + xtfs->w_savedlen = nslots - 1; + xtfs->w_saved[xtfs->w_savedlen - 1].skb = inskb; + iptfs_set_window_drop_times(xtfs, xtfs->w_savedlen - 1); + + /* if we don't have a slot0 then we must wait for it */ + if (!slot0) + return; + + /* If slot0, seq must match new want seq */ + + /* slot0 is valid, treat like we received expected. */ + __reorder_this(xtfs, slot0, list); +} + +/* Receive a new packet into the reorder window. Return a list of ordered + * packets from the window. + */ +static void iptfs_input_reorder(struct xfrm_iptfs_data *xtfs, + struct sk_buff *inskb, struct list_head *list, + struct list_head *freelist) +{ + const u32 nslots = xtfs->cfg.reorder_win_size + 1; + u64 inseq = __esp_seq(inskb); + u64 wantseq; + + assert_spin_locked(&xtfs->drop_lock); + + if (unlikely(!xtfs->w_seq_set)) { + xtfs->w_seq_set = true; + xtfs->w_wantseq = inseq; + } + wantseq = xtfs->w_wantseq; + + if (likely(inseq == wantseq)) + __reorder_this(xtfs, inskb, list); + else if (inseq < wantseq) + __reorder_past(xtfs, inskb, freelist); + else if ((inseq - wantseq) < nslots) + __reorder_future_fits(xtfs, inskb, freelist); + else + __reorder_future_shifts(xtfs, inskb, list); +} + +/** + * iptfs_drop_timer() - Handle drop timer expiry. + * @me: the timer + * + * This is similar to our input function. + * + * The drop timer is set when we start an in progress reassembly, and also when + * we save a future packet in the window saved array. + * + * NOTE packets in the save window are always newer WRT drop times as + * they get further in the future. i.e. for: + * + * if slots (S0, S1, ... Sn) and `Dn` is the drop time for slot `Sn`, + * then D(n-1) <= D(n). + * + * So, regardless of why the timer is firing we can always discard any inprogress + * fragment; either it's the reassembly timer, or slot 0 is going to be + * dropped as S0 must have the most recent drop time, and slot 0 holds the + * continuation fragment of the in progress packet. + * + * Returns HRTIMER_NORESTART. + */ +static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me) +{ + struct sk_buff *skb, *next; + struct list_head list; + struct xfrm_iptfs_data *xtfs; + struct xfrm_state *x; + u32 count; + + xtfs = container_of(me, typeof(*xtfs), drop_timer); + x = xtfs->x; + + INIT_LIST_HEAD(&list); + + spin_lock(&xtfs->drop_lock); + + /* Drop any in progress packet */ + skb = xtfs->ra_newskb; + xtfs->ra_newskb = NULL; + + /* Now drop as many packets as we should from the reordering window + * saved array + */ + count = xtfs->w_savedlen ? __reorder_drop(xtfs, &list) : 0; + + spin_unlock(&xtfs->drop_lock); + + if (skb) + kfree_skb_reason(skb, SKB_DROP_REASON_FRAG_REASM_TIMEOUT); + + if (count) { + list_for_each_entry_safe(skb, next, &list, list) { + skb_list_del_init(skb); + iptfs_input_ordered(x, skb); + } + } + + return HRTIMER_NORESTART; +} + +/** + * iptfs_input() - handle receipt of iptfs payload + * @x: xfrm state + * @skb: the packet + * + * We have an IPTFS payload order it if needed, then process newly in order + * packets. + * + * Return: -EINPROGRESS to inform xfrm_input to stop processing the skb. + */ +static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct list_head freelist, list; + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff *next; + + /* Fast path for no reorder window. */ + if (xtfs->cfg.reorder_win_size == 0) { + iptfs_input_ordered(x, skb); + goto done; + } + + /* Fetch list of in-order packets from the reordering window as well as + * a list of buffers we need to now free. + */ + INIT_LIST_HEAD(&list); + INIT_LIST_HEAD(&freelist); + + spin_lock(&xtfs->drop_lock); + iptfs_input_reorder(xtfs, skb, &list, &freelist); + spin_unlock(&xtfs->drop_lock); + + list_for_each_entry_safe(skb, next, &list, list) { + skb_list_del_init(skb); + iptfs_input_ordered(x, skb); + } + + list_for_each_entry_safe(skb, next, &freelist, list) { + skb_list_del_init(skb); + kfree_skb(skb); + } +done: + /* We always have dealt with the input SKB, either we are re-using it, + * or we have freed it. Return EINPROGRESS so that xfrm_input stops + * processing it. + */ + return -EINPROGRESS; +} + +/* ================================= */ +/* IPTFS Sending (ingress) Functions */ +/* ================================= */ + +/* ------------------------- */ +/* Enqueue to send functions */ +/* ------------------------- */ + +/** + * iptfs_enqueue() - enqueue packet if ok to send. + * @xtfs: xtfs state + * @skb: the packet + * + * Return: true if packet enqueued. + */ +static bool iptfs_enqueue(struct xfrm_iptfs_data *xtfs, struct sk_buff *skb) +{ + u64 newsz = xtfs->queue_size + skb->len; + struct iphdr *iph; + + assert_spin_locked(&xtfs->x->lock); + + if (newsz > xtfs->cfg.max_queue_size) + return false; + + /* Set ECN CE if we are above our ECN queue threshold */ + if (newsz > xtfs->ecn_queue_size) { + iph = ip_hdr(skb); + if (iph->version == 4) + IP_ECN_set_ce(iph); + else if (iph->version == 6) + IP6_ECN_set_ce(skb, ipv6_hdr(skb)); + } + + __skb_queue_tail(&xtfs->queue, skb); + xtfs->queue_size += skb->len; + return true; +} + +static int iptfs_get_cur_pmtu(struct xfrm_state *x, struct xfrm_iptfs_data *xtfs, + struct sk_buff *skb) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb); + u32 payload_mtu = xtfs->payload_mtu; + u32 pmtu = __iptfs_get_inner_mtu(x, xdst->child_mtu_cached); + + if (payload_mtu && payload_mtu < pmtu) + pmtu = payload_mtu; + + return pmtu; +} + +static int iptfs_is_too_big(struct sock *sk, struct sk_buff *skb, u32 pmtu) +{ + if (skb->len <= pmtu) + return 0; + + /* We only send ICMP too big if the user has configured us as + * dont-fragment. + */ + if (skb->dev) + XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMOUTERROR); + + if (sk) + xfrm_local_error(skb, pmtu); + else if (ip_hdr(skb)->version == 4) + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(pmtu)); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, pmtu); + + return 1; +} + +/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload + * (i.e., aggregating or fragmenting as appropriate). + * This is set in dst->output for an SA. + */ +static int iptfs_output_collect(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff *segs, *nskb; + u32 pmtu = 0; + bool ok = true; + bool was_gso; + + /* We have hooked into dst_entry->output which means we have skipped the + * protocol specific netfilter (see xfrm4_output, xfrm6_output). + * when our timer runs we will end up calling xfrm_output directly on + * the encapsulated traffic. + * + * For both cases this is the NF_INET_POST_ROUTING hook which allows + * changing the skb->dst entry which then may not be xfrm based anymore + * in which case a REROUTED flag is set. and dst_output is called. + * + * For IPv6 we are also skipping fragmentation handling for local + * sockets, which may or may not be good depending on our tunnel DF + * setting. Normally with fragmentation supported we want to skip this + * fragmentation. + */ + + if (xtfs->cfg.dont_frag) + pmtu = iptfs_get_cur_pmtu(x, xtfs, skb); + + /* Break apart GSO skbs. If the queue is nearing full then we want the + * accounting and queuing to be based on the individual packets not on the + * aggregate GSO buffer. + */ + was_gso = skb_is_gso(skb); + if (!was_gso) { + segs = skb; + } else { + segs = skb_gso_segment(skb, 0); + if (IS_ERR_OR_NULL(segs)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + if (IS_ERR(segs)) + return PTR_ERR(segs); + return -EINVAL; + } + consume_skb(skb); + skb = NULL; + } + + /* We can be running on multiple cores and from the network softirq or + * from user context depending on where the packet is coming from. + */ + spin_lock_bh(&x->lock); + + skb_list_walk_safe(segs, skb, nskb) { + skb_mark_not_on_list(skb); + + /* Once we drop due to no queue space we continue to drop the + * rest of the packets from that GRO. + */ + if (!ok) { +nospace: + trace_iptfs_no_queue_space(skb, xtfs, pmtu, was_gso); + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOQSPACE); + kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING); + continue; + } + + /* If the user indicated no iptfs fragmenting check before + * enqueue. + */ + if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) { + trace_iptfs_too_big(skb, xtfs, pmtu, was_gso); + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + continue; + } + + /* Enqueue to send in tunnel */ + ok = iptfs_enqueue(xtfs, skb); + if (!ok) + goto nospace; + + trace_iptfs_enqueue(skb, xtfs, pmtu, was_gso); + } + + /* Start a delay timer if we don't have one yet */ + if (!hrtimer_is_queued(&xtfs->iptfs_timer)) { + hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns, IPTFS_HRTIMER_MODE); + xtfs->iptfs_settime = ktime_get_raw_fast_ns(); + trace_iptfs_timer_start(xtfs, xtfs->init_delay_ns); + } + + spin_unlock_bh(&x->lock); + return 0; +} + +/* -------------------------- */ +/* Dequeue and send functions */ +/* -------------------------- */ + +static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff) +{ + struct ip_iptfs_hdr *h; + size_t hsz = sizeof(*h); + + /* now reset values to be pointing at the rest of the packets */ + h = skb_push(skb, hsz); + memset(h, 0, hsz); + if (blkoff) + h->block_offset = htons(blkoff); + + /* network_header current points at the inner IP packet + * move it to the iptfs header + */ + skb->transport_header = skb->network_header; + skb->network_header -= hsz; + + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; +} + +/** + * iptfs_copy_create_frag() - create an inner fragment skb. + * @st: The source packet data. + * @offset: offset in @st of the new fragment data. + * @copy_len: the amount of data to copy from @st. + * + * Create a new skb holding a single IPTFS inner packet fragment. @copy_len must + * not be greater than the max fragment size. + * + * Return: the new fragment skb or an ERR_PTR(). + */ +static struct sk_buff *iptfs_copy_create_frag(struct skb_seq_state *st, u32 offset, u32 copy_len) +{ + struct sk_buff *src = st->root_skb; + struct sk_buff *skb; + int err; + + skb = iptfs_alloc_skb(src, copy_len, true); + if (!skb) + return ERR_PTR(-ENOMEM); + + /* Now copy `copy_len` data from src */ + err = skb_copy_seq_read(st, offset, skb_put(skb, copy_len), copy_len); + if (err) { + kfree_skb(skb); + return ERR_PTR(err); + } + + return skb; +} + +/** + * iptfs_copy_create_frags() - create and send N-1 fragments of a larger skb. + * @skbp: the source packet skb (IN), skb holding the last fragment in + * the fragment stream (OUT). + * @xtfs: IPTFS SA state. + * @mtu: the max IPTFS fragment size. + * + * This function is responsible for fragmenting a larger inner packet into a + * sequence of IPTFS payload packets. The last fragment is returned rather than + * being sent so that the caller can append more inner packets (aggregation) if + * there is room. + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_copy_create_frags(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, u32 mtu) +{ + struct skb_seq_state skbseq; + struct list_head sublist; + struct sk_buff *skb = *skbp; + struct sk_buff *nskb = *skbp; + u32 copy_len, offset; + u32 to_copy = skb->len - mtu; + u32 blkoff = 0; + int err = 0; + + INIT_LIST_HEAD(&sublist); + + skb_prepare_seq_read(skb, 0, skb->len, &skbseq); + + /* A trimmed `skb` will be sent as the first fragment, later. */ + offset = mtu; + to_copy = skb->len - offset; + while (to_copy) { + /* Send all but last fragment to allow agg. append */ + trace_iptfs_first_fragmenting(nskb, mtu, to_copy, NULL); + list_add_tail(&nskb->list, &sublist); + + /* FUTURE: if the packet has an odd/non-aligning length we could + * send less data in the penultimate fragment so that the last + * fragment then ends on an aligned boundary. + */ + copy_len = min(to_copy, mtu); + nskb = iptfs_copy_create_frag(&skbseq, offset, copy_len); + if (IS_ERR(nskb)) { + XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMOUTERROR); + skb_abort_seq_read(&skbseq); + err = PTR_ERR(nskb); + nskb = NULL; + break; + } + iptfs_output_prepare_skb(nskb, to_copy); + offset += copy_len; + to_copy -= copy_len; + blkoff = to_copy; + } + skb_abort_seq_read(&skbseq); + + /* return last fragment that will be unsent (or NULL) */ + *skbp = nskb; + if (nskb) + trace_iptfs_first_final_fragment(nskb, mtu, blkoff, NULL); + + /* trim the original skb to MTU */ + if (!err) + err = pskb_trim(skb, mtu); + + if (err) { + /* Free all frags. Don't bother sending a partial packet we will + * never complete. + */ + kfree_skb(nskb); + list_for_each_entry_safe(skb, nskb, &sublist, list) { + skb_list_del_init(skb); + kfree_skb(skb); + } + return err; + } + + /* prepare the initial fragment with an iptfs header */ + iptfs_output_prepare_skb(skb, 0); + + /* Send all but last fragment, if we fail to send a fragment then free + * the rest -- no point in sending a packet that can't be reassembled. + */ + list_for_each_entry_safe(skb, nskb, &sublist, list) { + skb_list_del_init(skb); + if (!err) + err = xfrm_output(NULL, skb); + else + kfree_skb(skb); + } + if (err) + kfree_skb(*skbp); + return err; +} + +/** + * iptfs_first_skb() - handle the first dequeued inner packet for output + * @skbp: the source packet skb (IN), skb holding the last fragment in + * the fragment stream (OUT). + * @xtfs: IPTFS SA state. + * @mtu: the max IPTFS fragment size. + * + * This function is responsible for fragmenting a larger inner packet into a + * sequence of IPTFS payload packets. + * + * The last fragment is returned rather than being sent so that the caller can + * append more inner packets (aggregation) if there is room. + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, u32 mtu) +{ + struct sk_buff *skb = *skbp; + int err; + + /* Classic ESP skips the don't fragment ICMP error if DF is clear on + * the inner packet or ignore_df is set. Otherwise it will send an ICMP + * or local error if the inner packet won't fit it's MTU. + * + * With IPTFS we do not care about the inner packet DF bit. If the + * tunnel is configured to "don't fragment" we error back if things + * don't fit in our max packet size. Otherwise we iptfs-fragment as + * normal. + */ + + /* The opportunity for HW offload has ended */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + err = skb_checksum_help(skb); + if (err) + return err; + } + + /* We've split gso up before queuing */ + + trace_iptfs_first_dequeue(skb, mtu, 0, ip_hdr(skb)); + + /* Consider the buffer Tx'd and no longer owned */ + skb_orphan(skb); + + /* Simple case -- it fits. `mtu` accounted for all the overhead + * including the basic IPTFS header. + */ + if (skb->len <= mtu) { + iptfs_output_prepare_skb(skb, 0); + return 0; + } + + return iptfs_copy_create_frags(skbp, xtfs, mtu); +} + +static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, struct sk_buff *child) +{ + u32 fllen = 0; + + /* It might be possible to account for a frag list in addition to page + * fragment if it's a valid state to be in. The page fragments size + * should be kept as data_len so only the frag_list size is removed, + * this must be done above as well. + */ + *nextp = skb_shinfo(child)->frag_list; + while (*nextp) { + fllen += (*nextp)->len; + nextp = &(*nextp)->next; + } + skb_frag_list_init(child); + child->len -= fllen; + child->data_len -= fllen; + + return nextp; +} + +static void iptfs_consume_frags(struct sk_buff *to, struct sk_buff *from) +{ + struct skb_shared_info *fromi = skb_shinfo(from); + struct skb_shared_info *toi = skb_shinfo(to); + unsigned int new_truesize; + + /* If we have data in a head page, grab it */ + if (!skb_headlen(from)) { + new_truesize = SKB_TRUESIZE(skb_end_offset(from)); + } else { + iptfs_skb_head_to_frag(from, &toi->frags[toi->nr_frags]); + skb_frag_ref(to, toi->nr_frags++); + new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); + } + + /* Move any other page fragments rather than copy */ + memcpy(&toi->frags[toi->nr_frags], fromi->frags, + sizeof(fromi->frags[0]) * fromi->nr_frags); + toi->nr_frags += fromi->nr_frags; + fromi->nr_frags = 0; + from->data_len = 0; + from->len = 0; + to->truesize += from->truesize - new_truesize; + from->truesize = new_truesize; + + /* We are done with this SKB */ + consume_skb(from); +} + +static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff *skb, *skb2, **nextp; + struct skb_shared_info *shi, *shi2; + + /* If we are fragmenting due to a large inner packet we will output all + * the outer IPTFS packets required to contain the fragments of the + * single large inner packet. These outer packets need to be sent + * consecutively (ESP seq-wise). Since this output function is always + * running from a timer we do not need a lock to provide this guarantee. + * We will output our packets consecutively before the timer is allowed + * to run again on some other CPU. + */ + + while ((skb = __skb_dequeue(list))) { + u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb); + bool share_ok = true; + int remaining; + + /* protocol comes to us cleared sometimes */ + skb->protocol = x->outer_mode.family == AF_INET ? htons(ETH_P_IP) : + htons(ETH_P_IPV6); + + if (skb->len > mtu && xtfs->cfg.dont_frag) { + /* We handle this case before enqueueing so we are only + * here b/c MTU changed after we enqueued before we + * dequeued, just drop these. + */ + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR); + + trace_iptfs_first_toobig(skb, mtu, 0, ip_hdr(skb)); + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + continue; + } + + /* Convert first inner packet into an outer IPTFS packet, + * dealing with any fragmentation into multiple outer packets + * if necessary. + */ + if (iptfs_first_skb(&skb, xtfs, mtu)) + continue; + + /* If fragmentation was required the returned skb is the last + * IPTFS fragment in the chain, and it's IPTFS header blkoff has + * been set just past the end of the fragment data. + * + * In either case the space remaining to send more inner packet + * data is `mtu` - (skb->len - sizeof iptfs header). This is b/c + * the `mtu` value has the basic IPTFS header len accounted for, + * and we added that header to the skb so it is a part of + * skb->len, thus we subtract it from the skb length. + */ + remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr)); + + /* Re-home (un-nest) nested fragment lists. We need to do this + * b/c we will simply be appending any following aggregated + * inner packets using the frag list. + */ + shi = skb_shinfo(skb); + nextp = &shi->frag_list; + while (*nextp) { + if (skb_has_frag_list(*nextp)) + nextp = iptfs_rehome_fraglist(&(*nextp)->next, *nextp); + else + nextp = &(*nextp)->next; + } + + if (shi->frag_list || skb_cloned(skb) || skb_shared(skb)) + share_ok = false; + + /* See if we have enough space to simply append. + * + * NOTE: Maybe do not append if we will be mis-aligned, + * SW-based endpoints will probably have to copy in this + * case. + */ + while ((skb2 = skb_peek(list))) { + trace_iptfs_ingress_nth_peek(skb2, remaining); + if (skb2->len > remaining) + break; + + __skb_unlink(skb2, list); + + /* Consider the buffer Tx'd and no longer owned */ + skb_orphan(skb); + + /* If we don't have a cksum in the packet we need to add + * one before encapsulation. + */ + if (skb2->ip_summed == CHECKSUM_PARTIAL) { + if (skb_checksum_help(skb2)) { + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb2); + continue; + } + } + + /* skb->pp_recycle is passed to __skb_flag_unref for all + * frag pages so we can only share pages with skb's who + * match ourselves. + */ + shi2 = skb_shinfo(skb2); + if (share_ok && + (shi2->frag_list || + (!skb2->head_frag && skb_headlen(skb)) || + skb->pp_recycle != skb2->pp_recycle || + skb_zcopy(skb2) || + (shi->nr_frags + shi2->nr_frags + 1 > MAX_SKB_FRAGS))) + share_ok = false; + + /* Do accounting */ + skb->data_len += skb2->len; + skb->len += skb2->len; + remaining -= skb2->len; + + trace_iptfs_ingress_nth_add(skb2, share_ok); + + if (share_ok) { + iptfs_consume_frags(skb, skb2); + } else { + /* Append to the frag_list */ + *nextp = skb2; + nextp = &skb2->next; + if (skb_has_frag_list(skb2)) + nextp = iptfs_rehome_fraglist(nextp, + skb2); + skb->truesize += skb2->truesize; + } + } + + xfrm_output(NULL, skb); + } +} + +static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me) +{ + struct sk_buff_head list; + struct xfrm_iptfs_data *xtfs; + struct xfrm_state *x; + time64_t settime; + + xtfs = container_of(me, typeof(*xtfs), iptfs_timer); + x = xtfs->x; + + /* Process all the queued packets + * + * softirq execution order: timer > tasklet > hrtimer + * + * Network rx will have run before us giving one last chance to queue + * ingress packets for us to process and transmit. + */ + + spin_lock(&x->lock); + __skb_queue_head_init(&list); + skb_queue_splice_init(&xtfs->queue, &list); + xtfs->queue_size = 0; + settime = xtfs->iptfs_settime; + spin_unlock(&x->lock); + + /* After the above unlock, packets can begin queuing again, and the + * timer can be set again, from another CPU either in softirq or user + * context (not from this one since we are running at softirq level + * already). + */ + + trace_iptfs_timer_expire(xtfs, (unsigned long long)(ktime_get_raw_fast_ns() - settime)); + + iptfs_output_queued(x, &list); + + return HRTIMER_NORESTART; +} + +/** + * iptfs_encap_add_ipv4() - add outer encaps + * @x: xfrm state + * @skb: the packet + * + * This was originally taken from xfrm4_tunnel_encap_add. The reason for the + * copy is that IP-TFS/AGGFRAG can have different functionality for how to set + * the TOS/DSCP bits. Sets the protocol to a different value and doesn't do + * anything with inner headers as they aren't pointing into a normal IP + * singleton inner packet. + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_encap_add_ipv4(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct iphdr *top_iph; + + skb_reset_inner_network_header(skb); + skb_reset_inner_transport_header(skb); + + skb_set_network_header(skb, -(x->props.header_len - x->props.enc_hdr_len)); + skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); + skb->transport_header = skb->network_header + sizeof(*top_iph); + + top_iph = ip_hdr(skb); + top_iph->ihl = 5; + top_iph->version = 4; + top_iph->protocol = IPPROTO_AGGFRAG; + + /* As we have 0, fractional, 1 or N inner packets there's no obviously + * correct DSCP mapping to inherit. ECN should be cleared per RFC9347 + * 3.1. + */ + top_iph->tos = 0; + + top_iph->frag_off = htons(IP_DF); + top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst)); + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + ip_select_ident(dev_net(dst->dev), skb, NULL); + + return 0; +} + +#if IS_ENABLED(CONFIG_IPV6) +/** + * iptfs_encap_add_ipv6() - add outer encaps + * @x: xfrm state + * @skb: the packet + * + * This was originally taken from xfrm6_tunnel_encap_add. The reason for the + * copy is that IP-TFS/AGGFRAG can have different functionality for how to set + * the flow label and TOS/DSCP bits. It also sets the protocol to a different + * value and doesn't do anything with inner headers as they aren't pointing into + * a normal IP singleton inner packet. + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_encap_add_ipv6(struct xfrm_state *x, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *top_iph; + int dsfield; + + skb_reset_inner_network_header(skb); + skb_reset_inner_transport_header(skb); + + skb_set_network_header(skb, -x->props.header_len + x->props.enc_hdr_len); + skb->mac_header = skb->network_header + offsetof(struct ipv6hdr, nexthdr); + skb->transport_header = skb->network_header + sizeof(*top_iph); + + top_iph = ipv6_hdr(skb); + top_iph->version = 6; + top_iph->priority = 0; + memset(top_iph->flow_lbl, 0, sizeof(top_iph->flow_lbl)); + top_iph->nexthdr = IPPROTO_AGGFRAG; + + /* As we have 0, fractional, 1 or N inner packets there's no obviously + * correct DSCP mapping to inherit. ECN should be cleared per RFC9347 + * 3.1. + */ + dsfield = 0; + ipv6_change_dsfield(top_iph, 0, dsfield); + + top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst)); + top_iph->saddr = *(struct in6_addr *)&x->props.saddr; + top_iph->daddr = *(struct in6_addr *)&x->id.daddr; + + return 0; +} +#endif + +/** + * iptfs_prepare_output() - prepare the skb for output + * @x: xfrm state + * @skb: the packet + * + * Return: Error value, if 0 then skb values should be as follows: + * - transport_header should point at ESP header + * - network_header should point at Outer IP header + * - mac_header should point at protocol/nexthdr of the outer IP + */ +static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb) +{ + if (x->outer_mode.family == AF_INET) + return iptfs_encap_add_ipv4(x, skb); + if (x->outer_mode.family == AF_INET6) { +#if IS_ENABLED(CONFIG_IPV6) + return iptfs_encap_add_ipv6(x, skb); +#else + return -EAFNOSUPPORT; +#endif + } + return -EOPNOTSUPP; +} + +/* ========================== */ +/* State Management Functions */ +/* ========================== */ + +/** + * __iptfs_get_inner_mtu() - return inner MTU with no fragmentation. + * @x: xfrm state. + * @outer_mtu: the outer mtu + * + * Return: Correct MTU taking in to account the encap overhead. + */ +static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) +{ + struct crypto_aead *aead; + u32 blksize; + + aead = x->data; + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + return ((outer_mtu - x->props.header_len - crypto_aead_authsize(aead)) & + ~(blksize - 1)) - 2; +} + +/** + * iptfs_get_inner_mtu() - return the inner MTU for an IPTFS xfrm. + * @x: xfrm state. + * @outer_mtu: Outer MTU for the encapsulated packet. + * + * Return: Correct MTU taking in to account the encap overhead. + */ +static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + + /* If not dont-frag we have no MTU */ + if (!xtfs->cfg.dont_frag) + return x->outer_mode.family == AF_INET ? IP_MAX_MTU : IP6_MAX_MTU; + return __iptfs_get_inner_mtu(x, outer_mtu); +} + +/** + * iptfs_user_init() - initialize the SA with IPTFS options from netlink. + * @net: the net data + * @x: xfrm state + * @attrs: netlink attributes + * @extack: extack return data + * + * Return: 0 on success or a negative error code on failure + */ +static int iptfs_user_init(struct net *net, struct xfrm_state *x, + struct nlattr **attrs, + struct netlink_ext_ack *extack) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct xfrm_iptfs_config *xc; + u64 q; + + xc = &xtfs->cfg; + xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE; + xc->reorder_win_size = IPTFS_DEFAULT_REORDER_WINDOW; + xtfs->drop_time_ns = IPTFS_DEFAULT_DROP_TIME_USECS * NSECS_IN_USEC; + xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC; + + if (attrs[XFRMA_IPTFS_DONT_FRAG]) + xc->dont_frag = true; + if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) + xc->reorder_win_size = + nla_get_u16(attrs[XFRMA_IPTFS_REORDER_WINDOW]); + /* saved array is for saving 1..N seq nums from wantseq */ + if (xc->reorder_win_size) { + xtfs->w_saved = kcalloc(xc->reorder_win_size, + sizeof(*xtfs->w_saved), GFP_KERNEL); + if (!xtfs->w_saved) { + NL_SET_ERR_MSG(extack, "Cannot alloc reorder window"); + return -ENOMEM; + } + } + if (attrs[XFRMA_IPTFS_PKT_SIZE]) { + xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]); + if (!xc->pkt_size) { + xtfs->payload_mtu = 0; + } else if (xc->pkt_size > x->props.header_len) { + xtfs->payload_mtu = xc->pkt_size - x->props.header_len; + } else { + NL_SET_ERR_MSG(extack, + "Packet size must be 0 or greater than IPTFS/ESP header length"); + return -EINVAL; + } + } + if (attrs[XFRMA_IPTFS_MAX_QSIZE]) + xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]); + if (attrs[XFRMA_IPTFS_DROP_TIME]) + xtfs->drop_time_ns = + (u64)nla_get_u32(attrs[XFRMA_IPTFS_DROP_TIME]) * + NSECS_IN_USEC; + if (attrs[XFRMA_IPTFS_INIT_DELAY]) + xtfs->init_delay_ns = + (u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * NSECS_IN_USEC; + + q = (u64)xc->max_queue_size * 95; + do_div(q, 100); + xtfs->ecn_queue_size = (u32)q; + + return 0; +} + +static unsigned int iptfs_sa_len(const struct xfrm_state *x) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct xfrm_iptfs_config *xc = &xtfs->cfg; + unsigned int l = 0; + + if (x->dir == XFRM_SA_DIR_IN) { + l += nla_total_size(sizeof(u32)); /* drop time usec */ + l += nla_total_size(sizeof(xc->reorder_win_size)); + } else { + if (xc->dont_frag) + l += nla_total_size(0); /* dont-frag flag */ + l += nla_total_size(sizeof(u32)); /* init delay usec */ + l += nla_total_size(sizeof(xc->max_queue_size)); + l += nla_total_size(sizeof(xc->pkt_size)); + } + + return l; +} + +static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct xfrm_iptfs_config *xc = &xtfs->cfg; + int ret = 0; + u64 q; + + if (x->dir == XFRM_SA_DIR_IN) { + q = xtfs->drop_time_ns; + do_div(q, NSECS_IN_USEC); + ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, q); + if (ret) + return ret; + + ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW, + xc->reorder_win_size); + } else { + if (xc->dont_frag) { + ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG); + if (ret) + return ret; + } + + q = xtfs->init_delay_ns; + do_div(q, NSECS_IN_USEC); + ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q); + if (ret) + return ret; + + ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE, xc->max_queue_size); + if (ret) + return ret; + + ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size); + } + + return ret; +} + +static void __iptfs_init_state(struct xfrm_state *x, + struct xfrm_iptfs_data *xtfs) +{ + __skb_queue_head_init(&xtfs->queue); + hrtimer_setup(&xtfs->iptfs_timer, iptfs_delay_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); + + spin_lock_init(&xtfs->drop_lock); + hrtimer_setup(&xtfs->drop_timer, iptfs_drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE); + + /* Modify type (esp) adjustment values */ + + if (x->props.family == AF_INET) + x->props.header_len += sizeof(struct iphdr) + sizeof(struct ip_iptfs_hdr); + else if (x->props.family == AF_INET6) + x->props.header_len += sizeof(struct ipv6hdr) + sizeof(struct ip_iptfs_hdr); + x->props.enc_hdr_len = sizeof(struct ip_iptfs_hdr); + + /* Always keep a module reference when x->mode_data is set */ + __module_get(x->mode_cbs->owner); + + x->mode_data = xtfs; + xtfs->x = x; +} + +static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig) +{ + struct xfrm_iptfs_data *xtfs; + + xtfs = kmemdup(orig->mode_data, sizeof(*xtfs), GFP_KERNEL); + if (!xtfs) + return -ENOMEM; + + x->mode_data = xtfs; + xtfs->x = x; + + xtfs->ra_newskb = NULL; + if (xtfs->cfg.reorder_win_size) { + xtfs->w_saved = kcalloc(xtfs->cfg.reorder_win_size, + sizeof(*xtfs->w_saved), GFP_KERNEL); + if (!xtfs->w_saved) { + kfree_sensitive(xtfs); + return -ENOMEM; + } + } + + return 0; +} + +static int iptfs_init_state(struct xfrm_state *x) +{ + struct xfrm_iptfs_data *xtfs; + + if (x->mode_data) { + /* We have arrived here from xfrm_state_clone() */ + xtfs = x->mode_data; + } else { + xtfs = kzalloc(sizeof(*xtfs), GFP_KERNEL); + if (!xtfs) + return -ENOMEM; + } + + __iptfs_init_state(x, xtfs); + + return 0; +} + +static void iptfs_destroy_state(struct xfrm_state *x) +{ + struct xfrm_iptfs_data *xtfs = x->mode_data; + struct sk_buff_head list; + struct skb_wseq *s, *se; + struct sk_buff *skb; + + if (!xtfs) + return; + + spin_lock_bh(&xtfs->x->lock); + hrtimer_cancel(&xtfs->iptfs_timer); + __skb_queue_head_init(&list); + skb_queue_splice_init(&xtfs->queue, &list); + spin_unlock_bh(&xtfs->x->lock); + + while ((skb = __skb_dequeue(&list))) + kfree_skb(skb); + + spin_lock_bh(&xtfs->drop_lock); + hrtimer_cancel(&xtfs->drop_timer); + spin_unlock_bh(&xtfs->drop_lock); + + if (xtfs->ra_newskb) + kfree_skb(xtfs->ra_newskb); + + for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++) { + if (s->skb) + kfree_skb(s->skb); + } + + kfree_sensitive(xtfs->w_saved); + kfree_sensitive(xtfs); + + module_put(x->mode_cbs->owner); +} + +static const struct xfrm_mode_cbs iptfs_mode_cbs = { + .owner = THIS_MODULE, + .init_state = iptfs_init_state, + .clone_state = iptfs_clone_state, + .destroy_state = iptfs_destroy_state, + .user_init = iptfs_user_init, + .copy_to_user = iptfs_copy_to_user, + .sa_len = iptfs_sa_len, + .get_inner_mtu = iptfs_get_inner_mtu, + .input = iptfs_input, + .output = iptfs_output_collect, + .prepare_output = iptfs_prepare_output, +}; + +static int __init xfrm_iptfs_init(void) +{ + int err; + + pr_info("xfrm_iptfs: IPsec IP-TFS tunnel mode module\n"); + + err = xfrm_register_mode_cbs(XFRM_MODE_IPTFS, &iptfs_mode_cbs); + if (err < 0) + pr_info("%s: can't register IP-TFS\n", __func__); + + return err; +} + +static void __exit xfrm_iptfs_fini(void) +{ + xfrm_unregister_mode_cbs(XFRM_MODE_IPTFS); +} + +module_init(xfrm_iptfs_init); +module_exit(xfrm_iptfs_fini); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP-TFS support for xfrm ipsec tunnels"); diff --git a/net/xfrm/xfrm_nat_keepalive.c b/net/xfrm/xfrm_nat_keepalive.c index 82f0a301683f..ebf95d48e86c 100644 --- a/net/xfrm/xfrm_nat_keepalive.c +++ b/net/xfrm/xfrm_nat_keepalive.c @@ -9,9 +9,13 @@ #include <net/ip6_checksum.h> #include <net/xfrm.h> -static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv4); +static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv4) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #if IS_ENABLED(CONFIG_IPV6) -static DEFINE_PER_CPU(struct sock *, nat_keepalive_sk_ipv6); +static DEFINE_PER_CPU(struct sock_bh_locked, nat_keepalive_sk_ipv6) = { + .bh_lock = INIT_LOCAL_LOCK(bh_lock), +}; #endif struct nat_keepalive { @@ -56,10 +60,12 @@ static int nat_keepalive_send_ipv4(struct sk_buff *skb, skb_dst_set(skb, &rt->dst); - sk = *this_cpu_ptr(&nat_keepalive_sk_ipv4); + local_lock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock); + sk = this_cpu_read(nat_keepalive_sk_ipv4.sock); sock_net_set(sk, net); err = ip_build_and_send_pkt(skb, sk, fl4.saddr, fl4.daddr, NULL, tos); sock_net_set(sk, &init_net); + local_unlock_nested_bh(&nat_keepalive_sk_ipv4.bh_lock); return err; } @@ -89,15 +95,19 @@ static int nat_keepalive_send_ipv6(struct sk_buff *skb, fl6.fl6_sport = ka->encap_sport; fl6.fl6_dport = ka->encap_dport; - sk = *this_cpu_ptr(&nat_keepalive_sk_ipv6); + local_lock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); + sk = this_cpu_read(nat_keepalive_sk_ipv6.sock); sock_net_set(sk, net); dst = ipv6_stub->ipv6_dst_lookup_flow(net, sk, &fl6, NULL); - if (IS_ERR(dst)) + if (IS_ERR(dst)) { + local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); return PTR_ERR(dst); + } skb_dst_set(skb, dst); err = ipv6_stub->ip6_xmit(sk, skb, &fl6, skb->mark, NULL, 0, 0); sock_net_set(sk, &init_net); + local_unlock_nested_bh(&nat_keepalive_sk_ipv6.bh_lock); return err; } #endif @@ -202,7 +212,7 @@ static void nat_keepalive_work(struct work_struct *work) (ctx.next_run - ctx.now) * HZ); } -static int nat_keepalive_sk_init(struct sock * __percpu *socks, +static int nat_keepalive_sk_init(struct sock_bh_locked __percpu *socks, unsigned short family) { struct sock *sk; @@ -214,22 +224,22 @@ static int nat_keepalive_sk_init(struct sock * __percpu *socks, if (err < 0) goto err; - *per_cpu_ptr(socks, i) = sk; + per_cpu_ptr(socks, i)->sock = sk; } return 0; err: for_each_possible_cpu(i) - inet_ctl_sock_destroy(*per_cpu_ptr(socks, i)); + inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock); return err; } -static void nat_keepalive_sk_fini(struct sock * __percpu *socks) +static void nat_keepalive_sk_fini(struct sock_bh_locked __percpu *socks) { int i; for_each_possible_cpu(i) - inet_ctl_sock_destroy(*per_cpu_ptr(socks, i)); + inet_ctl_sock_destroy(per_cpu_ptr(socks, i)->sock); } void xfrm_nat_keepalive_state_updated(struct xfrm_state *x) diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index e5722c95b8bb..9077730ff7d0 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -472,6 +472,8 @@ static int xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb) WARN_ON_ONCE(1); break; default: + if (x->mode_cbs && x->mode_cbs->prepare_output) + return x->mode_cbs->prepare_output(x, skb); WARN_ON_ONCE(1); break; } @@ -610,6 +612,40 @@ out: } EXPORT_SYMBOL_GPL(xfrm_output_resume); +static int xfrm_dev_direct_output(struct sock *sk, struct xfrm_state *x, + struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct net *net = xs_net(x); + int err; + + dst = skb_dst_pop(skb); + if (!dst) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + return -EHOSTUNREACH; + } + skb_dst_set(skb, dst); + nf_reset_ct(skb); + + err = skb_dst(skb)->ops->local_out(net, sk, skb); + if (unlikely(err != 1)) { + kfree_skb(skb); + return err; + } + + /* In transport mode, network destination is + * directly reachable, while in tunnel mode, + * inner packet network may not be. In packet + * offload type, HW is responsible for hard + * header packet mangling so directly xmit skb + * to netdevice. + */ + skb->dev = x->xso.dev; + __skb_push(skb, skb->dev->hard_header_len); + return dev_queue_xmit(skb); +} + static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { return xfrm_output_resume(sk, skb, 1); @@ -675,6 +711,10 @@ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x) return; } + if (x->outer_mode.encap == XFRM_MODE_IPTFS) { + xo->inner_ipproto = IPPROTO_AGGFRAG; + return; + } /* non-Tunnel Mode */ if (!skb->encapsulation) @@ -729,6 +769,13 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) return -EHOSTUNREACH; } + /* Exclusive direct xmit for tunnel mode, as + * some filtering or matching rules may apply + * in transport mode. + */ + if (x->props.mode == XFRM_MODE_TUNNEL) + return xfrm_dev_direct_output(sk, x, skb); + return xfrm_output_resume(sk, skb, 0); } @@ -752,7 +799,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) skb->encapsulation = 1; if (skb_is_gso(skb)) { - if (skb->inner_protocol) + if (skb->inner_protocol && x->props.mode == XFRM_MODE_TUNNEL) return xfrm_output_gso(net, sk, skb); skb_shinfo(skb)->gso_type |= SKB_GSO_ESP; @@ -780,7 +827,7 @@ out: } EXPORT_SYMBOL_GPL(xfrm_output); -static int xfrm4_tunnel_check_size(struct sk_buff *skb) +int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -796,7 +843,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) { skb->protocol = htons(ETH_P_IP); - if (skb->sk) + if (skb->sk && sk_fullsock(skb->sk)) xfrm_local_error(skb, mtu); else icmp_send(skb, ICMP_DEST_UNREACH, @@ -806,6 +853,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) out: return ret; } +EXPORT_SYMBOL_GPL(xfrm4_tunnel_check_size); static int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) { @@ -828,10 +876,11 @@ static int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) } #if IS_ENABLED(CONFIG_IPV6) -static int xfrm6_tunnel_check_size(struct sk_buff *skb) +int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; struct dst_entry *dst = skb_dst(skb); + struct sock *sk = skb_to_full_sk(skb); if (skb->ignore_df) goto out; @@ -846,9 +895,9 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) skb->dev = dst->dev; skb->protocol = htons(ETH_P_IPV6); - if (xfrm6_local_dontfrag(skb->sk)) + if (xfrm6_local_dontfrag(sk)) ipv6_stub->xfrm6_local_rxpmtu(skb, mtu); - else if (skb->sk) + else if (sk) xfrm_local_error(skb, mtu); else icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); @@ -857,6 +906,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) out: return ret; } +EXPORT_SYMBOL_GPL(xfrm6_tunnel_check_size); #endif static int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 4408c11c0835..d4134a18c658 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -462,7 +462,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy) { BUG_ON(!policy->walk.dead); - if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer)) + if (timer_delete(&policy->timer) || timer_delete(&policy->polq.hold_timer)) BUG(); xfrm_dev_policy_free(policy); @@ -487,11 +487,11 @@ static void xfrm_policy_kill(struct xfrm_policy *policy) atomic_inc(&policy->genid); - if (del_timer(&policy->polq.hold_timer)) + if (timer_delete(&policy->polq.hold_timer)) xfrm_pol_put(policy); skb_queue_purge(&policy->polq.hold_queue); - if (del_timer(&policy->timer)) + if (timer_delete(&policy->timer)) xfrm_pol_put(policy); /* XXX: Flush state cache */ @@ -1469,7 +1469,7 @@ static void xfrm_policy_requeue(struct xfrm_policy *old, spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice_init(&pq->hold_queue, &list); - if (del_timer(&pq->hold_timer)) + if (timer_delete(&pq->hold_timer)) xfrm_pol_put(old); spin_unlock_bh(&pq->hold_queue.lock); @@ -1581,6 +1581,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) struct xfrm_policy *delpol; struct hlist_head *chain; + /* Sanitize mark before store */ + policy->mark.v &= policy->mark.m; + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (chain) @@ -2497,6 +2500,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; if (tmpl->mode == XFRM_MODE_TUNNEL || + tmpl->mode == XFRM_MODE_IPTFS || tmpl->mode == XFRM_MODE_BEET) { remote = &tmpl->id.daddr; local = &tmpl->saddr; @@ -2748,13 +2752,17 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->input = dst_discard; - rcu_read_lock(); - afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); - if (likely(afinfo)) - dst1->output = afinfo->output; - else - dst1->output = dst_discard_out; - rcu_read_unlock(); + if (xfrm[i]->mode_cbs && xfrm[i]->mode_cbs->output) { + dst1->output = xfrm[i]->mode_cbs->output; + } else { + rcu_read_lock(); + afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); + if (likely(afinfo)) + dst1->output = afinfo->output; + else + dst1->output = dst_discard_out; + rcu_read_unlock(); + } xdst_prev = xdst; @@ -2959,7 +2967,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) skb_dst_drop(skb); skb_dst_set(skb, dst); - dst_output(net, skb->sk, skb); + dst_output(net, skb_to_full_sk(skb), skb); } out: @@ -2999,7 +3007,7 @@ static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *s sched_next = jiffies + pq->timeout; - if (del_timer(&pq->hold_timer)) { + if (timer_delete(&pq->hold_timer)) { if (time_before(pq->hold_timer.expires, sched_next)) sched_next = pq->hold_timer.expires; xfrm_pol_put(pol); @@ -3289,8 +3297,9 @@ no_transform: ok: xfrm_pols_put(pols, drop_pols); - if (dst && dst->xfrm && - dst->xfrm->props.mode == XFRM_MODE_TUNNEL) + if (dst->xfrm && + (dst->xfrm->props.mode == XFRM_MODE_TUNNEL || + dst->xfrm->props.mode == XFRM_MODE_IPTFS)) dst->flags |= DST_XFRM_TUNNEL; return dst; @@ -4519,6 +4528,7 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm switch (t->mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: + case XFRM_MODE_IPTFS: if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, m->old_family) && xfrm_addr_equal(&t->saddr, &m->old_saddr, @@ -4561,7 +4571,8 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol, continue; n++; if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && - pol->xfrm_vec[i].mode != XFRM_MODE_BEET) + pol->xfrm_vec[i].mode != XFRM_MODE_BEET && + pol->xfrm_vec[i].mode != XFRM_MODE_IPTFS) continue; /* update endpoints */ memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, @@ -4622,7 +4633,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap, u32 if_id, - struct netlink_ext_ack *extack) + struct netlink_ext_ack *extack, struct xfrm_user_offload *xuo) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; @@ -4655,7 +4666,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, if ((x = xfrm_migrate_state_find(mp, net, if_id))) { x_cur[nx_cur] = x; nx_cur++; - xc = xfrm_state_migrate(x, mp, encap); + xc = xfrm_state_migrate(x, mp, encap, net, xuo, extack); if (xc) { x_new[nx_new] = xc; nx_new++; diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index eeb984be03a7..8e07dd614b0b 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -43,6 +43,8 @@ static const struct snmp_mib xfrm_mib_list[] = { SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR), SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR), SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR), + SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR), + SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE), SNMP_MIB_SENTINEL }; diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index bc56c6305725..dbdf8a39dffe 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -714,10 +714,12 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff oseq += skb_shinfo(skb)->gso_segs; } - if (unlikely(xo->seq.low < replay_esn->oseq)) { - XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi; - xo->seq.hi = oseq_hi; - replay_esn->oseq_hi = oseq_hi; + if (unlikely(oseq < replay_esn->oseq)) { + replay_esn->oseq_hi = ++oseq_hi; + if (xo->seq.low < replay_esn->oseq) { + XFRM_SKB_CB(skb)->seq.output.hi = oseq_hi; + xo->seq.hi = oseq_hi; + } if (replay_esn->oseq_hi == 0) { replay_esn->oseq--; replay_esn->oseq_hi--; @@ -729,6 +731,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff } replay_esn->oseq = oseq; + xfrm_dev_state_advance_esn(x); if (xfrm_aevent_is_on(net)) xfrm_replay_notify(x, XFRM_REPLAY_UPDATE); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 67ca7ac955a3..203b585c2ae2 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -34,6 +34,8 @@ #define xfrm_state_deref_prot(table, net) \ rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) +#define xfrm_state_deref_check(table, net) \ + rcu_dereference_check((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) static void xfrm_state_gc_task(struct work_struct *work); @@ -62,6 +64,8 @@ static inline unsigned int xfrm_dst_hash(struct net *net, u32 reqid, unsigned short family) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_dst_hash(daddr, saddr, reqid, family, net->xfrm.state_hmask); } @@ -70,6 +74,8 @@ static inline unsigned int xfrm_src_hash(struct net *net, const xfrm_address_t *saddr, unsigned short family) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_src_hash(daddr, saddr, family, net->xfrm.state_hmask); } @@ -77,11 +83,15 @@ static inline unsigned int xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask); } static unsigned int xfrm_seq_hash(struct net *net, u32 seq) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_seq_hash(seq, net->xfrm.state_hmask); } @@ -414,18 +424,18 @@ void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, } EXPORT_SYMBOL(xfrm_unregister_type_offload); -static const struct xfrm_type_offload * -xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load) +void xfrm_set_type_offload(struct xfrm_state *x) { const struct xfrm_type_offload *type = NULL; struct xfrm_state_afinfo *afinfo; + bool try_load = true; retry: - afinfo = xfrm_state_get_afinfo(family); + afinfo = xfrm_state_get_afinfo(x->props.family); if (unlikely(afinfo == NULL)) - return NULL; + goto out; - switch (proto) { + switch (x->id.proto) { case IPPROTO_ESP: type = afinfo->type_offload_esp; break; @@ -439,18 +449,16 @@ retry: rcu_read_unlock(); if (!type && try_load) { - request_module("xfrm-offload-%d-%d", family, proto); + request_module("xfrm-offload-%d-%d", x->props.family, + x->id.proto); try_load = false; goto retry; } - return type; -} - -static void xfrm_put_type_offload(const struct xfrm_type_offload *type) -{ - module_put(type->owner); +out: + x->type_offload = type; } +EXPORT_SYMBOL(xfrm_set_type_offload); static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = { [XFRM_MODE_BEET] = { @@ -467,6 +475,11 @@ static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = { .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET, }, + [XFRM_MODE_IPTFS] = { + .encap = XFRM_MODE_IPTFS, + .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET, + }, }; static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = { @@ -488,6 +501,11 @@ static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = { .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET6, }, + [XFRM_MODE_IPTFS] = { + .encap = XFRM_MODE_IPTFS, + .flags = XFRM_MODE_FLAG_TUNNEL, + .family = AF_INET6, + }, }; static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) @@ -515,6 +533,60 @@ static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) return NULL; } +static const struct xfrm_mode_cbs __rcu *xfrm_mode_cbs_map[XFRM_MODE_MAX]; +static DEFINE_SPINLOCK(xfrm_mode_cbs_map_lock); + +int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs) +{ + if (mode >= XFRM_MODE_MAX) + return -EINVAL; + + spin_lock_bh(&xfrm_mode_cbs_map_lock); + rcu_assign_pointer(xfrm_mode_cbs_map[mode], mode_cbs); + spin_unlock_bh(&xfrm_mode_cbs_map_lock); + + return 0; +} +EXPORT_SYMBOL(xfrm_register_mode_cbs); + +void xfrm_unregister_mode_cbs(u8 mode) +{ + if (mode >= XFRM_MODE_MAX) + return; + + spin_lock_bh(&xfrm_mode_cbs_map_lock); + RCU_INIT_POINTER(xfrm_mode_cbs_map[mode], NULL); + spin_unlock_bh(&xfrm_mode_cbs_map_lock); + synchronize_rcu(); +} +EXPORT_SYMBOL(xfrm_unregister_mode_cbs); + +static const struct xfrm_mode_cbs *xfrm_get_mode_cbs(u8 mode) +{ + const struct xfrm_mode_cbs *cbs; + bool try_load = true; + + if (mode >= XFRM_MODE_MAX) + return NULL; + +retry: + rcu_read_lock(); + + cbs = rcu_dereference(xfrm_mode_cbs_map[mode]); + if (cbs && !try_module_get(cbs->owner)) + cbs = NULL; + + rcu_read_unlock(); + + if (mode == XFRM_MODE_IPTFS && !cbs && try_load) { + request_module("xfrm-iptfs"); + try_load = false; + goto retry; + } + + return cbs; +} + void xfrm_state_free(struct xfrm_state *x) { kmem_cache_free(xfrm_state_cache, x); @@ -523,18 +595,18 @@ EXPORT_SYMBOL(xfrm_state_free); static void ___xfrm_state_destroy(struct xfrm_state *x) { + if (x->mode_cbs && x->mode_cbs->destroy_state) + x->mode_cbs->destroy_state(x); hrtimer_cancel(&x->mtimer); - del_timer_sync(&x->rtimer); - kfree(x->aead); - kfree(x->aalg); - kfree(x->ealg); + timer_delete_sync(&x->rtimer); + kfree_sensitive(x->aead); + kfree_sensitive(x->aalg); + kfree_sensitive(x->ealg); kfree(x->calg); kfree(x->encap); kfree(x->coaddr); kfree(x->replay_esn); kfree(x->preplay_esn); - if (x->type_offload) - xfrm_put_type_offload(x->type_offload); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -670,8 +742,8 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); INIT_HLIST_NODE(&x->byseq); - hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT); - x->mtimer.function = xfrm_timer_handler; + hrtimer_setup(&x->mtimer, xfrm_timer_handler, CLOCK_BOOTTIME, + HRTIMER_MODE_ABS_SOFT); timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0); x->curlft.add_time = ktime_get_real_seconds(); x->lft.soft_byte_limit = XFRM_INF; @@ -682,6 +754,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net) x->replay_maxdiff = 0; x->pcpu_num = UINT_MAX; spin_lock_init(&x->lock); + x->mode_data = NULL; } return x; } @@ -694,7 +767,7 @@ void xfrm_dev_state_delete(struct xfrm_state *x) struct net_device *dev = READ_ONCE(xso->dev); if (dev) { - dev->xfrmdev_ops->xdo_dev_state_delete(x); + dev->xfrmdev_ops->xdo_dev_state_delete(dev, x); spin_lock_bh(&xfrm_state_dev_gc_lock); hlist_add_head(&x->dev_gclist, &xfrm_state_dev_gc_list); spin_unlock_bh(&xfrm_state_dev_gc_lock); @@ -707,6 +780,8 @@ void xfrm_dev_state_free(struct xfrm_state *x) struct xfrm_dev_offload *xso = &x->xso; struct net_device *dev = READ_ONCE(xso->dev); + xfrm_unset_type_offload(x); + if (dev && dev->xfrmdev_ops) { spin_lock_bh(&xfrm_state_dev_gc_lock); if (!hlist_unhashed(&x->dev_gclist)) @@ -714,7 +789,7 @@ void xfrm_dev_state_free(struct xfrm_state *x) spin_unlock_bh(&xfrm_state_dev_gc_lock); if (dev->xfrmdev_ops->xdo_dev_state_free) - dev->xfrmdev_ops->xdo_dev_state_free(x); + dev->xfrmdev_ops->xdo_dev_state_free(dev, x); WRITE_ONCE(xso->dev, NULL); xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; netdev_put(dev, &xso->dev_tracker); @@ -763,9 +838,6 @@ int __xfrm_state_delete(struct xfrm_state *x) xfrm_nat_keepalive_state_updated(x); spin_unlock(&net->xfrm.xfrm_state_lock); - if (x->encap_sk) - sock_put(rcu_dereference_raw(x->encap_sk)); - xfrm_dev_state_delete(x); /* All xfrm_state objects are created by xfrm_state_alloc. @@ -1041,16 +1113,38 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, x->props.family = tmpl->encap_family; } -static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, +struct xfrm_hash_state_ptrs { + const struct hlist_head *bydst; + const struct hlist_head *bysrc; + const struct hlist_head *byspi; + unsigned int hmask; +}; + +static void xfrm_hash_ptrs_get(const struct net *net, struct xfrm_hash_state_ptrs *ptrs) +{ + unsigned int sequence; + + do { + sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); + + ptrs->bydst = xfrm_state_deref_check(net->xfrm.state_bydst, net); + ptrs->bysrc = xfrm_state_deref_check(net->xfrm.state_bysrc, net); + ptrs->byspi = xfrm_state_deref_check(net->xfrm.state_byspi, net); + ptrs->hmask = net->xfrm.state_hmask; + } while (read_seqcount_retry(&net->xfrm.xfrm_state_hash_generation, sequence)); +} + +static struct xfrm_state *__xfrm_state_lookup_all(const struct xfrm_hash_state_ptrs *state_ptrs, + u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family, struct xfrm_dev_offload *xdo) { - unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); + unsigned int h = __xfrm_spi_hash(daddr, spi, proto, family, state_ptrs->hmask); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { + hlist_for_each_entry_rcu(x, state_ptrs->byspi + h, byspi) { #ifdef CONFIG_XFRM_OFFLOAD if (xdo->type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) @@ -1084,15 +1178,16 @@ static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, return NULL; } -static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, +static struct xfrm_state *__xfrm_state_lookup(const struct xfrm_hash_state_ptrs *state_ptrs, + u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { - unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); + unsigned int h = __xfrm_spi_hash(daddr, spi, proto, family, state_ptrs->hmask); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { + hlist_for_each_entry_rcu(x, state_ptrs->byspi + h, byspi) { if (x->props.family != family || x->id.spi != spi || x->id.proto != proto || @@ -1114,11 +1209,11 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, __be32 spi, u8 proto, unsigned short family) { + struct xfrm_hash_state_ptrs state_ptrs; struct hlist_head *state_cache_input; struct xfrm_state *x = NULL; - int cpu = get_cpu(); - state_cache_input = per_cpu_ptr(net->xfrm.state_cache_input, cpu); + state_cache_input = raw_cpu_ptr(net->xfrm.state_cache_input); rcu_read_lock(); hlist_for_each_entry_rcu(x, state_cache_input, state_cache_input) { @@ -1135,7 +1230,9 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, goto out; } - x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); + xfrm_hash_ptrs_get(net, &state_ptrs); + + x = __xfrm_state_lookup(&state_ptrs, mark, daddr, spi, proto, family); if (x && x->km.state == XFRM_STATE_VALID) { spin_lock_bh(&net->xfrm.xfrm_state_lock); @@ -1150,20 +1247,20 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, out: rcu_read_unlock(); - put_cpu(); return x; } EXPORT_SYMBOL(xfrm_input_state_lookup); -static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, +static struct xfrm_state *__xfrm_state_lookup_byaddr(const struct xfrm_hash_state_ptrs *state_ptrs, + u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family) { - unsigned int h = xfrm_src_hash(net, daddr, saddr, family); + unsigned int h = __xfrm_src_hash(daddr, saddr, family, state_ptrs->hmask); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_bysrc + h, bysrc) { + hlist_for_each_entry_rcu(x, state_ptrs->bysrc + h, bysrc) { if (x->props.family != family || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family) || @@ -1183,14 +1280,17 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, static inline struct xfrm_state * __xfrm_state_locate(struct xfrm_state *x, int use_spi, int family) { + struct xfrm_hash_state_ptrs state_ptrs; struct net *net = xs_net(x); u32 mark = x->mark.v & x->mark.m; + xfrm_hash_ptrs_get(net, &state_ptrs); + if (use_spi) - return __xfrm_state_lookup(net, mark, &x->id.daddr, + return __xfrm_state_lookup(&state_ptrs, mark, &x->id.daddr, x->id.spi, x->id.proto, family); else - return __xfrm_state_lookup_byaddr(net, mark, + return __xfrm_state_lookup_byaddr(&state_ptrs, mark, &x->id.daddr, &x->props.saddr, x->id.proto, family); @@ -1264,6 +1364,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, u32 if_id) { static xfrm_address_t saddr_wildcard = { }; + struct xfrm_hash_state_ptrs state_ptrs; struct net *net = xp_net(pol); unsigned int h, h_wildcard; struct xfrm_state *x, *x0, *to_put; @@ -1328,8 +1429,10 @@ cached: else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */ WARN_ON(1); - h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); - hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { + xfrm_hash_ptrs_get(net, &state_ptrs); + + h = __xfrm_dst_hash(daddr, saddr, tmpl->reqid, encap_family, state_ptrs.hmask); + hlist_for_each_entry_rcu(x, state_ptrs.bydst + h, bydst) { #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) @@ -1362,8 +1465,9 @@ cached: if (best || acquire_in_progress) goto found; - h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); - hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) { + h_wildcard = __xfrm_dst_hash(daddr, &saddr_wildcard, tmpl->reqid, + encap_family, state_ptrs.hmask); + hlist_for_each_entry_rcu(x, state_ptrs.bydst + h_wildcard, bydst) { #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) @@ -1401,7 +1505,7 @@ found: if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && - (x0 = __xfrm_state_lookup_all(net, mark, daddr, + (x0 = __xfrm_state_lookup_all(&state_ptrs, mark, daddr, tmpl->id.spi, tmpl->id.proto, encap_family, &pol->xdo)) != NULL) { @@ -1444,19 +1548,19 @@ found: if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { struct xfrm_dev_offload *xdo = &pol->xdo; struct xfrm_dev_offload *xso = &x->xso; + struct net_device *dev = xdo->dev; xso->type = XFRM_DEV_OFFLOAD_PACKET; xso->dir = xdo->dir; - xso->dev = xdo->dev; - xso->real_dev = xdo->real_dev; + xso->dev = dev; xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ; - netdev_hold(xso->dev, &xso->dev_tracker, GFP_ATOMIC); - error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL); + netdev_hold(dev, &xso->dev_tracker, GFP_ATOMIC); + error = dev->xfrmdev_ops->xdo_dev_state_add(dev, x, + NULL); if (error) { xso->dir = 0; - netdev_put(xso->dev, &xso->dev_tracker); + netdev_put(dev, &xso->dev_tracker); xso->dev = NULL; - xso->real_dev = NULL; xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; x->km.state = XFRM_STATE_DEAD; to_put = x; @@ -1614,6 +1718,9 @@ static void __xfrm_state_insert(struct xfrm_state *x) list_add(&x->km.all, &net->xfrm.state_all); + /* Sanitize mark before store */ + x->mark.v &= x->mark.m; + h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family); XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, @@ -1851,8 +1958,9 @@ static inline int clone_security(struct xfrm_state *x, struct xfrm_sec_ctx *secu return 0; } -static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, - struct xfrm_encap_tmpl *encap) +static struct xfrm_state *xfrm_state_clone_and_setup(struct xfrm_state *orig, + struct xfrm_encap_tmpl *encap, + struct xfrm_migrate *m) { struct net *net = xs_net(orig); struct xfrm_state *x = xfrm_state_alloc(net); @@ -1945,6 +2053,17 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->new_mapping_sport = 0; x->dir = orig->dir; + x->mode_cbs = orig->mode_cbs; + if (x->mode_cbs && x->mode_cbs->clone_state) { + if (x->mode_cbs->clone_state(x, orig)) + goto error; + } + + + x->props.family = m->new_family; + memcpy(&x->id.daddr, &m->new_daddr, sizeof(x->id.daddr)); + memcpy(&x->props.saddr, &m->new_saddr, sizeof(x->props.saddr)); + return x; error: @@ -2007,21 +2126,23 @@ EXPORT_SYMBOL(xfrm_migrate_state_find); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, - struct xfrm_encap_tmpl *encap) + struct xfrm_encap_tmpl *encap, + struct net *net, + struct xfrm_user_offload *xuo, + struct netlink_ext_ack *extack) { struct xfrm_state *xc; - xc = xfrm_state_clone(x, encap); + xc = xfrm_state_clone_and_setup(x, encap, m); if (!xc) return NULL; - xc->props.family = m->new_family; - if (xfrm_init_state(xc) < 0) goto error; - memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr)); - memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); + /* configure the hardware if offload is requested */ + if (xuo && xfrm_dev_state_add(net, xc, xuo, extack)) + goto error; /* add state */ if (xfrm_addr_equal(&x->id.daddr, &m->new_daddr, m->new_family)) { @@ -2180,10 +2301,13 @@ struct xfrm_state * xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { + struct xfrm_hash_state_ptrs state_ptrs; struct xfrm_state *x; rcu_read_lock(); - x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); + xfrm_hash_ptrs_get(net, &state_ptrs); + + x = __xfrm_state_lookup(&state_ptrs, mark, daddr, spi, proto, family); rcu_read_unlock(); return x; } @@ -2194,11 +2318,15 @@ xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family) { + struct xfrm_hash_state_ptrs state_ptrs; struct xfrm_state *x; - spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __xfrm_state_lookup_byaddr(net, mark, daddr, saddr, proto, family); - spin_unlock_bh(&net->xfrm.xfrm_state_lock); + rcu_read_lock(); + + xfrm_hash_ptrs_get(net, &state_ptrs); + + x = __xfrm_state_lookup_byaddr(&state_ptrs, mark, daddr, saddr, proto, family); + rcu_read_unlock(); return x; } EXPORT_SYMBOL(xfrm_state_lookup_byaddr); @@ -2271,6 +2399,7 @@ static int __xfrm6_state_sort_cmp(const void *p) #endif case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: + case XFRM_MODE_IPTFS: return 4; } return 5; @@ -2297,6 +2426,7 @@ static int __xfrm6_tmpl_sort_cmp(const void *p) #endif case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: + case XFRM_MODE_IPTFS: return 3; } return 4; @@ -2986,6 +3116,9 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) case XFRM_MODE_TUNNEL: break; default: + if (x->mode_cbs && x->mode_cbs->get_inner_mtu) + return x->mode_cbs->get_inner_mtu(x, mtu); + WARN_ON_ONCE(1); break; } @@ -2995,8 +3128,7 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) } EXPORT_SYMBOL_GPL(xfrm_state_mtu); -int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, - struct netlink_ext_ack *extack) +int __xfrm_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { const struct xfrm_mode *inner_mode; const struct xfrm_mode *outer_mode; @@ -3051,8 +3183,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, goto error; } - x->type_offload = xfrm_get_type_offload(x->id.proto, family, offload); - err = x->type->init_state(x, extack); if (err) goto error; @@ -3065,12 +3195,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, } x->outer_mode = *outer_mode; - if (init_replay) { - err = xfrm_init_replay(x, extack); - if (err) - goto error; - } - if (x->nat_keepalive_interval) { if (x->dir != XFRM_SA_DIR_OUT) { NL_SET_ERR_MSG(extack, "NAT keepalive is only supported for outbound SAs"); @@ -3086,6 +3210,12 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, } } + x->mode_cbs = xfrm_get_mode_cbs(x->props.mode); + if (x->mode_cbs) { + if (x->mode_cbs->init_state) + err = x->mode_cbs->init_state(x); + module_put(x->mode_cbs->owner); + } error: return err; } @@ -3096,11 +3226,16 @@ int xfrm_init_state(struct xfrm_state *x) { int err; - err = __xfrm_init_state(x, true, false, NULL); - if (!err) - x->km.state = XFRM_STATE_VALID; + err = __xfrm_init_state(x, NULL); + if (err) + return err; - return err; + err = xfrm_init_replay(x, NULL); + if (err) + return err; + + x->km.state = XFRM_STATE_VALID; + return 0; } EXPORT_SYMBOL(xfrm_init_state); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index b2876e09328b..59f258daf830 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -178,6 +178,28 @@ static inline int verify_replay(struct xfrm_usersa_info *p, "Replay seq and seq_hi should be 0 for output SA"); return -EINVAL; } + + if (!(p->flags & XFRM_STATE_ESN)) { + if (rs->oseq_hi) { + NL_SET_ERR_MSG( + extack, + "Replay oseq_hi should be 0 in non-ESN mode for output SA"); + return -EINVAL; + } + if (rs->oseq == U32_MAX) { + NL_SET_ERR_MSG( + extack, + "Replay oseq should be less than 0xFFFFFFFF in non-ESN mode for output SA"); + return -EINVAL; + } + } else { + if (rs->oseq == U32_MAX && rs->oseq_hi == U32_MAX) { + NL_SET_ERR_MSG( + extack, + "Replay oseq and oseq_hi should be less than 0xFFFFFFFF for output SA"); + return -EINVAL; + } + } if (rs->bmp_len) { NL_SET_ERR_MSG(extack, "Replay bmp_len should 0 for output SA"); return -EINVAL; @@ -190,6 +212,28 @@ static inline int verify_replay(struct xfrm_usersa_info *p, "Replay oseq and oseq_hi should be 0 for input SA"); return -EINVAL; } + if (!(p->flags & XFRM_STATE_ESN)) { + if (rs->seq_hi) { + NL_SET_ERR_MSG( + extack, + "Replay seq_hi should be 0 in non-ESN mode for input SA"); + return -EINVAL; + } + + if (rs->seq == U32_MAX) { + NL_SET_ERR_MSG( + extack, + "Replay seq should be less than 0xFFFFFFFF in non-ESN mode for input SA"); + return -EINVAL; + } + } else { + if (rs->seq == U32_MAX && rs->seq_hi == U32_MAX) { + NL_SET_ERR_MSG( + extack, + "Replay seq and seq_hi should be less than 0xFFFFFFFF for input SA"); + return -EINVAL; + } + } } return 0; @@ -301,6 +345,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode"); goto out; } + if ((attrs[XFRMA_IPTFS_DROP_TIME] || + attrs[XFRMA_IPTFS_REORDER_WINDOW] || + attrs[XFRMA_IPTFS_DONT_FRAG] || + attrs[XFRMA_IPTFS_INIT_DELAY] || + attrs[XFRMA_IPTFS_MAX_QSIZE] || + attrs[XFRMA_IPTFS_PKT_SIZE]) && + p->mode != XFRM_MODE_IPTFS) { + NL_SET_ERR_MSG(extack, "IP-TFS options can only be used in IP-TFS mode"); + goto out; + } break; case IPPROTO_COMP: @@ -373,6 +427,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_BEET: break; + case XFRM_MODE_IPTFS: + if (p->id.proto != IPPROTO_ESP) { + NL_SET_ERR_MSG(extack, "IP-TFS mode only supported with ESP"); + goto out; + } + if (sa_dir == 0) { + NL_SET_ERR_MSG(extack, "IP-TFS mode requires in or out direction attribute"); + goto out; + } + break; default: NL_SET_ERR_MSG(extack, "Unsupported mode"); @@ -421,6 +485,18 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, goto out; } + if (attrs[XFRMA_IPTFS_DROP_TIME]) { + NL_SET_ERR_MSG(extack, "IP-TFS drop time should not be set for output SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) { + NL_SET_ERR_MSG(extack, "IP-TFS reorder window should not be set for output SA"); + err = -EINVAL; + goto out; + } + if (attrs[XFRMA_REPLAY_VAL]) { struct xfrm_replay_state *replay; @@ -458,6 +534,30 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, } } + + if (attrs[XFRMA_IPTFS_DONT_FRAG]) { + NL_SET_ERR_MSG(extack, "IP-TFS don't fragment should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_IPTFS_INIT_DELAY]) { + NL_SET_ERR_MSG(extack, "IP-TFS initial delay should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_IPTFS_MAX_QSIZE]) { + NL_SET_ERR_MSG(extack, "IP-TFS max queue size should not be set for input SA"); + err = -EINVAL; + goto out; + } + + if (attrs[XFRMA_IPTFS_PKT_SIZE]) { + NL_SET_ERR_MSG(extack, "IP-TFS packet size should not be set for input SA"); + err = -EINVAL; + goto out; + } } if (!sa_dir && attrs[XFRMA_SA_PCPU]) { @@ -851,7 +951,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; } - err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack); + err = __xfrm_init_state(x, extack); if (err) goto error; @@ -886,6 +986,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; } + if (x->mode_cbs && x->mode_cbs->user_init) { + err = x->mode_cbs->user_init(net, x, attrs, extack); + if (err) + goto error; + } + return x; error: @@ -1099,7 +1205,7 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) if (!nla) return -EMSGSIZE; algo = nla_data(nla); - strscpy_pad(algo->alg_name, auth->alg_name, sizeof(algo->alg_name)); + strscpy_pad(algo->alg_name, auth->alg_name); if (redact_secret && auth->alg_key_len) memset(algo->alg_key, 0, (auth->alg_key_len + 7) / 8); @@ -1112,7 +1218,7 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) if (!nla) return -EMSGSIZE; ap = nla_data(nla); - strscpy_pad(ap->alg_name, auth->alg_name, sizeof(ap->alg_name)); + strscpy_pad(ap->alg_name, auth->alg_name); ap->alg_key_len = auth->alg_key_len; ap->alg_trunc_len = auth->alg_trunc_len; if (redact_secret && auth->alg_key_len) @@ -1133,7 +1239,7 @@ static int copy_to_user_aead(struct xfrm_algo_aead *aead, struct sk_buff *skb) return -EMSGSIZE; ap = nla_data(nla); - strscpy_pad(ap->alg_name, aead->alg_name, sizeof(ap->alg_name)); + strscpy_pad(ap->alg_name, aead->alg_name); ap->alg_key_len = aead->alg_key_len; ap->alg_icv_len = aead->alg_icv_len; @@ -1155,7 +1261,7 @@ static int copy_to_user_ealg(struct xfrm_algo *ealg, struct sk_buff *skb) return -EMSGSIZE; ap = nla_data(nla); - strscpy_pad(ap->alg_name, ealg->alg_name, sizeof(ap->alg_name)); + strscpy_pad(ap->alg_name, ealg->alg_name); ap->alg_key_len = ealg->alg_key_len; if (redact_secret && ealg->alg_key_len) @@ -1176,7 +1282,7 @@ static int copy_to_user_calg(struct xfrm_algo *calg, struct sk_buff *skb) return -EMSGSIZE; ap = nla_data(nla); - strscpy_pad(ap->alg_name, calg->alg_name, sizeof(ap->alg_name)); + strscpy_pad(ap->alg_name, calg->alg_name); ap->alg_key_len = 0; return 0; @@ -1301,6 +1407,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x, if (ret) goto out; } + if (x->mode_cbs && x->mode_cbs->copy_to_user) + ret = x->mode_cbs->copy_to_user(x, skb); + if (ret) + goto out; if (x->mapping_maxage) { ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage); if (ret) @@ -1958,6 +2068,8 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, return -EINVAL; } break; + case XFRM_MODE_IPTFS: + break; default: if (ut[i].family != prev_family) { NL_SET_ERR_MSG(extack, "Mode in template doesn't support a family change"); @@ -2989,6 +3101,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, int n = 0; struct net *net = sock_net(skb->sk); struct xfrm_encap_tmpl *encap = NULL; + struct xfrm_user_offload *xuo = NULL; u32 if_id = 0; if (!attrs[XFRMA_MIGRATE]) { @@ -3019,11 +3132,19 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, if (attrs[XFRMA_IF_ID]) if_id = nla_get_u32(attrs[XFRMA_IF_ID]); + if (attrs[XFRMA_OFFLOAD_DEV]) { + xuo = kmemdup(nla_data(attrs[XFRMA_OFFLOAD_DEV]), + sizeof(*xuo), GFP_KERNEL); + if (!xuo) { + err = -ENOMEM; + goto error; + } + } err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, - if_id, extack); - + if_id, extack, xuo); +error: kfree(encap); - + kfree(xuo); return err; } #else @@ -3220,6 +3341,12 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, [XFRMA_SA_PCPU] = { .type = NLA_U32 }, + [XFRMA_IPTFS_DROP_TIME] = { .type = NLA_U32 }, + [XFRMA_IPTFS_REORDER_WINDOW] = { .type = NLA_U16 }, + [XFRMA_IPTFS_DONT_FRAG] = { .type = NLA_FLAG }, + [XFRMA_IPTFS_INIT_DELAY] = { .type = NLA_U32 }, + [XFRMA_IPTFS_MAX_QSIZE] = { .type = NLA_U32 }, + [XFRMA_IPTFS_PKT_SIZE] = { .type = NLA_U32 }, }; EXPORT_SYMBOL_GPL(xfrma_policy); @@ -3554,6 +3681,9 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x) if (x->nat_keepalive_interval) l += nla_total_size(sizeof(x->nat_keepalive_interval)); + if (x->mode_cbs && x->mode_cbs->sa_len) + l += x->mode_cbs->sa_len(x); + return l; } |