diff options
Diffstat (limited to 'net')
589 files changed, 29238 insertions, 11885 deletions
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c index 1c140af06d52..600b9563bfc5 100644 --- a/net/6lowpan/debugfs.c +++ b/net/6lowpan/debugfs.c @@ -170,7 +170,8 @@ static void lowpan_dev_debugfs_ctx_init(struct net_device *dev, struct dentry *root; char buf[32]; - WARN_ON_ONCE(id > LOWPAN_IPHC_CTX_TABLE_SIZE); + if (WARN_ON_ONCE(id >= LOWPAN_IPHC_CTX_TABLE_SIZE)) + return; sprintf(buf, "%d", id); diff --git a/net/802/Makefile b/net/802/Makefile index 19406a87bdaa..bfed80221b8b 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -8,7 +8,6 @@ obj-$(CONFIG_LLC) += p8022.o psnap.o obj-$(CONFIG_NET_FC) += fc.o obj-$(CONFIG_FDDI) += fddi.o obj-$(CONFIG_HIPPI) += hippi.o -obj-$(CONFIG_IPX) += p8022.o psnap.o p8023.o obj-$(CONFIG_ATALK) += p8022.o psnap.o obj-$(CONFIG_STP) += stp.o obj-$(CONFIG_GARP) += garp.o diff --git a/net/802/hippi.c b/net/802/hippi.c index f80b33a8f7e0..887e73d520e4 100644 --- a/net/802/hippi.c +++ b/net/802/hippi.c @@ -121,7 +121,7 @@ int hippi_mac_addr(struct net_device *dev, void *p) struct sockaddr *addr = p; if (netif_running(dev)) return -EBUSY; - memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + dev_addr_set(dev, addr->sa_data); return 0; } EXPORT_SYMBOL(hippi_mac_addr); diff --git a/net/802/p8022.c b/net/802/p8022.c index a6585627051d..79c23173116c 100644 --- a/net/802/p8022.c +++ b/net/802/p8022.c @@ -23,7 +23,7 @@ #include <net/p8022.h> static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb, - unsigned char *dest) + const unsigned char *dest) { llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap); return 0; diff --git a/net/802/p8023.c b/net/802/p8023.c deleted file mode 100644 index 19cd56990db2..000000000000 --- a/net/802/p8023.c +++ /dev/null @@ -1,60 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * NET3: 802.3 data link hooks used for IPX 802.3 - * - * 802.3 isn't really a protocol data link layer. Some old IPX stuff - * uses it however. Note that there is only one 802.3 protocol layer - * in the system. We don't currently support different protocols - * running raw 802.3 on different devices. Thankfully nobody else - * has done anything like the old IPX. - */ - -#include <linux/in.h> -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/slab.h> - -#include <net/datalink.h> -#include <net/p8022.h> - -/* - * Place an 802.3 header on a packet. The driver will do the mac - * addresses, we just need to give it the buffer length. - */ -static int p8023_request(struct datalink_proto *dl, - struct sk_buff *skb, unsigned char *dest_node) -{ - struct net_device *dev = skb->dev; - - dev_hard_header(skb, dev, ETH_P_802_3, dest_node, NULL, skb->len); - return dev_queue_xmit(skb); -} - -/* - * Create an 802.3 client. Note there can be only one 802.3 client - */ -struct datalink_proto *make_8023_client(void) -{ - struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC); - - if (proto) { - proto->header_length = 0; - proto->request = p8023_request; - } - return proto; -} - -/* - * Destroy the 802.3 client. - */ -void destroy_8023_client(struct datalink_proto *dl) -{ - kfree(dl); -} - -EXPORT_SYMBOL(destroy_8023_client); -EXPORT_SYMBOL(make_8023_client); - -MODULE_LICENSE("GPL"); diff --git a/net/802/psnap.c b/net/802/psnap.c index 4492e8d7ad20..1406bfdbda13 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -79,7 +79,7 @@ drop: * Put a SNAP header on a frame and pass to 802.2 */ static int snap_request(struct datalink_proto *dl, - struct sk_buff *skb, u8 *dest) + struct sk_buff *skb, const u8 *dest) { memcpy(skb_push(skb, 5), dl->type, 5); llc_build_and_send_ui_pkt(snap_sap, skb, dest, snap_sap->laddr.lsap); diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 4cdf8416869d..55275ef9a31a 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -67,7 +67,7 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, return 0; size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN; - array = kzalloc(size, GFP_KERNEL); + array = kzalloc(size, GFP_KERNEL_ACCOUNT); if (array == NULL) return -ENOBUFS; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index a0367b37512d..90330b893134 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -250,7 +250,7 @@ bool vlan_dev_inherit_address(struct net_device *dev, if (dev->addr_assign_type != NET_ADDR_STOLEN) return false; - ether_addr_copy(dev->dev_addr, real_dev->dev_addr); + eth_hw_addr_set(dev, real_dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return true; } @@ -349,7 +349,7 @@ static int vlan_dev_set_mac_address(struct net_device *dev, void *p) dev_uc_del(real_dev, dev->dev_addr); out: - ether_addr_copy(dev->dev_addr, addr->sa_data); + eth_hw_addr_set(dev, addr->sa_data); return 0; } @@ -372,8 +372,8 @@ static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCGMIIREG: case SIOCSMIIREG: case SIOCGHWTSTAMP: - if (netif_device_present(real_dev) && ops->ndo_do_ioctl) - err = ops->ndo_do_ioctl(real_dev, &ifrr, cmd); + if (netif_device_present(real_dev) && ops->ndo_eth_ioctl) + err = ops->ndo_eth_ioctl(real_dev, &ifrr, cmd); break; } @@ -586,7 +586,7 @@ static int vlan_dev_init(struct net_device *dev) dev->dev_id = real_dev->dev_id; if (is_zero_ether_addr(dev->dev_addr)) { - ether_addr_copy(dev->dev_addr, real_dev->dev_addr); + eth_hw_addr_set(dev, real_dev->dev_addr); dev->addr_assign_type = NET_ADDR_STOLEN; } if (is_zero_ether_addr(dev->broadcast)) @@ -814,7 +814,7 @@ static const struct net_device_ops vlan_netdev_ops = { .ndo_set_mac_address = vlan_dev_set_mac_address, .ndo_set_rx_mode = vlan_dev_set_rx_mode, .ndo_change_rx_flags = vlan_dev_change_rx_flags, - .ndo_do_ioctl = vlan_dev_ioctl, + .ndo_eth_ioctl = vlan_dev_ioctl, .ndo_neigh_setup = vlan_dev_neigh_setup, .ndo_get_stats64 = vlan_dev_get_stats64, #if IS_ENABLED(CONFIG_FCOE) diff --git a/net/9p/client.c b/net/9p/client.c index b7b958f61faf..213f12ed76cd 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -30,6 +30,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/9p.h> +#define DEFAULT_MSIZE (128 * 1024) + /* * Client Option Parsing (code inspired by NFS code) * - a little lazy - parse all client options @@ -65,7 +67,7 @@ EXPORT_SYMBOL(p9_is_proto_dotu); int p9_show_client_options(struct seq_file *m, struct p9_client *clnt) { - if (clnt->msize != 8192) + if (clnt->msize != DEFAULT_MSIZE) seq_printf(m, ",msize=%u", clnt->msize); seq_printf(m, ",trans=%s", clnt->trans_mod->name); @@ -139,7 +141,7 @@ static int parse_opts(char *opts, struct p9_client *clnt) int ret = 0; clnt->proto_version = p9_proto_2000L; - clnt->msize = 8192; + clnt->msize = DEFAULT_MSIZE; if (!opts) return 0; diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index f4dd0456beaf..007bbcc68010 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -34,7 +34,7 @@ #include <linux/syscalls.h> /* killme */ #define P9_PORT 564 -#define MAX_SOCK_BUF (64*1024) +#define MAX_SOCK_BUF (1024*1024) #define MAXPOLLWADDR 2 static struct p9_trans_module p9_tcp_trans; diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 2bbd7dce0f1d..490a4c900339 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -610,7 +610,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); if (!chan->vc_wq) { err = -ENOMEM; - goto out_free_tag; + goto out_remove_file; } init_waitqueue_head(chan->vc_wq); chan->ring_bufs_avail = 1; @@ -628,6 +628,8 @@ static int p9_virtio_probe(struct virtio_device *vdev) return 0; +out_remove_file: + sysfs_remove_file(&vdev->dev.kobj, &dev_attr_mount_tag.attr); out_free_tag: kfree(tag); out_free_vq: diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index f4fea28e05da..3ec1a51a6944 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -138,7 +138,7 @@ static bool p9_xen_write_todo(struct xen_9pfs_dataring *ring, RING_IDX size) static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) { - struct xen_9pfs_front_priv *priv = NULL; + struct xen_9pfs_front_priv *priv; RING_IDX cons, prod, masked_cons, masked_prod; unsigned long flags; u32 size = p9_req->tc.size; @@ -151,7 +151,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) break; } read_unlock(&xen_9pfs_lock); - if (!priv || priv->client != client) + if (list_entry_is_head(priv, &xen_9pfs_devs, list)) return -EINVAL; num = p9_req->tc.tag % priv->num_rings; diff --git a/net/Kconfig b/net/Kconfig index c7392c449b25..074472dfa94a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -294,7 +294,7 @@ config CGROUP_NET_CLASSID config NET_RX_BUSY_POLL bool - default y + default y if !PREEMPT_RT config BQL bool @@ -363,6 +363,7 @@ source "net/bluetooth/Kconfig" source "net/rxrpc/Kconfig" source "net/kcm/Kconfig" source "net/strparser/Kconfig" +source "net/mctp/Kconfig" config FIB_RULES bool diff --git a/net/Makefile b/net/Makefile index 9ca9572188fe..fbfeb8a0bb37 100644 --- a/net/Makefile +++ b/net/Makefile @@ -78,3 +78,4 @@ obj-$(CONFIG_QRTR) += qrtr/ obj-$(CONFIG_NET_NCSI) += ncsi/ obj-$(CONFIG_XDP_SOCKETS) += xdp/ obj-$(CONFIG_MPTCP) += mptcp/ +obj-$(CONFIG_MCTP) += mctp/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 8ade5a4ceaf5..bf5736c1d458 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -666,7 +666,7 @@ static int atif_ioctl(int cmd, void __user *arg) struct rtentry rtdef; int add_route; - if (copy_from_user(&atreq, arg, sizeof(atreq))) + if (get_user_ifreq(&atreq, NULL, arg)) return -EFAULT; dev = __dev_get_by_name(&init_net, atreq.ifr_name); @@ -865,7 +865,7 @@ static int atif_ioctl(int cmd, void __user *arg) return 0; } - return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0; + return put_user_ifreq(&atreq, arg); } static int atrtr_ioctl_addrt(struct rtentry *rt) diff --git a/net/atm/br2684.c b/net/atm/br2684.c index dd2a8dabed84..f666f2f98ba5 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -577,10 +577,12 @@ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg) pr_debug("vcc=%p, encaps=%d, brvcc=%p\n", atmvcc, be.encaps, brvcc); if (list_empty(&brdev->brvccs) && !brdev->mac_was_set) { unsigned char *esi = atmvcc->dev->esi; + const u8 one = 1; + if (esi[0] | esi[1] | esi[2] | esi[3] | esi[4] | esi[5]) - memcpy(net_dev->dev_addr, esi, net_dev->addr_len); + dev_addr_set(net_dev, esi); else - net_dev->dev_addr[2] = 1; + dev_addr_mod(net_dev, 2, &one, 1); } list_add(&brvcc->brvccs, &brdev->brvccs); write_unlock_irq(&devs_lock); diff --git a/net/atm/lec.c b/net/atm/lec.c index 7226c784dbe0..6257bf12e5a0 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -340,12 +340,12 @@ static int lec_close(struct net_device *dev) static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) { + static const u8 zero_addr[ETH_ALEN] = {}; unsigned long flags; struct net_device *dev = (struct net_device *)vcc->proto_data; struct lec_priv *priv = netdev_priv(dev); struct atmlec_msg *mesg; struct lec_arp_table *entry; - int i; char *tmp; /* FIXME */ WARN_ON(refcount_sub_and_test(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc)); @@ -355,12 +355,10 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) pr_debug("%s: msg from zeppelin:%d\n", dev->name, mesg->type); switch (mesg->type) { case l_set_mac_addr: - for (i = 0; i < 6; i++) - dev->dev_addr[i] = mesg->content.normal.mac_addr[i]; + eth_hw_addr_set(dev, mesg->content.normal.mac_addr); break; case l_del_mac_addr: - for (i = 0; i < 6; i++) - dev->dev_addr[i] = 0; + eth_hw_addr_set(dev, zero_addr); break; case l_addr_delete: lec_addr_delete(priv, mesg->content.normal.atm_addr, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 2631efc6e359..2f34bbdde0e8 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -202,7 +202,7 @@ struct sock *ax25_get_socket(ax25_address *my_addr, ax25_address *dest_addr, * Find an AX.25 control block given both ends. It will only pick up * floating AX.25 control blocks or non Raw socket bound control blocks. */ -ax25_cb *ax25_find_cb(ax25_address *src_addr, ax25_address *dest_addr, +ax25_cb *ax25_find_cb(const ax25_address *src_addr, ax25_address *dest_addr, ax25_digi *digi, struct net_device *dev) { ax25_cb *s; diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 4ac2e0847652..d0a043a51848 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -35,7 +35,7 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr) spin_lock_bh(&ax25_dev_lock); for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) - if (ax25cmp(addr, (ax25_address *)ax25_dev->dev->dev_addr) == 0) { + if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; } spin_unlock_bh(&ax25_dev_lock); diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c index b4083f30af0d..979bc4b828a0 100644 --- a/net/ax25/ax25_iface.c +++ b/net/ax25/ax25_iface.c @@ -98,7 +98,7 @@ void ax25_linkfail_release(struct ax25_linkfail *lf) EXPORT_SYMBOL(ax25_linkfail_release); -int ax25_listen_register(ax25_address *callsign, struct net_device *dev) +int ax25_listen_register(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *listen; @@ -121,7 +121,7 @@ int ax25_listen_register(ax25_address *callsign, struct net_device *dev) EXPORT_SYMBOL(ax25_listen_register); -void ax25_listen_release(ax25_address *callsign, struct net_device *dev) +void ax25_listen_release(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *s, *listen; @@ -171,7 +171,7 @@ int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *) return res; } -int ax25_listen_mine(ax25_address *callsign, struct net_device *dev) +int ax25_listen_mine(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *listen; diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index cd6afe895db9..1cac25aca637 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -181,7 +181,7 @@ static int ax25_process_rx_frame(ax25_cb *ax25, struct sk_buff *skb, int type, i } static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, - ax25_address *dev_addr, struct packet_type *ptype) + const ax25_address *dev_addr, struct packet_type *ptype) { ax25_address src, dest, *next_digi = NULL; int type = 0, mine = 0, dama; @@ -447,5 +447,5 @@ int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev, skb_pull(skb, AX25_KISS_HEADER_LEN); /* Remove the KISS byte */ - return ax25_rcv(skb, dev, (ax25_address *)dev->dev_addr, ptype); + return ax25_rcv(skb, dev, (const ax25_address *)dev->dev_addr, ptype); } diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c index e4f63dd43cb5..36249776c021 100644 --- a/net/ax25/ax25_ip.c +++ b/net/ax25/ax25_ip.c @@ -193,10 +193,8 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb) skb_pull(skb, AX25_KISS_HEADER_LEN); if (digipeat != NULL) { - if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) { - kfree_skb(skb); + if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) goto put; - } skb = ourskb; } diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c index f53751ba81b3..3db76d2470e9 100644 --- a/net/ax25/ax25_out.c +++ b/net/ax25/ax25_out.c @@ -29,7 +29,7 @@ static DEFINE_SPINLOCK(ax25_frag_lock); -ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax25_address *dest, ax25_digi *digi, struct net_device *dev) +ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, const ax25_address *src, ax25_address *dest, ax25_digi *digi, struct net_device *dev) { ax25_dev *ax25_dev; ax25_cb *ax25; @@ -325,7 +325,6 @@ void ax25_kick(ax25_cb *ax25) void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type) { - struct sk_buff *skbn; unsigned char *ptr; int headroom; @@ -336,18 +335,12 @@ void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type) headroom = ax25_addr_size(ax25->digipeat); - if (skb_headroom(skb) < headroom) { - if ((skbn = skb_realloc_headroom(skb, headroom)) == NULL) { + if (unlikely(skb_headroom(skb) < headroom)) { + skb = skb_expand_head(skb, headroom); + if (!skb) { printk(KERN_CRIT "AX.25: ax25_transmit_buffer - out of memory\n"); - kfree_skb(skb); return; } - - if (skb->sk != NULL) - skb_set_owner_w(skbn, skb->sk); - - consume_skb(skb); - skb = skbn; } ptr = skb_push(skb, headroom); diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index b40e0bce67ea..d0b2e094bd55 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -441,24 +441,17 @@ put: struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src, ax25_address *dest, ax25_digi *digi) { - struct sk_buff *skbn; unsigned char *bp; int len; len = digi->ndigi * AX25_ADDR_LEN; - if (skb_headroom(skb) < len) { - if ((skbn = skb_realloc_headroom(skb, len)) == NULL) { + if (unlikely(skb_headroom(skb) < len)) { + skb = skb_expand_head(skb, len); + if (!skb) { printk(KERN_CRIT "AX.25: ax25_dg_build_path - out of memory\n"); return NULL; } - - if (skb->sk != NULL) - skb_set_owner_w(skbn, skb->sk); - - consume_skb(skb); - - skb = skbn; } bp = skb_push(skb, len); diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 12022378f892..f94f538fa382 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -519,8 +519,7 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet, } out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); return res; } @@ -857,8 +856,7 @@ static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface) rcu_read_unlock(); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) @@ -1046,14 +1044,10 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, unlock: rcu_read_unlock(); out: - if (neigh_node) - batadv_neigh_node_put(neigh_node); - if (router) - batadv_neigh_node_put(router); - if (neigh_ifinfo) - batadv_neigh_ifinfo_put(neigh_ifinfo); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_node_put(neigh_node); + batadv_neigh_node_put(router); + batadv_neigh_ifinfo_put(neigh_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } /** @@ -1194,8 +1188,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, ret = true; out: - if (neigh_node) - batadv_neigh_node_put(neigh_node); + batadv_neigh_node_put(neigh_node); return ret; } @@ -1496,16 +1489,11 @@ out_neigh: if (orig_neigh_node && !is_single_hop_neigh) batadv_orig_node_put(orig_neigh_node); out: - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (router) - batadv_neigh_node_put(router); - if (router_router) - batadv_neigh_node_put(router_router); - if (orig_neigh_router) - batadv_neigh_node_put(orig_neigh_router); - if (hardif_neigh) - batadv_hardif_neigh_put(hardif_neigh); + batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_node_put(router); + batadv_neigh_node_put(router_router); + batadv_neigh_node_put(orig_neigh_router); + batadv_hardif_neigh_put(hardif_neigh); consume_skb(skb_priv); } @@ -1926,8 +1914,7 @@ batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, } out: - if (neigh_node_best) - batadv_neigh_node_put(neigh_node_best); + batadv_neigh_node_put(neigh_node_best); *sub_s = 0; return 0; @@ -2049,10 +2036,8 @@ static bool batadv_iv_ogm_neigh_diff(struct batadv_neigh_node *neigh1, *diff = (int)tq1 - (int)tq2; out: - if (neigh1_ifinfo) - batadv_neigh_ifinfo_put(neigh1_ifinfo); - if (neigh2_ifinfo) - batadv_neigh_ifinfo_put(neigh2_ifinfo); + batadv_neigh_ifinfo_put(neigh1_ifinfo); + batadv_neigh_ifinfo_put(neigh2_ifinfo); return ret; } @@ -2299,8 +2284,7 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (tmp_gw_factor > max_gw_factor || (tmp_gw_factor == max_gw_factor && tq_avg > max_tq)) { - if (curr_gw) - batadv_gw_node_put(curr_gw); + batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } @@ -2314,8 +2298,7 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) * $routing_class more tq points) */ if (tq_avg > max_tq) { - if (curr_gw) - batadv_gw_node_put(curr_gw); + batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); } @@ -2332,8 +2315,7 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) next: batadv_neigh_node_put(router); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_ifinfo_put(router_ifinfo); } rcu_read_unlock(); @@ -2397,14 +2379,10 @@ static bool batadv_iv_gw_is_eligible(struct batadv_priv *bat_priv, ret = true; out: - if (router_gw_ifinfo) - batadv_neigh_ifinfo_put(router_gw_ifinfo); - if (router_orig_ifinfo) - batadv_neigh_ifinfo_put(router_orig_ifinfo); - if (router_gw) - batadv_neigh_node_put(router_gw); - if (router_orig) - batadv_neigh_node_put(router_orig); + batadv_neigh_ifinfo_put(router_gw_ifinfo); + batadv_neigh_ifinfo_put(router_orig_ifinfo); + batadv_neigh_node_put(router_gw); + batadv_neigh_node_put(router_orig); return ret; } @@ -2479,12 +2457,9 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, ret = 0; out: - if (curr_gw) - batadv_gw_node_put(curr_gw); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (router) - batadv_neigh_node_put(router); + batadv_gw_node_put(curr_gw); + batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_node_put(router); return ret; } diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index b98aea958e3d..54e41fc709c3 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -106,8 +106,7 @@ static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface) batadv_v_primary_iface_set(hard_iface); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } static void @@ -366,8 +365,7 @@ batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq, } out: - if (neigh_node_best) - batadv_neigh_node_put(neigh_node_best); + batadv_neigh_node_put(neigh_node_best); *sub_s = 0; return 0; @@ -568,10 +566,8 @@ static int batadv_v_gw_throughput_get(struct batadv_gw_node *gw_node, u32 *bw) ret = 0; out: - if (router) - batadv_neigh_node_put(router); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_node_put(router); + batadv_neigh_ifinfo_put(router_ifinfo); return ret; } @@ -599,8 +595,7 @@ batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (curr_gw && bw <= max_bw) goto next; - if (curr_gw) - batadv_gw_node_put(curr_gw); + batadv_gw_node_put(curr_gw); curr_gw = gw_node; kref_get(&curr_gw->refcount); @@ -662,10 +657,8 @@ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv, ret = true; out: - if (curr_gw) - batadv_gw_node_put(curr_gw); - if (orig_gw) - batadv_gw_node_put(orig_gw); + batadv_gw_node_put(curr_gw); + batadv_gw_node_put(orig_gw); return ret; } @@ -764,12 +757,9 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, ret = 0; out: - if (curr_gw) - batadv_gw_node_put(curr_gw); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (router) - batadv_neigh_node_put(router); + batadv_gw_node_put(curr_gw); + batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_node_put(router); return ret; } diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 423c2d171703..71999e13f729 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -486,14 +486,11 @@ static void batadv_v_elp_neigh_update(struct batadv_priv *bat_priv, hardif_neigh->bat_v.elp_interval = ntohl(elp_packet->elp_interval); hardif_free: - if (hardif_neigh) - batadv_hardif_neigh_put(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); neigh_free: - if (neigh) - batadv_neigh_node_put(neigh); + batadv_neigh_node_put(neigh); orig_free: - if (orig_neigh) - batadv_orig_node_put(orig_neigh); + batadv_orig_node_put(orig_neigh); } /** diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index a0a9636d1740..1d750f3cb2e4 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -584,12 +584,9 @@ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv, batadv_v_ogm_queue_on_if(skb, if_outgoing); out: - if (orig_ifinfo) - batadv_orig_ifinfo_put(orig_ifinfo); - if (router) - batadv_neigh_node_put(router); - if (neigh_ifinfo) - batadv_neigh_ifinfo_put(neigh_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); + batadv_neigh_node_put(router); + batadv_neigh_ifinfo_put(neigh_ifinfo); } /** @@ -669,10 +666,8 @@ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv, else ret = 0; out: - if (orig_ifinfo) - batadv_orig_ifinfo_put(orig_ifinfo); - if (neigh_ifinfo) - batadv_neigh_ifinfo_put(neigh_ifinfo); + batadv_orig_ifinfo_put(orig_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); return ret; } @@ -763,16 +758,11 @@ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv, batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node); out: - if (router) - batadv_neigh_node_put(router); - if (orig_neigh_router) - batadv_neigh_node_put(orig_neigh_router); - if (orig_neigh_node) - batadv_orig_node_put(orig_neigh_node); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); - if (neigh_ifinfo) - batadv_neigh_ifinfo_put(neigh_ifinfo); + batadv_neigh_node_put(router); + batadv_neigh_node_put(orig_neigh_router); + batadv_orig_node_put(orig_neigh_node); + batadv_neigh_ifinfo_put(router_ifinfo); + batadv_neigh_ifinfo_put(neigh_ifinfo); return forward; } @@ -978,12 +968,9 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset, } rcu_read_unlock(); out: - if (orig_node) - batadv_orig_node_put(orig_node); - if (neigh_node) - batadv_neigh_node_put(neigh_node); - if (hardif_neigh) - batadv_hardif_neigh_put(hardif_neigh); + batadv_orig_node_put(orig_node); + batadv_neigh_node_put(neigh_node); + batadv_hardif_neigh_put(hardif_neigh); } /** diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 63d42dcc9324..2ed9496fc41f 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -162,6 +162,9 @@ static void batadv_backbone_gw_release(struct kref *ref) */ static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw) { + if (!backbone_gw) + return; + kref_put(&backbone_gw->refcount, batadv_backbone_gw_release); } @@ -197,6 +200,9 @@ static void batadv_claim_release(struct kref *ref) */ static void batadv_claim_put(struct batadv_bla_claim *claim) { + if (!claim) + return; + kref_put(&claim->refcount, batadv_claim_release); } @@ -248,7 +254,7 @@ batadv_claim_hash_find(struct batadv_priv *bat_priv, * Return: backbone gateway if found or NULL otherwise */ static struct batadv_bla_backbone_gw * -batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, +batadv_backbone_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; @@ -330,7 +336,7 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) * @vid: the VLAN ID * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...) */ -static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, +static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac, unsigned short vid, int claimtype) { struct sk_buff *skb; @@ -439,8 +445,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, netif_rx_any_context(skb); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } /** @@ -483,7 +488,7 @@ static void batadv_bla_loopdetect_report(struct work_struct *work) * Return: the (possibly created) backbone gateway or NULL on error */ static struct batadv_bla_backbone_gw * -batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, +batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig, unsigned short vid, bool own_backbone) { struct batadv_bla_backbone_gw *entry; @@ -921,7 +926,7 @@ static bool batadv_handle_request(struct batadv_priv *bat_priv, */ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - u8 *backbone_addr, u8 *claim_addr, + const u8 *backbone_addr, const u8 *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -959,7 +964,7 @@ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv, */ static bool batadv_handle_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - u8 *backbone_addr, u8 *claim_addr, + const u8 *backbone_addr, const u8 *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -1498,8 +1503,7 @@ static void batadv_bla_periodic_work(struct work_struct *work) rcu_read_unlock(); } out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); @@ -1556,10 +1560,14 @@ int batadv_bla_init(struct batadv_priv *bat_priv) return 0; bat_priv->bla.claim_hash = batadv_hash_new(128); - bat_priv->bla.backbone_hash = batadv_hash_new(32); + if (!bat_priv->bla.claim_hash) + return -ENOMEM; - if (!bat_priv->bla.claim_hash || !bat_priv->bla.backbone_hash) + bat_priv->bla.backbone_hash = batadv_hash_new(32); + if (!bat_priv->bla.backbone_hash) { + batadv_hash_destroy(bat_priv->bla.claim_hash); return -ENOMEM; + } batadv_hash_set_lock_class(bat_priv->bla.claim_hash, &batadv_claim_hash_lock_class_key); @@ -1808,8 +1816,7 @@ void batadv_bla_free(struct batadv_priv *bat_priv) batadv_hash_destroy(bat_priv->bla.backbone_hash); bat_priv->bla.backbone_hash = NULL; } - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } /** @@ -1996,10 +2003,8 @@ handled: ret = true; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (claim) - batadv_claim_put(claim); + batadv_hardif_put(primary_if); + batadv_claim_put(claim); return ret; } @@ -2103,10 +2108,8 @@ allow: handled: ret = true; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (claim) - batadv_claim_put(claim); + batadv_hardif_put(primary_if); + batadv_claim_put(claim); return ret; } @@ -2127,7 +2130,7 @@ batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, struct batadv_hard_iface *primary_if, struct batadv_bla_claim *claim) { - u8 *primary_addr = primary_if->net_dev->dev_addr; + const u8 *primary_addr = primary_if->net_dev->dev_addr; u16 backbone_crc; bool is_own; void *hdr; @@ -2271,11 +2274,9 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + dev_put(soft_iface); return ret; } @@ -2297,7 +2298,7 @@ batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, struct batadv_hard_iface *primary_if, struct batadv_bla_backbone_gw *backbone_gw) { - u8 *primary_addr = primary_if->net_dev->dev_addr; + const u8 *primary_addr = primary_if->net_dev->dev_addr; u16 backbone_crc; bool is_own; int msecs; @@ -2443,11 +2444,9 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + dev_put(soft_iface); return ret; } diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 8c95a11a830a..2f008e329007 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -127,6 +127,9 @@ static void batadv_dat_entry_release(struct kref *ref) */ static void batadv_dat_entry_put(struct batadv_dat_entry *dat_entry) { + if (!dat_entry) + return; + kref_put(&dat_entry->refcount, batadv_dat_entry_release); } @@ -405,8 +408,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, &dat_entry->ip, dat_entry->mac_addr, batadv_print_vid(vid)); out: - if (dat_entry) - batadv_dat_entry_put(dat_entry); + batadv_dat_entry_put(dat_entry); } #ifdef CONFIG_BATMAN_ADV_DEBUG @@ -594,8 +596,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, continue; max = tmp_max; - if (max_orig_node) - batadv_orig_node_put(max_orig_node); + batadv_orig_node_put(max_orig_node); max_orig_node = orig_node; } rcu_read_unlock(); @@ -981,11 +982,9 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + dev_put(soft_iface); return ret; } @@ -1218,8 +1217,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, BATADV_P_DAT_DHT_GET); } out: - if (dat_entry) - batadv_dat_entry_put(dat_entry); + batadv_dat_entry_put(dat_entry); return ret; } @@ -1286,8 +1284,7 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, ret = true; } out: - if (dat_entry) - batadv_dat_entry_put(dat_entry); + batadv_dat_entry_put(dat_entry); if (ret) kfree_skb(skb); return ret; @@ -1420,8 +1417,7 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, out: if (dropped) kfree_skb(skb); - if (dat_entry) - batadv_dat_entry_put(dat_entry); + batadv_dat_entry_put(dat_entry); /* if dropped == false -> deliver to the interface */ return dropped; } @@ -1830,7 +1826,6 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, ret = true; out: - if (dat_entry) - batadv_dat_entry_put(dat_entry); + batadv_dat_entry_put(dat_entry); return ret; } diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index a5d9d800082b..0899a729a23f 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -381,10 +381,8 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, } out: - if (orig_node_dst) - batadv_orig_node_put(orig_node_dst); - if (neigh_node) - batadv_neigh_node_put(neigh_node); + batadv_orig_node_put(orig_node_dst); + batadv_neigh_node_put(neigh_node); return ret; } diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 007f2827935d..b7466136e292 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -59,7 +59,7 @@ * after rcu grace period * @ref: kref pointer of the gw_node */ -static void batadv_gw_node_release(struct kref *ref) +void batadv_gw_node_release(struct kref *ref) { struct batadv_gw_node *gw_node; @@ -70,16 +70,6 @@ static void batadv_gw_node_release(struct kref *ref) } /** - * batadv_gw_node_put() - decrement the gw_node refcounter and possibly release - * it - * @gw_node: gateway node to free - */ -void batadv_gw_node_put(struct batadv_gw_node *gw_node) -{ - kref_put(&gw_node->refcount, batadv_gw_node_release); -} - -/** * batadv_gw_get_selected_gw_node() - Get currently selected gateway * @bat_priv: the bat priv with all the soft interface information * @@ -130,8 +120,7 @@ batadv_gw_get_selected_orig(struct batadv_priv *bat_priv) unlock: rcu_read_unlock(); out: - if (gw_node) - batadv_gw_node_put(gw_node); + batadv_gw_node_put(gw_node); return orig_node; } @@ -148,8 +137,7 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, curr_gw_node = rcu_replace_pointer(bat_priv->gw.curr_gw, new_gw_node, true); - if (curr_gw_node) - batadv_gw_node_put(curr_gw_node); + batadv_gw_node_put(curr_gw_node); spin_unlock_bh(&bat_priv->gw.list_lock); } @@ -284,14 +272,10 @@ void batadv_gw_election(struct batadv_priv *bat_priv) batadv_gw_select(bat_priv, next_gw); out: - if (curr_gw) - batadv_gw_node_put(curr_gw); - if (next_gw) - batadv_gw_node_put(next_gw); - if (router) - batadv_neigh_node_put(router); - if (router_ifinfo) - batadv_neigh_ifinfo_put(router_ifinfo); + batadv_gw_node_put(curr_gw); + batadv_gw_node_put(next_gw); + batadv_neigh_node_put(router); + batadv_neigh_ifinfo_put(router_ifinfo); } /** @@ -325,8 +309,7 @@ void batadv_gw_check_election(struct batadv_priv *bat_priv, reselect: batadv_gw_reselect(bat_priv); out: - if (curr_gw_orig) - batadv_orig_node_put(curr_gw_orig); + batadv_orig_node_put(curr_gw_orig); } /** @@ -466,13 +449,11 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); - if (curr_gw) - batadv_gw_node_put(curr_gw); + batadv_gw_node_put(curr_gw); } out: - if (gw_node) - batadv_gw_node_put(gw_node); + batadv_gw_node_put(gw_node); } /** @@ -555,10 +536,8 @@ int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + batadv_hardif_put(primary_if); + dev_put(soft_iface); return ret; } @@ -780,15 +759,10 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, batadv_neigh_ifinfo_put(old_ifinfo); out: - if (orig_dst_node) - batadv_orig_node_put(orig_dst_node); - if (curr_gw) - batadv_gw_node_put(curr_gw); - if (gw_node) - batadv_gw_node_put(gw_node); - if (neigh_old) - batadv_neigh_node_put(neigh_old); - if (neigh_curr) - batadv_neigh_node_put(neigh_curr); + batadv_orig_node_put(orig_dst_node); + batadv_gw_node_put(curr_gw); + batadv_gw_node_put(gw_node); + batadv_neigh_node_put(neigh_old); + batadv_neigh_node_put(neigh_curr); return out_of_range; } diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 2ae5846ef958..95c2ccdaa554 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -9,6 +9,7 @@ #include "main.h" +#include <linux/kref.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> @@ -27,7 +28,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); void batadv_gw_node_free(struct batadv_priv *bat_priv); -void batadv_gw_node_put(struct batadv_gw_node *gw_node); +void batadv_gw_node_release(struct kref *ref); struct batadv_gw_node * batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv); int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb); @@ -38,4 +39,17 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); +/** + * batadv_gw_node_put() - decrement the gw_node refcounter and possibly release + * it + * @gw_node: gateway node to free + */ +static inline void batadv_gw_node_put(struct batadv_gw_node *gw_node) +{ + if (!gw_node) + return; + + kref_put(&gw_node->refcount, batadv_gw_node_release); +} + #endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index fdde305a198e..9349c76f30c5 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -10,7 +10,7 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/errno.h> -#include <linux/kernel.h> +#include <linux/kstrtox.h> #include <linux/limits.h> #include <linux/math64.h> #include <linux/netdevice.h> diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 55d97e18aa4a..8a2b78f9c4b2 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -236,8 +236,7 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev) real_netdev = dev_get_by_index(real_net, ifindex); out: - if (hard_iface) - batadv_hardif_put(hard_iface); + batadv_hardif_put(hard_iface); return real_netdev; } @@ -457,8 +456,7 @@ static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv, batadv_dat_init_own_addr(bat_priv, primary_if); batadv_bla_update_orig_address(bat_priv, primary_if, oldif); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } static void batadv_primary_if_select(struct batadv_priv *bat_priv, @@ -481,8 +479,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, batadv_primary_if_update_addr(bat_priv, curr_hard_iface); out: - if (curr_hard_iface) - batadv_hardif_put(curr_hard_iface); + batadv_hardif_put(curr_hard_iface); } static bool @@ -657,8 +654,7 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface) bat_priv->algo_ops->iface.activate(hard_iface); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } static void @@ -811,8 +807,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) new_if = batadv_hardif_get_active(hard_iface->soft_iface); batadv_primary_if_select(bat_priv, new_if); - if (new_if) - batadv_hardif_put(new_if); + batadv_hardif_put(new_if); } bat_priv->algo_ops->iface.disable(hard_iface); @@ -834,8 +829,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface) batadv_hardif_put(hard_iface); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } static struct batadv_hard_iface * @@ -990,8 +984,7 @@ static int batadv_hard_if_event(struct notifier_block *this, hardif_put: batadv_hardif_put(hard_iface); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); return NOTIFY_DONE; } diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 8cb2a1f10080..64f660dbbe54 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -89,6 +89,9 @@ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing, */ static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface) { + if (!hard_iface) + return; + kref_put(&hard_iface->refcount, batadv_hardif_release); } diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index f0e5d1429662..7a93a1e94c40 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -7,7 +7,7 @@ #include "log.h" #include "main.h" -#include <stdarg.h> +#include <linux/stdarg.h> #include "trace.h" diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 3ddd66e4c29e..5207cd8d6ad8 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -190,29 +190,41 @@ int batadv_mesh_init(struct net_device *soft_iface) bat_priv->gw.generation = 0; - ret = batadv_v_mesh_init(bat_priv); - if (ret < 0) - goto err; - ret = batadv_originator_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_orig; + } ret = batadv_tt_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_tt; + } + + ret = batadv_v_mesh_init(bat_priv); + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_v; + } ret = batadv_bla_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_bla; + } ret = batadv_dat_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_dat; + } ret = batadv_nc_mesh_init(bat_priv); - if (ret < 0) - goto err; + if (ret < 0) { + atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); + goto err_nc; + } batadv_gw_init(bat_priv); batadv_mcast_init(bat_priv); @@ -222,8 +234,20 @@ int batadv_mesh_init(struct net_device *soft_iface) return 0; -err: - batadv_mesh_free(soft_iface); +err_nc: + batadv_dat_free(bat_priv); +err_dat: + batadv_bla_free(bat_priv); +err_bla: + batadv_v_mesh_free(bat_priv); +err_v: + batadv_tt_free(bat_priv); +err_tt: + batadv_originator_free(bat_priv); +err_orig: + batadv_purge_outstanding_packets(bat_priv, NULL); + atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); + return ret; } diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 014235fd4681..058b8f2eef65 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2021.2" +#define BATADV_SOURCE_VERSION "2021.3" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 923e2197c2db..433901dcf0c3 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -89,10 +89,9 @@ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) rcu_read_lock(); do { upper = netdev_master_upper_dev_get_rcu(upper); - } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); + } while (upper && !netif_is_bridge_master(upper)); - if (upper) - dev_hold(upper); + dev_hold(upper); rcu_read_unlock(); return upper; @@ -509,8 +508,7 @@ batadv_mcast_mla_softif_get(struct net_device *dev, } out: - if (bridge) - dev_put(bridge); + dev_put(bridge); return ret4 + ret6; } @@ -2239,12 +2237,11 @@ batadv_mcast_netlink_get_primary(struct netlink_callback *cb, } out: - if (soft_iface) - dev_put(soft_iface); + dev_put(soft_iface); if (!ret && primary_if) *primary_if = hard_iface; - else if (hard_iface) + else batadv_hardif_put(hard_iface); return ret; diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index b6cc746e01a6..29276284d281 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -359,15 +359,13 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, atomic_read(&bat_priv->orig_interval))) goto nla_put_failure; - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); genlmsg_end(msg, hdr); return 0; nla_put_failure: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); genlmsg_cancel(msg, hdr); return -EMSGSIZE; diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 4bb76b434d07..0a7f1d36a6a8 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -152,8 +152,10 @@ int batadv_nc_mesh_init(struct batadv_priv *bat_priv) &batadv_nc_coding_hash_lock_class_key); bat_priv->nc.decoding_hash = batadv_hash_new(128); - if (!bat_priv->nc.decoding_hash) + if (!bat_priv->nc.decoding_hash) { + batadv_hash_destroy(bat_priv->nc.coding_hash); goto err; + } batadv_hash_set_lock_class(bat_priv->nc.decoding_hash, &batadv_nc_decoding_hash_lock_class_key); @@ -217,6 +219,9 @@ static void batadv_nc_node_release(struct kref *ref) */ static void batadv_nc_node_put(struct batadv_nc_node *nc_node) { + if (!nc_node) + return; + kref_put(&nc_node->refcount, batadv_nc_node_release); } @@ -241,6 +246,9 @@ static void batadv_nc_path_release(struct kref *ref) */ static void batadv_nc_path_put(struct batadv_nc_path *nc_path) { + if (!nc_path) + return; + kref_put(&nc_path->refcount, batadv_nc_path_release); } @@ -930,10 +938,8 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, out_nc_node->last_seen = jiffies; out: - if (in_nc_node) - batadv_nc_node_put(in_nc_node); - if (out_nc_node) - batadv_nc_node_put(out_nc_node); + batadv_nc_node_put(in_nc_node); + batadv_nc_node_put(out_nc_node); } /** @@ -1209,14 +1215,10 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, batadv_send_unicast_skb(skb_dest, first_dest); res = true; out: - if (router_neigh) - batadv_neigh_node_put(router_neigh); - if (router_coding) - batadv_neigh_node_put(router_coding); - if (router_neigh_ifinfo) - batadv_neigh_ifinfo_put(router_neigh_ifinfo); - if (router_coding_ifinfo) - batadv_neigh_ifinfo_put(router_coding_ifinfo); + batadv_neigh_node_put(router_neigh); + batadv_neigh_node_put(router_coding); + batadv_neigh_ifinfo_put(router_neigh_ifinfo); + batadv_neigh_ifinfo_put(router_coding_ifinfo); return res; } diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index da7249448474..aadc653ca1d8 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -177,7 +177,7 @@ out: * and queue for free after rcu grace period * @ref: kref pointer of the originator-vlan object */ -static void batadv_orig_node_vlan_release(struct kref *ref) +void batadv_orig_node_vlan_release(struct kref *ref) { struct batadv_orig_node_vlan *orig_vlan; @@ -187,16 +187,6 @@ static void batadv_orig_node_vlan_release(struct kref *ref) } /** - * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release - * the originator-vlan object - * @orig_vlan: the originator-vlan object to release - */ -void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan) -{ - kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); -} - -/** * batadv_originator_init() - Initialize all originator structures * @bat_priv: the bat priv with all the soft interface information * @@ -231,7 +221,7 @@ err: * free after rcu grace period * @ref: kref pointer of the neigh_ifinfo */ -static void batadv_neigh_ifinfo_release(struct kref *ref) +void batadv_neigh_ifinfo_release(struct kref *ref) { struct batadv_neigh_ifinfo *neigh_ifinfo; @@ -244,21 +234,11 @@ static void batadv_neigh_ifinfo_release(struct kref *ref) } /** - * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release - * the neigh_ifinfo - * @neigh_ifinfo: the neigh_ifinfo object to release - */ -void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo) -{ - kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release); -} - -/** * batadv_hardif_neigh_release() - release hardif neigh node from lists and * queue for free after rcu grace period * @ref: kref pointer of the neigh_node */ -static void batadv_hardif_neigh_release(struct kref *ref) +void batadv_hardif_neigh_release(struct kref *ref) { struct batadv_hardif_neigh_node *hardif_neigh; @@ -274,21 +254,11 @@ static void batadv_hardif_neigh_release(struct kref *ref) } /** - * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter - * and possibly release it - * @hardif_neigh: hardif neigh neighbor to free - */ -void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh) -{ - kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release); -} - -/** * batadv_neigh_node_release() - release neigh_node from lists and queue for * free after rcu grace period * @ref: kref pointer of the neigh_node */ -static void batadv_neigh_node_release(struct kref *ref) +void batadv_neigh_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; @@ -309,16 +279,6 @@ static void batadv_neigh_node_release(struct kref *ref) } /** - * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly - * release it - * @neigh_node: neigh neighbor to free - */ -void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) -{ - kref_put(&neigh_node->refcount, batadv_neigh_node_release); -} - -/** * batadv_orig_router_get() - router to the originator depending on iface * @orig_node: the orig node for the router * @if_outgoing: the interface where the payload packet has been received or @@ -704,8 +664,7 @@ batadv_neigh_node_create(struct batadv_orig_node *orig_node, out: spin_unlock_bh(&orig_node->neigh_list_lock); - if (hardif_neigh) - batadv_hardif_neigh_put(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); return neigh_node; } @@ -797,14 +756,10 @@ int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (hardif) - batadv_hardif_put(hardif); - if (hard_iface) - dev_put(hard_iface); - if (primary_if) - batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + batadv_hardif_put(hardif); + dev_put(hard_iface); + batadv_hardif_put(primary_if); + dev_put(soft_iface); return ret; } @@ -814,7 +769,7 @@ int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) * free after rcu grace period * @ref: kref pointer of the orig_ifinfo */ -static void batadv_orig_ifinfo_release(struct kref *ref) +void batadv_orig_ifinfo_release(struct kref *ref) { struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *router; @@ -826,23 +781,12 @@ static void batadv_orig_ifinfo_release(struct kref *ref) /* this is the last reference to this object */ router = rcu_dereference_protected(orig_ifinfo->router, true); - if (router) - batadv_neigh_node_put(router); + batadv_neigh_node_put(router); kfree_rcu(orig_ifinfo, rcu); } /** - * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release - * the orig_ifinfo - * @orig_ifinfo: the orig_ifinfo object to release - */ -void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo) -{ - kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release); -} - -/** * batadv_orig_node_free_rcu() - free the orig_node * @rcu: rcu pointer of the orig_node */ @@ -865,7 +809,7 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu) * free after rcu grace period * @ref: kref pointer of the orig_node */ -static void batadv_orig_node_release(struct kref *ref) +void batadv_orig_node_release(struct kref *ref) { struct hlist_node *node_tmp; struct batadv_neigh_node *neigh_node; @@ -895,8 +839,7 @@ static void batadv_orig_node_release(struct kref *ref) orig_node->last_bonding_candidate = NULL; spin_unlock_bh(&orig_node->neigh_list_lock); - if (last_candidate) - batadv_orig_ifinfo_put(last_candidate); + batadv_orig_ifinfo_put(last_candidate); spin_lock_bh(&orig_node->vlan_list_lock); hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) { @@ -912,16 +855,6 @@ static void batadv_orig_node_release(struct kref *ref) } /** - * batadv_orig_node_put() - decrement the orig node refcounter and possibly - * release it - * @orig_node: the orig node to free - */ -void batadv_orig_node_put(struct batadv_orig_node *orig_node) -{ - kref_put(&orig_node->refcount, batadv_orig_node_release); -} - -/** * batadv_originator_free() - Free all originator structures * @bat_priv: the bat priv with all the soft interface information */ @@ -1213,8 +1146,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv, if (!kref_get_unless_zero(&neigh->refcount)) continue; - if (best) - batadv_neigh_node_put(best); + batadv_neigh_node_put(best); best = neigh; } @@ -1259,8 +1191,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, BATADV_IF_DEFAULT); batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT, best_neigh_node); - if (best_neigh_node) - batadv_neigh_node_put(best_neigh_node); + batadv_neigh_node_put(best_neigh_node); /* ... then for all other interfaces. */ rcu_read_lock(); @@ -1279,8 +1210,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv, hard_iface); batadv_update_route(bat_priv, orig_node, hard_iface, best_neigh_node); - if (best_neigh_node) - batadv_neigh_node_put(best_neigh_node); + batadv_neigh_node_put(best_neigh_node); batadv_hardif_put(hard_iface); } @@ -1410,14 +1340,10 @@ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (hardif) - batadv_hardif_put(hardif); - if (hard_iface) - dev_put(hard_iface); - if (primary_if) - batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + batadv_hardif_put(hardif); + dev_put(hard_iface); + batadv_hardif_put(primary_if); + dev_put(soft_iface); return ret; } diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 805be87d55b8..ea3d69e4e670 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -12,6 +12,7 @@ #include <linux/compiler.h> #include <linux/if_ether.h> #include <linux/jhash.h> +#include <linux/kref.h> #include <linux/netlink.h> #include <linux/skbuff.h> #include <linux/types.h> @@ -20,19 +21,18 @@ bool batadv_compare_orig(const struct hlist_node *node, const void *data2); int batadv_originator_init(struct batadv_priv *bat_priv); void batadv_originator_free(struct batadv_priv *bat_priv); void batadv_purge_orig_ref(struct batadv_priv *bat_priv); -void batadv_orig_node_put(struct batadv_orig_node *orig_node); +void batadv_orig_node_release(struct kref *ref); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_hardif_neigh_node * batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); -void -batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh); +void batadv_hardif_neigh_release(struct kref *ref); struct batadv_neigh_node * batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); -void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node); +void batadv_neigh_node_release(struct kref *ref); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, const struct batadv_hard_iface *if_outgoing); @@ -42,7 +42,7 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh, struct batadv_neigh_ifinfo * batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh, struct batadv_hard_iface *if_outgoing); -void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo); +void batadv_neigh_ifinfo_release(struct kref *ref); int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb); @@ -52,7 +52,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node, struct batadv_orig_ifinfo * batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing); -void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo); +void batadv_orig_ifinfo_release(struct kref *ref); int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb); struct batadv_orig_node_vlan * @@ -61,7 +61,7 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, struct batadv_orig_node_vlan * batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, unsigned short vid); -void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan); +void batadv_orig_node_vlan_release(struct kref *ref); /** * batadv_choose_orig() - Return the index of the orig entry in the hash table @@ -82,4 +82,86 @@ static inline u32 batadv_choose_orig(const void *data, u32 size) struct batadv_orig_node * batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data); +/** + * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release + * the originator-vlan object + * @orig_vlan: the originator-vlan object to release + */ +static inline void +batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan) +{ + if (!orig_vlan) + return; + + kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release); +} + +/** + * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release + * the neigh_ifinfo + * @neigh_ifinfo: the neigh_ifinfo object to release + */ +static inline void +batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo) +{ + if (!neigh_ifinfo) + return; + + kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release); +} + +/** + * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter + * and possibly release it + * @hardif_neigh: hardif neigh neighbor to free + */ +static inline void +batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh) +{ + if (!hardif_neigh) + return; + + kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release); +} + +/** + * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly + * release it + * @neigh_node: neigh neighbor to free + */ +static inline void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node) +{ + if (!neigh_node) + return; + + kref_put(&neigh_node->refcount, batadv_neigh_node_release); +} + +/** + * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release + * the orig_ifinfo + * @orig_ifinfo: the orig_ifinfo object to release + */ +static inline void +batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo) +{ + if (!orig_ifinfo) + return; + + kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release); +} + +/** + * batadv_orig_node_put() - decrement the orig node refcounter and possibly + * release it + * @orig_node: the orig node to free + */ +static inline void batadv_orig_node_put(struct batadv_orig_node *orig_node) +{ + if (!orig_node) + return; + + kref_put(&orig_node->refcount, batadv_orig_node_release); +} + #endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index bb9e93e3d98c..83f31494ea4d 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -101,8 +101,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, } /* decrease refcount of previous best neighbor */ - if (curr_router) - batadv_neigh_node_put(curr_router); + batadv_neigh_node_put(curr_router); } /** @@ -128,8 +127,7 @@ void batadv_update_route(struct batadv_priv *bat_priv, _batadv_update_route(bat_priv, orig_node, recv_if, neigh_node); out: - if (router) - batadv_neigh_node_put(router); + batadv_neigh_node_put(router); } /** @@ -269,10 +267,8 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv, goto out; } out: - if (primary_if) - batadv_hardif_put(primary_if); - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); kfree_skb(skb); @@ -324,10 +320,8 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv, skb = NULL; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); kfree_skb(skb); @@ -425,8 +419,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, skb = NULL; put_orig_node: - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); free_skb: kfree_skb(skb); @@ -513,8 +506,7 @@ batadv_last_bonding_replace(struct batadv_orig_node *orig_node, orig_node->last_bonding_candidate = new_candidate; spin_unlock_bh(&orig_node->neigh_list_lock); - if (old_candidate) - batadv_orig_ifinfo_put(old_candidate); + batadv_orig_ifinfo_put(old_candidate); } /** @@ -656,8 +648,7 @@ next: batadv_orig_ifinfo_put(next_candidate); } - if (last_candidate) - batadv_orig_ifinfo_put(last_candidate); + batadv_orig_ifinfo_put(last_candidate); return router; } @@ -756,7 +747,8 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node *orig_node = NULL; struct batadv_hard_iface *primary_if = NULL; bool ret = false; - u8 *orig_addr, orig_ttvn; + const u8 *orig_addr; + u8 orig_ttvn; if (batadv_is_my_client(bat_priv, dst_addr, vid)) { primary_if = batadv_primary_if_get_selected(bat_priv); @@ -785,10 +777,8 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, ret = true; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); return ret; } @@ -1031,8 +1021,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, orig_node); rx_success: - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } @@ -1279,7 +1268,6 @@ free_skb: kfree_skb(skb); ret = NET_RX_DROP; out: - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return ret; } diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 0b9dd29d3b6a..477d85a3b558 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -152,8 +152,7 @@ int batadv_send_unicast_skb(struct sk_buff *skb, if (hardif_neigh && ret != NET_XMIT_DROP) hardif_neigh->bat_v.last_unicast_tx = jiffies; - if (hardif_neigh) - batadv_hardif_neigh_put(hardif_neigh); + batadv_hardif_neigh_put(hardif_neigh); #endif return ret; @@ -309,8 +308,7 @@ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv, ret = true; out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); return ret; } @@ -425,8 +423,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, ret = batadv_send_skb_unicast(bat_priv, skb, packet_type, packet_subtype, orig_node, vid); - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -452,8 +449,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, ret = batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, BATADV_P_DATA, orig_node, vid); - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return ret; } @@ -474,10 +470,8 @@ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet, else consume_skb(forw_packet->skb); - if (forw_packet->if_incoming) - batadv_hardif_put(forw_packet->if_incoming); - if (forw_packet->if_outgoing) - batadv_hardif_put(forw_packet->if_outgoing); + batadv_hardif_put(forw_packet->if_incoming); + batadv_hardif_put(forw_packet->if_outgoing); if (forw_packet->queue_left) atomic_inc(forw_packet->queue_left); kfree(forw_packet); @@ -748,6 +742,10 @@ void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv, * Adds a broadcast packet to the queue and sets up timers. Broadcast packets * are sent multiple times to increase probability for being received. * + * This call clones the given skb, hence the caller needs to take into + * account that the data segment of the original skb might not be + * modifiable anymore. + * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ static int batadv_forw_bcast_packet_to_list(struct batadv_priv *bat_priv, @@ -761,7 +759,7 @@ static int batadv_forw_bcast_packet_to_list(struct batadv_priv *bat_priv, unsigned long send_time = jiffies; struct sk_buff *newskb; - newskb = skb_copy(skb, GFP_ATOMIC); + newskb = skb_clone(skb, GFP_ATOMIC); if (!newskb) goto err; @@ -800,6 +798,10 @@ err: * or if a delay is given after that. Furthermore, queues additional * retransmissions if this interface is a wireless one. * + * This call clones the given skb, hence the caller needs to take into + * account that the data segment of the original skb might not be + * modifiable anymore. + * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ static int batadv_forw_bcast_packet_if(struct batadv_priv *bat_priv, @@ -814,7 +816,7 @@ static int batadv_forw_bcast_packet_if(struct batadv_priv *bat_priv, int ret = NETDEV_TX_OK; if (!delay) { - newskb = skb_copy(skb, GFP_ATOMIC); + newskb = skb_clone(skb, GFP_ATOMIC); if (!newskb) return NETDEV_TX_BUSY; @@ -867,8 +869,7 @@ static bool batadv_send_no_broadcast(struct batadv_priv *bat_priv, ret = batadv_hardif_no_broadcast(if_out, bcast_packet->orig, orig_neigh); - if (neigh_node) - batadv_hardif_neigh_put(neigh_node); + batadv_hardif_neigh_put(neigh_node); /* ok, may broadcast */ if (!ret) diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index ae368a42a4ad..7ee09337fc40 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -134,7 +134,7 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) return -EADDRNOTAVAIL; ether_addr_copy(old_addr, dev->dev_addr); - ether_addr_copy(dev->dev_addr, addr->sa_data); + eth_hw_addr_set(dev, addr->sa_data); /* only modify transtable if it has been initialized before */ if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) @@ -383,10 +383,8 @@ dropped: dropped_freed: batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED); end: - if (mcast_single_orig) - batadv_orig_node_put(mcast_single_orig); - if (primary_if) - batadv_hardif_put(primary_if); + batadv_orig_node_put(mcast_single_orig); + batadv_hardif_put(primary_if); return NETDEV_TX_OK; } @@ -501,7 +499,7 @@ out: * after rcu grace period * @ref: kref pointer of the vlan object */ -static void batadv_softif_vlan_release(struct kref *ref) +void batadv_softif_vlan_release(struct kref *ref) { struct batadv_softif_vlan *vlan; @@ -515,19 +513,6 @@ static void batadv_softif_vlan_release(struct kref *ref) } /** - * batadv_softif_vlan_put() - decrease the vlan object refcounter and - * possibly release it - * @vlan: the vlan object to release - */ -void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) -{ - if (!vlan) - return; - - kref_put(&vlan->refcount, batadv_softif_vlan_release); -} - -/** * batadv_softif_vlan_get() - get the vlan object for a specific vid * @bat_priv: the bat priv with all the soft interface information * @vid: the identifier of the vlan object to retrieve @@ -851,8 +836,7 @@ static int batadv_softif_slave_add(struct net_device *dev, ret = batadv_hardif_enable_interface(hard_iface, dev); out: - if (hard_iface) - batadv_hardif_put(hard_iface); + batadv_hardif_put(hard_iface); return ret; } @@ -878,8 +862,7 @@ static int batadv_softif_slave_del(struct net_device *dev, ret = 0; out: - if (hard_iface) - batadv_hardif_put(hard_iface); + batadv_hardif_put(hard_iface); return ret; } diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 67a2ddd6832f..9f2003f1a497 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -9,6 +9,7 @@ #include "main.h" +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> @@ -21,8 +22,21 @@ void batadv_interface_rx(struct net_device *soft_iface, bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); -void batadv_softif_vlan_put(struct batadv_softif_vlan *softif_vlan); +void batadv_softif_vlan_release(struct kref *ref); struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid); +/** + * batadv_softif_vlan_put() - decrease the vlan object refcounter and + * possibly release it + * @vlan: the vlan object to release + */ +static inline void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) +{ + if (!vlan) + return; + + kref_put(&vlan->refcount, batadv_softif_vlan_release); +} + #endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */ diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 789c851732b7..fbcb15c7c29b 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -358,6 +358,9 @@ static void batadv_tp_vars_release(struct kref *ref) */ static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars) { + if (!tp_vars) + return; + kref_put(&tp_vars->refcount, batadv_tp_vars_release); } @@ -628,9 +631,9 @@ static void batadv_tp_recv_ack(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node = NULL; const struct batadv_icmp_tp_packet *icmp; struct batadv_tp_vars *tp_vars; + const unsigned char *dev_addr; size_t packet_len, mss; u32 rtt, recv_ack, cwnd; - unsigned char *dev_addr; packet_len = BATADV_TP_PLEN; mss = BATADV_TP_PLEN; @@ -748,12 +751,9 @@ move_twnd: wake_up(&tp_vars->more_bytes); out: - if (likely(primary_if)) - batadv_hardif_put(primary_if); - if (likely(orig_node)) - batadv_orig_node_put(orig_node); - if (likely(tp_vars)) - batadv_tp_vars_put(tp_vars); + batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); + batadv_tp_vars_put(tp_vars); } /** @@ -882,10 +882,8 @@ static int batadv_tp_send(void *arg) } out: - if (likely(primary_if)) - batadv_hardif_put(primary_if); - if (likely(orig_node)) - batadv_orig_node_put(orig_node); + batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); batadv_tp_sender_end(bat_priv, tp_vars); batadv_tp_sender_cleanup(bat_priv, tp_vars); @@ -1205,10 +1203,8 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst, ret = 0; out: - if (likely(orig_node)) - batadv_orig_node_put(orig_node); - if (likely(primary_if)) - batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); + batadv_hardif_put(primary_if); return ret; } @@ -1456,8 +1452,7 @@ send_ack: batadv_tp_send_ack(bat_priv, icmp->orig, tp_vars->last_recv, icmp->timestamp, icmp->session, icmp->uid); out: - if (likely(tp_vars)) - batadv_tp_vars_put(tp_vars); + batadv_tp_vars_put(tp_vars); } /** diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 434b4f042909..4b7ad6684bc4 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -247,6 +247,9 @@ static void batadv_tt_local_entry_release(struct kref *ref) static void batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) { + if (!tt_local_entry) + return; + kref_put(&tt_local_entry->common.refcount, batadv_tt_local_entry_release); } @@ -270,7 +273,7 @@ static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu) * queue for free after rcu grace period * @ref: kref pointer of the nc_node */ -static void batadv_tt_global_entry_release(struct kref *ref) +void batadv_tt_global_entry_release(struct kref *ref) { struct batadv_tt_global_entry *tt_global_entry; @@ -283,17 +286,6 @@ static void batadv_tt_global_entry_release(struct kref *ref) } /** - * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and - * possibly release it - * @tt_global_entry: tt_global_entry to be free'd - */ -void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) -{ - kref_put(&tt_global_entry->common.refcount, - batadv_tt_global_entry_release); -} - -/** * batadv_tt_global_hash_count() - count the number of orig entries * @bat_priv: the bat priv with all the soft interface information * @addr: the mac address of the client to count entries for @@ -452,6 +444,9 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref) static void batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry) { + if (!orig_entry) + return; + kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release); } @@ -818,14 +813,10 @@ check_roaming: ret = true; out: - if (in_hardif) - batadv_hardif_put(in_hardif); - if (in_dev) - dev_put(in_dev); - if (tt_local) - batadv_tt_local_entry_put(tt_local); - if (tt_global) - batadv_tt_global_entry_put(tt_global); + batadv_hardif_put(in_hardif); + dev_put(in_dev); + batadv_tt_local_entry_put(tt_local); + batadv_tt_global_entry_put(tt_global); return ret; } @@ -1215,10 +1206,8 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + batadv_hardif_put(primary_if); + dev_put(soft_iface); cb->args[0] = bucket; cb->args[1] = idx; @@ -1305,8 +1294,7 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, batadv_tt_local_entry_put(tt_removed_entry); out: - if (tt_local_entry) - batadv_tt_local_entry_put(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return curr_flags; } @@ -1576,8 +1564,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, sync_flags: batadv_tt_global_sync_flags(tt_global); out: - if (orig_entry) - batadv_tt_orig_list_entry_put(orig_entry); + batadv_tt_orig_list_entry_put(orig_entry); spin_unlock_bh(&tt_global->list_lock); } @@ -1750,10 +1737,8 @@ out_remove: tt_global_entry->common.flags &= ~BATADV_TT_CLIENT_ROAM; out: - if (tt_global_entry) - batadv_tt_global_entry_put(tt_global_entry); - if (tt_local_entry) - batadv_tt_local_entry_put(tt_local_entry); + batadv_tt_global_entry_put(tt_global_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -1789,15 +1774,13 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv, } /* release the refcount for the "old" best */ - if (best_router) - batadv_neigh_node_put(best_router); + batadv_neigh_node_put(best_router); best_entry = orig_entry; best_router = router; } - if (best_router) - batadv_neigh_node_put(best_router); + batadv_neigh_node_put(best_router); return best_entry; } @@ -2003,10 +1986,8 @@ int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb) ret = msg->len; out: - if (primary_if) - batadv_hardif_put(primary_if); - if (soft_iface) - dev_put(soft_iface); + batadv_hardif_put(primary_if); + dev_put(soft_iface); cb->args[0] = bucket; cb->args[1] = idx; @@ -2196,10 +2177,8 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, } out: - if (tt_global_entry) - batadv_tt_global_entry_put(tt_global_entry); - if (local_entry) - batadv_tt_local_entry_put(local_entry); + batadv_tt_global_entry_put(tt_global_entry); + batadv_tt_local_entry_put(local_entry); } /** @@ -2426,10 +2405,8 @@ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, rcu_read_unlock(); out: - if (tt_global_entry) - batadv_tt_global_entry_put(tt_global_entry); - if (tt_local_entry) - batadv_tt_local_entry_put(tt_local_entry); + batadv_tt_global_entry_put(tt_global_entry); + batadv_tt_local_entry_put(tt_local_entry); return orig_node; } @@ -2606,6 +2583,9 @@ static void batadv_tt_req_node_release(struct kref *ref) */ static void batadv_tt_req_node_put(struct batadv_tt_req_node *tt_req_node) { + if (!tt_req_node) + return; + kref_put(&tt_req_node->refcount, batadv_tt_req_node_release); } @@ -2987,8 +2967,7 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv, ret = true; out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); @@ -2999,8 +2978,7 @@ out: spin_unlock_bh(&bat_priv->tt.req_list_lock); } - if (tt_req_node) - batadv_tt_req_node_put(tt_req_node); + batadv_tt_req_node_put(tt_req_node); kfree(tvlv_tt_data); return ret; @@ -3131,10 +3109,8 @@ unlock: spin_unlock_bh(&req_dst_orig_node->tt_buff_lock); out: - if (res_dst_orig_node) - batadv_orig_node_put(res_dst_orig_node); - if (req_dst_orig_node) - batadv_orig_node_put(req_dst_orig_node); + batadv_orig_node_put(res_dst_orig_node); + batadv_orig_node_put(req_dst_orig_node); kfree(tvlv_tt_data); return ret; } @@ -3248,10 +3224,8 @@ unlock: spin_unlock_bh(&bat_priv->tt.last_changeset_lock); out: spin_unlock_bh(&bat_priv->tt.commit_lock); - if (orig_node) - batadv_orig_node_put(orig_node); - if (primary_if) - batadv_hardif_put(primary_if); + batadv_orig_node_put(orig_node); + batadv_hardif_put(primary_if); kfree(tvlv_tt_data); /* The packet was for this host, so it doesn't need to be re-routed */ return true; @@ -3336,8 +3310,7 @@ static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, atomic_set(&orig_node->last_ttvn, ttvn); out: - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); } static void batadv_tt_update_changes(struct batadv_priv *bat_priv, @@ -3378,8 +3351,7 @@ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, goto out; ret = true; out: - if (tt_local_entry) - batadv_tt_local_entry_put(tt_local_entry); + batadv_tt_local_entry_put(tt_local_entry); return ret; } @@ -3442,8 +3414,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, spin_unlock_bh(&bat_priv->tt.req_list_lock); out: - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); } static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv) @@ -3574,8 +3545,7 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, &tvlv_roam, sizeof(tvlv_roam)); out: - if (primary_if) - batadv_hardif_put(primary_if); + batadv_hardif_put(primary_if); } static void batadv_tt_purge(struct work_struct *work) @@ -4170,8 +4140,7 @@ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, atomic_read(&orig_node->last_ttvn) + 1); out: - if (orig_node) - batadv_orig_node_put(orig_node); + batadv_orig_node_put(orig_node); return NET_RX_SUCCESS; } @@ -4193,8 +4162,10 @@ int batadv_tt_init(struct batadv_priv *bat_priv) return ret; ret = batadv_tt_global_init(bat_priv); - if (ret < 0) + if (ret < 0) { + batadv_tt_local_table_free(bat_priv); return ret; + } batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1, batadv_tt_tvlv_unicast_handler_v1, diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index e1285904f885..d18740d9a22b 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -9,6 +9,7 @@ #include "main.h" +#include <linux/kref.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/skbuff.h> @@ -28,7 +29,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_tt_global_entry * batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); -void batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry); +void batadv_tt_global_entry_release(struct kref *ref); int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, @@ -55,4 +56,19 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, int batadv_tt_cache_init(void); void batadv_tt_cache_destroy(void); +/** + * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and + * possibly release it + * @tt_global_entry: tt_global_entry to be free'd + */ +static inline void +batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry) +{ + if (!tt_global_entry) + return; + + kref_put(&tt_global_entry->common.refcount, + batadv_tt_global_entry_release); +} + #endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */ diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 253f5a33a914..0cb58eb04093 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -50,6 +50,9 @@ static void batadv_tvlv_handler_release(struct kref *ref) */ static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler) { + if (!tvlv_handler) + return; + kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release); } @@ -106,6 +109,9 @@ static void batadv_tvlv_container_release(struct kref *ref) */ static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv) { + if (!tvlv) + return; + kref_put(&tvlv->refcount, batadv_tvlv_container_release); } @@ -438,8 +444,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, ogm_source, orig_node, src, dst, tvlv_value, tvlv_value_cont_len); - if (tvlv_handler) - batadv_tvlv_handler_put(tvlv_handler); + batadv_tvlv_handler_put(tvlv_handler); tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; tvlv_value_len -= tvlv_value_cont_len; } @@ -582,8 +587,8 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length */ -void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, - u8 *dst, u8 type, u8 version, +void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src, + const u8 *dst, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len) { struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index 54f2a35653d0..4cf8af00fc11 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -42,8 +42,8 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, u8 *src, u8 *dst, void *tvlv_buff, u16 tvlv_buff_len); -void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, - u8 *dst, u8 type, u8 version, +void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src, + const u8 *dst, u8 type, u8 version, void *tvlv_value, u16 tvlv_value_len); #endif /* _NET_BATMAN_ADV_TVLV_H_ */ diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index fd164a248569..133d7ea063fb 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -663,6 +663,7 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev) { struct net_device *netdev; + bdaddr_t addr; int err; netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)), @@ -672,7 +673,8 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev) return -ENOMEM; netdev->addr_assign_type = NET_ADDR_PERM; - baswap((void *)netdev->dev_addr, &chan->src); + baswap(&addr, &chan->src); + __dev_addr_set(netdev, &addr, sizeof(addr)); netdev->netdev_ops = &netdev_ops; SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index cc0995301f93..291770fc9551 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -14,7 +14,8 @@ bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \ - ecdh_helper.o hci_request.o mgmt_util.o mgmt_config.o + ecdh_helper.o hci_request.o mgmt_util.o mgmt_config.o hci_codec.o \ + eir.o bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 72f47b372705..c9add7753b9f 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -594,7 +594,7 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) * ie. eh.h_dest is our local address. */ memcpy(s->eh.h_dest, &src, ETH_ALEN); memcpy(s->eh.h_source, &dst, ETH_ALEN); - memcpy(dev->dev_addr, s->eh.h_dest, ETH_ALEN); + eth_hw_addr_set(dev, s->eh.h_dest); s->dev = dev; s->sock = sock; diff --git a/net/bluetooth/cmtp/cmtp.h b/net/bluetooth/cmtp/cmtp.h index c32638dddbf9..f6b9dc4e408f 100644 --- a/net/bluetooth/cmtp/cmtp.h +++ b/net/bluetooth/cmtp/cmtp.h @@ -26,7 +26,7 @@ #include <linux/types.h> #include <net/bluetooth/bluetooth.h> -#define BTNAMSIZ 18 +#define BTNAMSIZ 21 /* CMTP ioctl defines */ #define CMTPCONNADD _IOW('C', 200, int) diff --git a/net/bluetooth/eir.c b/net/bluetooth/eir.c new file mode 100644 index 000000000000..7e930f77ecab --- /dev/null +++ b/net/bluetooth/eir.c @@ -0,0 +1,335 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * BlueZ - Bluetooth protocol stack for Linux + * + * Copyright (C) 2021 Intel Corporation + */ + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci_core.h> +#include <net/bluetooth/mgmt.h> + +#include "eir.h" + +#define PNP_INFO_SVCLASS_ID 0x1200 + +u8 eir_append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len) +{ + size_t short_len; + size_t complete_len; + + /* no space left for name (+ NULL + type + len) */ + if ((HCI_MAX_AD_LENGTH - ad_len) < HCI_MAX_SHORT_NAME_LENGTH + 3) + return ad_len; + + /* use complete name if present and fits */ + complete_len = strlen(hdev->dev_name); + if (complete_len && complete_len <= HCI_MAX_SHORT_NAME_LENGTH) + return eir_append_data(ptr, ad_len, EIR_NAME_COMPLETE, + hdev->dev_name, complete_len + 1); + + /* use short name if present */ + short_len = strlen(hdev->short_name); + if (short_len) + return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, + hdev->short_name, short_len + 1); + + /* use shortened full name if present, we already know that name + * is longer then HCI_MAX_SHORT_NAME_LENGTH + */ + if (complete_len) { + u8 name[HCI_MAX_SHORT_NAME_LENGTH + 1]; + + memcpy(name, hdev->dev_name, HCI_MAX_SHORT_NAME_LENGTH); + name[HCI_MAX_SHORT_NAME_LENGTH] = '\0'; + + return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, name, + sizeof(name)); + } + + return ad_len; +} + +u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len) +{ + return eir_append_le16(ptr, ad_len, EIR_APPEARANCE, hdev->appearance); +} + +static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) +{ + u8 *ptr = data, *uuids_start = NULL; + struct bt_uuid *uuid; + + if (len < 4) + return ptr; + + list_for_each_entry(uuid, &hdev->uuids, list) { + u16 uuid16; + + if (uuid->size != 16) + continue; + + uuid16 = get_unaligned_le16(&uuid->uuid[12]); + if (uuid16 < 0x1100) + continue; + + if (uuid16 == PNP_INFO_SVCLASS_ID) + continue; + + if (!uuids_start) { + uuids_start = ptr; + uuids_start[0] = 1; + uuids_start[1] = EIR_UUID16_ALL; + ptr += 2; + } + + /* Stop if not enough space to put next UUID */ + if ((ptr - data) + sizeof(u16) > len) { + uuids_start[1] = EIR_UUID16_SOME; + break; + } + + *ptr++ = (uuid16 & 0x00ff); + *ptr++ = (uuid16 & 0xff00) >> 8; + uuids_start[0] += sizeof(uuid16); + } + + return ptr; +} + +static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) +{ + u8 *ptr = data, *uuids_start = NULL; + struct bt_uuid *uuid; + + if (len < 6) + return ptr; + + list_for_each_entry(uuid, &hdev->uuids, list) { + if (uuid->size != 32) + continue; + + if (!uuids_start) { + uuids_start = ptr; + uuids_start[0] = 1; + uuids_start[1] = EIR_UUID32_ALL; + ptr += 2; + } + + /* Stop if not enough space to put next UUID */ + if ((ptr - data) + sizeof(u32) > len) { + uuids_start[1] = EIR_UUID32_SOME; + break; + } + + memcpy(ptr, &uuid->uuid[12], sizeof(u32)); + ptr += sizeof(u32); + uuids_start[0] += sizeof(u32); + } + + return ptr; +} + +static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) +{ + u8 *ptr = data, *uuids_start = NULL; + struct bt_uuid *uuid; + + if (len < 18) + return ptr; + + list_for_each_entry(uuid, &hdev->uuids, list) { + if (uuid->size != 128) + continue; + + if (!uuids_start) { + uuids_start = ptr; + uuids_start[0] = 1; + uuids_start[1] = EIR_UUID128_ALL; + ptr += 2; + } + + /* Stop if not enough space to put next UUID */ + if ((ptr - data) + 16 > len) { + uuids_start[1] = EIR_UUID128_SOME; + break; + } + + memcpy(ptr, uuid->uuid, 16); + ptr += 16; + uuids_start[0] += 16; + } + + return ptr; +} + +void eir_create(struct hci_dev *hdev, u8 *data) +{ + u8 *ptr = data; + size_t name_len; + + name_len = strlen(hdev->dev_name); + + if (name_len > 0) { + /* EIR Data type */ + if (name_len > 48) { + name_len = 48; + ptr[1] = EIR_NAME_SHORT; + } else { + ptr[1] = EIR_NAME_COMPLETE; + } + + /* EIR Data length */ + ptr[0] = name_len + 1; + + memcpy(ptr + 2, hdev->dev_name, name_len); + + ptr += (name_len + 2); + } + + if (hdev->inq_tx_power != HCI_TX_POWER_INVALID) { + ptr[0] = 2; + ptr[1] = EIR_TX_POWER; + ptr[2] = (u8)hdev->inq_tx_power; + + ptr += 3; + } + + if (hdev->devid_source > 0) { + ptr[0] = 9; + ptr[1] = EIR_DEVICE_ID; + + put_unaligned_le16(hdev->devid_source, ptr + 2); + put_unaligned_le16(hdev->devid_vendor, ptr + 4); + put_unaligned_le16(hdev->devid_product, ptr + 6); + put_unaligned_le16(hdev->devid_version, ptr + 8); + + ptr += 10; + } + + ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); + ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); + ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); +} + +u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) +{ + struct adv_info *adv = NULL; + u8 ad_len = 0, flags = 0; + u32 instance_flags; + + /* Return 0 when the current instance identifier is invalid. */ + if (instance) { + adv = hci_find_adv_instance(hdev, instance); + if (!adv) + return 0; + } + + instance_flags = hci_adv_instance_flags(hdev, instance); + + /* If instance already has the flags set skip adding it once + * again. + */ + if (adv && eir_get_data(adv->adv_data, adv->adv_data_len, EIR_FLAGS, + NULL)) + goto skip_flags; + + /* The Add Advertising command allows userspace to set both the general + * and limited discoverable flags. + */ + if (instance_flags & MGMT_ADV_FLAG_DISCOV) + flags |= LE_AD_GENERAL; + + if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV) + flags |= LE_AD_LIMITED; + + if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) + flags |= LE_AD_NO_BREDR; + + if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) { + /* If a discovery flag wasn't provided, simply use the global + * settings. + */ + if (!flags) + flags |= mgmt_get_adv_discov_flags(hdev); + + /* If flags would still be empty, then there is no need to + * include the "Flags" AD field". + */ + if (flags) { + ptr[0] = 0x02; + ptr[1] = EIR_FLAGS; + ptr[2] = flags; + + ad_len += 3; + ptr += 3; + } + } + +skip_flags: + if (adv) { + memcpy(ptr, adv->adv_data, adv->adv_data_len); + ad_len += adv->adv_data_len; + ptr += adv->adv_data_len; + } + + if (instance_flags & MGMT_ADV_FLAG_TX_POWER) { + s8 adv_tx_power; + + if (ext_adv_capable(hdev)) { + if (adv) + adv_tx_power = adv->tx_power; + else + adv_tx_power = hdev->adv_tx_power; + } else { + adv_tx_power = hdev->adv_tx_power; + } + + /* Provide Tx Power only if we can provide a valid value for it */ + if (adv_tx_power != HCI_TX_POWER_INVALID) { + ptr[0] = 0x02; + ptr[1] = EIR_TX_POWER; + ptr[2] = (u8)adv_tx_power; + + ad_len += 3; + ptr += 3; + } + } + + return ad_len; +} + +static u8 create_default_scan_rsp(struct hci_dev *hdev, u8 *ptr) +{ + u8 scan_rsp_len = 0; + + if (hdev->appearance) + scan_rsp_len = eir_append_appearance(hdev, ptr, scan_rsp_len); + + return eir_append_local_name(hdev, ptr, scan_rsp_len); +} + +u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr) +{ + struct adv_info *adv; + u8 scan_rsp_len = 0; + + if (!instance) + return create_default_scan_rsp(hdev, ptr); + + adv = hci_find_adv_instance(hdev, instance); + if (!adv) + return 0; + + if ((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) + scan_rsp_len = eir_append_appearance(hdev, ptr, scan_rsp_len); + + memcpy(&ptr[scan_rsp_len], adv->scan_rsp_data, adv->scan_rsp_len); + + scan_rsp_len += adv->scan_rsp_len; + + if (adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) + scan_rsp_len = eir_append_local_name(hdev, ptr, scan_rsp_len); + + return scan_rsp_len; +} diff --git a/net/bluetooth/eir.h b/net/bluetooth/eir.h new file mode 100644 index 000000000000..724662f8f8b1 --- /dev/null +++ b/net/bluetooth/eir.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BlueZ - Bluetooth protocol stack for Linux + * + * Copyright (C) 2021 Intel Corporation + */ + +void eir_create(struct hci_dev *hdev, u8 *data); + +u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr); +u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr); + +u8 eir_append_local_name(struct hci_dev *hdev, u8 *eir, u8 ad_len); +u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len); + +static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, + u8 *data, u8 data_len) +{ + eir[eir_len++] = sizeof(type) + data_len; + eir[eir_len++] = type; + memcpy(&eir[eir_len], data, data_len); + eir_len += data_len; + + return eir_len; +} + +static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data) +{ + eir[eir_len++] = sizeof(type) + sizeof(data); + eir[eir_len++] = type; + put_unaligned_le16(data, &eir[eir_len]); + eir_len += sizeof(data); + + return eir_len; +} + +static inline void *eir_get_data(u8 *eir, size_t eir_len, u8 type, + size_t *data_len) +{ + size_t parsed = 0; + + if (eir_len < 2) + return NULL; + + while (parsed < eir_len - 1) { + u8 field_len = eir[0]; + + if (field_len == 0) + break; + + parsed += field_len + 1; + + if (parsed > eir_len) + break; + + if (eir[1] != type) { + eir += field_len + 1; + continue; + } + + /* Zero length data */ + if (field_len == 1) + return NULL; + + if (data_len) + *data_len = field_len - 1; + + return &eir[2]; + } + + return NULL; +} diff --git a/net/bluetooth/hci_codec.c b/net/bluetooth/hci_codec.c new file mode 100644 index 000000000000..f0421d0edaa3 --- /dev/null +++ b/net/bluetooth/hci_codec.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Copyright (C) 2021 Intel Corporation */ + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci_core.h> +#include "hci_codec.h" + +static int hci_codec_list_add(struct list_head *list, + struct hci_op_read_local_codec_caps *sent, + struct hci_rp_read_local_codec_caps *rp, + void *caps, + __u32 len) +{ + struct codec_list *entry; + + entry = kzalloc(sizeof(*entry) + len, GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->id = sent->id; + if (sent->id == 0xFF) { + entry->cid = __le16_to_cpu(sent->cid); + entry->vid = __le16_to_cpu(sent->vid); + } + entry->transport = sent->transport; + entry->len = len; + entry->num_caps = rp->num_caps; + if (rp->num_caps) + memcpy(entry->caps, caps, len); + list_add(&entry->list, list); + + return 0; +} + +void hci_codec_list_clear(struct list_head *codec_list) +{ + struct codec_list *c, *n; + + list_for_each_entry_safe(c, n, codec_list, list) { + list_del(&c->list); + kfree(c); + } +} + +static void hci_read_codec_capabilities(struct hci_dev *hdev, __u8 transport, + struct hci_op_read_local_codec_caps + *cmd) +{ + __u8 i; + + for (i = 0; i < TRANSPORT_TYPE_MAX; i++) { + if (transport & BIT(i)) { + struct hci_rp_read_local_codec_caps *rp; + struct hci_codec_caps *caps; + struct sk_buff *skb; + __u8 j; + __u32 len; + + cmd->transport = i; + skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_CODEC_CAPS, + sizeof(*cmd), cmd, + HCI_CMD_TIMEOUT); + if (IS_ERR(skb)) { + bt_dev_err(hdev, "Failed to read codec capabilities (%ld)", + PTR_ERR(skb)); + continue; + } + + if (skb->len < sizeof(*rp)) + goto error; + + rp = (void *)skb->data; + + if (rp->status) + goto error; + + if (!rp->num_caps) { + len = 0; + /* this codec doesn't have capabilities */ + goto skip_caps_parse; + } + + skb_pull(skb, sizeof(*rp)); + + for (j = 0, len = 0; j < rp->num_caps; j++) { + caps = (void *)skb->data; + if (skb->len < sizeof(*caps)) + goto error; + if (skb->len < caps->len) + goto error; + len += sizeof(caps->len) + caps->len; + skb_pull(skb, sizeof(caps->len) + caps->len); + } + +skip_caps_parse: + hci_dev_lock(hdev); + hci_codec_list_add(&hdev->local_codecs, cmd, rp, + (__u8 *)rp + sizeof(*rp), len); + hci_dev_unlock(hdev); +error: + kfree_skb(skb); + } + } +} + +void hci_read_supported_codecs(struct hci_dev *hdev) +{ + struct sk_buff *skb; + struct hci_rp_read_local_supported_codecs *rp; + struct hci_std_codecs *std_codecs; + struct hci_vnd_codecs *vnd_codecs; + struct hci_op_read_local_codec_caps caps; + __u8 i; + + skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL, + HCI_CMD_TIMEOUT); + + if (IS_ERR(skb)) { + bt_dev_err(hdev, "Failed to read local supported codecs (%ld)", + PTR_ERR(skb)); + return; + } + + if (skb->len < sizeof(*rp)) + goto error; + + rp = (void *)skb->data; + + if (rp->status) + goto error; + + skb_pull(skb, sizeof(rp->status)); + + std_codecs = (void *)skb->data; + + /* validate codecs length before accessing */ + if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num) + + sizeof(std_codecs->num)) + goto error; + + /* enumerate codec capabilities of standard codecs */ + memset(&caps, 0, sizeof(caps)); + for (i = 0; i < std_codecs->num; i++) { + caps.id = std_codecs->codec[i]; + caps.direction = 0x00; + hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK, &caps); + } + + skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num) + + sizeof(std_codecs->num)); + + vnd_codecs = (void *)skb->data; + + /* validate vendor codecs length before accessing */ + if (skb->len < + flex_array_size(vnd_codecs, codec, vnd_codecs->num) + + sizeof(vnd_codecs->num)) + goto error; + + /* enumerate vendor codec capabilities */ + for (i = 0; i < vnd_codecs->num; i++) { + caps.id = 0xFF; + caps.cid = vnd_codecs->codec[i].cid; + caps.vid = vnd_codecs->codec[i].vid; + caps.direction = 0x00; + hci_read_codec_capabilities(hdev, LOCAL_CODEC_ACL_MASK, &caps); + } + +error: + kfree_skb(skb); +} + +void hci_read_supported_codecs_v2(struct hci_dev *hdev) +{ + struct sk_buff *skb; + struct hci_rp_read_local_supported_codecs_v2 *rp; + struct hci_std_codecs_v2 *std_codecs; + struct hci_vnd_codecs_v2 *vnd_codecs; + struct hci_op_read_local_codec_caps caps; + __u8 i; + + skb = __hci_cmd_sync(hdev, HCI_OP_READ_LOCAL_CODECS_V2, 0, NULL, + HCI_CMD_TIMEOUT); + + if (IS_ERR(skb)) { + bt_dev_err(hdev, "Failed to read local supported codecs (%ld)", + PTR_ERR(skb)); + return; + } + + if (skb->len < sizeof(*rp)) + goto error; + + rp = (void *)skb->data; + + if (rp->status) + goto error; + + skb_pull(skb, sizeof(rp->status)); + + std_codecs = (void *)skb->data; + + /* check for payload data length before accessing */ + if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num) + + sizeof(std_codecs->num)) + goto error; + + memset(&caps, 0, sizeof(caps)); + + for (i = 0; i < std_codecs->num; i++) { + caps.id = std_codecs->codec[i].id; + hci_read_codec_capabilities(hdev, std_codecs->codec[i].transport, + &caps); + } + + skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num) + + sizeof(std_codecs->num)); + + vnd_codecs = (void *)skb->data; + + /* check for payload data length before accessing */ + if (skb->len < + flex_array_size(vnd_codecs, codec, vnd_codecs->num) + + sizeof(vnd_codecs->num)) + goto error; + + for (i = 0; i < vnd_codecs->num; i++) { + caps.id = 0xFF; + caps.cid = vnd_codecs->codec[i].cid; + caps.vid = vnd_codecs->codec[i].vid; + hci_read_codec_capabilities(hdev, vnd_codecs->codec[i].transport, + &caps); + } + +error: + kfree_skb(skb); +} diff --git a/net/bluetooth/hci_codec.h b/net/bluetooth/hci_codec.h new file mode 100644 index 000000000000..a2751930f123 --- /dev/null +++ b/net/bluetooth/hci_codec.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* Copyright (C) 2014 Intel Corporation */ + +void hci_read_supported_codecs(struct hci_dev *hdev); +void hci_read_supported_codecs_v2(struct hci_dev *hdev); +void hci_codec_list_clear(struct list_head *codec_list); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 2b5059a56cda..bd669c95b9a7 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -307,13 +307,133 @@ static bool find_next_esco_param(struct hci_conn *conn, return conn->attempt <= size; } -bool hci_setup_sync(struct hci_conn *conn, __u16 handle) +static bool hci_enhanced_setup_sync_conn(struct hci_conn *conn, __u16 handle) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_enhanced_setup_sync_conn cp; + const struct sco_param *param; + + bt_dev_dbg(hdev, "hcon %p", conn); + + /* for offload use case, codec needs to configured before opening SCO */ + if (conn->codec.data_path) + hci_req_configure_datapath(hdev, &conn->codec); + + conn->state = BT_CONNECT; + conn->out = true; + + conn->attempt++; + + memset(&cp, 0x00, sizeof(cp)); + + cp.handle = cpu_to_le16(handle); + + cp.tx_bandwidth = cpu_to_le32(0x00001f40); + cp.rx_bandwidth = cpu_to_le32(0x00001f40); + + switch (conn->codec.id) { + case BT_CODEC_MSBC: + if (!find_next_esco_param(conn, esco_param_msbc, + ARRAY_SIZE(esco_param_msbc))) + return false; + + param = &esco_param_msbc[conn->attempt - 1]; + cp.tx_coding_format.id = 0x05; + cp.rx_coding_format.id = 0x05; + cp.tx_codec_frame_size = __cpu_to_le16(60); + cp.rx_codec_frame_size = __cpu_to_le16(60); + cp.in_bandwidth = __cpu_to_le32(32000); + cp.out_bandwidth = __cpu_to_le32(32000); + cp.in_coding_format.id = 0x04; + cp.out_coding_format.id = 0x04; + cp.in_coded_data_size = __cpu_to_le16(16); + cp.out_coded_data_size = __cpu_to_le16(16); + cp.in_pcm_data_format = 2; + cp.out_pcm_data_format = 2; + cp.in_pcm_sample_payload_msb_pos = 0; + cp.out_pcm_sample_payload_msb_pos = 0; + cp.in_data_path = conn->codec.data_path; + cp.out_data_path = conn->codec.data_path; + cp.in_transport_unit_size = 1; + cp.out_transport_unit_size = 1; + break; + + case BT_CODEC_TRANSPARENT: + if (!find_next_esco_param(conn, esco_param_msbc, + ARRAY_SIZE(esco_param_msbc))) + return false; + param = &esco_param_msbc[conn->attempt - 1]; + cp.tx_coding_format.id = 0x03; + cp.rx_coding_format.id = 0x03; + cp.tx_codec_frame_size = __cpu_to_le16(60); + cp.rx_codec_frame_size = __cpu_to_le16(60); + cp.in_bandwidth = __cpu_to_le32(0x1f40); + cp.out_bandwidth = __cpu_to_le32(0x1f40); + cp.in_coding_format.id = 0x03; + cp.out_coding_format.id = 0x03; + cp.in_coded_data_size = __cpu_to_le16(16); + cp.out_coded_data_size = __cpu_to_le16(16); + cp.in_pcm_data_format = 2; + cp.out_pcm_data_format = 2; + cp.in_pcm_sample_payload_msb_pos = 0; + cp.out_pcm_sample_payload_msb_pos = 0; + cp.in_data_path = conn->codec.data_path; + cp.out_data_path = conn->codec.data_path; + cp.in_transport_unit_size = 1; + cp.out_transport_unit_size = 1; + break; + + case BT_CODEC_CVSD: + if (lmp_esco_capable(conn->link)) { + if (!find_next_esco_param(conn, esco_param_cvsd, + ARRAY_SIZE(esco_param_cvsd))) + return false; + param = &esco_param_cvsd[conn->attempt - 1]; + } else { + if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) + return false; + param = &sco_param_cvsd[conn->attempt - 1]; + } + cp.tx_coding_format.id = 2; + cp.rx_coding_format.id = 2; + cp.tx_codec_frame_size = __cpu_to_le16(60); + cp.rx_codec_frame_size = __cpu_to_le16(60); + cp.in_bandwidth = __cpu_to_le32(16000); + cp.out_bandwidth = __cpu_to_le32(16000); + cp.in_coding_format.id = 4; + cp.out_coding_format.id = 4; + cp.in_coded_data_size = __cpu_to_le16(16); + cp.out_coded_data_size = __cpu_to_le16(16); + cp.in_pcm_data_format = 2; + cp.out_pcm_data_format = 2; + cp.in_pcm_sample_payload_msb_pos = 0; + cp.out_pcm_sample_payload_msb_pos = 0; + cp.in_data_path = conn->codec.data_path; + cp.out_data_path = conn->codec.data_path; + cp.in_transport_unit_size = 16; + cp.out_transport_unit_size = 16; + break; + default: + return false; + } + + cp.retrans_effort = param->retrans_effort; + cp.pkt_type = __cpu_to_le16(param->pkt_type); + cp.max_latency = __cpu_to_le16(param->max_latency); + + if (hci_send_cmd(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0) + return false; + + return true; +} + +static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle) { struct hci_dev *hdev = conn->hdev; struct hci_cp_setup_sync_conn cp; const struct sco_param *param; - BT_DBG("hcon %p", conn); + bt_dev_dbg(hdev, "hcon %p", conn); conn->state = BT_CONNECT; conn->out = true; @@ -359,6 +479,14 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle) return true; } +bool hci_setup_sync(struct hci_conn *conn, __u16 handle) +{ + if (enhanced_sco_capable(conn->hdev)) + return hci_enhanced_setup_sync_conn(conn, handle); + + return hci_setup_sync_conn(conn, handle); +} + u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier) { @@ -1040,8 +1168,8 @@ static void hci_req_directed_advertising(struct hci_request *req, } struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, - u8 dst_type, u8 sec_level, u16 conn_timeout, - u8 role, bdaddr_t *direct_rpa) + u8 dst_type, bool dst_resolved, u8 sec_level, + u16 conn_timeout, u8 role, bdaddr_t *direct_rpa) { struct hci_conn_params *params; struct hci_conn *conn; @@ -1078,19 +1206,24 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, return ERR_PTR(-EBUSY); } - /* When given an identity address with existing identity - * resolving key, the connection needs to be established - * to a resolvable random address. - * - * Storing the resolvable random address is required here - * to handle connection failures. The address will later - * be resolved back into the original identity address - * from the connect request. + /* Check if the destination address has been resolved by the controller + * since if it did then the identity address shall be used. */ - irk = hci_find_irk_by_addr(hdev, dst, dst_type); - if (irk && bacmp(&irk->rpa, BDADDR_ANY)) { - dst = &irk->rpa; - dst_type = ADDR_LE_DEV_RANDOM; + if (!dst_resolved) { + /* When given an identity address with existing identity + * resolving key, the connection needs to be established + * to a resolvable random address. + * + * Storing the resolvable random address is required here + * to handle connection failures. The address will later + * be resolved back into the original identity address + * from the connect request. + */ + irk = hci_find_irk_by_addr(hdev, dst, dst_type); + if (irk && bacmp(&irk->rpa, BDADDR_ANY)) { + dst = &irk->rpa; + dst_type = ADDR_LE_DEV_RANDOM; + } } if (conn) { @@ -1319,7 +1452,7 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, } struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, - __u16 setting) + __u16 setting, struct bt_codec *codec) { struct hci_conn *acl; struct hci_conn *sco; @@ -1344,6 +1477,7 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, hci_conn_hold(sco); sco->setting = setting; + sco->codec = *codec; if (acl->state == BT_CONNECTED && (sco->state == BT_OPEN || sco->state == BT_CLOSED)) { diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index e1a545c8a69f..8d33aa64846b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -45,6 +45,7 @@ #include "leds.h" #include "msft.h" #include "aosp.h" +#include "hci_codec.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); @@ -61,130 +62,6 @@ DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); -/* ---- HCI debugfs entries ---- */ - -static ssize_t dut_mode_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[3]; - - buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y' : 'N'; - buf[1] = '\n'; - buf[2] = '\0'; - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -} - -static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - struct sk_buff *skb; - bool enable; - int err; - - if (!test_bit(HCI_UP, &hdev->flags)) - return -ENETDOWN; - - err = kstrtobool_from_user(user_buf, count, &enable); - if (err) - return err; - - if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE)) - return -EALREADY; - - hci_req_sync_lock(hdev); - if (enable) - skb = __hci_cmd_sync(hdev, HCI_OP_ENABLE_DUT_MODE, 0, NULL, - HCI_CMD_TIMEOUT); - else - skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL, - HCI_CMD_TIMEOUT); - hci_req_sync_unlock(hdev); - - if (IS_ERR(skb)) - return PTR_ERR(skb); - - kfree_skb(skb); - - hci_dev_change_flag(hdev, HCI_DUT_MODE); - - return count; -} - -static const struct file_operations dut_mode_fops = { - .open = simple_open, - .read = dut_mode_read, - .write = dut_mode_write, - .llseek = default_llseek, -}; - -static ssize_t vendor_diag_read(struct file *file, char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - char buf[3]; - - buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y' : 'N'; - buf[1] = '\n'; - buf[2] = '\0'; - return simple_read_from_buffer(user_buf, count, ppos, buf, 2); -} - -static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct hci_dev *hdev = file->private_data; - bool enable; - int err; - - err = kstrtobool_from_user(user_buf, count, &enable); - if (err) - return err; - - /* When the diagnostic flags are not persistent and the transport - * is not active or in user channel operation, then there is no need - * for the vendor callback. Instead just store the desired value and - * the setting will be programmed when the controller gets powered on. - */ - if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && - (!test_bit(HCI_RUNNING, &hdev->flags) || - hci_dev_test_flag(hdev, HCI_USER_CHANNEL))) - goto done; - - hci_req_sync_lock(hdev); - err = hdev->set_diag(hdev, enable); - hci_req_sync_unlock(hdev); - - if (err < 0) - return err; - -done: - if (enable) - hci_dev_set_flag(hdev, HCI_VENDOR_DIAG); - else - hci_dev_clear_flag(hdev, HCI_VENDOR_DIAG); - - return count; -} - -static const struct file_operations vendor_diag_fops = { - .open = simple_open, - .read = vendor_diag_read, - .write = vendor_diag_write, - .llseek = default_llseek, -}; - -static void hci_debugfs_create_basic(struct hci_dev *hdev) -{ - debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev, - &dut_mode_fops); - - if (hdev->set_diag) - debugfs_create_file("vendor_diag", 0644, hdev->debugfs, hdev, - &vendor_diag_fops); -} - static int hci_reset_req(struct hci_request *req, unsigned long opt) { BT_DBG("%s %ld", req->hdev->name, opt); @@ -838,10 +715,6 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) if (hdev->commands[22] & 0x04) hci_set_event_mask_page_2(req); - /* Read local codec list if the HCI command is supported */ - if (hdev->commands[29] & 0x20) - hci_req_add(req, HCI_OP_READ_LOCAL_CODECS, 0, NULL); - /* Read local pairing options if the HCI command is supported */ if (hdev->commands[41] & 0x08) hci_req_add(req, HCI_OP_READ_LOCAL_PAIRING_OPTS, 0, NULL); @@ -937,6 +810,12 @@ static int __hci_init(struct hci_dev *hdev) if (err < 0) return err; + /* Read local codec list if the HCI command is supported */ + if (hdev->commands[45] & 0x04) + hci_read_supported_codecs_v2(hdev); + else if (hdev->commands[29] & 0x20) + hci_read_supported_codecs(hdev); + /* This function is only called when the controller is actually in * configured state. When the controller is marked as unconfigured, * this initialization procedure is not run. @@ -1343,6 +1222,12 @@ int hci_inquiry(void __user *arg) goto done; } + /* Restrict maximum inquiry length to 60 seconds */ + if (ir.length > 60) { + err = -EINVAL; + goto done; + } + hci_dev_lock(hdev); if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX || inquiry_cache_empty(hdev) || ir.flags & IREQ_CACHE_FLUSH) { @@ -1718,6 +1603,7 @@ static void hci_pend_le_actions_clear(struct hci_dev *hdev) int hci_dev_do_close(struct hci_dev *hdev) { bool auto_off; + int err = 0; BT_DBG("%s %p", hdev->name, hdev); @@ -1727,10 +1613,18 @@ int hci_dev_do_close(struct hci_dev *hdev) hci_request_cancel_all(hdev); hci_req_sync_lock(hdev); + if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && + !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && + test_bit(HCI_UP, &hdev->flags)) { + /* Execute vendor specific shutdown routine */ + if (hdev->shutdown) + err = hdev->shutdown(hdev); + } + if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { cancel_delayed_work_sync(&hdev->cmd_timer); hci_req_sync_unlock(hdev); - return 0; + return err; } hci_leds_update_powered(hdev, false); @@ -1798,14 +1692,6 @@ int hci_dev_do_close(struct hci_dev *hdev) clear_bit(HCI_INIT, &hdev->flags); } - if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && - !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && - test_bit(HCI_UP, &hdev->flags)) { - /* Execute vendor specific shutdown routine */ - if (hdev->shutdown) - hdev->shutdown(hdev); - } - /* flush cmd work */ flush_work(&hdev->cmd_work); @@ -1841,11 +1727,12 @@ int hci_dev_do_close(struct hci_dev *hdev) memset(hdev->eir, 0, sizeof(hdev->eir)); memset(hdev->dev_class, 0, sizeof(hdev->dev_class)); bacpy(&hdev->random_addr, BDADDR_ANY); + hci_codec_list_clear(&hdev->local_codecs); hci_req_sync_unlock(hdev); hci_dev_put(hdev); - return 0; + return err; } int hci_dev_close(__u16 dev) @@ -3074,6 +2961,60 @@ int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, } /* This function requires the caller holds hdev->lock */ +u32 hci_adv_instance_flags(struct hci_dev *hdev, u8 instance) +{ + u32 flags; + struct adv_info *adv; + + if (instance == 0x00) { + /* Instance 0 always manages the "Tx Power" and "Flags" + * fields + */ + flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; + + /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting + * corresponds to the "connectable" instance flag. + */ + if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) + flags |= MGMT_ADV_FLAG_CONNECTABLE; + + if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) + flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; + else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) + flags |= MGMT_ADV_FLAG_DISCOV; + + return flags; + } + + adv = hci_find_adv_instance(hdev, instance); + + /* Return 0 when we got an invalid instance identifier. */ + if (!adv) + return 0; + + return adv->flags; +} + +bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance) +{ + struct adv_info *adv; + + /* Instance 0x00 always set local name */ + if (instance == 0x00) + return true; + + adv = hci_find_adv_instance(hdev, instance); + if (!adv) + return false; + + if (adv->flags & MGMT_ADV_FLAG_APPEARANCE || + adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) + return true; + + return adv->scan_rsp_len ? true : false; +} + +/* This function requires the caller holds hdev->lock */ void hci_adv_monitors_clear(struct hci_dev *hdev) { struct adv_monitor *monitor; @@ -3480,15 +3421,6 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, { struct hci_conn_params *param; - switch (addr_type) { - case ADDR_LE_DEV_PUBLIC_RESOLVED: - addr_type = ADDR_LE_DEV_PUBLIC; - break; - case ADDR_LE_DEV_RANDOM_RESOLVED: - addr_type = ADDR_LE_DEV_RANDOM; - break; - } - list_for_each_entry(param, list, action) { if (bacmp(¶m->addr, addr) == 0 && param->addr_type == addr_type) @@ -3694,55 +3626,12 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, struct hci_dev *hdev = container_of(nb, struct hci_dev, suspend_notifier); int ret = 0; - u8 state = BT_RUNNING; - /* If powering down, wait for completion. */ - if (mgmt_powering_down(hdev)) { - set_bit(SUSPEND_POWERING_DOWN, hdev->suspend_tasks); - ret = hci_suspend_wait_event(hdev); - if (ret) - goto done; - } - - /* Suspend notifier should only act on events when powered. */ - if (!hdev_is_powered(hdev) || - hci_dev_test_flag(hdev, HCI_UNREGISTER)) - goto done; - - if (action == PM_SUSPEND_PREPARE) { - /* Suspend consists of two actions: - * - First, disconnect everything and make the controller not - * connectable (disabling scanning) - * - Second, program event filter/accept list and enable scan - */ - ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT); - if (!ret) - state = BT_SUSPEND_DISCONNECT; + if (action == PM_SUSPEND_PREPARE) + ret = hci_suspend_dev(hdev); + else if (action == PM_POST_SUSPEND) + ret = hci_resume_dev(hdev); - /* Only configure accept list if disconnect succeeded and wake - * isn't being prevented. - */ - if (!ret && !(hdev->prevent_wake && hdev->prevent_wake(hdev))) { - ret = hci_change_suspend_state(hdev, - BT_SUSPEND_CONFIGURE_WAKE); - if (!ret) - state = BT_SUSPEND_CONFIGURE_WAKE; - } - - hci_clear_wake_reason(hdev); - mgmt_suspending(hdev, state); - - } else if (action == PM_POST_SUSPEND) { - ret = hci_change_suspend_state(hdev, BT_RUNNING); - - mgmt_resuming(hdev, hdev->wake_reason, &hdev->wake_addr, - hdev->wake_addr_type); - } - -done: - /* We always allow suspend even if suspend preparation failed and - * attempt to recover in resume. - */ if (ret) bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", action, ret); @@ -3751,11 +3640,18 @@ done: } /* Alloc HCI device */ -struct hci_dev *hci_alloc_dev(void) +struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) { struct hci_dev *hdev; + unsigned int alloc_size; - hdev = kzalloc(sizeof(*hdev), GFP_KERNEL); + alloc_size = sizeof(*hdev); + if (sizeof_priv) { + /* Fixme: May need ALIGN-ment? */ + alloc_size += sizeof_priv; + } + + hdev = kzalloc(alloc_size, GFP_KERNEL); if (!hdev) return NULL; @@ -3843,6 +3739,7 @@ struct hci_dev *hci_alloc_dev(void) INIT_LIST_HEAD(&hdev->adv_instances); INIT_LIST_HEAD(&hdev->blocked_keys); + INIT_LIST_HEAD(&hdev->local_codecs); INIT_WORK(&hdev->rx_work, hci_rx_work); INIT_WORK(&hdev->cmd_work, hci_cmd_work); INIT_WORK(&hdev->tx_work, hci_tx_work); @@ -3869,7 +3766,7 @@ struct hci_dev *hci_alloc_dev(void) return hdev; } -EXPORT_SYMBOL(hci_alloc_dev); +EXPORT_SYMBOL(hci_alloc_dev_priv); /* Free HCI device */ void hci_free_dev(struct hci_dev *hdev) @@ -3980,6 +3877,7 @@ int hci_register_dev(struct hci_dev *hdev) queue_work(hdev->req_workqueue, &hdev->power_on); idr_init(&hdev->adv_monitors_idr); + msft_register(hdev); return id; @@ -4012,6 +3910,8 @@ void hci_unregister_dev(struct hci_dev *hdev) cancel_work_sync(&hdev->suspend_prepare); } + msft_unregister(hdev); + hci_dev_do_close(hdev); if (!test_bit(HCI_INIT, &hdev->flags) && @@ -4034,13 +3934,13 @@ void hci_unregister_dev(struct hci_dev *hdev) } device_del(&hdev->dev); - /* Actual cleanup is deferred until hci_cleanup_dev(). */ + /* Actual cleanup is deferred until hci_release_dev(). */ hci_dev_put(hdev); } EXPORT_SYMBOL(hci_unregister_dev); -/* Cleanup HCI device */ -void hci_cleanup_dev(struct hci_dev *hdev) +/* Release HCI device */ +void hci_release_dev(struct hci_dev *hdev) { debugfs_remove_recursive(hdev->debugfs); kfree_const(hdev->hw_info); @@ -4067,21 +3967,85 @@ void hci_cleanup_dev(struct hci_dev *hdev) hci_dev_unlock(hdev); ida_simple_remove(&hci_index_ida, hdev->id); + kfree(hdev); } +EXPORT_SYMBOL(hci_release_dev); /* Suspend HCI device */ int hci_suspend_dev(struct hci_dev *hdev) { + int ret; + u8 state = BT_RUNNING; + + bt_dev_dbg(hdev, ""); + + /* Suspend should only act on when powered. */ + if (!hdev_is_powered(hdev) || + hci_dev_test_flag(hdev, HCI_UNREGISTER)) + return 0; + + /* If powering down, wait for completion. */ + if (mgmt_powering_down(hdev)) { + set_bit(SUSPEND_POWERING_DOWN, hdev->suspend_tasks); + ret = hci_suspend_wait_event(hdev); + if (ret) + goto done; + } + + /* Suspend consists of two actions: + * - First, disconnect everything and make the controller not + * connectable (disabling scanning) + * - Second, program event filter/accept list and enable scan + */ + ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT); + if (ret) + goto clear; + + state = BT_SUSPEND_DISCONNECT; + + /* Only configure accept list if device may wakeup. */ + if (hdev->wakeup && hdev->wakeup(hdev)) { + ret = hci_change_suspend_state(hdev, BT_SUSPEND_CONFIGURE_WAKE); + if (!ret) + state = BT_SUSPEND_CONFIGURE_WAKE; + } + +clear: + hci_clear_wake_reason(hdev); + mgmt_suspending(hdev, state); + +done: + /* We always allow suspend even if suspend preparation failed and + * attempt to recover in resume. + */ hci_sock_dev_event(hdev, HCI_DEV_SUSPEND); - return 0; + return ret; } EXPORT_SYMBOL(hci_suspend_dev); /* Resume HCI device */ int hci_resume_dev(struct hci_dev *hdev) { + int ret; + + bt_dev_dbg(hdev, ""); + + /* Resume should only act on when powered. */ + if (!hdev_is_powered(hdev) || + hci_dev_test_flag(hdev, HCI_UNREGISTER)) + return 0; + + /* If powering down don't attempt to resume */ + if (mgmt_powering_down(hdev)) + return 0; + + ret = hci_change_suspend_state(hdev, BT_RUNNING); + + mgmt_resuming(hdev, hdev->wake_reason, &hdev->wake_addr, + hdev->wake_addr_type); + hci_sock_dev_event(hdev, HCI_DEV_RESUME); - return 0; + return ret; } EXPORT_SYMBOL(hci_resume_dev); diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 841393389f7b..902b40a90b91 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -27,6 +27,7 @@ #include <net/bluetooth/hci_core.h> #include "smp.h" +#include "hci_request.h" #include "hci_debugfs.h" #define DEFINE_QUIRK_ATTRIBUTE(__name, __quirk) \ @@ -1250,3 +1251,125 @@ void hci_debugfs_create_conn(struct hci_conn *conn) snprintf(name, sizeof(name), "%u", conn->handle); conn->debugfs = debugfs_create_dir(name, hdev->debugfs); } + +static ssize_t dut_mode_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y' : 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t dut_mode_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + struct sk_buff *skb; + bool enable; + int err; + + if (!test_bit(HCI_UP, &hdev->flags)) + return -ENETDOWN; + + err = kstrtobool_from_user(user_buf, count, &enable); + if (err) + return err; + + if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE)) + return -EALREADY; + + hci_req_sync_lock(hdev); + if (enable) + skb = __hci_cmd_sync(hdev, HCI_OP_ENABLE_DUT_MODE, 0, NULL, + HCI_CMD_TIMEOUT); + else + skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL, + HCI_CMD_TIMEOUT); + hci_req_sync_unlock(hdev); + + if (IS_ERR(skb)) + return PTR_ERR(skb); + + kfree_skb(skb); + + hci_dev_change_flag(hdev, HCI_DUT_MODE); + + return count; +} + +static const struct file_operations dut_mode_fops = { + .open = simple_open, + .read = dut_mode_read, + .write = dut_mode_write, + .llseek = default_llseek, +}; + +static ssize_t vendor_diag_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y' : 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + bool enable; + int err; + + err = kstrtobool_from_user(user_buf, count, &enable); + if (err) + return err; + + /* When the diagnostic flags are not persistent and the transport + * is not active or in user channel operation, then there is no need + * for the vendor callback. Instead just store the desired value and + * the setting will be programmed when the controller gets powered on. + */ + if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && + (!test_bit(HCI_RUNNING, &hdev->flags) || + hci_dev_test_flag(hdev, HCI_USER_CHANNEL))) + goto done; + + hci_req_sync_lock(hdev); + err = hdev->set_diag(hdev, enable); + hci_req_sync_unlock(hdev); + + if (err < 0) + return err; + +done: + if (enable) + hci_dev_set_flag(hdev, HCI_VENDOR_DIAG); + else + hci_dev_clear_flag(hdev, HCI_VENDOR_DIAG); + + return count; +} + +static const struct file_operations vendor_diag_fops = { + .open = simple_open, + .read = vendor_diag_read, + .write = vendor_diag_write, + .llseek = default_llseek, +}; + +void hci_debugfs_create_basic(struct hci_dev *hdev) +{ + debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev, + &dut_mode_fops); + + if (hdev->set_diag) + debugfs_create_file("vendor_diag", 0644, hdev->debugfs, hdev, + &vendor_diag_fops); +} diff --git a/net/bluetooth/hci_debugfs.h b/net/bluetooth/hci_debugfs.h index 4444dc8cedc2..9a8a7c93bb12 100644 --- a/net/bluetooth/hci_debugfs.h +++ b/net/bluetooth/hci_debugfs.h @@ -26,6 +26,7 @@ void hci_debugfs_create_common(struct hci_dev *hdev); void hci_debugfs_create_bredr(struct hci_dev *hdev); void hci_debugfs_create_le(struct hci_dev *hdev); void hci_debugfs_create_conn(struct hci_conn *conn); +void hci_debugfs_create_basic(struct hci_dev *hdev); #else @@ -45,4 +46,8 @@ static inline void hci_debugfs_create_conn(struct hci_conn *conn) { } +static inline void hci_debugfs_create_basic(struct hci_dev *hdev) +{ +} + #endif diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 1c3018202564..7d0db1ca1248 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -36,10 +36,13 @@ #include "amp.h" #include "smp.h" #include "msft.h" +#include "eir.h" #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" +#define secs_to_jiffies(_secs) msecs_to_jiffies((_secs) * 1000) + /* Handle HCI Event packets */ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb, @@ -1171,6 +1174,12 @@ static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb) bacpy(&hdev->random_addr, sent); + if (!bacmp(&hdev->rpa, sent)) { + hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED); + queue_delayed_work(hdev->workqueue, &hdev->rpa_expired, + secs_to_jiffies(hdev->rpa_timeout)); + } + hci_dev_unlock(hdev); } @@ -1201,24 +1210,30 @@ static void hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev, { __u8 status = *((__u8 *) skb->data); struct hci_cp_le_set_adv_set_rand_addr *cp; - struct adv_info *adv_instance; + struct adv_info *adv; if (status) return; cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR); - if (!cp) + /* Update only in case the adv instance since handle 0x00 shall be using + * HCI_OP_LE_SET_RANDOM_ADDR since that allows both extended and + * non-extended adverting. + */ + if (!cp || !cp->handle) return; hci_dev_lock(hdev); - if (!cp->handle) { - /* Store in hdev for instance 0 (Set adv and Directed advs) */ - bacpy(&hdev->random_addr, &cp->bdaddr); - } else { - adv_instance = hci_find_adv_instance(hdev, cp->handle); - if (adv_instance) - bacpy(&adv_instance->random_addr, &cp->bdaddr); + adv = hci_find_adv_instance(hdev, cp->handle); + if (adv) { + bacpy(&adv->random_addr, &cp->bdaddr); + if (!bacmp(&hdev->rpa, &cp->bdaddr)) { + adv->rpa_expired = false; + queue_delayed_work(hdev->workqueue, + &adv->rpa_expired_cb, + secs_to_jiffies(hdev->rpa_timeout)); + } } hci_dev_unlock(hdev); @@ -1277,7 +1292,9 @@ static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_cp_le_set_ext_adv_enable *cp; + struct hci_cp_ext_adv_set *set; __u8 status = *((__u8 *) skb->data); + struct adv_info *adv = NULL, *n; BT_DBG("%s status 0x%2.2x", hdev->name, status); @@ -1288,22 +1305,48 @@ static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, if (!cp) return; + set = (void *)cp->data; + hci_dev_lock(hdev); + if (cp->num_of_sets) + adv = hci_find_adv_instance(hdev, set->handle); + if (cp->enable) { struct hci_conn *conn; hci_dev_set_flag(hdev, HCI_LE_ADV); + if (adv) + adv->enabled = true; + conn = hci_lookup_le_connect(hdev); if (conn) queue_delayed_work(hdev->workqueue, &conn->le_conn_timeout, conn->conn_timeout); } else { + if (adv) { + adv->enabled = false; + /* If just one instance was disabled check if there are + * any other instance enabled before clearing HCI_LE_ADV + */ + list_for_each_entry_safe(adv, n, &hdev->adv_instances, + list) { + if (adv->enabled) + goto unlock; + } + } else { + /* All instances shall be considered disabled */ + list_for_each_entry_safe(adv, n, &hdev->adv_instances, + list) + adv->enabled = false; + } + hci_dev_clear_flag(hdev, HCI_LE_ADV); } +unlock: hci_dev_unlock(hdev); } @@ -2236,6 +2279,41 @@ static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status) hci_dev_unlock(hdev); } +static void hci_cs_enhanced_setup_sync_conn(struct hci_dev *hdev, __u8 status) +{ + struct hci_cp_enhanced_setup_sync_conn *cp; + struct hci_conn *acl, *sco; + __u16 handle; + + bt_dev_dbg(hdev, "status 0x%2.2x", status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN); + if (!cp) + return; + + handle = __le16_to_cpu(cp->handle); + + bt_dev_dbg(hdev, "handle 0x%4.4x", handle); + + hci_dev_lock(hdev); + + acl = hci_conn_hash_lookup_handle(hdev, handle); + if (acl) { + sco = acl->link; + if (sco) { + sco->state = BT_CLOSED; + + hci_connect_cfm(sco, status); + hci_conn_del(sco); + } + } + + hci_dev_unlock(hdev); +} + static void hci_cs_sniff_mode(struct hci_dev *hdev, __u8 status) { struct hci_cp_sniff_mode *cp; @@ -2306,24 +2384,47 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); if (conn) { - u8 type = conn->type; - mgmt_disconnect_failed(hdev, &conn->dst, conn->type, conn->dst_type, status); + if (conn->type == LE_LINK && conn->role == HCI_ROLE_SLAVE) { + hdev->cur_adv_instance = conn->adv_instance; + hci_req_reenable_advertising(hdev); + } + /* If the disconnection failed for any reason, the upper layer * does not retry to disconnect in current implementation. * Hence, we need to do some basic cleanup here and re-enable * advertising if necessary. */ hci_conn_del(conn); - if (type == LE_LINK) - hci_req_reenable_advertising(hdev); } hci_dev_unlock(hdev); } +static u8 ev_bdaddr_type(struct hci_dev *hdev, u8 type, bool *resolved) +{ + /* When using controller based address resolution, then the new + * address types 0x02 and 0x03 are used. These types need to be + * converted back into either public address or random address type + */ + switch (type) { + case ADDR_LE_DEV_PUBLIC_RESOLVED: + if (resolved) + *resolved = true; + return ADDR_LE_DEV_PUBLIC; + case ADDR_LE_DEV_RANDOM_RESOLVED: + if (resolved) + *resolved = true; + return ADDR_LE_DEV_RANDOM; + } + + if (resolved) + *resolved = false; + return type; +} + static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr, u8 peer_addr_type, u8 own_address_type, u8 filter_policy) @@ -2335,21 +2436,7 @@ static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr, if (!conn) return; - /* When using controller based address resolution, then the new - * address types 0x02 and 0x03 are used. These types need to be - * converted back into either public address or random address type - */ - if (use_ll_privacy(hdev) && - hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) { - switch (own_address_type) { - case ADDR_LE_DEV_PUBLIC_RESOLVED: - own_address_type = ADDR_LE_DEV_PUBLIC; - break; - case ADDR_LE_DEV_RANDOM_RESOLVED: - own_address_type = ADDR_LE_DEV_RANDOM; - break; - } - } + own_address_type = ev_bdaddr_type(hdev, own_address_type, NULL); /* Store the initiator and responder address information which * is needed for SMP. These values will not change during the @@ -2844,7 +2931,6 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) struct hci_conn_params *params; struct hci_conn *conn; bool mgmt_connected; - u8 type; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); @@ -2899,10 +2985,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) } } - type = conn->type; - hci_disconn_cfm(conn, ev->reason); - hci_conn_del(conn); /* The suspend notifier is waiting for all devices to disconnect so * clear the bit from pending tasks and inform the wait queue. @@ -2922,8 +3005,12 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) * or until a connection is created or until the Advertising * is timed out due to Directed Advertising." */ - if (type == LE_LINK) + if (conn->type == LE_LINK && conn->role == HCI_ROLE_SLAVE) { + hdev->cur_adv_instance = conn->adv_instance; hci_req_reenable_advertising(hdev); + } + + hci_conn_del(conn); unlock: hci_dev_unlock(hdev); @@ -3268,11 +3355,9 @@ unlock: hci_dev_unlock(hdev); } -static inline void handle_cmd_cnt_and_timer(struct hci_dev *hdev, - u16 opcode, u8 ncmd) +static inline void handle_cmd_cnt_and_timer(struct hci_dev *hdev, u8 ncmd) { - if (opcode != HCI_OP_NOP) - cancel_delayed_work(&hdev->cmd_timer); + cancel_delayed_work(&hdev->cmd_timer); if (!test_bit(HCI_RESET, &hdev->flags)) { if (ncmd) { @@ -3647,7 +3732,7 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, break; } - handle_cmd_cnt_and_timer(hdev, *opcode, ev->ncmd); + handle_cmd_cnt_and_timer(hdev, ev->ncmd); hci_req_cmd_complete(hdev, *opcode, *status, req_complete, req_complete_skb); @@ -3715,6 +3800,10 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cs_setup_sync_conn(hdev, ev->status); break; + case HCI_OP_ENHANCED_SETUP_SYNC_CONN: + hci_cs_enhanced_setup_sync_conn(hdev, ev->status); + break; + case HCI_OP_SNIFF_MODE: hci_cs_sniff_mode(hdev, ev->status); break; @@ -3748,7 +3837,7 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb, break; } - handle_cmd_cnt_and_timer(hdev, *opcode, ev->ncmd); + handle_cmd_cnt_and_timer(hdev, ev->ncmd); /* Indicate request completion if the command failed. Also, if * we're not waiting for a special event and we get a success @@ -4356,6 +4445,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, { struct hci_ev_sync_conn_complete *ev = (void *) skb->data; struct hci_conn *conn; + unsigned int notify_evt; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); @@ -4382,6 +4472,21 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, switch (ev->status) { case 0x00: + /* The synchronous connection complete event should only be + * sent once per new connection. Receiving a successful + * complete event when the connection status is already + * BT_CONNECTED means that the device is misbehaving and sent + * multiple complete event packets for the same new connection. + * + * Registering the device more than once can corrupt kernel + * memory, hence upon detecting this invalid event, we report + * an error and ignore the packet. + */ + if (conn->state == BT_CONNECTED) { + bt_dev_err(hdev, "Ignoring connect complete event for existing connection"); + goto unlock; + } + conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; conn->type = ev->link_type; @@ -4415,15 +4520,21 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, switch (ev->air_mode) { case 0x02: - if (hdev->notify) - hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD); + notify_evt = HCI_NOTIFY_ENABLE_SCO_CVSD; break; case 0x03: - if (hdev->notify) - hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_TRANSP); + notify_evt = HCI_NOTIFY_ENABLE_SCO_TRANSP; break; } + /* Notify only in case of SCO over HCI transport data path which + * is zero and non-zero value shall be non-HCI transport data path + */ + if (conn->codec.data_path == 0) { + if (hdev->notify) + hdev->notify(hdev, notify_evt); + } + hci_connect_cfm(conn, ev->status); if (ev->status) hci_conn_del(conn); @@ -5104,9 +5215,64 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev, } #endif +static void le_conn_update_addr(struct hci_conn *conn, bdaddr_t *bdaddr, + u8 bdaddr_type, bdaddr_t *local_rpa) +{ + if (conn->out) { + conn->dst_type = bdaddr_type; + conn->resp_addr_type = bdaddr_type; + bacpy(&conn->resp_addr, bdaddr); + + /* Check if the controller has set a Local RPA then it must be + * used instead or hdev->rpa. + */ + if (local_rpa && bacmp(local_rpa, BDADDR_ANY)) { + conn->init_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(&conn->init_addr, local_rpa); + } else if (hci_dev_test_flag(conn->hdev, HCI_PRIVACY)) { + conn->init_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(&conn->init_addr, &conn->hdev->rpa); + } else { + hci_copy_identity_address(conn->hdev, &conn->init_addr, + &conn->init_addr_type); + } + } else { + conn->resp_addr_type = conn->hdev->adv_addr_type; + /* Check if the controller has set a Local RPA then it must be + * used instead or hdev->rpa. + */ + if (local_rpa && bacmp(local_rpa, BDADDR_ANY)) { + conn->resp_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(&conn->resp_addr, local_rpa); + } else if (conn->hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) { + /* In case of ext adv, resp_addr will be updated in + * Adv Terminated event. + */ + if (!ext_adv_capable(conn->hdev)) + bacpy(&conn->resp_addr, + &conn->hdev->random_addr); + } else { + bacpy(&conn->resp_addr, &conn->hdev->bdaddr); + } + + conn->init_addr_type = bdaddr_type; + bacpy(&conn->init_addr, bdaddr); + + /* For incoming connections, set the default minimum + * and maximum connection interval. They will be used + * to check if the parameters are in range and if not + * trigger the connection update procedure. + */ + conn->le_conn_min_interval = conn->hdev->le_conn_min_interval; + conn->le_conn_max_interval = conn->hdev->le_conn_max_interval; + } +} + static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, - bdaddr_t *bdaddr, u8 bdaddr_type, u8 role, u16 handle, - u16 interval, u16 latency, u16 supervision_timeout) + bdaddr_t *bdaddr, u8 bdaddr_type, + bdaddr_t *local_rpa, u8 role, u16 handle, + u16 interval, u16 latency, + u16 supervision_timeout) { struct hci_conn_params *params; struct hci_conn *conn; @@ -5154,32 +5320,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, cancel_delayed_work(&conn->le_conn_timeout); } - if (!conn->out) { - /* Set the responder (our side) address type based on - * the advertising address type. - */ - conn->resp_addr_type = hdev->adv_addr_type; - if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) { - /* In case of ext adv, resp_addr will be updated in - * Adv Terminated event. - */ - if (!ext_adv_capable(hdev)) - bacpy(&conn->resp_addr, &hdev->random_addr); - } else { - bacpy(&conn->resp_addr, &hdev->bdaddr); - } - - conn->init_addr_type = bdaddr_type; - bacpy(&conn->init_addr, bdaddr); - - /* For incoming connections, set the default minimum - * and maximum connection interval. They will be used - * to check if the parameters are in range and if not - * trigger the connection update procedure. - */ - conn->le_conn_min_interval = hdev->le_conn_min_interval; - conn->le_conn_max_interval = hdev->le_conn_max_interval; - } + le_conn_update_addr(conn, bdaddr, bdaddr_type, local_rpa); /* Lookup the identity address from the stored connection * address and address type. @@ -5196,22 +5337,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn->dst_type = irk->addr_type; } - /* When using controller based address resolution, then the new - * address types 0x02 and 0x03 are used. These types need to be - * converted back into either public address or random address type - */ - if (use_ll_privacy(hdev) && - hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY) && - hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) { - switch (conn->dst_type) { - case ADDR_LE_DEV_PUBLIC_RESOLVED: - conn->dst_type = ADDR_LE_DEV_PUBLIC; - break; - case ADDR_LE_DEV_RANDOM_RESOLVED: - conn->dst_type = ADDR_LE_DEV_RANDOM; - break; - } - } + conn->dst_type = ev_bdaddr_type(hdev, conn->dst_type, NULL); if (status) { hci_le_conn_failed(conn, status); @@ -5236,6 +5362,13 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, conn->handle = handle; conn->state = BT_CONFIG; + /* Store current advertising instance as connection advertising instance + * when sotfware rotation is in use so it can be re-enabled when + * disconnected. + */ + if (!ext_adv_capable(hdev)) + conn->adv_instance = hdev->cur_adv_instance; + conn->le_conn_interval = interval; conn->le_conn_latency = latency; conn->le_supv_timeout = supervision_timeout; @@ -5290,7 +5423,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type, - ev->role, le16_to_cpu(ev->handle), + NULL, ev->role, le16_to_cpu(ev->handle), le16_to_cpu(ev->interval), le16_to_cpu(ev->latency), le16_to_cpu(ev->supervision_timeout)); @@ -5304,7 +5437,7 @@ static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type, - ev->role, le16_to_cpu(ev->handle), + &ev->local_rpa, ev->role, le16_to_cpu(ev->handle), le16_to_cpu(ev->interval), le16_to_cpu(ev->latency), le16_to_cpu(ev->supervision_timeout)); @@ -5319,13 +5452,13 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_evt_le_ext_adv_set_term *ev = (void *) skb->data; struct hci_conn *conn; + struct adv_info *adv; BT_DBG("%s status 0x%2.2x", hdev->name, ev->status); - if (ev->status) { - struct adv_info *adv; + adv = hci_find_adv_instance(hdev, ev->handle); - adv = hci_find_adv_instance(hdev, ev->handle); + if (ev->status) { if (!adv) return; @@ -5336,11 +5469,18 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb) return; } + if (adv) + adv->enabled = false; + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->conn_handle)); if (conn) { - struct adv_info *adv_instance; + /* Store handle in the connection so the correct advertising + * instance can be re-enabled when disconnected. + */ + conn->adv_instance = ev->handle; - if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM) + if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM || + bacmp(&conn->resp_addr, BDADDR_ANY)) return; if (!ev->handle) { @@ -5348,9 +5488,8 @@ static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb) return; } - adv_instance = hci_find_adv_instance(hdev, ev->handle); - if (adv_instance) - bacpy(&conn->resp_addr, &adv_instance->random_addr); + if (adv) + bacpy(&conn->resp_addr, &adv->random_addr); } } @@ -5380,8 +5519,8 @@ static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, /* This function requires the caller holds hdev->lock */ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, bdaddr_t *addr, - u8 addr_type, u8 adv_type, - bdaddr_t *direct_rpa) + u8 addr_type, bool addr_resolved, + u8 adv_type, bdaddr_t *direct_rpa) { struct hci_conn *conn; struct hci_conn_params *params; @@ -5433,9 +5572,9 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, } } - conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW, - hdev->def_le_autoconnect_timeout, HCI_ROLE_MASTER, - direct_rpa); + conn = hci_connect_le(hdev, addr, addr_type, addr_resolved, + BT_SECURITY_LOW, hdev->def_le_autoconnect_timeout, + HCI_ROLE_MASTER, direct_rpa); if (!IS_ERR(conn)) { /* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned * by higher layer that tried to connect, if no then @@ -5476,7 +5615,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, struct discovery_state *d = &hdev->discovery; struct smp_irk *irk; struct hci_conn *conn; - bool match; + bool match, bdaddr_resolved; u32 flags; u8 *ptr; @@ -5520,6 +5659,9 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, * controller address. */ if (direct_addr) { + direct_addr_type = ev_bdaddr_type(hdev, direct_addr_type, + &bdaddr_resolved); + /* Only resolvable random addresses are valid for these * kind of reports and others can be ignored. */ @@ -5547,13 +5689,15 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, bdaddr_type = irk->addr_type; } + bdaddr_type = ev_bdaddr_type(hdev, bdaddr_type, &bdaddr_resolved); + /* Check if we have been requested to connect to this device. * * direct_addr is set only for directed advertising reports (it is NULL * for advertising reports) and is already verified to be RPA above. */ - conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, type, - direct_addr); + conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, bdaddr_resolved, + type, direct_addr); if (!ext_adv && conn && type == LE_ADV_IND && len <= HCI_MAX_AD_LENGTH) { /* Store report for later inclusion by * mgmt_device_connected diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 1d14adc023e9..92611bfc0b9e 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -30,6 +30,7 @@ #include "smp.h" #include "hci_request.h" #include "msft.h" +#include "eir.h" #define HCI_REQ_DONE 0 #define HCI_REQ_PEND 1 @@ -521,164 +522,6 @@ void __hci_req_update_name(struct hci_request *req) hci_req_add(req, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp); } -#define PNP_INFO_SVCLASS_ID 0x1200 - -static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) -{ - u8 *ptr = data, *uuids_start = NULL; - struct bt_uuid *uuid; - - if (len < 4) - return ptr; - - list_for_each_entry(uuid, &hdev->uuids, list) { - u16 uuid16; - - if (uuid->size != 16) - continue; - - uuid16 = get_unaligned_le16(&uuid->uuid[12]); - if (uuid16 < 0x1100) - continue; - - if (uuid16 == PNP_INFO_SVCLASS_ID) - continue; - - if (!uuids_start) { - uuids_start = ptr; - uuids_start[0] = 1; - uuids_start[1] = EIR_UUID16_ALL; - ptr += 2; - } - - /* Stop if not enough space to put next UUID */ - if ((ptr - data) + sizeof(u16) > len) { - uuids_start[1] = EIR_UUID16_SOME; - break; - } - - *ptr++ = (uuid16 & 0x00ff); - *ptr++ = (uuid16 & 0xff00) >> 8; - uuids_start[0] += sizeof(uuid16); - } - - return ptr; -} - -static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) -{ - u8 *ptr = data, *uuids_start = NULL; - struct bt_uuid *uuid; - - if (len < 6) - return ptr; - - list_for_each_entry(uuid, &hdev->uuids, list) { - if (uuid->size != 32) - continue; - - if (!uuids_start) { - uuids_start = ptr; - uuids_start[0] = 1; - uuids_start[1] = EIR_UUID32_ALL; - ptr += 2; - } - - /* Stop if not enough space to put next UUID */ - if ((ptr - data) + sizeof(u32) > len) { - uuids_start[1] = EIR_UUID32_SOME; - break; - } - - memcpy(ptr, &uuid->uuid[12], sizeof(u32)); - ptr += sizeof(u32); - uuids_start[0] += sizeof(u32); - } - - return ptr; -} - -static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len) -{ - u8 *ptr = data, *uuids_start = NULL; - struct bt_uuid *uuid; - - if (len < 18) - return ptr; - - list_for_each_entry(uuid, &hdev->uuids, list) { - if (uuid->size != 128) - continue; - - if (!uuids_start) { - uuids_start = ptr; - uuids_start[0] = 1; - uuids_start[1] = EIR_UUID128_ALL; - ptr += 2; - } - - /* Stop if not enough space to put next UUID */ - if ((ptr - data) + 16 > len) { - uuids_start[1] = EIR_UUID128_SOME; - break; - } - - memcpy(ptr, uuid->uuid, 16); - ptr += 16; - uuids_start[0] += 16; - } - - return ptr; -} - -static void create_eir(struct hci_dev *hdev, u8 *data) -{ - u8 *ptr = data; - size_t name_len; - - name_len = strlen(hdev->dev_name); - - if (name_len > 0) { - /* EIR Data type */ - if (name_len > 48) { - name_len = 48; - ptr[1] = EIR_NAME_SHORT; - } else - ptr[1] = EIR_NAME_COMPLETE; - - /* EIR Data length */ - ptr[0] = name_len + 1; - - memcpy(ptr + 2, hdev->dev_name, name_len); - - ptr += (name_len + 2); - } - - if (hdev->inq_tx_power != HCI_TX_POWER_INVALID) { - ptr[0] = 2; - ptr[1] = EIR_TX_POWER; - ptr[2] = (u8) hdev->inq_tx_power; - - ptr += 3; - } - - if (hdev->devid_source > 0) { - ptr[0] = 9; - ptr[1] = EIR_DEVICE_ID; - - put_unaligned_le16(hdev->devid_source, ptr + 2); - put_unaligned_le16(hdev->devid_vendor, ptr + 4); - put_unaligned_le16(hdev->devid_product, ptr + 6); - put_unaligned_le16(hdev->devid_version, ptr + 8); - - ptr += 10; - } - - ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); - ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); - ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data)); -} - void __hci_req_update_eir(struct hci_request *req) { struct hci_dev *hdev = req->hdev; @@ -698,7 +541,7 @@ void __hci_req_update_eir(struct hci_request *req) memset(&cp, 0, sizeof(cp)); - create_eir(hdev, cp.data); + eir_create(hdev, cp.data); if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0) return; @@ -1134,25 +977,6 @@ void hci_req_add_le_passive_scan(struct hci_request *req) addr_resolv); } -static bool adv_instance_is_scannable(struct hci_dev *hdev, u8 instance) -{ - struct adv_info *adv_instance; - - /* Instance 0x00 always set local name */ - if (instance == 0x00) - return true; - - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return false; - - if (adv_instance->flags & MGMT_ADV_FLAG_APPEARANCE || - adv_instance->flags & MGMT_ADV_FLAG_LOCAL_NAME) - return true; - - return adv_instance->scan_rsp_len ? true : false; -} - static void hci_req_clear_event_filter(struct hci_request *req) { struct hci_cp_set_event_filter f; @@ -1281,21 +1105,24 @@ static void suspend_req_complete(struct hci_dev *hdev, u8 status, u16 opcode) } } -static void hci_req_add_set_adv_filter_enable(struct hci_request *req, - bool enable) +static void hci_req_prepare_adv_monitor_suspend(struct hci_request *req, + bool suspending) { struct hci_dev *hdev = req->hdev; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_MSFT: - msft_req_add_set_filter_enable(req, enable); + if (suspending) + msft_suspend(hdev); + else + msft_resume(hdev); break; default: return; } /* No need to block when enabling since it's on resume path */ - if (hdev->suspended && !enable) + if (hdev->suspended && suspending) set_bit(SUSPEND_SET_ADV_FILTER, hdev->suspend_tasks); } @@ -1362,7 +1189,7 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) } /* Disable advertisement filters */ - hci_req_add_set_adv_filter_enable(&req, false); + hci_req_prepare_adv_monitor_suspend(&req, true); /* Prevent disconnects from causing scanning to be re-enabled */ hdev->scanning_paused = true; @@ -1404,7 +1231,7 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) /* Reset passive/background scanning to normal */ __hci_update_background_scan(&req); /* Enable all of the advertisement filters */ - hci_req_add_set_adv_filter_enable(&req, true); + hci_req_prepare_adv_monitor_suspend(&req, false); /* Unpause directed advertising */ hdev->advertising_paused = false; @@ -1442,7 +1269,7 @@ done: static bool adv_cur_instance_is_scannable(struct hci_dev *hdev) { - return adv_instance_is_scannable(hdev, hdev->cur_adv_instance); + return hci_adv_instance_is_scannable(hdev, hdev->cur_adv_instance); } void __hci_req_disable_advertising(struct hci_request *req) @@ -1457,40 +1284,6 @@ void __hci_req_disable_advertising(struct hci_request *req) } } -static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance) -{ - u32 flags; - struct adv_info *adv_instance; - - if (instance == 0x00) { - /* Instance 0 always manages the "Tx Power" and "Flags" - * fields - */ - flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; - - /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting - * corresponds to the "connectable" instance flag. - */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) - flags |= MGMT_ADV_FLAG_CONNECTABLE; - - if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) - flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; - else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) - flags |= MGMT_ADV_FLAG_DISCOV; - - return flags; - } - - adv_instance = hci_find_adv_instance(hdev, instance); - - /* Return 0 when we got an invalid instance identifier. */ - if (!adv_instance) - return 0; - - return adv_instance->flags; -} - static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) { /* If privacy is not enabled don't use RPA */ @@ -1555,15 +1348,15 @@ static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) void __hci_req_enable_advertising(struct hci_request *req) { struct hci_dev *hdev = req->hdev; - struct adv_info *adv_instance; + struct adv_info *adv; struct hci_cp_le_set_adv_param cp; u8 own_addr_type, enable = 0x01; bool connectable; u16 adv_min_interval, adv_max_interval; u32 flags; - flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance); - adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance); + flags = hci_adv_instance_flags(hdev, hdev->cur_adv_instance); + adv = hci_find_adv_instance(hdev, hdev->cur_adv_instance); /* If the "connectable" instance flag was not set, then choose between * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. @@ -1595,9 +1388,9 @@ void __hci_req_enable_advertising(struct hci_request *req) memset(&cp, 0, sizeof(cp)); - if (adv_instance) { - adv_min_interval = adv_instance->min_interval; - adv_max_interval = adv_instance->max_interval; + if (adv) { + adv_min_interval = adv->min_interval; + adv_max_interval = adv->max_interval; } else { adv_min_interval = hdev->le_adv_min_interval; adv_max_interval = hdev->le_adv_max_interval; @@ -1628,85 +1421,6 @@ void __hci_req_enable_advertising(struct hci_request *req) hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); } -u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len) -{ - size_t short_len; - size_t complete_len; - - /* no space left for name (+ NULL + type + len) */ - if ((HCI_MAX_AD_LENGTH - ad_len) < HCI_MAX_SHORT_NAME_LENGTH + 3) - return ad_len; - - /* use complete name if present and fits */ - complete_len = strlen(hdev->dev_name); - if (complete_len && complete_len <= HCI_MAX_SHORT_NAME_LENGTH) - return eir_append_data(ptr, ad_len, EIR_NAME_COMPLETE, - hdev->dev_name, complete_len + 1); - - /* use short name if present */ - short_len = strlen(hdev->short_name); - if (short_len) - return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, - hdev->short_name, short_len + 1); - - /* use shortened full name if present, we already know that name - * is longer then HCI_MAX_SHORT_NAME_LENGTH - */ - if (complete_len) { - u8 name[HCI_MAX_SHORT_NAME_LENGTH + 1]; - - memcpy(name, hdev->dev_name, HCI_MAX_SHORT_NAME_LENGTH); - name[HCI_MAX_SHORT_NAME_LENGTH] = '\0'; - - return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, name, - sizeof(name)); - } - - return ad_len; -} - -static u8 append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len) -{ - return eir_append_le16(ptr, ad_len, EIR_APPEARANCE, hdev->appearance); -} - -static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr) -{ - u8 scan_rsp_len = 0; - - if (hdev->appearance) - scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len); - - return append_local_name(hdev, ptr, scan_rsp_len); -} - -static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance, - u8 *ptr) -{ - struct adv_info *adv_instance; - u32 instance_flags; - u8 scan_rsp_len = 0; - - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return 0; - - instance_flags = adv_instance->flags; - - if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) - scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len); - - memcpy(&ptr[scan_rsp_len], adv_instance->scan_rsp_data, - adv_instance->scan_rsp_len); - - scan_rsp_len += adv_instance->scan_rsp_len; - - if (instance_flags & MGMT_ADV_FLAG_LOCAL_NAME) - scan_rsp_len = append_local_name(hdev, ptr, scan_rsp_len); - - return scan_rsp_len; -} - void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; @@ -1723,11 +1437,7 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) memset(&pdu, 0, sizeof(pdu)); - if (instance) - len = create_instance_scan_rsp_data(hdev, instance, - pdu.data); - else - len = create_default_scan_rsp_data(hdev, pdu.data); + len = eir_create_scan_rsp(hdev, instance, pdu.data); if (hdev->scan_rsp_data_len == len && !memcmp(pdu.data, hdev->scan_rsp_data, len)) @@ -1748,11 +1458,7 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) memset(&cp, 0, sizeof(cp)); - if (instance) - len = create_instance_scan_rsp_data(hdev, instance, - cp.data); - else - len = create_default_scan_rsp_data(hdev, cp.data); + len = eir_create_scan_rsp(hdev, instance, cp.data); if (hdev->scan_rsp_data_len == len && !memcmp(cp.data, hdev->scan_rsp_data, len)) @@ -1767,95 +1473,6 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) } } -static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr) -{ - struct adv_info *adv_instance = NULL; - u8 ad_len = 0, flags = 0; - u32 instance_flags; - - /* Return 0 when the current instance identifier is invalid. */ - if (instance) { - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return 0; - } - - instance_flags = get_adv_instance_flags(hdev, instance); - - /* If instance already has the flags set skip adding it once - * again. - */ - if (adv_instance && eir_get_data(adv_instance->adv_data, - adv_instance->adv_data_len, EIR_FLAGS, - NULL)) - goto skip_flags; - - /* The Add Advertising command allows userspace to set both the general - * and limited discoverable flags. - */ - if (instance_flags & MGMT_ADV_FLAG_DISCOV) - flags |= LE_AD_GENERAL; - - if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV) - flags |= LE_AD_LIMITED; - - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) - flags |= LE_AD_NO_BREDR; - - if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) { - /* If a discovery flag wasn't provided, simply use the global - * settings. - */ - if (!flags) - flags |= mgmt_get_adv_discov_flags(hdev); - - /* If flags would still be empty, then there is no need to - * include the "Flags" AD field". - */ - if (flags) { - ptr[0] = 0x02; - ptr[1] = EIR_FLAGS; - ptr[2] = flags; - - ad_len += 3; - ptr += 3; - } - } - -skip_flags: - if (adv_instance) { - memcpy(ptr, adv_instance->adv_data, - adv_instance->adv_data_len); - ad_len += adv_instance->adv_data_len; - ptr += adv_instance->adv_data_len; - } - - if (instance_flags & MGMT_ADV_FLAG_TX_POWER) { - s8 adv_tx_power; - - if (ext_adv_capable(hdev)) { - if (adv_instance) - adv_tx_power = adv_instance->tx_power; - else - adv_tx_power = hdev->adv_tx_power; - } else { - adv_tx_power = hdev->adv_tx_power; - } - - /* Provide Tx Power only if we can provide a valid value for it */ - if (adv_tx_power != HCI_TX_POWER_INVALID) { - ptr[0] = 0x02; - ptr[1] = EIR_TX_POWER; - ptr[2] = (u8)adv_tx_power; - - ad_len += 3; - ptr += 3; - } - } - - return ad_len; -} - void __hci_req_update_adv_data(struct hci_request *req, u8 instance) { struct hci_dev *hdev = req->hdev; @@ -1872,7 +1489,7 @@ void __hci_req_update_adv_data(struct hci_request *req, u8 instance) memset(&pdu, 0, sizeof(pdu)); - len = create_instance_adv_data(hdev, instance, pdu.data); + len = eir_create_adv_data(hdev, instance, pdu.data); /* There's nothing to do if the data hasn't changed */ if (hdev->adv_data_len == len && @@ -1894,7 +1511,7 @@ void __hci_req_update_adv_data(struct hci_request *req, u8 instance) memset(&cp, 0, sizeof(cp)); - len = create_instance_adv_data(hdev, instance, cp.data); + len = eir_create_adv_data(hdev, instance, cp.data); /* There's nothing to do if the data hasn't changed */ if (hdev->adv_data_len == len && @@ -2072,8 +1689,6 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, * current RPA has expired then generate a new one. */ if (use_rpa) { - int to; - /* If Controller supports LL Privacy use own address type is * 0x03 */ @@ -2084,14 +1699,10 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, *own_addr_type = ADDR_LE_DEV_RANDOM; if (adv_instance) { - if (!adv_instance->rpa_expired && - !bacmp(&adv_instance->random_addr, &hdev->rpa)) + if (adv_rpa_valid(adv_instance)) return 0; - - adv_instance->rpa_expired = false; } else { - if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) && - !bacmp(&hdev->random_addr, &hdev->rpa)) + if (rpa_valid(hdev)) return 0; } @@ -2103,14 +1714,6 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, bacpy(rand_addr, &hdev->rpa); - to = msecs_to_jiffies(hdev->rpa_timeout * 1000); - if (adv_instance) - queue_delayed_work(hdev->workqueue, - &adv_instance->rpa_expired_cb, to); - else - queue_delayed_work(hdev->workqueue, - &hdev->rpa_expired, to); - return 0; } @@ -2153,6 +1756,30 @@ void __hci_req_clear_ext_adv_sets(struct hci_request *req) hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL); } +static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) +{ + struct hci_dev *hdev = req->hdev; + + /* If we're advertising or initiating an LE connection we can't + * go ahead and change the random address at this time. This is + * because the eventual initiator address used for the + * subsequently created connection will be undefined (some + * controllers use the new address and others the one we had + * when the operation started). + * + * In this kind of scenario skip the update and let the random + * address be updated at the next cycle. + */ + if (hci_dev_test_flag(hdev, HCI_LE_ADV) || + hci_lookup_le_connect(hdev)) { + bt_dev_dbg(hdev, "Deferring random address update"); + hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); + return; + } + + hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa); +} + int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; @@ -2173,7 +1800,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) adv_instance = NULL; } - flags = get_adv_instance_flags(hdev, instance); + flags = hci_adv_instance_flags(hdev, instance); /* If the "connectable" instance flag was not set, then choose between * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. @@ -2213,7 +1840,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND); else cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); - } else if (adv_instance_is_scannable(hdev, instance) || + } else if (hci_adv_instance_is_scannable(hdev, instance) || (flags & MGMT_ADV_PARAM_SCAN_RSP)) { if (secondary_adv) cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND); @@ -2255,6 +1882,13 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) } else { if (!bacmp(&random_addr, &hdev->random_addr)) return 0; + /* Instance 0x00 doesn't have an adv_info, instead it + * uses hdev->random_addr to track its address so + * whenever it needs to be updated this also set the + * random address since hdev->random_addr is shared with + * scan state machine. + */ + set_random_addr(req, &random_addr); } memset(&cp, 0, sizeof(cp)); @@ -2512,30 +2146,6 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, false); } -static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) -{ - struct hci_dev *hdev = req->hdev; - - /* If we're advertising or initiating an LE connection we can't - * go ahead and change the random address at this time. This is - * because the eventual initiator address used for the - * subsequently created connection will be undefined (some - * controllers use the new address and others the one we had - * when the operation started). - * - * In this kind of scenario skip the update and let the random - * address be updated at the next cycle. - */ - if (hci_dev_test_flag(hdev, HCI_LE_ADV) || - hci_lookup_le_connect(hdev)) { - bt_dev_dbg(hdev, "Deferring random address update"); - hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); - return; - } - - hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa); -} - int hci_update_random_address(struct hci_request *req, bool require_privacy, bool use_rpa, u8 *own_addr_type) { @@ -2547,8 +2157,6 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, * the current RPA in use, then generate a new one. */ if (use_rpa) { - int to; - /* If Controller supports LL Privacy use own address type is * 0x03 */ @@ -2558,8 +2166,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, else *own_addr_type = ADDR_LE_DEV_RANDOM; - if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) && - !bacmp(&hdev->random_addr, &hdev->rpa)) + if (rpa_valid(hdev)) return 0; err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); @@ -2570,9 +2177,6 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy, set_random_addr(req, &hdev->rpa); - to = msecs_to_jiffies(hdev->rpa_timeout * 1000); - queue_delayed_work(hdev->workqueue, &hdev->rpa_expired, to); - return 0; } @@ -3340,6 +2944,53 @@ bool hci_req_stop_discovery(struct hci_request *req) return ret; } +static void config_data_path_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + bt_dev_dbg(hdev, "status %u", status); +} + +int hci_req_configure_datapath(struct hci_dev *hdev, struct bt_codec *codec) +{ + struct hci_request req; + int err; + __u8 vnd_len, *vnd_data = NULL; + struct hci_op_configure_data_path *cmd = NULL; + + hci_req_init(&req, hdev); + + err = hdev->get_codec_config_data(hdev, ESCO_LINK, codec, &vnd_len, + &vnd_data); + if (err < 0) + goto error; + + cmd = kzalloc(sizeof(*cmd) + vnd_len, GFP_KERNEL); + if (!cmd) { + err = -ENOMEM; + goto error; + } + + err = hdev->get_data_path_id(hdev, &cmd->data_path_id); + if (err < 0) + goto error; + + cmd->vnd_len = vnd_len; + memcpy(cmd->vnd_data, vnd_data, vnd_len); + + cmd->direction = 0x00; + hci_req_add(&req, HCI_CONFIGURE_DATA_PATH, sizeof(*cmd) + vnd_len, cmd); + + cmd->direction = 0x01; + hci_req_add(&req, HCI_CONFIGURE_DATA_PATH, sizeof(*cmd) + vnd_len, cmd); + + err = hci_req_run(&req, config_data_path_complete); +error: + + kfree(cmd); + kfree(vnd_data); + return err; +} + static int stop_discovery(struct hci_request *req, unsigned long opt) { hci_dev_lock(req->hdev); diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 39ee8a18087a..f31420f58525 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -101,6 +101,8 @@ void __hci_req_update_class(struct hci_request *req); /* Returns true if HCI commands were queued */ bool hci_req_stop_discovery(struct hci_request *req); +int hci_req_configure_datapath(struct hci_dev *hdev, struct bt_codec *codec); + static inline void hci_req_update_scan(struct hci_dev *hdev) { queue_work(hdev->req_workqueue, &hdev->scan_update); @@ -122,26 +124,3 @@ static inline void hci_update_background_scan(struct hci_dev *hdev) void hci_request_setup(struct hci_dev *hdev); void hci_request_cancel_all(struct hci_dev *hdev); - -u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len); - -static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type, - u8 *data, u8 data_len) -{ - eir[eir_len++] = sizeof(type) + data_len; - eir[eir_len++] = type; - memcpy(&eir[eir_len], data, data_len); - eir_len += data_len; - - return eir_len; -} - -static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data) -{ - eir[eir_len++] = sizeof(type) + sizeof(data); - eir[eir_len++] = type; - put_unaligned_le16(data, &eir[eir_len]); - eir_len += sizeof(data); - - return eir_len; -} diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index f1128c2134f0..d0dad1fafe07 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -57,6 +57,7 @@ struct hci_pinfo { unsigned long flags; __u32 cookie; char comm[TASK_COMM_LEN]; + __u16 mtu; }; static struct hci_dev *hci_hdev_from_sock(struct sock *sk) @@ -1374,6 +1375,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, break; } + /* Default MTU to HCI_MAX_FRAME_SIZE if not set */ + if (!hci_pi(sk)->mtu) + hci_pi(sk)->mtu = HCI_MAX_FRAME_SIZE; + sk->sk_state = BT_BOUND; done: @@ -1506,9 +1511,8 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, } static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, - struct msghdr *msg, size_t msglen) + struct sk_buff *skb) { - void *buf; u8 *cp; struct mgmt_hdr *hdr; u16 opcode, index, len; @@ -1517,40 +1521,31 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, bool var_len, no_hdev; int err; - BT_DBG("got %zu bytes", msglen); + BT_DBG("got %d bytes", skb->len); - if (msglen < sizeof(*hdr)) + if (skb->len < sizeof(*hdr)) return -EINVAL; - buf = kmalloc(msglen, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (memcpy_from_msg(buf, msg, msglen)) { - err = -EFAULT; - goto done; - } - - hdr = buf; + hdr = (void *)skb->data; opcode = __le16_to_cpu(hdr->opcode); index = __le16_to_cpu(hdr->index); len = __le16_to_cpu(hdr->len); - if (len != msglen - sizeof(*hdr)) { + if (len != skb->len - sizeof(*hdr)) { err = -EINVAL; goto done; } if (chan->channel == HCI_CHANNEL_CONTROL) { - struct sk_buff *skb; + struct sk_buff *cmd; /* Send event to monitor */ - skb = create_monitor_ctrl_command(sk, index, opcode, len, - buf + sizeof(*hdr)); - if (skb) { - hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, + cmd = create_monitor_ctrl_command(sk, index, opcode, len, + skb->data + sizeof(*hdr)); + if (cmd) { + hci_send_to_channel(HCI_CHANNEL_MONITOR, cmd, HCI_SOCK_TRUSTED, NULL); - kfree_skb(skb); + kfree_skb(cmd); } } @@ -1615,26 +1610,25 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, if (hdev && chan->hdev_init) chan->hdev_init(sk, hdev); - cp = buf + sizeof(*hdr); + cp = skb->data + sizeof(*hdr); err = handler->func(sk, hdev, cp, len); if (err < 0) goto done; - err = msglen; + err = skb->len; done: if (hdev) hci_dev_put(hdev); - kfree(buf); return err; } -static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len) +static int hci_logging_frame(struct sock *sk, struct sk_buff *skb, + unsigned int flags) { struct hci_mon_hdr *hdr; - struct sk_buff *skb; struct hci_dev *hdev; u16 index; int err; @@ -1643,24 +1637,13 @@ static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len) * the priority byte, the ident length byte and at least one string * terminator NUL byte. Anything shorter are invalid packets. */ - if (len < sizeof(*hdr) + 3) + if (skb->len < sizeof(*hdr) + 3) return -EINVAL; - skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err); - if (!skb) - return err; - - if (memcpy_from_msg(skb_put(skb, len), msg, len)) { - err = -EFAULT; - goto drop; - } - hdr = (void *)skb->data; - if (__le16_to_cpu(hdr->len) != len - sizeof(*hdr)) { - err = -EINVAL; - goto drop; - } + if (__le16_to_cpu(hdr->len) != skb->len - sizeof(*hdr)) + return -EINVAL; if (__le16_to_cpu(hdr->opcode) == 0x0000) { __u8 priority = skb->data[sizeof(*hdr)]; @@ -1679,25 +1662,20 @@ static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len) * The message follows the ident string (if present) and * must be NUL terminated. Otherwise it is not a valid packet. */ - if (priority > 7 || skb->data[len - 1] != 0x00 || - ident_len > len - sizeof(*hdr) - 3 || - skb->data[sizeof(*hdr) + ident_len + 1] != 0x00) { - err = -EINVAL; - goto drop; - } + if (priority > 7 || skb->data[skb->len - 1] != 0x00 || + ident_len > skb->len - sizeof(*hdr) - 3 || + skb->data[sizeof(*hdr) + ident_len + 1] != 0x00) + return -EINVAL; } else { - err = -EINVAL; - goto drop; + return -EINVAL; } index = __le16_to_cpu(hdr->index); if (index != MGMT_INDEX_NONE) { hdev = hci_dev_get(index); - if (!hdev) { - err = -ENODEV; - goto drop; - } + if (!hdev) + return -ENODEV; } else { hdev = NULL; } @@ -1705,13 +1683,11 @@ static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len) hdr->opcode = cpu_to_le16(HCI_MON_USER_LOGGING); hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL); - err = len; + err = skb->len; if (hdev) hci_dev_put(hdev); -drop: - kfree_skb(skb); return err; } @@ -1723,19 +1699,23 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, struct hci_dev *hdev; struct sk_buff *skb; int err; + const unsigned int flags = msg->msg_flags; BT_DBG("sock %p sk %p", sock, sk); - if (msg->msg_flags & MSG_OOB) + if (flags & MSG_OOB) return -EOPNOTSUPP; - if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE| - MSG_CMSG_COMPAT)) + if (flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; - if (len < 4 || len > HCI_MAX_FRAME_SIZE) + if (len < 4 || len > hci_pi(sk)->mtu) return -EINVAL; + skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0); + if (IS_ERR(skb)) + return PTR_ERR(skb); + lock_sock(sk); switch (hci_pi(sk)->channel) { @@ -1744,39 +1724,30 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, break; case HCI_CHANNEL_MONITOR: err = -EOPNOTSUPP; - goto done; + goto drop; case HCI_CHANNEL_LOGGING: - err = hci_logging_frame(sk, msg, len); - goto done; + err = hci_logging_frame(sk, skb, flags); + goto drop; default: mutex_lock(&mgmt_chan_list_lock); chan = __hci_mgmt_chan_find(hci_pi(sk)->channel); if (chan) - err = hci_mgmt_cmd(chan, sk, msg, len); + err = hci_mgmt_cmd(chan, sk, skb); else err = -EINVAL; mutex_unlock(&mgmt_chan_list_lock); - goto done; + goto drop; } hdev = hci_hdev_from_sock(sk); if (IS_ERR(hdev)) { err = PTR_ERR(hdev); - goto done; + goto drop; } if (!test_bit(HCI_UP, &hdev->flags)) { err = -ENETDOWN; - goto done; - } - - skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err); - if (!skb) - goto done; - - if (memcpy_from_msg(skb_put(skb, len), msg, len)) { - err = -EFAULT; goto drop; } @@ -1857,8 +1828,8 @@ drop: goto done; } -static int hci_sock_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int len) +static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int len) { struct hci_ufilter uf = { .opcode = 0 }; struct sock *sk = sock->sk; @@ -1866,9 +1837,6 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname, BT_DBG("sk %p, opt %d", sk, optname); - if (level != SOL_HCI) - return -ENOPROTOOPT; - lock_sock(sk); if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) { @@ -1943,18 +1911,63 @@ done: return err; } -static int hci_sock_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) +static int hci_sock_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int len) { - struct hci_ufilter uf; struct sock *sk = sock->sk; - int len, opt, err = 0; + int err = 0, opt = 0; BT_DBG("sk %p, opt %d", sk, optname); - if (level != SOL_HCI) + if (level == SOL_HCI) + return hci_sock_setsockopt_old(sock, level, optname, optval, + len); + + if (level != SOL_BLUETOOTH) return -ENOPROTOOPT; + lock_sock(sk); + + switch (optname) { + case BT_SNDMTU: + case BT_RCVMTU: + switch (hci_pi(sk)->channel) { + /* Don't allow changing MTU for channels that are meant for HCI + * traffic only. + */ + case HCI_CHANNEL_RAW: + case HCI_CHANNEL_USER: + err = -ENOPROTOOPT; + goto done; + } + + if (copy_from_sockptr(&opt, optval, sizeof(u16))) { + err = -EFAULT; + break; + } + + hci_pi(sk)->mtu = opt; + break; + + default: + err = -ENOPROTOOPT; + break; + } + +done: + release_sock(sk); + return err; +} + +static int hci_sock_getsockopt_old(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct hci_ufilter uf; + struct sock *sk = sock->sk; + int len, opt, err = 0; + + BT_DBG("sk %p, opt %d", sk, optname); + if (get_user(len, optlen)) return -EFAULT; @@ -2012,6 +2025,39 @@ done: return err; } +static int hci_sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sk %p, opt %d", sk, optname); + + if (level == SOL_HCI) + return hci_sock_getsockopt_old(sock, level, optname, optval, + optlen); + + if (level != SOL_BLUETOOTH) + return -ENOPROTOOPT; + + lock_sock(sk); + + switch (optname) { + case BT_SNDMTU: + case BT_RCVMTU: + if (put_user(hci_pi(sk)->mtu, (u16 __user *)optval)) + err = -EFAULT; + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + static const struct proto_ops hci_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index b69d88b88d2e..7827639ecf5c 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -85,8 +85,7 @@ static void bt_host_release(struct device *dev) struct hci_dev *hdev = to_hci_dev(dev); if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) - hci_cleanup_dev(hdev); - kfree(hdev); + hci_release_dev(hdev); module_put(THIS_MODULE); } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 77ba68209dbd..4f8f37599962 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7902,7 +7902,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, dst_type = ADDR_LE_DEV_RANDOM; if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) - hcon = hci_connect_le(hdev, dst, dst_type, + hcon = hci_connect_le(hdev, dst, dst_type, false, chan->sec_level, HCI_LE_CONN_TIMEOUT, HCI_ROLE_SLAVE, NULL); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index c99d65ef13b1..160c016a5dfb 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1508,6 +1508,9 @@ static void l2cap_sock_close_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; + if (!sk) + return; + l2cap_sock_kill(sk); } @@ -1516,6 +1519,9 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err) struct sock *sk = chan->data; struct sock *parent; + if (!sk) + return; + BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); /* This callback can be called both for server (BT_LISTEN) @@ -1707,8 +1713,10 @@ static void l2cap_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); - if (l2cap_pi(sk)->chan) + if (l2cap_pi(sk)->chan) { + l2cap_pi(sk)->chan->data = NULL; l2cap_chan_put(l2cap_pi(sk)->chan); + } if (l2cap_pi(sk)->rx_busy_skb) { kfree_skb(l2cap_pi(sk)->rx_busy_skb); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 3663f880df11..3e5283607b97 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,6 +38,7 @@ #include "mgmt_util.h" #include "mgmt_config.h" #include "msft.h" +#include "eir.h" #define MGMT_VERSION 1 #define MGMT_REVISION 21 @@ -3791,6 +3792,18 @@ static const u8 debug_uuid[16] = { }; #endif +/* 330859bc-7506-492d-9370-9a6f0614037f */ +static const u8 quality_report_uuid[16] = { + 0x7f, 0x03, 0x14, 0x06, 0x6f, 0x9a, 0x70, 0x93, + 0x2d, 0x49, 0x06, 0x75, 0xbc, 0x59, 0x08, 0x33, +}; + +/* a6695ace-ee7f-4fb9-881a-5fac66c629af */ +static const u8 offload_codecs_uuid[16] = { + 0xaf, 0x29, 0xc6, 0x66, 0xac, 0x5f, 0x1a, 0x88, + 0xb9, 0x4f, 0x7f, 0xee, 0xce, 0x5a, 0x69, 0xa6, +}; + /* 671b10b5-42c0-4696-9227-eb28d1b049d6 */ static const u8 simult_central_periph_uuid[16] = { 0xd6, 0x49, 0xb0, 0xd1, 0x28, 0xeb, 0x27, 0x92, @@ -3806,7 +3819,7 @@ static const u8 rpa_resolution_uuid[16] = { static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { - char buf[62]; /* Enough space for 3 features */ + char buf[102]; /* Enough space for 5 features: 2 + 20 * 5 */ struct mgmt_rp_read_exp_features_info *rp = (void *)buf; u16 idx = 0; u32 flags; @@ -3850,6 +3863,28 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, idx++; } + if (hdev && hdev->set_quality_report) { + if (hci_dev_test_flag(hdev, HCI_QUALITY_REPORT)) + flags = BIT(0); + else + flags = 0; + + memcpy(rp->features[idx].uuid, quality_report_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; + } + + if (hdev && hdev->get_data_path_id) { + if (hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) + flags = BIT(0); + else + flags = 0; + + memcpy(rp->features[idx].uuid, offload_codecs_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; + } + rp->feature_count = cpu_to_le16(idx); /* After reading the experimental features information, enable @@ -3892,150 +3927,341 @@ static int exp_debug_feature_changed(bool enabled, struct sock *skip) } #endif -static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, - void *data, u16 data_len) +static int exp_quality_report_feature_changed(bool enabled, struct sock *skip) { - struct mgmt_cp_set_exp_feature *cp = data; - struct mgmt_rp_set_exp_feature rp; + struct mgmt_ev_exp_feature_changed ev; - bt_dev_dbg(hdev, "sock %p", sk); + memset(&ev, 0, sizeof(ev)); + memcpy(ev.uuid, quality_report_uuid, 16); + ev.flags = cpu_to_le32(enabled ? BIT(0) : 0); + + return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, NULL, + &ev, sizeof(ev), + HCI_MGMT_EXP_FEATURE_EVENTS, skip); +} + +#define EXP_FEAT(_uuid, _set_func) \ +{ \ + .uuid = _uuid, \ + .set_func = _set_func, \ +} + +/* The zero key uuid is special. Multiple exp features are set through it. */ +static int set_zero_key_func(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, u16 data_len) +{ + struct mgmt_rp_set_exp_feature rp; - if (!memcmp(cp->uuid, ZERO_KEY, 16)) { - memset(rp.uuid, 0, 16); - rp.flags = cpu_to_le32(0); + memset(rp.uuid, 0, 16); + rp.flags = cpu_to_le32(0); #ifdef CONFIG_BT_FEATURE_DEBUG - if (!hdev) { - bool changed = bt_dbg_get(); + if (!hdev) { + bool changed = bt_dbg_get(); - bt_dbg_set(false); + bt_dbg_set(false); - if (changed) - exp_debug_feature_changed(false, sk); - } + if (changed) + exp_debug_feature_changed(false, sk); + } #endif - if (hdev && use_ll_privacy(hdev) && !hdev_is_powered(hdev)) { - bool changed = hci_dev_test_flag(hdev, - HCI_ENABLE_LL_PRIVACY); + if (hdev && use_ll_privacy(hdev) && !hdev_is_powered(hdev)) { + bool changed = hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY); - hci_dev_clear_flag(hdev, HCI_ENABLE_LL_PRIVACY); + hci_dev_clear_flag(hdev, HCI_ENABLE_LL_PRIVACY); - if (changed) - exp_ll_privacy_feature_changed(false, hdev, sk); - } + if (changed) + exp_ll_privacy_feature_changed(false, hdev, sk); + } - hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); - return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, - MGMT_OP_SET_EXP_FEATURE, 0, - &rp, sizeof(rp)); - } + return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); +} #ifdef CONFIG_BT_FEATURE_DEBUG - if (!memcmp(cp->uuid, debug_uuid, 16)) { - bool val, changed; - int err; +static int set_debug_func(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, u16 data_len) +{ + struct mgmt_rp_set_exp_feature rp; - /* Command requires to use the non-controller index */ - if (hdev) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_INDEX); + bool val, changed; + int err; - /* Parameters are limited to a single octet */ - if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) - return mgmt_cmd_status(sk, MGMT_INDEX_NONE, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_PARAMS); + /* Command requires to use the non-controller index */ + if (hdev) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); - /* Only boolean on/off is supported */ - if (cp->param[0] != 0x00 && cp->param[0] != 0x01) - return mgmt_cmd_status(sk, MGMT_INDEX_NONE, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_PARAMS); + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); - val = !!cp->param[0]; - changed = val ? !bt_dbg_get() : bt_dbg_get(); - bt_dbg_set(val); + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); - memcpy(rp.uuid, debug_uuid, 16); - rp.flags = cpu_to_le32(val ? BIT(0) : 0); + val = !!cp->param[0]; + changed = val ? !bt_dbg_get() : bt_dbg_get(); + bt_dbg_set(val); - hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + memcpy(rp.uuid, debug_uuid, 16); + rp.flags = cpu_to_le32(val ? BIT(0) : 0); - err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, - MGMT_OP_SET_EXP_FEATURE, 0, - &rp, sizeof(rp)); + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); - if (changed) - exp_debug_feature_changed(val, sk); + err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); - return err; - } + if (changed) + exp_debug_feature_changed(val, sk); + + return err; +} #endif - if (!memcmp(cp->uuid, rpa_resolution_uuid, 16)) { - bool val, changed; - int err; - u32 flags; +static int set_rpa_resolution_func(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, + u16 data_len) +{ + struct mgmt_rp_set_exp_feature rp; + bool val, changed; + int err; + u32 flags; - /* Command requires to use the controller index */ - if (!hdev) - return mgmt_cmd_status(sk, MGMT_INDEX_NONE, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_INDEX); + /* Command requires to use the controller index */ + if (!hdev) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); - /* Changes can only be made when controller is powered down */ - if (hdev_is_powered(hdev)) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_REJECTED); + /* Changes can only be made when controller is powered down */ + if (hdev_is_powered(hdev)) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_REJECTED); - /* Parameters are limited to a single octet */ - if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_PARAMS); + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); - /* Only boolean on/off is supported */ - if (cp->param[0] != 0x00 && cp->param[0] != 0x01) - return mgmt_cmd_status(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, - MGMT_STATUS_INVALID_PARAMS); + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); - val = !!cp->param[0]; + val = !!cp->param[0]; - if (val) { - changed = !hci_dev_test_flag(hdev, - HCI_ENABLE_LL_PRIVACY); - hci_dev_set_flag(hdev, HCI_ENABLE_LL_PRIVACY); - hci_dev_clear_flag(hdev, HCI_ADVERTISING); + if (val) { + changed = !hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY); + hci_dev_set_flag(hdev, HCI_ENABLE_LL_PRIVACY); + hci_dev_clear_flag(hdev, HCI_ADVERTISING); - /* Enable LL privacy + supported settings changed */ - flags = BIT(0) | BIT(1); - } else { - changed = hci_dev_test_flag(hdev, - HCI_ENABLE_LL_PRIVACY); - hci_dev_clear_flag(hdev, HCI_ENABLE_LL_PRIVACY); + /* Enable LL privacy + supported settings changed */ + flags = BIT(0) | BIT(1); + } else { + changed = hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY); + hci_dev_clear_flag(hdev, HCI_ENABLE_LL_PRIVACY); - /* Disable LL privacy + supported settings changed */ - flags = BIT(1); + /* Disable LL privacy + supported settings changed */ + flags = BIT(1); + } + + memcpy(rp.uuid, rpa_resolution_uuid, 16); + rp.flags = cpu_to_le32(flags); + + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); + + if (changed) + exp_ll_privacy_feature_changed(val, hdev, sk); + + return err; +} + +static int set_quality_report_func(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, + u16 data_len) +{ + struct mgmt_rp_set_exp_feature rp; + bool val, changed; + int err; + + /* Command requires to use a valid controller index */ + if (!hdev) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); + + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + hci_req_sync_lock(hdev); + + val = !!cp->param[0]; + changed = (val != hci_dev_test_flag(hdev, HCI_QUALITY_REPORT)); + + if (!hdev->set_quality_report) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_NOT_SUPPORTED); + goto unlock_quality_report; + } + + if (changed) { + err = hdev->set_quality_report(hdev, val); + if (err) { + err = mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_FAILED); + goto unlock_quality_report; } + if (val) + hci_dev_set_flag(hdev, HCI_QUALITY_REPORT); + else + hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT); + } - memcpy(rp.uuid, rpa_resolution_uuid, 16); - rp.flags = cpu_to_le32(flags); + bt_dev_dbg(hdev, "quality report enable %d changed %d", val, changed); - hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + memcpy(rp.uuid, quality_report_uuid, 16); + rp.flags = cpu_to_le32(val ? BIT(0) : 0); + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); - err = mgmt_cmd_complete(sk, hdev->id, - MGMT_OP_SET_EXP_FEATURE, 0, - &rp, sizeof(rp)); + if (changed) + exp_quality_report_feature_changed(val, sk); - if (changed) - exp_ll_privacy_feature_changed(val, hdev, sk); +unlock_quality_report: + hci_req_sync_unlock(hdev); + return err; +} - return err; +static int exp_offload_codec_feature_changed(bool enabled, struct sock *skip) +{ + struct mgmt_ev_exp_feature_changed ev; + + memset(&ev, 0, sizeof(ev)); + memcpy(ev.uuid, offload_codecs_uuid, 16); + ev.flags = cpu_to_le32(enabled ? BIT(0) : 0); + + return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, NULL, + &ev, sizeof(ev), + HCI_MGMT_EXP_FEATURE_EVENTS, skip); +} + +static int set_offload_codec_func(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, + u16 data_len) +{ + bool val, changed; + int err; + struct mgmt_rp_set_exp_feature rp; + + /* Command requires to use a valid controller index */ + if (!hdev) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); + + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + val = !!cp->param[0]; + changed = (val != hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)); + + if (!hdev->get_data_path_id) { + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_NOT_SUPPORTED); + } + + if (changed) { + if (val) + hci_dev_set_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED); + else + hci_dev_clear_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED); + } + + bt_dev_info(hdev, "offload codecs enable %d changed %d", + val, changed); + + memcpy(rp.uuid, offload_codecs_uuid, 16); + rp.flags = cpu_to_le32(val ? BIT(0) : 0); + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); + + if (changed) + exp_offload_codec_feature_changed(val, sk); + + return err; +} + +static const struct mgmt_exp_feature { + const u8 *uuid; + int (*set_func)(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, u16 data_len); +} exp_features[] = { + EXP_FEAT(ZERO_KEY, set_zero_key_func), +#ifdef CONFIG_BT_FEATURE_DEBUG + EXP_FEAT(debug_uuid, set_debug_func), +#endif + EXP_FEAT(rpa_resolution_uuid, set_rpa_resolution_func), + EXP_FEAT(quality_report_uuid, set_quality_report_func), + EXP_FEAT(offload_codecs_uuid, set_offload_codec_func), + + /* end with a null feature */ + EXP_FEAT(NULL, NULL) +}; + +static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_cp_set_exp_feature *cp = data; + size_t i = 0; + + bt_dev_dbg(hdev, "sock %p", sk); + + for (i = 0; exp_features[i].uuid; i++) { + if (!memcmp(cp->uuid, exp_features[i].uuid, 16)) + return exp_features[i].set_func(sk, hdev, cp, data_len); } return mgmt_cmd_status(sk, hdev ? hdev->id : MGMT_INDEX_NONE, @@ -7204,7 +7430,7 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status, if (!mgmt_rp) goto done; - if (status) + if (eir_len == 0) goto send_rsp; eir_len = eir_append_data(mgmt_rp->eir, 0, EIR_CLASS_OF_DEV, @@ -7315,6 +7541,11 @@ static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev, if (!rp) return -ENOMEM; + if (!status && !lmp_ssp_capable(hdev)) { + status = MGMT_STATUS_NOT_SUPPORTED; + eir_len = 0; + } + if (status) goto complete; @@ -7526,7 +7757,7 @@ static u8 calculate_name_len(struct hci_dev *hdev) { u8 buf[HCI_MAX_SHORT_NAME_LENGTH + 3]; - return append_local_name(hdev, buf, 0); + return eir_append_local_name(hdev, buf, 0); } static u8 tlv_data_max_len(struct hci_dev *hdev, u32 adv_flags, @@ -7725,7 +7956,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, * advertising. */ if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) - return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_NOT_SUPPORTED); if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets) @@ -8222,7 +8453,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, * advertising. */ if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY)) - return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING, + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING, MGMT_STATUS_NOT_SUPPORTED); hci_dev_lock(hdev); diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index b4bfae41e8a5..255cffa554ee 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -94,11 +94,14 @@ struct msft_data { __u16 pending_add_handle; __u16 pending_remove_handle; __u8 reregistering; + __u8 suspending; __u8 filter_enabled; }; static int __msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor); +static int __msft_remove_monitor(struct hci_dev *hdev, + struct adv_monitor *monitor, u16 handle); bool msft_monitor_supported(struct hci_dev *hdev) { @@ -154,7 +157,7 @@ failed: } /* This function requires the caller holds hdev->lock */ -static void reregister_monitor_on_restart(struct hci_dev *hdev, int handle) +static void reregister_monitor(struct hci_dev *hdev, int handle) { struct adv_monitor *monitor; struct msft_data *msft = hdev->msft_data; @@ -182,31 +185,102 @@ static void reregister_monitor_on_restart(struct hci_dev *hdev, int handle) } } +/* This function requires the caller holds hdev->lock */ +static void remove_monitor_on_suspend(struct hci_dev *hdev, int handle) +{ + struct adv_monitor *monitor; + struct msft_data *msft = hdev->msft_data; + int err; + + while (1) { + monitor = idr_get_next(&hdev->adv_monitors_idr, &handle); + if (!monitor) { + /* All monitors have been removed */ + msft->suspending = false; + hci_update_background_scan(hdev); + return; + } + + msft->pending_remove_handle = (u16)handle; + err = __msft_remove_monitor(hdev, monitor, handle); + + /* If success, return and wait for monitor removed callback */ + if (!err) + return; + + /* Otherwise free the monitor and keep removing */ + hci_free_adv_monitor(hdev, monitor); + handle++; + } +} + +/* This function requires the caller holds hdev->lock */ +void msft_suspend(struct hci_dev *hdev) +{ + struct msft_data *msft = hdev->msft_data; + + if (!msft) + return; + + if (msft_monitor_supported(hdev)) { + msft->suspending = true; + /* Quitely remove all monitors on suspend to avoid waking up + * the system. + */ + remove_monitor_on_suspend(hdev, 0); + } +} + +/* This function requires the caller holds hdev->lock */ +void msft_resume(struct hci_dev *hdev) +{ + struct msft_data *msft = hdev->msft_data; + + if (!msft) + return; + + if (msft_monitor_supported(hdev)) { + msft->reregistering = true; + /* Monitors are removed on suspend, so we need to add all + * monitors on resume. + */ + reregister_monitor(hdev, 0); + } +} + void msft_do_open(struct hci_dev *hdev) { - struct msft_data *msft; + struct msft_data *msft = hdev->msft_data; if (hdev->msft_opcode == HCI_OP_NOP) return; + if (!msft) { + bt_dev_err(hdev, "MSFT extension not registered"); + return; + } + bt_dev_dbg(hdev, "Initialize MSFT extension"); - msft = kzalloc(sizeof(*msft), GFP_KERNEL); - if (!msft) - return; + /* Reset existing MSFT data before re-reading */ + kfree(msft->evt_prefix); + msft->evt_prefix = NULL; + msft->evt_prefix_len = 0; + msft->features = 0; if (!read_supported_features(hdev, msft)) { + hdev->msft_data = NULL; kfree(msft); return; } - INIT_LIST_HEAD(&msft->handle_map); - hdev->msft_data = msft; - if (msft_monitor_supported(hdev)) { msft->reregistering = true; msft_set_filter_enable(hdev, true); - reregister_monitor_on_restart(hdev, 0); + /* Monitors get removed on power off, so we need to explicitly + * tell the controller to re-monitor. + */ + reregister_monitor(hdev, 0); } } @@ -221,8 +295,9 @@ void msft_do_close(struct hci_dev *hdev) bt_dev_dbg(hdev, "Cleanup of MSFT extension"); - hdev->msft_data = NULL; - + /* The controller will silently remove all monitors on power off. + * Therefore, remove handle_data mapping and reset monitor state. + */ list_for_each_entry_safe(handle_data, tmp, &msft->handle_map, list) { monitor = idr_find(&hdev->adv_monitors_idr, handle_data->mgmt_handle); @@ -233,6 +308,34 @@ void msft_do_close(struct hci_dev *hdev) list_del(&handle_data->list); kfree(handle_data); } +} + +void msft_register(struct hci_dev *hdev) +{ + struct msft_data *msft = NULL; + + bt_dev_dbg(hdev, "Register MSFT extension"); + + msft = kzalloc(sizeof(*msft), GFP_KERNEL); + if (!msft) { + bt_dev_err(hdev, "Failed to register MSFT extension"); + return; + } + + INIT_LIST_HEAD(&msft->handle_map); + hdev->msft_data = msft; +} + +void msft_unregister(struct hci_dev *hdev) +{ + struct msft_data *msft = hdev->msft_data; + + if (!msft) + return; + + bt_dev_dbg(hdev, "Unregister MSFT extension"); + + hdev->msft_data = NULL; kfree(msft->evt_prefix); kfree(msft); @@ -345,8 +448,7 @@ unlock: /* If in restart/reregister sequence, keep registering. */ if (msft->reregistering) - reregister_monitor_on_restart(hdev, - msft->pending_add_handle + 1); + reregister_monitor(hdev, msft->pending_add_handle + 1); hci_dev_unlock(hdev); @@ -383,13 +485,25 @@ static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, if (handle_data) { monitor = idr_find(&hdev->adv_monitors_idr, handle_data->mgmt_handle); - if (monitor) + + if (monitor && monitor->state == ADV_MONITOR_STATE_OFFLOADED) + monitor->state = ADV_MONITOR_STATE_REGISTERED; + + /* Do not free the monitor if it is being removed due to + * suspend. It will be re-monitored on resume. + */ + if (monitor && !msft->suspending) hci_free_adv_monitor(hdev, monitor); list_del(&handle_data->list); kfree(handle_data); } + /* If in suspend/remove sequence, keep removing. */ + if (msft->suspending) + remove_monitor_on_suspend(hdev, + msft->pending_remove_handle + 1); + /* If remove all monitors is required, we need to continue the process * here because the earlier it was paused when waiting for the * response from controller. @@ -408,7 +522,8 @@ static void msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev, hci_dev_unlock(hdev); done: - hci_remove_adv_monitor_complete(hdev, status); + if (!msft->suspending) + hci_remove_adv_monitor_complete(hdev, status); } static void msft_le_set_advertisement_filter_enable_cb(struct hci_dev *hdev, @@ -541,15 +656,15 @@ int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor) if (!msft) return -EOPNOTSUPP; - if (msft->reregistering) + if (msft->reregistering || msft->suspending) return -EBUSY; return __msft_add_monitor_pattern(hdev, monitor); } /* This function requires the caller holds hdev->lock */ -int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, - u16 handle) +static int __msft_remove_monitor(struct hci_dev *hdev, + struct adv_monitor *monitor, u16 handle) { struct msft_cp_le_cancel_monitor_advertisement cp; struct msft_monitor_advertisement_handle_data *handle_data; @@ -557,12 +672,6 @@ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, struct msft_data *msft = hdev->msft_data; int err = 0; - if (!msft) - return -EOPNOTSUPP; - - if (msft->reregistering) - return -EBUSY; - handle_data = msft_find_handle_data(hdev, monitor->handle, true); /* If no matched handle, just remove without telling controller */ @@ -582,6 +691,21 @@ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, return err; } +/* This function requires the caller holds hdev->lock */ +int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, + u16 handle) +{ + struct msft_data *msft = hdev->msft_data; + + if (!msft) + return -EOPNOTSUPP; + + if (msft->reregistering || msft->suspending) + return -EBUSY; + + return __msft_remove_monitor(hdev, monitor, handle); +} + void msft_req_add_set_filter_enable(struct hci_request *req, bool enable) { struct hci_dev *hdev = req->hdev; diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h index 6e56d94b88d8..59c6e081c789 100644 --- a/net/bluetooth/msft.h +++ b/net/bluetooth/msft.h @@ -13,6 +13,8 @@ #if IS_ENABLED(CONFIG_BT_MSFTEXT) bool msft_monitor_supported(struct hci_dev *hdev); +void msft_register(struct hci_dev *hdev); +void msft_unregister(struct hci_dev *hdev); void msft_do_open(struct hci_dev *hdev); void msft_do_close(struct hci_dev *hdev); void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb); @@ -22,6 +24,8 @@ int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor, u16 handle); void msft_req_add_set_filter_enable(struct hci_request *req, bool enable); int msft_set_filter_enable(struct hci_dev *hdev, bool enable); +void msft_suspend(struct hci_dev *hdev); +void msft_resume(struct hci_dev *hdev); bool msft_curve_validity(struct hci_dev *hdev); #else @@ -31,6 +35,8 @@ static inline bool msft_monitor_supported(struct hci_dev *hdev) return false; } +static inline void msft_register(struct hci_dev *hdev) {} +static inline void msft_unregister(struct hci_dev *hdev) {} static inline void msft_do_open(struct hci_dev *hdev) {} static inline void msft_do_close(struct hci_dev *hdev) {} static inline void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) {} @@ -55,6 +61,9 @@ static inline int msft_set_filter_enable(struct hci_dev *hdev, bool enable) return -EOPNOTSUPP; } +static inline void msft_suspend(struct hci_dev *hdev) {} +static inline void msft_resume(struct hci_dev *hdev) {} + static inline bool msft_curve_validity(struct hci_dev *hdev) { return false; diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index f2bacb464ccf..7324764384b6 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -549,22 +549,58 @@ struct rfcomm_dlc *rfcomm_dlc_exists(bdaddr_t *src, bdaddr_t *dst, u8 channel) return dlc; } +static int rfcomm_dlc_send_frag(struct rfcomm_dlc *d, struct sk_buff *frag) +{ + int len = frag->len; + + BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len); + + if (len > d->mtu) + return -EINVAL; + + rfcomm_make_uih(frag, d->addr); + __skb_queue_tail(&d->tx_queue, frag); + + return len; +} + int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb) { - int len = skb->len; + unsigned long flags; + struct sk_buff *frag, *next; + int len; if (d->state != BT_CONNECTED) return -ENOTCONN; - BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len); + frag = skb_shinfo(skb)->frag_list; + skb_shinfo(skb)->frag_list = NULL; - if (len > d->mtu) - return -EINVAL; + /* Queue all fragments atomically. */ + spin_lock_irqsave(&d->tx_queue.lock, flags); - rfcomm_make_uih(skb, d->addr); - skb_queue_tail(&d->tx_queue, skb); + len = rfcomm_dlc_send_frag(d, skb); + if (len < 0 || !frag) + goto unlock; + + for (; frag; frag = next) { + int ret; + + next = frag->next; + + ret = rfcomm_dlc_send_frag(d, frag); + if (ret < 0) { + kfree_skb(frag); + goto unlock; + } + + len += ret; + } + +unlock: + spin_unlock_irqrestore(&d->tx_queue.lock, flags); - if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags)) + if (len > 0 && !test_bit(RFCOMM_TX_THROTTLED, &d->flags)) rfcomm_schedule(); return len; } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index ae6f80730561..4bf4ea6cbb5e 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -70,7 +70,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) BT_DBG("dlc %p state %ld err %d", d, d->state, err); - spin_lock_bh(&sk->sk_lock.slock); + lock_sock(sk); if (err) sk->sk_err = err; @@ -91,7 +91,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) sk->sk_state_change(sk); } - spin_unlock_bh(&sk->sk_lock.slock); + release_sock(sk); if (parent && sock_flag(sk, SOCK_ZAPPED)) { /* We have to drop DLC lock here, otherwise @@ -575,46 +575,20 @@ static int rfcomm_sock_sendmsg(struct socket *sock, struct msghdr *msg, lock_sock(sk); sent = bt_sock_wait_ready(sk, msg->msg_flags); - if (sent) - goto done; - - while (len) { - size_t size = min_t(size_t, len, d->mtu); - int err; - - skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE, - msg->msg_flags & MSG_DONTWAIT, &err); - if (!skb) { - if (sent == 0) - sent = err; - break; - } - skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE); - - err = memcpy_from_msg(skb_put(skb, size), msg, size); - if (err) { - kfree_skb(skb); - if (sent == 0) - sent = err; - break; - } - skb->priority = sk->sk_priority; + release_sock(sk); - err = rfcomm_dlc_send(d, skb); - if (err < 0) { - kfree_skb(skb); - if (sent == 0) - sent = err; - break; - } + if (sent) + return sent; - sent += size; - len -= size; - } + skb = bt_skb_sendmmsg(sk, msg, len, d->mtu, RFCOMM_SKB_HEAD_RESERVE, + RFCOMM_SKB_TAIL_RESERVE); + if (IS_ERR(skb)) + return PTR_ERR(skb); -done: - release_sock(sk); + sent = rfcomm_dlc_send(d, skb); + if (sent < 0) + kfree_skb(skb); return sent; } @@ -974,7 +948,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * if (!parent) return 0; - bh_lock_sock(parent); + lock_sock(parent); /* Check for backlog size */ if (sk_acceptq_is_full(parent)) { @@ -1001,7 +975,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * result = 1; done: - bh_unlock_sock(parent); + release_sock(parent); if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) parent->sk_state_change(parent); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 4e095746e002..ebd78fdbd6e8 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -1127,9 +1127,10 @@ int __init rfcomm_init_ttys(void) { int error; - rfcomm_tty_driver = alloc_tty_driver(RFCOMM_TTY_PORTS); - if (!rfcomm_tty_driver) - return -ENOMEM; + rfcomm_tty_driver = tty_alloc_driver(RFCOMM_TTY_PORTS, + TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV); + if (IS_ERR(rfcomm_tty_driver)) + return PTR_ERR(rfcomm_tty_driver); rfcomm_tty_driver->driver_name = "rfcomm"; rfcomm_tty_driver->name = "rfcomm"; @@ -1137,7 +1138,6 @@ int __init rfcomm_init_ttys(void) rfcomm_tty_driver->minor_start = RFCOMM_TTY_MINOR; rfcomm_tty_driver->type = TTY_DRIVER_TYPE_SERIAL; rfcomm_tty_driver->subtype = SERIAL_TYPE_NORMAL; - rfcomm_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV; rfcomm_tty_driver->init_termios = tty_std_termios; rfcomm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL; rfcomm_tty_driver->init_termios.c_lflag &= ~ICANON; @@ -1146,7 +1146,7 @@ int __init rfcomm_init_ttys(void) error = tty_register_driver(rfcomm_tty_driver); if (error) { BT_ERR("Can't register RFCOMM TTY driver"); - put_tty_driver(rfcomm_tty_driver); + tty_driver_kref_put(rfcomm_tty_driver); return error; } @@ -1158,5 +1158,5 @@ int __init rfcomm_init_ttys(void) void rfcomm_cleanup_ttys(void) { tty_unregister_driver(rfcomm_tty_driver); - put_tty_driver(rfcomm_tty_driver); + tty_driver_kref_put(rfcomm_tty_driver); } diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index d9a4e88dacbb..8eabf41b2993 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -48,6 +48,8 @@ struct sco_conn { spinlock_t lock; struct sock *sk; + struct delayed_work timeout_work; + unsigned int mtu; }; @@ -67,6 +69,7 @@ struct sco_pinfo { __u32 flags; __u16 setting; __u8 cmsg_mask; + struct bt_codec codec; struct sco_conn *conn; }; @@ -74,31 +77,47 @@ struct sco_pinfo { #define SCO_CONN_TIMEOUT (HZ * 40) #define SCO_DISCONN_TIMEOUT (HZ * 2) -static void sco_sock_timeout(struct timer_list *t) +static void sco_sock_timeout(struct work_struct *work) { - struct sock *sk = from_timer(sk, t, sk_timer); + struct sco_conn *conn = container_of(work, struct sco_conn, + timeout_work.work); + struct sock *sk; + + sco_conn_lock(conn); + sk = conn->sk; + if (sk) + sock_hold(sk); + sco_conn_unlock(conn); + + if (!sk) + return; BT_DBG("sock %p state %d", sk, sk->sk_state); - bh_lock_sock(sk); + lock_sock(sk); sk->sk_err = ETIMEDOUT; sk->sk_state_change(sk); - bh_unlock_sock(sk); - - sco_sock_kill(sk); + release_sock(sk); sock_put(sk); } static void sco_sock_set_timer(struct sock *sk, long timeout) { + if (!sco_pi(sk)->conn) + return; + BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); - sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout); + cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); + schedule_delayed_work(&sco_pi(sk)->conn->timeout_work, timeout); } static void sco_sock_clear_timer(struct sock *sk) { + if (!sco_pi(sk)->conn) + return; + BT_DBG("sock %p state %d", sk, sk->sk_state); - sk_stop_timer(sk, &sk->sk_timer); + cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); } /* ---- SCO connections ---- */ @@ -115,6 +134,7 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) return NULL; spin_lock_init(&conn->lock); + INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout); hcon->sco_data = conn; conn->hcon = hcon; @@ -169,18 +189,21 @@ static void sco_conn_del(struct hci_conn *hcon, int err) /* Kill socket */ sco_conn_lock(conn); sk = conn->sk; + if (sk) + sock_hold(sk); sco_conn_unlock(conn); if (sk) { - sock_hold(sk); - bh_lock_sock(sk); + lock_sock(sk); sco_sock_clear_timer(sk); sco_chan_del(sk, err); - bh_unlock_sock(sk); - sco_sock_kill(sk); + release_sock(sk); sock_put(sk); } + /* Ensure no more work items will run before freeing conn. */ + cancel_delayed_work_sync(&conn->timeout_work); + hcon->sco_data = NULL; kfree(conn); } @@ -212,44 +235,32 @@ static int sco_chan_add(struct sco_conn *conn, struct sock *sk, return err; } -static int sco_connect(struct sock *sk) +static int sco_connect(struct hci_dev *hdev, struct sock *sk) { struct sco_conn *conn; struct hci_conn *hcon; - struct hci_dev *hdev; int err, type; BT_DBG("%pMR -> %pMR", &sco_pi(sk)->src, &sco_pi(sk)->dst); - hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); - if (!hdev) - return -EHOSTUNREACH; - - hci_dev_lock(hdev); - if (lmp_esco_capable(hdev) && !disable_esco) type = ESCO_LINK; else type = SCO_LINK; if (sco_pi(sk)->setting == BT_VOICE_TRANSPARENT && - (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) { - err = -EOPNOTSUPP; - goto done; - } + (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) + return -EOPNOTSUPP; hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, - sco_pi(sk)->setting); - if (IS_ERR(hcon)) { - err = PTR_ERR(hcon); - goto done; - } + sco_pi(sk)->setting, &sco_pi(sk)->codec); + if (IS_ERR(hcon)) + return PTR_ERR(hcon); conn = sco_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); - err = -ENOMEM; - goto done; + return -ENOMEM; } /* Update source addr of the socket */ @@ -257,7 +268,7 @@ static int sco_connect(struct sock *sk) err = sco_chan_add(conn, sk, NULL); if (err) - goto done; + return err; if (hcon->state == BT_CONNECTED) { sco_sock_clear_timer(sk); @@ -267,17 +278,13 @@ static int sco_connect(struct sock *sk) sco_sock_set_timer(sk, sk->sk_sndtimeo); } -done: - hci_dev_unlock(hdev); - hci_dev_put(hdev); return err; } -static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len) +static int sco_send_frame(struct sock *sk, struct sk_buff *skb) { struct sco_conn *conn = sco_pi(sk)->conn; - struct sk_buff *skb; - int err; + int len = skb->len; /* Check outgoing MTU */ if (len > conn->mtu) @@ -285,15 +292,6 @@ static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len) BT_DBG("sk %p len %d", sk, len); - skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err); - if (!skb) - return err; - - if (memcpy_from_msg(skb_put(skb, len), msg, len)) { - kfree_skb(skb); - return -EFAULT; - } - hci_send_sco(conn->hcon, skb); return len; @@ -394,8 +392,7 @@ static void sco_sock_cleanup_listen(struct sock *parent) */ static void sco_sock_kill(struct sock *sk) { - if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket || - sock_flag(sk, SOCK_DEAD)) + if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) return; BT_DBG("sk %p state %d", sk, sk->sk_state); @@ -438,16 +435,16 @@ static void __sco_sock_close(struct sock *sk) sock_set_flag(sk, SOCK_ZAPPED); break; } + } /* Must be called on unlocked socket. */ static void sco_sock_close(struct sock *sk) { - sco_sock_clear_timer(sk); lock_sock(sk); + sco_sock_clear_timer(sk); __sco_sock_close(sk); release_sock(sk); - sco_sock_kill(sk); } static void sco_skb_put_cmsg(struct sk_buff *skb, struct msghdr *msg, @@ -499,8 +496,10 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = BT_OPEN; sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT; - - timer_setup(&sk->sk_timer, sco_sock_timeout, 0); + sco_pi(sk)->codec.id = BT_CODEC_CVSD; + sco_pi(sk)->codec.cid = 0xffff; + sco_pi(sk)->codec.vid = 0xffff; + sco_pi(sk)->codec.data_path = 0x00; bt_sock_link(&sco_sk_list, sk); return sk; @@ -566,6 +565,7 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; + struct hci_dev *hdev; int err; BT_DBG("sk %p", sk); @@ -580,12 +580,19 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen if (sk->sk_type != SOCK_SEQPACKET) return -EINVAL; + hdev = hci_get_route(&sa->sco_bdaddr, &sco_pi(sk)->src, BDADDR_BREDR); + if (!hdev) + return -EHOSTUNREACH; + hci_dev_lock(hdev); + lock_sock(sk); /* Set destination address and psm */ bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr); - err = sco_connect(sk); + err = sco_connect(hdev, sk); + hci_dev_unlock(hdev); + hci_dev_put(hdev); if (err) goto done; @@ -714,6 +721,7 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; + struct sk_buff *skb; int err; BT_DBG("sock %p, sk %p", sock, sk); @@ -725,14 +733,21 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; + skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0); + if (IS_ERR(skb)) + return PTR_ERR(skb); + lock_sock(sk); if (sk->sk_state == BT_CONNECTED) - err = sco_send_frame(sk, msg, len); + err = sco_send_frame(sk, skb); else err = -ENOTCONN; release_sock(sk); + + if (err < 0) + kfree_skb(skb); return err; } @@ -773,6 +788,11 @@ static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting) cp.max_latency = cpu_to_le16(0xffff); cp.retrans_effort = 0xff; break; + default: + /* use CVSD settings as fallback */ + cp.max_latency = cpu_to_le16(0xffff); + cp.retrans_effort = 0xff; + break; } hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ, @@ -809,6 +829,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, int len, err = 0; struct bt_voice voice; u32 opt; + struct bt_codecs *codecs; + struct hci_dev *hdev; + __u8 buffer[255]; BT_DBG("sk %p", sk); @@ -856,6 +879,16 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, } sco_pi(sk)->setting = voice.setting; + hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, + BDADDR_BREDR); + if (!hdev) { + err = -EBADFD; + break; + } + if (enhanced_sco_capable(hdev) && + voice.setting == BT_VOICE_TRANSPARENT) + sco_pi(sk)->codec.id = BT_CODEC_TRANSPARENT; + hci_dev_put(hdev); break; case BT_PKT_STATUS: @@ -870,6 +903,57 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, sco_pi(sk)->cmsg_mask &= SCO_CMSG_PKT_STATUS; break; + case BT_CODEC: + if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && + sk->sk_state != BT_CONNECT2) { + err = -EINVAL; + break; + } + + hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, + BDADDR_BREDR); + if (!hdev) { + err = -EBADFD; + break; + } + + if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) { + hci_dev_put(hdev); + err = -EOPNOTSUPP; + break; + } + + if (!hdev->get_data_path_id) { + hci_dev_put(hdev); + err = -EOPNOTSUPP; + break; + } + + if (optlen < sizeof(struct bt_codecs) || + optlen > sizeof(buffer)) { + hci_dev_put(hdev); + err = -EINVAL; + break; + } + + if (copy_from_sockptr(buffer, optval, optlen)) { + hci_dev_put(hdev); + err = -EFAULT; + break; + } + + codecs = (void *)buffer; + + if (codecs->num_codecs > 1) { + hci_dev_put(hdev); + err = -EINVAL; + break; + } + + sco_pi(sk)->codec = codecs->codecs[0]; + hci_dev_put(hdev); + break; + default: err = -ENOPROTOOPT; break; @@ -948,6 +1032,12 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, struct bt_voice voice; u32 phys; int pkt_status; + int buf_len; + struct codec_list *c; + u8 num_codecs, i, __user *ptr; + struct hci_dev *hdev; + struct hci_codec_caps *caps; + struct bt_codec codec; BT_DBG("sk %p", sk); @@ -1012,6 +1102,101 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, err = -EFAULT; break; + case BT_CODEC: + num_codecs = 0; + buf_len = 0; + + hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR); + if (!hdev) { + err = -EBADFD; + break; + } + + if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) { + hci_dev_put(hdev); + err = -EOPNOTSUPP; + break; + } + + if (!hdev->get_data_path_id) { + hci_dev_put(hdev); + err = -EOPNOTSUPP; + break; + } + + /* find total buffer size required to copy codec + caps */ + hci_dev_lock(hdev); + list_for_each_entry(c, &hdev->local_codecs, list) { + if (c->transport != HCI_TRANSPORT_SCO_ESCO) + continue; + num_codecs++; + for (i = 0, caps = c->caps; i < c->num_caps; i++) { + buf_len += 1 + caps->len; + caps = (void *)&caps->data[caps->len]; + } + buf_len += sizeof(struct bt_codec); + } + hci_dev_unlock(hdev); + + buf_len += sizeof(struct bt_codecs); + if (buf_len > len) { + hci_dev_put(hdev); + err = -ENOBUFS; + break; + } + ptr = optval; + + if (put_user(num_codecs, ptr)) { + hci_dev_put(hdev); + err = -EFAULT; + break; + } + ptr += sizeof(num_codecs); + + /* Iterate all the codecs supported over SCO and populate + * codec data + */ + hci_dev_lock(hdev); + list_for_each_entry(c, &hdev->local_codecs, list) { + if (c->transport != HCI_TRANSPORT_SCO_ESCO) + continue; + + codec.id = c->id; + codec.cid = c->cid; + codec.vid = c->vid; + err = hdev->get_data_path_id(hdev, &codec.data_path); + if (err < 0) + break; + codec.num_caps = c->num_caps; + if (copy_to_user(ptr, &codec, sizeof(codec))) { + err = -EFAULT; + break; + } + ptr += sizeof(codec); + + /* find codec capabilities data length */ + len = 0; + for (i = 0, caps = c->caps; i < c->num_caps; i++) { + len += 1 + caps->len; + caps = (void *)&caps->data[caps->len]; + } + + /* copy codec capabilities data */ + if (len && copy_to_user(ptr, c->caps, len)) { + err = -EFAULT; + break; + } + ptr += len; + } + + if (!err && put_user(buf_len, optlen)) + err = -EFAULT; + + hci_dev_unlock(hdev); + hci_dev_put(hdev); + + break; + default: err = -ENOPROTOOPT; break; @@ -1083,11 +1268,11 @@ static void sco_conn_ready(struct sco_conn *conn) BT_DBG("conn %p", conn); if (sk) { + lock_sock(sk); sco_sock_clear_timer(sk); - bh_lock_sock(sk); sk->sk_state = BT_CONNECTED; sk->sk_state_change(sk); - bh_unlock_sock(sk); + release_sock(sk); } else { sco_conn_lock(conn); @@ -1102,12 +1287,12 @@ static void sco_conn_ready(struct sco_conn *conn) return; } - bh_lock_sock(parent); + lock_sock(parent); sk = sco_sock_alloc(sock_net(parent), NULL, BTPROTO_SCO, GFP_ATOMIC, 0); if (!sk) { - bh_unlock_sock(parent); + release_sock(parent); sco_conn_unlock(conn); return; } @@ -1128,7 +1313,7 @@ static void sco_conn_ready(struct sco_conn *conn) /* Wake up parent */ parent->sk_data_ready(parent); - bh_unlock_sock(parent); + release_sock(parent); sco_conn_unlock(conn); } diff --git a/net/bpf/Makefile b/net/bpf/Makefile index 1c0a98d8c28f..1ebe270bde23 100644 --- a/net/bpf/Makefile +++ b/net/bpf/Makefile @@ -1,2 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_BPF_SYSCALL) := test_run.o +ifeq ($(CONFIG_BPF_JIT),y) +obj-$(CONFIG_BPF_SYSCALL) += bpf_dummy_struct_ops.o +endif diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c new file mode 100644 index 000000000000..fbc896323bec --- /dev/null +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd + */ +#include <linux/kernel.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> + +extern struct bpf_struct_ops bpf_bpf_dummy_ops; + +/* A common type for test_N with return value in bpf_dummy_ops */ +typedef int (*dummy_ops_test_ret_fn)(struct bpf_dummy_ops_state *state, ...); + +struct bpf_dummy_ops_test_args { + u64 args[MAX_BPF_FUNC_ARGS]; + struct bpf_dummy_ops_state state; +}; + +static struct bpf_dummy_ops_test_args * +dummy_ops_init_args(const union bpf_attr *kattr, unsigned int nr) +{ + __u32 size_in; + struct bpf_dummy_ops_test_args *args; + void __user *ctx_in; + void __user *u_state; + + size_in = kattr->test.ctx_size_in; + if (size_in != sizeof(u64) * nr) + return ERR_PTR(-EINVAL); + + args = kzalloc(sizeof(*args), GFP_KERNEL); + if (!args) + return ERR_PTR(-ENOMEM); + + ctx_in = u64_to_user_ptr(kattr->test.ctx_in); + if (copy_from_user(args->args, ctx_in, size_in)) + goto out; + + /* args[0] is 0 means state argument of test_N will be NULL */ + u_state = u64_to_user_ptr(args->args[0]); + if (u_state && copy_from_user(&args->state, u_state, + sizeof(args->state))) + goto out; + + return args; +out: + kfree(args); + return ERR_PTR(-EFAULT); +} + +static int dummy_ops_copy_args(struct bpf_dummy_ops_test_args *args) +{ + void __user *u_state; + + u_state = u64_to_user_ptr(args->args[0]); + if (u_state && copy_to_user(u_state, &args->state, sizeof(args->state))) + return -EFAULT; + + return 0; +} + +static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args) +{ + dummy_ops_test_ret_fn test = (void *)image; + struct bpf_dummy_ops_state *state = NULL; + + /* state needs to be NULL if args[0] is 0 */ + if (args->args[0]) + state = &args->state; + return test(state, args->args[1], args->args[2], + args->args[3], args->args[4]); +} + +int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops; + const struct btf_type *func_proto; + struct bpf_dummy_ops_test_args *args; + struct bpf_tramp_progs *tprogs; + void *image = NULL; + unsigned int op_idx; + int prog_ret; + int err; + + if (prog->aux->attach_btf_id != st_ops->type_id) + return -EOPNOTSUPP; + + func_proto = prog->aux->attach_func_proto; + args = dummy_ops_init_args(kattr, btf_type_vlen(func_proto)); + if (IS_ERR(args)) + return PTR_ERR(args); + + tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); + if (!tprogs) { + err = -ENOMEM; + goto out; + } + + image = bpf_jit_alloc_exec(PAGE_SIZE); + if (!image) { + err = -ENOMEM; + goto out; + } + set_vm_flush_reset_perms(image); + + op_idx = prog->expected_attach_type; + err = bpf_struct_ops_prepare_trampoline(tprogs, prog, + &st_ops->func_models[op_idx], + image, image + PAGE_SIZE); + if (err < 0) + goto out; + + set_memory_ro((long)image, 1); + set_memory_x((long)image, 1); + prog_ret = dummy_ops_call_op(image, args); + + err = dummy_ops_copy_args(args); + if (err) + goto out; + if (put_user(prog_ret, &uattr->test.retval)) + err = -EFAULT; +out: + kfree(args); + bpf_jit_free_exec(image); + kfree(tprogs); + return err; +} + +static int bpf_dummy_init(struct btf *btf) +{ + return 0; +} + +static bool bpf_dummy_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + return bpf_tracing_btf_ctx_access(off, size, type, prog, info); +} + +static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id) +{ + const struct btf_type *state; + s32 type_id; + int err; + + type_id = btf_find_by_name_kind(btf, "bpf_dummy_ops_state", + BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + + state = btf_type_by_id(btf, type_id); + if (t != state) { + bpf_log(log, "only access to bpf_dummy_ops_state is supported\n"); + return -EACCES; + } + + err = btf_struct_access(log, btf, t, off, size, atype, next_btf_id); + if (err < 0) + return err; + + return atype == BPF_READ ? err : NOT_INIT; +} + +static const struct bpf_verifier_ops bpf_dummy_verifier_ops = { + .is_valid_access = bpf_dummy_ops_is_valid_access, + .btf_struct_access = bpf_dummy_ops_btf_struct_access, +}; + +static int bpf_dummy_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + return -EOPNOTSUPP; +} + +static int bpf_dummy_reg(void *kdata) +{ + return -EOPNOTSUPP; +} + +static void bpf_dummy_unreg(void *kdata) +{ +} + +struct bpf_struct_ops bpf_bpf_dummy_ops = { + .verifier_ops = &bpf_dummy_verifier_ops, + .init = bpf_dummy_init, + .init_member = bpf_dummy_init_member, + .reg = bpf_dummy_reg, + .unreg = bpf_dummy_unreg, + .name = "bpf_dummy_ops", +}; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index caa16bf30fb5..46dd95755967 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -2,6 +2,7 @@ /* Copyright (c) 2017 Facebook */ #include <linux/bpf.h> +#include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -16,6 +17,7 @@ #include <linux/error-injection.h> #include <linux/smp.h> #include <linux/sock_diag.h> +#include <net/xdp.h> #define CREATE_TRACE_POINTS #include <trace/events/bpf_test_run.h> @@ -88,17 +90,19 @@ reset: static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { - struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = { NULL }; + struct bpf_prog_array_item item = {.prog = prog}; + struct bpf_run_ctx *old_ctx; + struct bpf_cg_run_ctx run_ctx; struct bpf_test_timer t = { NO_MIGRATE }; enum bpf_cgroup_storage_type stype; int ret; for_each_cgroup_storage_type(stype) { - storage[stype] = bpf_cgroup_storage_alloc(prog, stype); - if (IS_ERR(storage[stype])) { - storage[stype] = NULL; + item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype); + if (IS_ERR(item.cgroup_storage[stype])) { + item.cgroup_storage[stype] = NULL; for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return -ENOMEM; } } @@ -107,22 +111,19 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, repeat = 1; bpf_test_timer_enter(&t); + old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); do { - ret = bpf_cgroup_storage_set(storage); - if (ret) - break; - + run_ctx.prog_item = &item; if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else - *retval = BPF_PROG_RUN(prog, ctx); - - bpf_cgroup_storage_unset(); + *retval = bpf_prog_run(prog, ctx); } while (bpf_test_timer_continue(&t, repeat, &ret, time)); + bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); for_each_cgroup_storage_type(stype) - bpf_cgroup_storage_free(storage[stype]); + bpf_cgroup_storage_free(item.cgroup_storage[stype]); return ret; } @@ -241,9 +242,11 @@ BTF_ID(func, bpf_kfunc_call_test2) BTF_ID(func, bpf_kfunc_call_test3) BTF_SET_END(test_sk_kfunc_ids) -bool bpf_prog_test_check_kfunc_call(u32 kfunc_id) +bool bpf_prog_test_check_kfunc_call(u32 kfunc_id, struct module *owner) { - return btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id); + if (btf_id_set_contains(&test_sk_kfunc_ids, kfunc_id)) + return true; + return bpf_check_mod_kfunc_call(&prog_test_kfunc_list, kfunc_id, owner); } static void *bpf_test_init(const union bpf_attr *kattr, u32 size, @@ -327,7 +330,7 @@ __bpf_prog_test_run_raw_tp(void *data) struct bpf_raw_tp_test_run_info *info = data; rcu_read_lock(); - info->retval = BPF_PROG_RUN(info->prog, info->ctx); + info->retval = bpf_prog_run(info->prog, info->ctx); rcu_read_unlock(); } @@ -355,13 +358,9 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, return -EINVAL; if (ctx_size_in) { - info.ctx = kzalloc(ctx_size_in, GFP_USER); - if (!info.ctx) - return -ENOMEM; - if (copy_from_user(info.ctx, ctx_in, ctx_size_in)) { - err = -EFAULT; - goto out; - } + info.ctx = memdup_user(ctx_in, ctx_size_in); + if (IS_ERR(info.ctx)) + return PTR_ERR(info.ctx); } else { info.ctx = NULL; } @@ -389,7 +388,6 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32))) err = -EFAULT; -out: kfree(info.ctx); return err; } @@ -483,11 +481,7 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) return -EINVAL; /* priority is allowed */ - - if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority), - offsetof(struct __sk_buff, ifindex))) - return -EINVAL; - + /* ingress_ifindex is allowed */ /* ifindex is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, ifindex), @@ -511,11 +505,18 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) /* gso_size is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_size), + offsetof(struct __sk_buff, hwtstamp))) + return -EINVAL; + + /* hwtstamp is allowed */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, hwtstamp), sizeof(struct __sk_buff))) return -EINVAL; skb->mark = __skb->mark; skb->priority = __skb->priority; + skb->skb_iif = __skb->ingress_ifindex; skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); @@ -532,6 +533,7 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) return -EINVAL; skb_shinfo(skb)->gso_segs = __skb->gso_segs; skb_shinfo(skb)->gso_size = __skb->gso_size; + skb_shinfo(skb)->hwtstamps.hwtstamp = __skb->hwtstamp; return 0; } @@ -545,13 +547,21 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) __skb->mark = skb->mark; __skb->priority = skb->priority; + __skb->ingress_ifindex = skb->skb_iif; __skb->ifindex = skb->dev->ifindex; __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); __skb->wire_len = cb->pkt_len; __skb->gso_segs = skb_shinfo(skb)->gso_segs; + __skb->hwtstamp = skb_shinfo(skb)->hwtstamps.hwtstamp; } +static struct proto bpf_dummy_proto = { + .name = "bpf_dummy", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sock), +}; + int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { @@ -596,20 +606,19 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, break; } - sk = kzalloc(sizeof(struct sock), GFP_USER); + sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); if (!sk) { kfree(data); kfree(ctx); return -ENOMEM; } - sock_net_set(sk, net); sock_init_data(NULL, sk); skb = build_skb(data, 0); if (!skb) { kfree(data); kfree(ctx); - kfree(sk); + sk_free(sk); return -ENOMEM; } skb->sk = sk; @@ -682,12 +691,69 @@ out: if (dev && dev != net->loopback_dev) dev_put(dev); kfree_skb(skb); - bpf_sk_storage_free(sk); - kfree(sk); + sk_free(sk); kfree(ctx); return ret; } +static int xdp_convert_md_to_buff(struct xdp_md *xdp_md, struct xdp_buff *xdp) +{ + unsigned int ingress_ifindex, rx_queue_index; + struct netdev_rx_queue *rxqueue; + struct net_device *device; + + if (!xdp_md) + return 0; + + if (xdp_md->egress_ifindex != 0) + return -EINVAL; + + ingress_ifindex = xdp_md->ingress_ifindex; + rx_queue_index = xdp_md->rx_queue_index; + + if (!ingress_ifindex && rx_queue_index) + return -EINVAL; + + if (ingress_ifindex) { + device = dev_get_by_index(current->nsproxy->net_ns, + ingress_ifindex); + if (!device) + return -ENODEV; + + if (rx_queue_index >= device->real_num_rx_queues) + goto free_dev; + + rxqueue = __netif_get_rx_queue(device, rx_queue_index); + + if (!xdp_rxq_info_is_reg(&rxqueue->xdp_rxq)) + goto free_dev; + + xdp->rxq = &rxqueue->xdp_rxq; + /* The device is now tracked in the xdp->rxq for later + * dev_put() + */ + } + + xdp->data = xdp->data_meta + xdp_md->data; + return 0; + +free_dev: + dev_put(device); + return -EINVAL; +} + +static void xdp_convert_buff_to_md(struct xdp_buff *xdp, struct xdp_md *xdp_md) +{ + if (!xdp_md) + return; + + xdp_md->data = xdp->data - xdp->data_meta; + xdp_md->data_end = xdp->data_end - xdp->data_meta; + + if (xdp_md->ingress_ifindex) + dev_put(xdp->rxq->dev); +} + int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { @@ -698,38 +764,75 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, struct netdev_rx_queue *rxqueue; struct xdp_buff xdp = {}; u32 retval, duration; + struct xdp_md *ctx; u32 max_data_sz; void *data; - int ret; + int ret = -EINVAL; if (prog->expected_attach_type == BPF_XDP_DEVMAP || prog->expected_attach_type == BPF_XDP_CPUMAP) return -EINVAL; - if (kattr->test.ctx_in || kattr->test.ctx_out) - return -EINVAL; + + ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md)); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + if (ctx) { + /* There can't be user provided data before the meta data */ + if (ctx->data_meta || ctx->data_end != size || + ctx->data > ctx->data_end || + unlikely(xdp_metalen_invalid(ctx->data))) + goto free_ctx; + /* Meta data is allocated from the headroom */ + headroom -= ctx->data; + } /* XDP have extra tailroom as (most) drivers use full page */ max_data_sz = 4096 - headroom - tailroom; data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); - if (IS_ERR(data)) - return PTR_ERR(data); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + goto free_ctx; + } rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp_init_buff(&xdp, headroom + max_data_sz + tailroom, &rxqueue->xdp_rxq); xdp_prepare_buff(&xdp, data, headroom, size, true); - bpf_prog_change_xdp(NULL, prog); + ret = xdp_convert_md_to_buff(ctx, &xdp); + if (ret) + goto free_data; + + if (repeat > 1) + bpf_prog_change_xdp(NULL, prog); ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); + /* We convert the xdp_buff back to an xdp_md before checking the return + * code so the reference count of any held netdevice will be decremented + * even if the test run failed. + */ + xdp_convert_buff_to_md(&xdp, ctx); if (ret) goto out; - if (xdp.data != data + headroom || xdp.data_end != xdp.data + size) - size = xdp.data_end - xdp.data; - ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); + + if (xdp.data_meta != data + headroom || + xdp.data_end != xdp.data_meta + size) + size = xdp.data_end - xdp.data_meta; + + ret = bpf_test_finish(kattr, uattr, xdp.data_meta, size, retval, + duration); + if (!ret) + ret = bpf_ctx_finish(kattr, uattr, ctx, + sizeof(struct xdp_md)); + out: - bpf_prog_change_xdp(prog, NULL); + if (repeat > 1) + bpf_prog_change_xdp(prog, NULL); +free_data: kfree(data); +free_ctx: + kfree(ctx); return ret; } @@ -896,7 +999,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat bpf_test_timer_enter(&t); do { ctx.selected_sk = NULL; - retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, BPF_PROG_RUN); + retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run); } while (bpf_test_timer_continue(&t, repeat, &ret, &duration)); bpf_test_timer_leave(&t); @@ -944,13 +1047,9 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog, return -EINVAL; if (ctx_size_in) { - ctx = kzalloc(ctx_size_in, GFP_USER); - if (!ctx) - return -ENOMEM; - if (copy_from_user(ctx, ctx_in, ctx_size_in)) { - err = -EFAULT; - goto out; - } + ctx = memdup_user(ctx_in, ctx_size_in); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); } rcu_read_lock_trace(); diff --git a/net/bridge/br.c b/net/bridge/br.c index ef743f94254d..1fac72cc617f 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -36,7 +36,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v bool changed_addr; int err; - if (dev->priv_flags & IFF_EBRIDGE) { + if (netif_is_bridge_master(dev)) { err = br_vlan_bridge_event(dev, event, ptr); if (err) return notifier_from_errno(err); @@ -201,6 +201,48 @@ static struct notifier_block br_switchdev_notifier = { .notifier_call = br_switchdev_event, }; +/* called under rtnl_mutex */ +static int br_switchdev_blocking_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); + struct net_device *dev = switchdev_notifier_info_to_dev(ptr); + struct switchdev_notifier_brport_info *brport_info; + const struct switchdev_brport *b; + struct net_bridge_port *p; + int err = NOTIFY_DONE; + + p = br_port_get_rtnl(dev); + if (!p) + goto out; + + switch (event) { + case SWITCHDEV_BRPORT_OFFLOADED: + brport_info = ptr; + b = &brport_info->brport; + + err = br_switchdev_port_offload(p, b->dev, b->ctx, + b->atomic_nb, b->blocking_nb, + b->tx_fwd_offload, extack); + err = notifier_from_errno(err); + break; + case SWITCHDEV_BRPORT_UNOFFLOADED: + brport_info = ptr; + b = &brport_info->brport; + + br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb, + b->blocking_nb); + break; + } + +out: + return err; +} + +static struct notifier_block br_switchdev_blocking_notifier = { + .notifier_call = br_switchdev_blocking_event, +}; + /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device @@ -214,17 +256,22 @@ static struct notifier_block br_switchdev_notifier = { int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, struct netlink_ext_ack *extack) { + int err = 0; + switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: br_opt_toggle(br, BROPT_NO_LL_LEARN, on); break; + case BR_BOOLOPT_MCAST_VLAN_SNOOPING: + err = br_multicast_toggle_vlan_snooping(br, on, extack); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } - return 0; + return err; } int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) @@ -232,6 +279,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: return br_opt_get(br, BROPT_NO_LL_LEARN); + case BR_BOOLOPT_MCAST_VLAN_SNOOPING: + return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -300,7 +349,7 @@ static void __net_exit br_net_exit(struct net *net) rtnl_lock(); for_each_netdev(net, dev) - if (dev->priv_flags & IFF_EBRIDGE) + if (netif_is_bridge_master(dev)) br_dev_delete(dev, &list); unregister_netdevice_many(&list); @@ -348,11 +397,15 @@ static int __init br_init(void) if (err) goto err_out4; - err = br_netlink_init(); + err = register_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); if (err) goto err_out5; - brioctl_set(br_ioctl_deviceless_stub); + err = br_netlink_init(); + if (err) + goto err_out6; + + brioctl_set(br_ioctl_stub); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = br_fdb_test_addr; @@ -366,6 +419,8 @@ static int __init br_init(void) return 0; +err_out6: + unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); err_out5: unregister_switchdev_notifier(&br_switchdev_notifier); err_out4: @@ -385,6 +440,7 @@ static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); br_netlink_fini(); + unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); unregister_switchdev_notifier(&br_switchdev_notifier); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index e8b626cc6bfd..8d6bab244c4a 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -27,11 +27,14 @@ EXPORT_SYMBOL_GPL(nf_br_ops); /* net device transmit always called with BH disabled */ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) { + struct net_bridge_mcast_port *pmctx_null = NULL; struct net_bridge *br = netdev_priv(dev); + struct net_bridge_mcast *brmctx = &br->multicast_ctx; struct net_bridge_fdb_entry *dst; struct net_bridge_mdb_entry *mdst; const struct nf_br_ops *nf_ops; u8 state = BR_STATE_FORWARDING; + struct net_bridge_vlan *vlan; const unsigned char *dest; u16 vid = 0; @@ -53,7 +56,8 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) skb_reset_mac_header(skb); skb_pull(skb, ETH_HLEN); - if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, &state)) + if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid, + &state, &vlan)) goto out; if (IS_ENABLED(CONFIG_INET) && @@ -82,15 +86,15 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) br_flood(br, skb, BR_PKT_MULTICAST, false, true); goto out; } - if (br_multicast_rcv(br, NULL, skb, vid)) { + if (br_multicast_rcv(&brmctx, &pmctx_null, vlan, skb, vid)) { kfree_skb(skb); goto out; } - mdst = br_mdb_get(br, skb, vid); + mdst = br_mdb_get(brmctx, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br, eth_hdr(skb), mdst)) - br_multicast_flood(mdst, skb, false, true); + br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) + br_multicast_flood(mdst, skb, brmctx, false, true); else br_flood(br, skb, BR_PKT_MULTICAST, false, true); } else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) { @@ -450,7 +454,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_set_rx_mode = br_dev_set_multicast_list, .ndo_change_rx_flags = br_dev_change_rx_flags, .ndo_change_mtu = br_change_mtu, - .ndo_do_ioctl = br_dev_ioctl, + .ndo_siocdevprivate = br_dev_siocdevprivate, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = br_netpoll_setup, .ndo_netpoll_cleanup = br_netpoll_cleanup, diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 5dee30966ed3..6ccda68bd473 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -32,10 +32,6 @@ static const struct rhashtable_params br_fdb_rht_params = { }; static struct kmem_cache *br_fdb_cache __read_mostly; -static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid); -static void fdb_notify(struct net_bridge *br, - const struct net_bridge_fdb_entry *, int, bool); int __init br_fdb_init(void) { @@ -87,6 +83,128 @@ static void fdb_rcu_free(struct rcu_head *head) kmem_cache_free(br_fdb_cache, ent); } +static int fdb_to_nud(const struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb) +{ + if (test_bit(BR_FDB_LOCAL, &fdb->flags)) + return NUD_PERMANENT; + else if (test_bit(BR_FDB_STATIC, &fdb->flags)) + return NUD_NOARP; + else if (has_expired(br, fdb)) + return NUD_STALE; + else + return NUD_REACHABLE; +} + +static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb, + u32 portid, u32 seq, int type, unsigned int flags) +{ + const struct net_bridge_port *dst = READ_ONCE(fdb->dst); + unsigned long now = jiffies; + struct nda_cacheinfo ci; + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); + if (nlh == NULL) + return -EMSGSIZE; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = AF_BRIDGE; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = 0; + ndm->ndm_type = 0; + ndm->ndm_ifindex = dst ? dst->dev->ifindex : br->dev->ifindex; + ndm->ndm_state = fdb_to_nud(br, fdb); + + if (test_bit(BR_FDB_OFFLOADED, &fdb->flags)) + ndm->ndm_flags |= NTF_OFFLOADED; + if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) + ndm->ndm_flags |= NTF_EXT_LEARNED; + if (test_bit(BR_FDB_STICKY, &fdb->flags)) + ndm->ndm_flags |= NTF_STICKY; + + if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) + goto nla_put_failure; + if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex)) + goto nla_put_failure; + ci.ndm_used = jiffies_to_clock_t(now - fdb->used); + ci.ndm_confirmed = 0; + ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); + ci.ndm_refcnt = 0; + if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) + goto nla_put_failure; + + if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), + &fdb->key.vlan_id)) + goto nla_put_failure; + + if (test_bit(BR_FDB_NOTIFY, &fdb->flags)) { + struct nlattr *nest = nla_nest_start(skb, NDA_FDB_EXT_ATTRS); + u8 notify_bits = FDB_NOTIFY_BIT; + + if (!nest) + goto nla_put_failure; + if (test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) + notify_bits |= FDB_NOTIFY_INACTIVE_BIT; + + if (nla_put_u8(skb, NFEA_ACTIVITY_NOTIFY, notify_bits)) { + nla_nest_cancel(skb, nest); + goto nla_put_failure; + } + + nla_nest_end(skb, nest); + } + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static inline size_t fdb_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ndmsg)) + + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ + + nla_total_size(sizeof(u32)) /* NDA_MASTER */ + + nla_total_size(sizeof(u16)) /* NDA_VLAN */ + + nla_total_size(sizeof(struct nda_cacheinfo)) + + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */ + + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ +} + +static void fdb_notify(struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb, int type, + bool swdev_notify) +{ + struct net *net = dev_net(br->dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + if (swdev_notify) + br_switchdev_fdb_notify(br, fdb, type); + + skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC); + if (skb == NULL) + goto errout; + + err = fdb_fill_info(skb, br, fdb, 0, 0, type, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in fdb_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); + return; +errout: + rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +} + static struct net_bridge_fdb_entry *fdb_find_rcu(struct rhashtable *tbl, const unsigned char *addr, __u16 vid) @@ -257,6 +375,66 @@ void br_fdb_find_delete_local(struct net_bridge *br, spin_unlock_bh(&br->hash_lock); } +static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, + struct net_bridge_port *source, + const unsigned char *addr, + __u16 vid, + unsigned long flags) +{ + struct net_bridge_fdb_entry *fdb; + int err; + + fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC); + if (!fdb) + return NULL; + + memcpy(fdb->key.addr.addr, addr, ETH_ALEN); + WRITE_ONCE(fdb->dst, source); + fdb->key.vlan_id = vid; + fdb->flags = flags; + fdb->updated = fdb->used = jiffies; + err = rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode, + br_fdb_rht_params); + if (err) { + kmem_cache_free(br_fdb_cache, fdb); + return NULL; + } + + hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list); + + return fdb; +} + +static int fdb_add_local(struct net_bridge *br, struct net_bridge_port *source, + const unsigned char *addr, u16 vid) +{ + struct net_bridge_fdb_entry *fdb; + + if (!is_valid_ether_addr(addr)) + return -EINVAL; + + fdb = br_fdb_find(br, addr, vid); + if (fdb) { + /* it is okay to have multiple ports with same + * address, just use the first one. + */ + if (test_bit(BR_FDB_LOCAL, &fdb->flags)) + return 0; + br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n", + source ? source->dev->name : br->dev->name, addr, vid); + fdb_delete(br, fdb, true); + } + + fdb = fdb_create(br, source, addr, vid, + BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC)); + if (!fdb) + return -ENOMEM; + + fdb_add_hw_addr(br, addr); + fdb_notify(br, fdb, RTM_NEWNEIGH, true); + return 0; +} + void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) { struct net_bridge_vlan_group *vg; @@ -283,7 +461,7 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) insert: /* insert new address, may fail if invalid address or dup. */ - fdb_insert(br, p, newaddr, 0); + fdb_add_local(br, p, newaddr, 0); if (!vg || !vg->num_vlans) goto done; @@ -293,7 +471,7 @@ insert: * from under us. */ list_for_each_entry(v, &vg->vlan_list, vlist) - fdb_insert(br, p, newaddr, v->vid); + fdb_add_local(br, p, newaddr, v->vid); done: spin_unlock_bh(&br->hash_lock); @@ -313,7 +491,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); - fdb_insert(br, NULL, newaddr, 0); + fdb_add_local(br, NULL, newaddr, 0); vg = br_vlan_group(br); if (!vg || !vg->num_vlans) goto out; @@ -328,7 +506,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) if (f && test_bit(BR_FDB_LOCAL, &f->flags) && !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) fdb_delete_local(br, NULL, f); - fdb_insert(br, NULL, newaddr, v->vid); + fdb_add_local(br, NULL, newaddr, v->vid); } out: spin_unlock_bh(&br->hash_lock); @@ -503,71 +681,14 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf, return num; } -static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br, - struct net_bridge_port *source, - const unsigned char *addr, - __u16 vid, - unsigned long flags) -{ - struct net_bridge_fdb_entry *fdb; - - fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC); - if (fdb) { - memcpy(fdb->key.addr.addr, addr, ETH_ALEN); - WRITE_ONCE(fdb->dst, source); - fdb->key.vlan_id = vid; - fdb->flags = flags; - fdb->updated = fdb->used = jiffies; - if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, - &fdb->rhnode, - br_fdb_rht_params)) { - kmem_cache_free(br_fdb_cache, fdb); - fdb = NULL; - } else { - hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list); - } - } - return fdb; -} - -static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid) -{ - struct net_bridge_fdb_entry *fdb; - - if (!is_valid_ether_addr(addr)) - return -EINVAL; - - fdb = br_fdb_find(br, addr, vid); - if (fdb) { - /* it is okay to have multiple ports with same - * address, just use the first one. - */ - if (test_bit(BR_FDB_LOCAL, &fdb->flags)) - return 0; - br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n", - source ? source->dev->name : br->dev->name, addr, vid); - fdb_delete(br, fdb, true); - } - - fdb = fdb_create(br, source, addr, vid, - BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC)); - if (!fdb) - return -ENOMEM; - - fdb_add_hw_addr(br, addr); - fdb_notify(br, fdb, RTM_NEWNEIGH, true); - return 0; -} - /* Add entry for local address of interface */ -int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid) +int br_fdb_add_local(struct net_bridge *br, struct net_bridge_port *source, + const unsigned char *addr, u16 vid) { int ret; spin_lock_bh(&br->hash_lock); - ret = fdb_insert(br, source, addr, vid); + ret = fdb_add_local(br, source, addr, vid); spin_unlock_bh(&br->hash_lock); return ret; } @@ -638,190 +759,6 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, } } -static int fdb_to_nud(const struct net_bridge *br, - const struct net_bridge_fdb_entry *fdb) -{ - if (test_bit(BR_FDB_LOCAL, &fdb->flags)) - return NUD_PERMANENT; - else if (test_bit(BR_FDB_STATIC, &fdb->flags)) - return NUD_NOARP; - else if (has_expired(br, fdb)) - return NUD_STALE; - else - return NUD_REACHABLE; -} - -static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, - const struct net_bridge_fdb_entry *fdb, - u32 portid, u32 seq, int type, unsigned int flags) -{ - const struct net_bridge_port *dst = READ_ONCE(fdb->dst); - unsigned long now = jiffies; - struct nda_cacheinfo ci; - struct nlmsghdr *nlh; - struct ndmsg *ndm; - - nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); - if (nlh == NULL) - return -EMSGSIZE; - - ndm = nlmsg_data(nlh); - ndm->ndm_family = AF_BRIDGE; - ndm->ndm_pad1 = 0; - ndm->ndm_pad2 = 0; - ndm->ndm_flags = 0; - ndm->ndm_type = 0; - ndm->ndm_ifindex = dst ? dst->dev->ifindex : br->dev->ifindex; - ndm->ndm_state = fdb_to_nud(br, fdb); - - if (test_bit(BR_FDB_OFFLOADED, &fdb->flags)) - ndm->ndm_flags |= NTF_OFFLOADED; - if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) - ndm->ndm_flags |= NTF_EXT_LEARNED; - if (test_bit(BR_FDB_STICKY, &fdb->flags)) - ndm->ndm_flags |= NTF_STICKY; - - if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) - goto nla_put_failure; - if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex)) - goto nla_put_failure; - ci.ndm_used = jiffies_to_clock_t(now - fdb->used); - ci.ndm_confirmed = 0; - ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); - ci.ndm_refcnt = 0; - if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) - goto nla_put_failure; - - if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16), - &fdb->key.vlan_id)) - goto nla_put_failure; - - if (test_bit(BR_FDB_NOTIFY, &fdb->flags)) { - struct nlattr *nest = nla_nest_start(skb, NDA_FDB_EXT_ATTRS); - u8 notify_bits = FDB_NOTIFY_BIT; - - if (!nest) - goto nla_put_failure; - if (test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags)) - notify_bits |= FDB_NOTIFY_INACTIVE_BIT; - - if (nla_put_u8(skb, NFEA_ACTIVITY_NOTIFY, notify_bits)) { - nla_nest_cancel(skb, nest); - goto nla_put_failure; - } - - nla_nest_end(skb, nest); - } - - nlmsg_end(skb, nlh); - return 0; - -nla_put_failure: - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; -} - -static inline size_t fdb_nlmsg_size(void) -{ - return NLMSG_ALIGN(sizeof(struct ndmsg)) - + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ - + nla_total_size(sizeof(u32)) /* NDA_MASTER */ - + nla_total_size(sizeof(u16)) /* NDA_VLAN */ - + nla_total_size(sizeof(struct nda_cacheinfo)) - + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */ - + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */ -} - -static int br_fdb_replay_one(struct notifier_block *nb, - const struct net_bridge_fdb_entry *fdb, - struct net_device *dev, unsigned long action, - const void *ctx) -{ - struct switchdev_notifier_fdb_info item; - int err; - - item.addr = fdb->key.addr.addr; - item.vid = fdb->key.vlan_id; - item.added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); - item.offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); - item.is_local = test_bit(BR_FDB_LOCAL, &fdb->flags); - item.info.dev = dev; - item.info.ctx = ctx; - - err = nb->notifier_call(nb, action, &item); - return notifier_to_errno(err); -} - -int br_fdb_replay(const struct net_device *br_dev, const struct net_device *dev, - const void *ctx, bool adding, struct notifier_block *nb) -{ - struct net_bridge_fdb_entry *fdb; - struct net_bridge *br; - unsigned long action; - int err = 0; - - if (!netif_is_bridge_master(br_dev)) - return -EINVAL; - - if (!netif_is_bridge_port(dev) && !netif_is_bridge_master(dev)) - return -EINVAL; - - br = netdev_priv(br_dev); - - if (adding) - action = SWITCHDEV_FDB_ADD_TO_DEVICE; - else - action = SWITCHDEV_FDB_DEL_TO_DEVICE; - - rcu_read_lock(); - - hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) { - const struct net_bridge_port *dst = READ_ONCE(fdb->dst); - struct net_device *dst_dev; - - dst_dev = dst ? dst->dev : br->dev; - if (dst_dev && dst_dev != dev) - continue; - - err = br_fdb_replay_one(nb, fdb, dst_dev, action, ctx); - if (err) - break; - } - - rcu_read_unlock(); - - return err; -} -EXPORT_SYMBOL_GPL(br_fdb_replay); - -static void fdb_notify(struct net_bridge *br, - const struct net_bridge_fdb_entry *fdb, int type, - bool swdev_notify) -{ - struct net *net = dev_net(br->dev); - struct sk_buff *skb; - int err = -ENOBUFS; - - if (swdev_notify) - br_switchdev_fdb_notify(br, fdb, type); - - skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC); - if (skb == NULL) - goto errout; - - err = fdb_fill_info(skb, br, fdb, 0, 0, type, 0); - if (err < 0) { - /* -EMSGSIZE implies BUG in fdb_nlmsg_size() */ - WARN_ON(err == -EMSGSIZE); - kfree_skb(skb); - goto errout; - } - rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); - return; -errout: - rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); -} - /* Dump information about entries, in response to GETNEIGH */ int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, @@ -833,7 +770,7 @@ int br_fdb_dump(struct sk_buff *skb, struct net_bridge_fdb_entry *f; int err = 0; - if (!(dev->priv_flags & IFF_EBRIDGE)) + if (!netif_is_bridge_master(dev)) return err; if (!filter_dev) { @@ -1084,7 +1021,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return -EINVAL; } - if (dev->priv_flags & IFF_EBRIDGE) { + if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { @@ -1181,7 +1118,7 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct net_bridge *br; int err; - if (dev->priv_flags & IFF_EBRIDGE) { + if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group(br); } else { diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 07856362538f..ec646656dbf1 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -48,6 +48,8 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb skb_set_network_header(skb, depth); } + br_switchdev_frame_set_offload_fwd_mark(skb); + dev_queue_xmit(skb); return 0; @@ -76,6 +78,11 @@ static void __br_forward(const struct net_bridge_port *to, struct net *net; int br_hook; + /* Mark the skb for forwarding offload early so that br_handle_vlan() + * can know whether to pop the VLAN header on egress or keep it. + */ + nbp_switchdev_frame_mark_tx_fwd_offload(to, skb); + vg = nbp_vlan_group_rcu(to); skb = br_handle_vlan(to->br, to, vg, skb); if (!skb) @@ -174,6 +181,8 @@ static struct net_bridge_port *maybe_deliver( if (!should_deliver(p, skb)) return prev; + nbp_switchdev_frame_mark_tx_fwd_to_hwdom(p, skb); + if (!prev) goto out; @@ -267,20 +276,19 @@ static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb, /* called with rcu_read_lock */ void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, + struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig) { - struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev; - struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *prev = NULL; struct net_bridge_port_group *p; bool allow_mode_include = true; struct hlist_node *rp; - rp = br_multicast_get_first_rport_node(br, skb); + rp = br_multicast_get_first_rport_node(brmctx, skb); if (mdst) { p = rcu_dereference(mdst->ports); - if (br_multicast_should_handle_mode(br, mdst->addr.proto) && + if (br_multicast_should_handle_mode(brmctx, mdst->addr.proto) && br_multicast_is_star_g(&mdst->addr)) allow_mode_include = false; } else { diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 14cd6ef96111..c1183fef1f21 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -456,7 +456,7 @@ int br_add_bridge(struct net *net, const char *name) dev_net_set(dev, net); dev->rtnl_link_ops = &br_link_ops; - res = register_netdev(dev); + res = register_netdevice(dev); if (res) free_netdev(dev); return res; @@ -467,12 +467,11 @@ int br_del_bridge(struct net *net, const char *name) struct net_device *dev; int ret = 0; - rtnl_lock(); dev = __dev_get_by_name(net, name); if (dev == NULL) ret = -ENXIO; /* Could not find device */ - else if (!(dev->priv_flags & IFF_EBRIDGE)) { + else if (!netif_is_bridge_master(dev)) { /* Attempt to delete non bridge device! */ ret = -EPERM; } @@ -485,7 +484,6 @@ int br_del_bridge(struct net *net, const char *name) else br_dev_delete(dev, NULL); - rtnl_unlock(); return ret; } @@ -644,10 +642,6 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, if (err) goto err5; - err = nbp_switchdev_mark_set(p); - if (err) - goto err6; - dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); @@ -676,7 +670,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, else netdev_set_rx_headroom(dev, br_hr); - if (br_fdb_insert(br, p, dev->dev_addr, 0)) + if (br_fdb_add_local(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); if (br->dev->addr_assign_type != NET_ADDR_SET) { @@ -685,13 +679,13 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, */ err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack); if (err) - goto err7; + goto err6; } err = nbp_vlan_init(p, extack); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); - goto err7; + goto err6; } spin_lock_bh(&br->lock); @@ -714,13 +708,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, return 0; -err7: +err6: if (fdb_synced) br_fdb_unsync_static(br, p); list_del_rcu(&p->list); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); -err6: netdev_upper_dev_unlink(dev, br->dev); err5: dev->priv_flags &= ~IFF_BRIDGE_PORT; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 1f506309efa8..b50382f957c1 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -69,8 +69,11 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb struct net_bridge_port *p = br_port_get_rcu(skb->dev); enum br_pkt_type pkt_type = BR_PKT_UNICAST; struct net_bridge_fdb_entry *dst = NULL; + struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mdst; bool local_rcv, mcast_hit = false; + struct net_bridge_mcast *brmctx; + struct net_bridge_vlan *vlan; struct net_bridge *br; u16 vid = 0; u8 state; @@ -78,9 +81,11 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (!p || p->state == BR_STATE_DISABLED) goto drop; + brmctx = &p->br->multicast_ctx; + pmctx = &p->multicast_ctx; state = p->state; if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid, - &state)) + &state, &vlan)) goto out; nbp_switchdev_frame_mark(p, skb); @@ -98,7 +103,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb local_rcv = true; } else { pkt_type = BR_PKT_MULTICAST; - if (br_multicast_rcv(br, p, skb, vid)) + if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid)) goto drop; } } @@ -128,11 +133,11 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb switch (pkt_type) { case BR_PKT_MULTICAST: - mdst = br_mdb_get(br, skb, vid); + mdst = br_mdb_get(brmctx, skb, vid); if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) && - br_multicast_querier_exists(br, eth_hdr(skb), mdst)) { + br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) { if ((mdst && mdst->host_joined) || - br_multicast_is_router(br, skb)) { + br_multicast_is_router(brmctx, skb)) { local_rcv = true; br->dev->stats.multicast++; } @@ -162,7 +167,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb if (!mcast_hit) br_flood(br, skb, pkt_type, local_rcv, false); else - br_multicast_flood(mdst, skb, local_rcv, false); + br_multicast_flood(mdst, skb, brmctx, local_rcv, false); } if (local_rcv) @@ -289,11 +294,8 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); p = br_port_get_rcu(skb->dev); - if (p->flags & BR_VLAN_TUNNEL) { - if (br_handle_ingress_vlan_tunnel(skb, p, - nbp_vlan_group_rcu(p))) - goto drop; - } + if (p->flags & BR_VLAN_TUNNEL) + br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p)); if (unlikely(is_link_local_ether_addr(dest))) { u16 fwd_mask = p->br->group_fwd_mask_required; diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index 2db800fc27ca..db4ab2c2ce18 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -26,7 +26,7 @@ static int get_bridge_ifindices(struct net *net, int *indices, int num) for_each_netdev_rcu(net, dev) { if (i >= num) break; - if (dev->priv_flags & IFF_EBRIDGE) + if (netif_is_bridge_master(dev)) indices[i++] = dev->ifindex; } rcu_read_unlock(); @@ -71,7 +71,8 @@ static int get_fdb_entries(struct net_bridge *br, void __user *userbuf, num = br_fdb_fillbuf(br, buf, maxnum, offset); if (num > 0) { - if (copy_to_user(userbuf, buf, num*sizeof(struct __fdb_entry))) + if (copy_to_user(userbuf, buf, + array_size(num, sizeof(struct __fdb_entry)))) num = -EFAULT; } kfree(buf); @@ -106,15 +107,32 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) * This interface is deprecated because it was too difficult * to do the translation for 32/64bit ioctl compatibility. */ -static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) +int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p = NULL; unsigned long args[4]; + void __user *argp; int ret = -EOPNOTSUPP; - if (copy_from_user(args, rq->ifr_data, sizeof(args))) - return -EFAULT; + if (in_compat_syscall()) { + unsigned int cargs[4]; + + if (copy_from_user(cargs, data, sizeof(cargs))) + return -EFAULT; + + args[0] = cargs[0]; + args[1] = cargs[1]; + args[2] = cargs[2]; + args[3] = cargs[3]; + + argp = compat_ptr(args[1]); + } else { + if (copy_from_user(args, data, sizeof(args))) + return -EFAULT; + + argp = (void __user *)args[1]; + } switch (args[0]) { case BRCTL_ADD_IF: @@ -171,7 +189,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) return -ENOMEM; get_port_ifindices(br, indices, num); - if (copy_to_user((void __user *)args[1], indices, num*sizeof(int))) + if (copy_to_user(argp, indices, array_size(num, sizeof(int)))) num = -EFAULT; kfree(indices); return num; @@ -232,7 +250,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) rcu_read_unlock(); - if (copy_to_user((void __user *)args[1], &p, sizeof(p))) + if (copy_to_user(argp, &p, sizeof(p))) return -EFAULT; return 0; @@ -282,8 +300,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) } case BRCTL_GET_FDB_ENTRIES: - return get_fdb_entries(br, (void __user *)args[1], - args[2], args[3]); + return get_fdb_entries(br, argp, args[2], args[3]); } if (!ret) { @@ -320,7 +337,8 @@ static int old_deviceless(struct net *net, void __user *uarg) args[2] = get_bridge_ifindices(net, indices, args[2]); - ret = copy_to_user((void __user *)args[1], indices, args[2]*sizeof(int)) + ret = copy_to_user(uarg, indices, + array_size(args[2], sizeof(int))) ? -EFAULT : args[2]; kfree(indices); @@ -350,48 +368,47 @@ static int old_deviceless(struct net *net, void __user *uarg) return -EOPNOTSUPP; } -int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg) +int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, + struct ifreq *ifr, void __user *uarg) { + int ret = -EOPNOTSUPP; + + rtnl_lock(); + switch (cmd) { case SIOCGIFBR: case SIOCSIFBR: - return old_deviceless(net, uarg); - + ret = old_deviceless(net, uarg); + break; case SIOCBRADDBR: case SIOCBRDELBR: { char buf[IFNAMSIZ]; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; + } - if (copy_from_user(buf, uarg, IFNAMSIZ)) - return -EFAULT; + if (copy_from_user(buf, uarg, IFNAMSIZ)) { + ret = -EFAULT; + break; + } buf[IFNAMSIZ-1] = 0; if (cmd == SIOCBRADDBR) - return br_add_bridge(net, buf); - - return br_del_bridge(net, buf); - } + ret = br_add_bridge(net, buf); + else + ret = br_del_bridge(net, buf); } - return -EOPNOTSUPP; -} - -int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) -{ - struct net_bridge *br = netdev_priv(dev); - - switch (cmd) { - case SIOCDEVPRIVATE: - return old_dev_ioctl(dev, rq, cmd); - + break; case SIOCBRADDIF: case SIOCBRDELIF: - return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF); - + ret = add_del_if(br, ifr->ifr_ifindex, cmd == SIOCBRADDIF); + break; } - br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd); - return -EOPNOTSUPP; + rtnl_unlock(); + + return ret; } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 17a720b4473f..4556d913955b 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -16,57 +16,89 @@ #include "br_private.h" -static bool br_rports_have_mc_router(struct net_bridge *br) +static bool +br_ip4_rports_get_timer(struct net_bridge_mcast_port *pmctx, + unsigned long *timer) +{ + *timer = br_timer_value(&pmctx->ip4_mc_router_timer); + return !hlist_unhashed(&pmctx->ip4_rlist); +} + +static bool +br_ip6_rports_get_timer(struct net_bridge_mcast_port *pmctx, + unsigned long *timer) { #if IS_ENABLED(CONFIG_IPV6) - return !hlist_empty(&br->ip4_mc_router_list) || - !hlist_empty(&br->ip6_mc_router_list); + *timer = br_timer_value(&pmctx->ip6_mc_router_timer); + return !hlist_unhashed(&pmctx->ip6_rlist); #else - return !hlist_empty(&br->ip4_mc_router_list); + *timer = 0; + return false; #endif } -static bool -br_ip4_rports_get_timer(struct net_bridge_port *port, unsigned long *timer) +static size_t __br_rports_one_size(void) { - *timer = br_timer_value(&port->ip4_mc_router_timer); - return !hlist_unhashed(&port->ip4_rlist); + return nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PORT */ + nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_TIMER */ + nla_total_size(sizeof(u8)) + /* MDBA_ROUTER_PATTR_TYPE */ + nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET_TIMER */ + nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET6_TIMER */ + nla_total_size(sizeof(u32)); /* MDBA_ROUTER_PATTR_VID */ } -static bool -br_ip6_rports_get_timer(struct net_bridge_port *port, unsigned long *timer) +size_t br_rports_size(const struct net_bridge_mcast *brmctx) { + struct net_bridge_mcast_port *pmctx; + size_t size = nla_total_size(0); /* MDBA_ROUTER */ + + rcu_read_lock(); + hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, + ip4_rlist) + size += __br_rports_one_size(); + #if IS_ENABLED(CONFIG_IPV6) - *timer = br_timer_value(&port->ip6_mc_router_timer); - return !hlist_unhashed(&port->ip6_rlist); -#else - *timer = 0; - return false; + hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, + ip6_rlist) + size += __br_rports_one_size(); #endif + rcu_read_unlock(); + + return size; } -static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, - struct net_device *dev) +int br_rports_fill_info(struct sk_buff *skb, + const struct net_bridge_mcast *brmctx) { - struct net_bridge *br = netdev_priv(dev); + u16 vid = brmctx->vlan ? brmctx->vlan->vid : 0; bool have_ip4_mc_rtr, have_ip6_mc_rtr; unsigned long ip4_timer, ip6_timer; struct nlattr *nest, *port_nest; struct net_bridge_port *p; - if (!br->multicast_router) - return 0; - - if (!br_rports_have_mc_router(br)) + if (!brmctx->multicast_router || !br_rports_have_mc_router(brmctx)) return 0; nest = nla_nest_start_noflag(skb, MDBA_ROUTER); if (nest == NULL) return -EMSGSIZE; - list_for_each_entry_rcu(p, &br->port_list, list) { - have_ip4_mc_rtr = br_ip4_rports_get_timer(p, &ip4_timer); - have_ip6_mc_rtr = br_ip6_rports_get_timer(p, &ip6_timer); + list_for_each_entry_rcu(p, &brmctx->br->port_list, list) { + struct net_bridge_mcast_port *pmctx; + + if (vid) { + struct net_bridge_vlan *v; + + v = br_vlan_find(nbp_vlan_group(p), vid); + if (!v) + continue; + pmctx = &v->port_mcast_ctx; + } else { + pmctx = &p->multicast_ctx; + } + + have_ip4_mc_rtr = br_ip4_rports_get_timer(pmctx, &ip4_timer); + have_ip6_mc_rtr = br_ip6_rports_get_timer(pmctx, &ip6_timer); if (!have_ip4_mc_rtr && !have_ip6_mc_rtr) continue; @@ -79,13 +111,14 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb, nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER, max(ip4_timer, ip6_timer)) || nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE, - p->multicast_router) || + p->multicast_ctx.multicast_router) || (have_ip4_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET_TIMER, ip4_timer)) || (have_ip6_mc_rtr && nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER, - ip6_timer))) { + ip6_timer)) || + (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid))) { nla_nest_cancel(skb, port_nest); goto fail; } @@ -240,7 +273,7 @@ static int __mdb_fill_info(struct sk_buff *skb, switch (mp->addr.proto) { case htons(ETH_P_IP): - dump_srcs_mode = !!(mp->br->multicast_igmp_version == 3); + dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_igmp_version == 3); if (mp->addr.src.ip4) { if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE, mp->addr.src.ip4)) @@ -250,7 +283,7 @@ static int __mdb_fill_info(struct sk_buff *skb, break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - dump_srcs_mode = !!(mp->br->multicast_mld_version == 2); + dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_mld_version == 2); if (!ipv6_addr_any(&mp->addr.src.ip6)) { if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE, &mp->addr.src.ip6)) @@ -389,7 +422,8 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->seq = net->dev_base_seq; for_each_netdev_rcu(net, dev) { - if (dev->priv_flags & IFF_EBRIDGE) { + if (netif_is_bridge_master(dev)) { + struct net_bridge *br = netdev_priv(dev); struct br_port_msg *bpm; if (idx < s_idx) @@ -406,7 +440,7 @@ static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb) bpm->ifindex = dev->ifindex; if (br_mdb_fill_info(skb, cb, dev) < 0) goto out; - if (br_rports_fill_info(skb, cb, dev) < 0) + if (br_rports_fill_info(skb, &br->multicast_ctx) < 0) goto out; cb->args[1] = 0; @@ -483,7 +517,7 @@ static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) /* MDBA_MDB_EATTR_SOURCE */ if (pg->key.addr.src.ip4) nlmsg_size += nla_total_size(sizeof(__be32)); - if (pg->key.port->br->multicast_igmp_version == 2) + if (pg->key.port->br->multicast_ctx.multicast_igmp_version == 2) goto out; addr_size = sizeof(__be32); break; @@ -492,7 +526,7 @@ static size_t rtnl_mdb_nlmsg_size(struct net_bridge_port_group *pg) /* MDBA_MDB_EATTR_SOURCE */ if (!ipv6_addr_any(&pg->key.addr.src.ip6)) nlmsg_size += nla_total_size(sizeof(struct in6_addr)); - if (pg->key.port->br->multicast_mld_version == 1) + if (pg->key.port->br->multicast_ctx.multicast_mld_version == 1) goto out; addr_size = sizeof(struct in6_addr); break; @@ -518,250 +552,16 @@ out: return nlmsg_size; } -struct br_mdb_complete_info { - struct net_bridge_port *port; - struct br_ip ip; -}; - -static void br_mdb_complete(struct net_device *dev, int err, void *priv) -{ - struct br_mdb_complete_info *data = priv; - struct net_bridge_port_group __rcu **pp; - struct net_bridge_port_group *p; - struct net_bridge_mdb_entry *mp; - struct net_bridge_port *port = data->port; - struct net_bridge *br = port->br; - - if (err) - goto err; - - spin_lock_bh(&br->multicast_lock); - mp = br_mdb_ip_get(br, &data->ip); - if (!mp) - goto out; - for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; - pp = &p->next) { - if (p->key.port != port) - continue; - p->flags |= MDB_PG_FLAGS_OFFLOAD; - } -out: - spin_unlock_bh(&br->multicast_lock); -err: - kfree(priv); -} - -static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb, - const struct net_bridge_mdb_entry *mp) -{ - if (mp->addr.proto == htons(ETH_P_IP)) - ip_eth_mc_map(mp->addr.dst.ip4, mdb->addr); -#if IS_ENABLED(CONFIG_IPV6) - else if (mp->addr.proto == htons(ETH_P_IPV6)) - ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb->addr); -#endif - else - ether_addr_copy(mdb->addr, mp->addr.dst.mac_addr); - - mdb->vid = mp->addr.vid; -} - -static int br_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, - const struct switchdev_obj_port_mdb *mdb, - unsigned long action, const void *ctx, - struct netlink_ext_ack *extack) -{ - struct switchdev_notifier_port_obj_info obj_info = { - .info = { - .dev = dev, - .extack = extack, - .ctx = ctx, - }, - .obj = &mdb->obj, - }; - int err; - - err = nb->notifier_call(nb, action, &obj_info); - return notifier_to_errno(err); -} - -static int br_mdb_queue_one(struct list_head *mdb_list, - enum switchdev_obj_id id, - const struct net_bridge_mdb_entry *mp, - struct net_device *orig_dev) -{ - struct switchdev_obj_port_mdb *mdb; - - mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC); - if (!mdb) - return -ENOMEM; - - mdb->obj.id = id; - mdb->obj.orig_dev = orig_dev; - br_switchdev_mdb_populate(mdb, mp); - list_add_tail(&mdb->obj.list, mdb_list); - - return 0; -} - -int br_mdb_replay(struct net_device *br_dev, struct net_device *dev, - const void *ctx, bool adding, struct notifier_block *nb, - struct netlink_ext_ack *extack) -{ - const struct net_bridge_mdb_entry *mp; - struct switchdev_obj *obj, *tmp; - struct net_bridge *br; - unsigned long action; - LIST_HEAD(mdb_list); - int err = 0; - - ASSERT_RTNL(); - - if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) - return -EINVAL; - - br = netdev_priv(br_dev); - - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) - return 0; - - /* We cannot walk over br->mdb_list protected just by the rtnl_mutex, - * because the write-side protection is br->multicast_lock. But we - * need to emulate the [ blocking ] calling context of a regular - * switchdev event, so since both br->multicast_lock and RCU read side - * critical sections are atomic, we have no choice but to pick the RCU - * read side lock, queue up all our events, leave the critical section - * and notify switchdev from blocking context. - */ - rcu_read_lock(); - - hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { - struct net_bridge_port_group __rcu * const *pp; - const struct net_bridge_port_group *p; - - if (mp->host_joined) { - err = br_mdb_queue_one(&mdb_list, - SWITCHDEV_OBJ_ID_HOST_MDB, - mp, br_dev); - if (err) { - rcu_read_unlock(); - goto out_free_mdb; - } - } - - for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; - pp = &p->next) { - if (p->key.port->dev != dev) - continue; - - err = br_mdb_queue_one(&mdb_list, - SWITCHDEV_OBJ_ID_PORT_MDB, - mp, dev); - if (err) { - rcu_read_unlock(); - goto out_free_mdb; - } - } - } - - rcu_read_unlock(); - - if (adding) - action = SWITCHDEV_PORT_OBJ_ADD; - else - action = SWITCHDEV_PORT_OBJ_DEL; - - list_for_each_entry(obj, &mdb_list, list) { - err = br_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), - action, ctx, extack); - if (err) - goto out_free_mdb; - } - -out_free_mdb: - list_for_each_entry_safe(obj, tmp, &mdb_list, list) { - list_del(&obj->list); - kfree(SWITCHDEV_OBJ_PORT_MDB(obj)); - } - - return err; -} -EXPORT_SYMBOL_GPL(br_mdb_replay); - -static void br_mdb_switchdev_host_port(struct net_device *dev, - struct net_device *lower_dev, - struct net_bridge_mdb_entry *mp, - int type) -{ - struct switchdev_obj_port_mdb mdb = { - .obj = { - .id = SWITCHDEV_OBJ_ID_HOST_MDB, - .flags = SWITCHDEV_F_DEFER, - .orig_dev = dev, - }, - }; - - br_switchdev_mdb_populate(&mdb, mp); - - switch (type) { - case RTM_NEWMDB: - switchdev_port_obj_add(lower_dev, &mdb.obj, NULL); - break; - case RTM_DELMDB: - switchdev_port_obj_del(lower_dev, &mdb.obj); - break; - } -} - -static void br_mdb_switchdev_host(struct net_device *dev, - struct net_bridge_mdb_entry *mp, int type) -{ - struct net_device *lower_dev; - struct list_head *iter; - - netdev_for_each_lower_dev(dev, lower_dev, iter) - br_mdb_switchdev_host_port(dev, lower_dev, mp, type); -} - void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type) { - struct br_mdb_complete_info *complete_info; - struct switchdev_obj_port_mdb mdb = { - .obj = { - .id = SWITCHDEV_OBJ_ID_PORT_MDB, - .flags = SWITCHDEV_F_DEFER, - }, - }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; - if (pg) { - br_switchdev_mdb_populate(&mdb, mp); - - mdb.obj.orig_dev = pg->key.port->dev; - switch (type) { - case RTM_NEWMDB: - complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); - if (!complete_info) - break; - complete_info->port = pg->key.port; - complete_info->ip = mp->addr; - mdb.obj.complete_priv = complete_info; - mdb.obj.complete = br_mdb_complete; - if (switchdev_port_obj_add(pg->key.port->dev, &mdb.obj, NULL)) - kfree(complete_info); - break; - case RTM_DELMDB: - switchdev_port_obj_del(pg->key.port->dev, &mdb.obj); - break; - } - } else { - br_mdb_switchdev_host(dev, mp, type); - } + br_switchdev_mdb_notify(dev, mp, pg, type); skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC); if (!skb) @@ -781,12 +581,12 @@ errout: static int nlmsg_populate_rtr_fill(struct sk_buff *skb, struct net_device *dev, - int ifindex, u32 pid, + int ifindex, u16 vid, u32 pid, u32 seq, int type, unsigned int flags) { + struct nlattr *nest, *port_nest; struct br_port_msg *bpm; struct nlmsghdr *nlh; - struct nlattr *nest; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0); if (!nlh) @@ -800,8 +600,18 @@ static int nlmsg_populate_rtr_fill(struct sk_buff *skb, if (!nest) goto cancel; - if (nla_put_u32(skb, MDBA_ROUTER_PORT, ifindex)) + port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT); + if (!port_nest) + goto end; + if (nla_put_nohdr(skb, sizeof(u32), &ifindex)) { + nla_nest_cancel(skb, port_nest); + goto end; + } + if (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid)) { + nla_nest_cancel(skb, port_nest); goto end; + } + nla_nest_end(skb, port_nest); nla_nest_end(skb, nest); nlmsg_end(skb, nlh); @@ -817,23 +627,28 @@ cancel: static inline size_t rtnl_rtr_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct br_port_msg)) - + nla_total_size(sizeof(__u32)); + + nla_total_size(sizeof(__u32)) + + nla_total_size(sizeof(u16)); } -void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, +void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; int ifindex; + u16 vid; - ifindex = port ? port->dev->ifindex : 0; + ifindex = pmctx ? pmctx->port->dev->ifindex : 0; + vid = pmctx && br_multicast_port_ctx_is_vlan(pmctx) ? pmctx->vlan->vid : + 0; skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC); if (!skb) goto errout; - err = nlmsg_populate_rtr_fill(skb, dev, ifindex, 0, 0, type, NTF_SELF); + err = nlmsg_populate_rtr_fill(skb, dev, ifindex, vid, 0, 0, type, + NTF_SELF); if (err < 0) { kfree_skb(skb); goto errout; @@ -965,7 +780,7 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, return -ENODEV; } - if (!(dev->priv_flags & IFF_EBRIDGE)) { + if (!netif_is_bridge_master(dev)) { NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge"); return -EOPNOTSUPP; } @@ -1004,14 +819,47 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, return 0; } +static struct net_bridge_mcast * +__br_mdb_choose_context(struct net_bridge *br, + const struct br_mdb_entry *entry, + struct netlink_ext_ack *extack) +{ + struct net_bridge_mcast *brmctx = NULL; + struct net_bridge_vlan *v; + + if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + brmctx = &br->multicast_ctx; + goto out; + } + + if (!entry->vid) { + NL_SET_ERR_MSG_MOD(extack, "Cannot add an entry without a vlan when vlan snooping is enabled"); + goto out; + } + + v = br_vlan_find(br_vlan_group(br), entry->vid); + if (!v) { + NL_SET_ERR_MSG_MOD(extack, "Vlan is not configured"); + goto out; + } + if (br_multicast_ctx_vlan_global_disabled(&v->br_mcast_ctx)) { + NL_SET_ERR_MSG_MOD(extack, "Vlan's multicast processing is disabled"); + goto out; + } + brmctx = &v->br_mcast_ctx; +out: + return brmctx; +} + static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, struct br_mdb_entry *entry, struct nlattr **mdb_attrs, struct netlink_ext_ack *extack) { struct net_bridge_mdb_entry *mp, *star_mp; - struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + struct net_bridge_mcast *brmctx; struct br_ip group, star_group; unsigned long now = jiffies; unsigned char flags = 0; @@ -1020,6 +868,10 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, __mdb_entry_to_br_ip(entry, &group, mdb_attrs); + brmctx = __br_mdb_choose_context(br, entry, extack); + if (!brmctx) + return -EINVAL; + /* host join errors which can happen before creating the group */ if (!port) { /* don't allow any flags for host-joined groups */ @@ -1053,7 +905,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, return -EEXIST; } - br_multicast_host_join(mp, false); + br_multicast_host_join(brmctx, mp, false); br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB); return 0; @@ -1084,14 +936,15 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, } rcu_assign_pointer(*pp, p); if (entry->state == MDB_TEMPORARY) - mod_timer(&p->timer, now + br->multicast_membership_interval); + mod_timer(&p->timer, + now + brmctx->multicast_membership_interval); br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); /* if we are adding a new EXCLUDE port group (*,G) it needs to be also * added to all S,G entries for proper replication, if we are adding * a new INCLUDE port (S,G) then all of *,G EXCLUDE ports need to be * added to it for proper replication */ - if (br_multicast_should_handle_mode(br, group.proto)) { + if (br_multicast_should_handle_mode(brmctx, group.proto)) { switch (filter_mode) { case MCAST_EXCLUDE: br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index d0434dc8c03b..f3d751105343 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -49,30 +49,30 @@ static const struct rhashtable_params br_sg_port_rht_params = { .automatic_shrinking = true, }; -static void br_multicast_start_querier(struct net_bridge *br, +static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query); -static void br_ip4_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port); -static void br_ip4_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx); +static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src); static void br_multicast_port_group_rexmit(struct timer_list *t); static void -br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted); -static void br_ip6_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port); +br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted); +static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx); #if IS_ENABLED(CONFIG_IPV6) -static void br_ip6_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src); #endif static struct net_bridge_port_group * -__br_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +__br_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, @@ -80,6 +80,7 @@ __br_multicast_add_group(struct net_bridge *br, bool blocked); static void br_multicast_find_del_pg(struct net_bridge *br, struct net_bridge_port_group *pg); +static void __br_multicast_stop(struct net_bridge_mcast *brmctx); static struct net_bridge_port_group * br_sg_port_find(struct net_bridge *br, @@ -140,12 +141,14 @@ static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br, } #endif -struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, +struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid) { + struct net_bridge *br = brmctx->br; struct br_ip ip; - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || + br_multicast_ctx_vlan_global_disabled(brmctx)) return NULL; if (BR_INPUT_SKB_CB(skb)->igmp) @@ -158,7 +161,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, switch (skb->protocol) { case htons(ETH_P_IP): ip.dst.ip4 = ip_hdr(skb)->daddr; - if (br->multicast_igmp_version == 3) { + if (brmctx->multicast_igmp_version == 3) { struct net_bridge_mdb_entry *mdb; ip.src.ip4 = ip_hdr(skb)->saddr; @@ -171,7 +174,7 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): ip.dst.ip6 = ipv6_hdr(skb)->daddr; - if (br->multicast_mld_version == 2) { + if (brmctx->multicast_mld_version == 2) { struct net_bridge_mdb_entry *mdb; ip.src.ip6 = ipv6_hdr(skb)->saddr; @@ -190,6 +193,62 @@ struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, return br_mdb_ip_get_rcu(br, &ip); } +/* IMPORTANT: this function must be used only when the contexts cannot be + * passed down (e.g. timer) and must be used for read-only purposes because + * the vlan snooping option can change, so it can return any context + * (non-vlan or vlan). Its initial intended purpose is to read timer values + * from the *current* context based on the option. At worst that could lead + * to inconsistent timers when the contexts are changed, i.e. src timer + * which needs to re-arm with a specific delay taken from the old context + */ +static struct net_bridge_mcast_port * +br_multicast_pg_to_port_ctx(const struct net_bridge_port_group *pg) +{ + struct net_bridge_mcast_port *pmctx = &pg->key.port->multicast_ctx; + struct net_bridge_vlan *vlan; + + lockdep_assert_held_once(&pg->key.port->br->multicast_lock); + + /* if vlan snooping is disabled use the port's multicast context */ + if (!pg->key.addr.vid || + !br_opt_get(pg->key.port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) + goto out; + + /* locking is tricky here, due to different rules for multicast and + * vlans we need to take rcu to find the vlan and make sure it has + * the BR_VLFLAG_MCAST_ENABLED flag set, it can only change under + * multicast_lock which must be already held here, so the vlan's pmctx + * can safely be used on return + */ + rcu_read_lock(); + vlan = br_vlan_find(nbp_vlan_group_rcu(pg->key.port), pg->key.addr.vid); + if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx)) + pmctx = &vlan->port_mcast_ctx; + else + pmctx = NULL; + rcu_read_unlock(); +out: + return pmctx; +} + +/* when snooping we need to check if the contexts should be used + * in the following order: + * - if pmctx is non-NULL (port), check if it should be used + * - if pmctx is NULL (bridge), check if brmctx should be used + */ +static bool +br_multicast_ctx_should_use(const struct net_bridge_mcast *brmctx, + const struct net_bridge_mcast_port *pmctx) +{ + if (!netif_running(brmctx->br->dev)) + return false; + + if (pmctx) + return !br_multicast_port_ctx_state_disabled(pmctx); + else + return !br_multicast_ctx_vlan_disabled(brmctx); +} + static bool br_port_group_equal(struct net_bridge_port_group *p, struct net_bridge_port *port, const unsigned char *src) @@ -203,20 +262,23 @@ static bool br_port_group_equal(struct net_bridge_port_group *p, return ether_addr_equal(src, p->eth_addr); } -static void __fwd_add_star_excl(struct net_bridge_port_group *pg, +static void __fwd_add_star_excl(struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, struct br_ip *sg_ip) { struct net_bridge_port_group_sg_key sg_key; - struct net_bridge *br = pg->key.port->br; struct net_bridge_port_group *src_pg; + struct net_bridge_mcast *brmctx; memset(&sg_key, 0, sizeof(sg_key)); + brmctx = br_multicast_port_ctx_get_global(pmctx); sg_key.port = pg->key.port; sg_key.addr = *sg_ip; - if (br_sg_port_find(br, &sg_key)) + if (br_sg_port_find(brmctx->br, &sg_key)) return; - src_pg = __br_multicast_add_group(br, pg->key.port, sg_ip, pg->eth_addr, + src_pg = __br_multicast_add_group(brmctx, pmctx, + sg_ip, pg->eth_addr, MCAST_INCLUDE, false, false); if (IS_ERR_OR_NULL(src_pg) || src_pg->rt_protocol != RTPROT_KERNEL) @@ -256,6 +318,7 @@ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, { struct net_bridge *br = pg->key.port->br; struct net_bridge_port_group *pg_lst; + struct net_bridge_mcast_port *pmctx; struct net_bridge_mdb_entry *mp; struct br_ip sg_ip; @@ -265,9 +328,13 @@ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, mp = br_mdb_ip_get(br, &pg->key.addr); if (!mp) return; + pmctx = br_multicast_pg_to_port_ctx(pg); + if (!pmctx) + return; memset(&sg_ip, 0, sizeof(sg_ip)); sg_ip = pg->key.addr; + for (pg_lst = mlock_dereference(mp->ports, br); pg_lst; pg_lst = mlock_dereference(pg_lst->next, br)) { @@ -284,7 +351,7 @@ void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, __fwd_del_star_excl(pg, &sg_ip); break; case MCAST_EXCLUDE: - __fwd_add_star_excl(pg, &sg_ip); + __fwd_add_star_excl(pmctx, pg, &sg_ip); break; } } @@ -377,7 +444,9 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, { struct net_bridge_port_group_sg_key sg_key; struct net_bridge *br = star_mp->br; + struct net_bridge_mcast_port *pmctx; struct net_bridge_port_group *pg; + struct net_bridge_mcast *brmctx; if (WARN_ON(br_multicast_is_star_g(&sg->key.addr))) return; @@ -400,7 +469,12 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, if (br_sg_port_find(br, &sg_key)) continue; - src_pg = __br_multicast_add_group(br, pg->key.port, + pmctx = br_multicast_pg_to_port_ctx(pg); + if (!pmctx) + continue; + brmctx = br_multicast_port_ctx_get_global(pmctx); + + src_pg = __br_multicast_add_group(brmctx, pmctx, &sg->key.addr, sg->eth_addr, MCAST_INCLUDE, false, false); @@ -414,16 +488,23 @@ void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp, static void br_multicast_fwd_src_add(struct net_bridge_group_src *src) { struct net_bridge_mdb_entry *star_mp; + struct net_bridge_mcast_port *pmctx; struct net_bridge_port_group *sg; + struct net_bridge_mcast *brmctx; struct br_ip sg_ip; if (src->flags & BR_SGRP_F_INSTALLED) return; memset(&sg_ip, 0, sizeof(sg_ip)); + pmctx = br_multicast_pg_to_port_ctx(src->pg); + if (!pmctx) + return; + brmctx = br_multicast_port_ctx_get_global(pmctx); sg_ip = src->pg->key.addr; sg_ip.src = src->addr.src; - sg = __br_multicast_add_group(src->br, src->pg->key.port, &sg_ip, + + sg = __br_multicast_add_group(brmctx, pmctx, &sg_ip, src->pg->eth_addr, MCAST_INCLUDE, false, !timer_pending(&src->timer)); if (IS_ERR_OR_NULL(sg)) @@ -692,7 +773,28 @@ static void br_multicast_gc(struct hlist_head *head) } } -static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, +static void __br_multicast_query_handle_vlan(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct sk_buff *skb) +{ + struct net_bridge_vlan *vlan = NULL; + + if (pmctx && br_multicast_port_ctx_is_vlan(pmctx)) + vlan = pmctx->vlan; + else if (br_multicast_ctx_is_vlan(brmctx)) + vlan = brmctx->vlan; + + if (vlan && !(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED)) { + u16 vlan_proto; + + if (br_vlan_get_proto(brmctx->br->dev, &vlan_proto) != 0) + return; + __vlan_hwaccel_put_tag(skb, htons(vlan_proto), vlan->vid); + } +} + +static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, __be32 ip_dst, __be32 group, bool with_srcs, bool over_lmqt, @@ -714,11 +816,11 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, u16 lmqt_srcs = 0; igmp_hdr_size = sizeof(*ih); - if (br->multicast_igmp_version == 3) { + if (brmctx->multicast_igmp_version == 3) { igmp_hdr_size = sizeof(*ihv3); if (pg && with_srcs) { - lmqt = now + (br->multicast_last_member_interval * - br->multicast_last_member_count); + lmqt = now + (brmctx->multicast_last_member_interval * + brmctx->multicast_last_member_count); hlist_for_each_entry(ent, &pg->src_list, node) { if (over_lmqt == time_after(ent->timer.expires, lmqt) && @@ -734,19 +836,20 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, pkt_size = sizeof(*eth) + sizeof(*iph) + 4 + igmp_hdr_size; if ((p && pkt_size > p->dev->mtu) || - pkt_size > br->dev->mtu) + pkt_size > brmctx->br->dev->mtu) return NULL; - skb = netdev_alloc_skb_ip_align(br->dev, pkt_size); + skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size); if (!skb) goto out; + __br_multicast_query_handle_vlan(brmctx, pmctx, skb); skb->protocol = htons(ETH_P_IP); skb_reset_mac_header(skb); eth = eth_hdr(skb); - ether_addr_copy(eth->h_source, br->dev->dev_addr); + ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr); ip_eth_mc_map(ip_dst, eth->h_dest); eth->h_proto = htons(ETH_P_IP); skb_put(skb, sizeof(*eth)); @@ -762,8 +865,8 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, iph->frag_off = htons(IP_DF); iph->ttl = 1; iph->protocol = IPPROTO_IGMP; - iph->saddr = br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR) ? - inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0; + iph->saddr = br_opt_get(brmctx->br, BROPT_MULTICAST_QUERY_USE_IFADDR) ? + inet_select_addr(brmctx->br->dev, 0, RT_SCOPE_LINK) : 0; iph->daddr = ip_dst; ((u8 *)&iph[1])[0] = IPOPT_RA; ((u8 *)&iph[1])[1] = 4; @@ -775,12 +878,12 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, skb_set_transport_header(skb, skb->len); *igmp_type = IGMP_HOST_MEMBERSHIP_QUERY; - switch (br->multicast_igmp_version) { + switch (brmctx->multicast_igmp_version) { case 2: ih = igmp_hdr(skb); ih->type = IGMP_HOST_MEMBERSHIP_QUERY; - ih->code = (group ? br->multicast_last_member_interval : - br->multicast_query_response_interval) / + ih->code = (group ? brmctx->multicast_last_member_interval : + brmctx->multicast_query_response_interval) / (HZ / IGMP_TIMER_SCALE); ih->group = group; ih->csum = 0; @@ -790,11 +893,11 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br, case 3: ihv3 = igmpv3_query_hdr(skb); ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY; - ihv3->code = (group ? br->multicast_last_member_interval : - br->multicast_query_response_interval) / + ihv3->code = (group ? brmctx->multicast_last_member_interval : + brmctx->multicast_query_response_interval) / (HZ / IGMP_TIMER_SCALE); ihv3->group = group; - ihv3->qqic = br->multicast_query_interval / HZ; + ihv3->qqic = brmctx->multicast_query_interval / HZ; ihv3->nsrcs = htons(lmqt_srcs); ihv3->resv = 0; ihv3->suppress = sflag; @@ -837,7 +940,8 @@ out: } #if IS_ENABLED(CONFIG_IPV6) -static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, +static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, const struct in6_addr *ip6_dst, const struct in6_addr *group, @@ -862,11 +966,11 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, u8 *hopopt; mld_hdr_size = sizeof(*mldq); - if (br->multicast_mld_version == 2) { + if (brmctx->multicast_mld_version == 2) { mld_hdr_size = sizeof(*mld2q); if (pg && with_srcs) { - llqt = now + (br->multicast_last_member_interval * - br->multicast_last_member_count); + llqt = now + (brmctx->multicast_last_member_interval * + brmctx->multicast_last_member_count); hlist_for_each_entry(ent, &pg->src_list, node) { if (over_llqt == time_after(ent->timer.expires, llqt) && @@ -882,20 +986,21 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, pkt_size = sizeof(*eth) + sizeof(*ip6h) + 8 + mld_hdr_size; if ((p && pkt_size > p->dev->mtu) || - pkt_size > br->dev->mtu) + pkt_size > brmctx->br->dev->mtu) return NULL; - skb = netdev_alloc_skb_ip_align(br->dev, pkt_size); + skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size); if (!skb) goto out; + __br_multicast_query_handle_vlan(brmctx, pmctx, skb); skb->protocol = htons(ETH_P_IPV6); /* Ethernet header */ skb_reset_mac_header(skb); eth = eth_hdr(skb); - ether_addr_copy(eth->h_source, br->dev->dev_addr); + ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr); eth->h_proto = htons(ETH_P_IPV6); skb_put(skb, sizeof(*eth)); @@ -908,14 +1013,14 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; ip6h->daddr = *ip6_dst; - if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, - &ip6h->saddr)) { + if (ipv6_dev_get_saddr(dev_net(brmctx->br->dev), brmctx->br->dev, + &ip6h->daddr, 0, &ip6h->saddr)) { kfree_skb(skb); - br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, false); + br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, false); return NULL; } - br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); + br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, true); ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); hopopt = (u8 *)(ip6h + 1); @@ -933,10 +1038,10 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, /* ICMPv6 */ skb_set_transport_header(skb, skb->len); interval = ipv6_addr_any(group) ? - br->multicast_query_response_interval : - br->multicast_last_member_interval; + brmctx->multicast_query_response_interval : + brmctx->multicast_last_member_interval; *igmp_type = ICMPV6_MGM_QUERY; - switch (br->multicast_mld_version) { + switch (brmctx->multicast_mld_version) { case 1: mldq = (struct mld_msg *)icmp6_hdr(skb); mldq->mld_type = ICMPV6_MGM_QUERY; @@ -959,7 +1064,7 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, mld2q->mld2q_suppress = sflag; mld2q->mld2q_qrv = 2; mld2q->mld2q_nsrcs = htons(llqt_srcs); - mld2q->mld2q_qqic = br->multicast_query_interval / HZ; + mld2q->mld2q_qqic = brmctx->multicast_query_interval / HZ; mld2q->mld2q_mca = *group; csum = &mld2q->mld2q_cksum; csum_start = (void *)mld2q; @@ -1000,7 +1105,8 @@ out: } #endif -static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, +static struct sk_buff *br_multicast_alloc_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *ip_dst, struct br_ip *group, @@ -1013,7 +1119,7 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, switch (group->proto) { case htons(ETH_P_IP): ip4_dst = ip_dst ? ip_dst->dst.ip4 : htonl(INADDR_ALLHOSTS_GROUP); - return br_ip4_multicast_alloc_query(br, pg, + return br_ip4_multicast_alloc_query(brmctx, pmctx, pg, ip4_dst, group->dst.ip4, with_srcs, over_lmqt, sflag, igmp_type, @@ -1028,7 +1134,7 @@ static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br, ipv6_addr_set(&ip6_dst, htonl(0xff020000), 0, 0, htonl(1)); - return br_ip6_multicast_alloc_query(br, pg, + return br_ip6_multicast_alloc_query(brmctx, pmctx, pg, &ip6_dst, &group->dst.ip6, with_srcs, over_lmqt, sflag, igmp_type, @@ -1206,7 +1312,8 @@ struct net_bridge_port_group *br_multicast_new_port_group( return p; } -void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify) +void br_multicast_host_join(const struct net_bridge_mcast *brmctx, + struct net_bridge_mdb_entry *mp, bool notify) { if (!mp->host_joined) { mp->host_joined = true; @@ -1219,7 +1326,7 @@ void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify) if (br_group_is_l2(&mp->addr)) return; - mod_timer(&mp->timer, jiffies + mp->br->multicast_membership_interval); + mod_timer(&mp->timer, jiffies + brmctx->multicast_membership_interval); } void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify) @@ -1235,8 +1342,8 @@ void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify) } static struct net_bridge_port_group * -__br_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +__br_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, @@ -1248,29 +1355,28 @@ __br_multicast_add_group(struct net_bridge *br, struct net_bridge_mdb_entry *mp; unsigned long now = jiffies; - if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED)) + if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; - mp = br_multicast_new_group(br, group); + mp = br_multicast_new_group(brmctx->br, group); if (IS_ERR(mp)) return ERR_CAST(mp); - if (!port) { - br_multicast_host_join(mp, true); + if (!pmctx) { + br_multicast_host_join(brmctx, mp, true); goto out; } for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; + (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { - if (br_port_group_equal(p, port, src)) + if (br_port_group_equal(p, pmctx->port, src)) goto found; - if ((unsigned long)p->key.port < (unsigned long)port) + if ((unsigned long)p->key.port < (unsigned long)pmctx->port) break; } - p = br_multicast_new_port_group(port, group, *pp, 0, src, + p = br_multicast_new_port_group(pmctx->port, group, *pp, 0, src, filter_mode, RTPROT_KERNEL); if (unlikely(!p)) { p = ERR_PTR(-ENOMEM); @@ -1279,18 +1385,19 @@ __br_multicast_add_group(struct net_bridge *br, rcu_assign_pointer(*pp, p); if (blocked) p->flags |= MDB_PG_FLAGS_BLOCKED; - br_mdb_notify(br->dev, mp, p, RTM_NEWMDB); + br_mdb_notify(brmctx->br->dev, mp, p, RTM_NEWMDB); found: if (igmpv2_mldv1) - mod_timer(&p->timer, now + br->multicast_membership_interval); + mod_timer(&p->timer, + now + brmctx->multicast_membership_interval); out: return p; } -static int br_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +static int br_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct br_ip *group, const unsigned char *src, u8 filter_mode, @@ -1299,18 +1406,18 @@ static int br_multicast_add_group(struct net_bridge *br, struct net_bridge_port_group *pg; int err; - spin_lock(&br->multicast_lock); - pg = __br_multicast_add_group(br, port, group, src, filter_mode, + spin_lock(&brmctx->br->multicast_lock); + pg = __br_multicast_add_group(brmctx, pmctx, group, src, filter_mode, igmpv2_mldv1, false); /* NULL is considered valid for host joined groups */ err = PTR_ERR_OR_ZERO(pg); - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); return err; } -static int br_ip4_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip4_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src, @@ -1328,13 +1435,13 @@ static int br_ip4_multicast_add_group(struct net_bridge *br, br_group.vid = vid; filter_mode = igmpv2 ? MCAST_EXCLUDE : MCAST_INCLUDE; - return br_multicast_add_group(br, port, &br_group, src, filter_mode, - igmpv2); + return br_multicast_add_group(brmctx, pmctx, &br_group, src, + filter_mode, igmpv2); } #if IS_ENABLED(CONFIG_IPV6) -static int br_ip6_multicast_add_group(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip6_multicast_add_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src, @@ -1352,8 +1459,8 @@ static int br_ip6_multicast_add_group(struct net_bridge *br, br_group.vid = vid; filter_mode = mldv1 ? MCAST_EXCLUDE : MCAST_INCLUDE; - return br_multicast_add_group(br, port, &br_group, src, filter_mode, - mldv1); + return br_multicast_add_group(brmctx, pmctx, &br_group, src, + filter_mode, mldv1); } #endif @@ -1366,52 +1473,54 @@ static bool br_multicast_rport_del(struct hlist_node *rlist) return true; } -static bool br_ip4_multicast_rport_del(struct net_bridge_port *p) +static bool br_ip4_multicast_rport_del(struct net_bridge_mcast_port *pmctx) { - return br_multicast_rport_del(&p->ip4_rlist); + return br_multicast_rport_del(&pmctx->ip4_rlist); } -static bool br_ip6_multicast_rport_del(struct net_bridge_port *p) +static bool br_ip6_multicast_rport_del(struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) - return br_multicast_rport_del(&p->ip6_rlist); + return br_multicast_rport_del(&pmctx->ip6_rlist); #else return false; #endif } -static void br_multicast_router_expired(struct net_bridge_port *port, +static void br_multicast_router_expired(struct net_bridge_mcast_port *pmctx, struct timer_list *t, struct hlist_node *rlist) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; bool del; spin_lock(&br->multicast_lock); - if (port->multicast_router == MDB_RTR_TYPE_DISABLED || - port->multicast_router == MDB_RTR_TYPE_PERM || + if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED || + pmctx->multicast_router == MDB_RTR_TYPE_PERM || timer_pending(t)) goto out; del = br_multicast_rport_del(rlist); - br_multicast_rport_del_notify(port, del); + br_multicast_rport_del_notify(pmctx, del); out: spin_unlock(&br->multicast_lock); } static void br_ip4_multicast_router_expired(struct timer_list *t) { - struct net_bridge_port *port = from_timer(port, t, ip4_mc_router_timer); + struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, + ip4_mc_router_timer); - br_multicast_router_expired(port, t, &port->ip4_rlist); + br_multicast_router_expired(pmctx, t, &pmctx->ip4_rlist); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_router_expired(struct timer_list *t) { - struct net_bridge_port *port = from_timer(port, t, ip6_mc_router_timer); + struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, + ip6_mc_router_timer); - br_multicast_router_expired(port, t, &port->ip6_rlist); + br_multicast_router_expired(pmctx, t, &pmctx->ip6_rlist); } #endif @@ -1428,80 +1537,86 @@ static void br_mc_router_state_change(struct net_bridge *p, switchdev_port_attr_set(p->dev, &attr, NULL); } -static void br_multicast_local_router_expired(struct net_bridge *br, +static void br_multicast_local_router_expired(struct net_bridge_mcast *brmctx, struct timer_list *timer) { - spin_lock(&br->multicast_lock); - if (br->multicast_router == MDB_RTR_TYPE_DISABLED || - br->multicast_router == MDB_RTR_TYPE_PERM || - br_ip4_multicast_is_router(br) || - br_ip6_multicast_is_router(br)) + spin_lock(&brmctx->br->multicast_lock); + if (brmctx->multicast_router == MDB_RTR_TYPE_DISABLED || + brmctx->multicast_router == MDB_RTR_TYPE_PERM || + br_ip4_multicast_is_router(brmctx) || + br_ip6_multicast_is_router(brmctx)) goto out; - br_mc_router_state_change(br, false); + br_mc_router_state_change(brmctx->br, false); out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_local_router_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip4_mc_router_timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip4_mc_router_timer); - br_multicast_local_router_expired(br, t); + br_multicast_local_router_expired(brmctx, t); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_local_router_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip6_mc_router_timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip6_mc_router_timer); - br_multicast_local_router_expired(br, t); + br_multicast_local_router_expired(brmctx, t); } #endif -static void br_multicast_querier_expired(struct net_bridge *br, +static void br_multicast_querier_expired(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { - spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + spin_lock(&brmctx->br->multicast_lock); + if (!netif_running(brmctx->br->dev) || + br_multicast_ctx_vlan_global_disabled(brmctx) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) goto out; - br_multicast_start_querier(br, query); + br_multicast_start_querier(brmctx, query); out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_querier_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip4_other_query.timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip4_other_query.timer); - br_multicast_querier_expired(br, &br->ip4_own_query); + br_multicast_querier_expired(brmctx, &brmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_querier_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip6_other_query.timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip6_other_query.timer); - br_multicast_querier_expired(br, &br->ip6_own_query); + br_multicast_querier_expired(brmctx, &brmctx->ip6_own_query); } #endif -static void br_multicast_select_own_querier(struct net_bridge *br, +static void br_multicast_select_own_querier(struct net_bridge_mcast *brmctx, struct br_ip *ip, struct sk_buff *skb) { if (ip->proto == htons(ETH_P_IP)) - br->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr; + brmctx->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr; #if IS_ENABLED(CONFIG_IPV6) else - br->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr; + brmctx->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr; #endif } -static void __br_multicast_send_query(struct net_bridge *br, - struct net_bridge_port *port, +static void __br_multicast_send_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct net_bridge_port_group *pg, struct br_ip *ip_dst, struct br_ip *group, @@ -1513,19 +1628,23 @@ static void __br_multicast_send_query(struct net_bridge *br, struct sk_buff *skb; u8 igmp_type; + if (!br_multicast_ctx_should_use(brmctx, pmctx) || + !br_multicast_ctx_matches_vlan_snooping(brmctx)) + return; + again_under_lmqt: - skb = br_multicast_alloc_query(br, pg, ip_dst, group, with_srcs, - over_lmqt, sflag, &igmp_type, + skb = br_multicast_alloc_query(brmctx, pmctx, pg, ip_dst, group, + with_srcs, over_lmqt, sflag, &igmp_type, need_rexmit); if (!skb) return; - if (port) { - skb->dev = port->dev; - br_multicast_count(br, port, skb, igmp_type, + if (pmctx) { + skb->dev = pmctx->port->dev; + br_multicast_count(brmctx->br, pmctx->port, skb, igmp_type, BR_MCAST_DIR_TX); NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, - dev_net(port->dev), NULL, skb, NULL, skb->dev, + dev_net(pmctx->port->dev), NULL, skb, NULL, skb->dev, br_dev_queue_push_xmit); if (over_lmqt && with_srcs && sflag) { @@ -1533,35 +1652,62 @@ again_under_lmqt: goto again_under_lmqt; } } else { - br_multicast_select_own_querier(br, group, skb); - br_multicast_count(br, port, skb, igmp_type, + br_multicast_select_own_querier(brmctx, group, skb); + br_multicast_count(brmctx->br, NULL, skb, igmp_type, BR_MCAST_DIR_RX); netif_rx(skb); } } -static void br_multicast_send_query(struct net_bridge *br, - struct net_bridge_port *port, +static void br_multicast_read_querier(const struct bridge_mcast_querier *querier, + struct bridge_mcast_querier *dest) +{ + unsigned int seq; + + memset(dest, 0, sizeof(*dest)); + do { + seq = read_seqcount_begin(&querier->seq); + dest->port_ifidx = querier->port_ifidx; + memcpy(&dest->addr, &querier->addr, sizeof(struct br_ip)); + } while (read_seqcount_retry(&querier->seq, seq)); +} + +static void br_multicast_update_querier(struct net_bridge_mcast *brmctx, + struct bridge_mcast_querier *querier, + int ifindex, + struct br_ip *saddr) +{ + write_seqcount_begin(&querier->seq); + querier->port_ifidx = ifindex; + memcpy(&querier->addr, saddr, sizeof(*saddr)); + write_seqcount_end(&querier->seq); +} + +static void br_multicast_send_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct bridge_mcast_own_query *own_query) { struct bridge_mcast_other_query *other_query = NULL; + struct bridge_mcast_querier *querier; struct br_ip br_group; unsigned long time; - if (!netif_running(br->dev) || - !br_opt_get(br, BROPT_MULTICAST_ENABLED) || - !br_opt_get(br, BROPT_MULTICAST_QUERIER)) + if (!br_multicast_ctx_should_use(brmctx, pmctx) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) || + !brmctx->multicast_querier) return; memset(&br_group.dst, 0, sizeof(br_group.dst)); - if (port ? (own_query == &port->ip4_own_query) : - (own_query == &br->ip4_own_query)) { - other_query = &br->ip4_other_query; + if (pmctx ? (own_query == &pmctx->ip4_own_query) : + (own_query == &brmctx->ip4_own_query)) { + querier = &brmctx->ip4_querier; + other_query = &brmctx->ip4_other_query; br_group.proto = htons(ETH_P_IP); #if IS_ENABLED(CONFIG_IPV6) } else { - other_query = &br->ip6_other_query; + querier = &brmctx->ip6_querier; + other_query = &brmctx->ip6_other_query; br_group.proto = htons(ETH_P_IPV6); #endif } @@ -1569,31 +1715,39 @@ static void br_multicast_send_query(struct net_bridge *br, if (!other_query || timer_pending(&other_query->timer)) return; - __br_multicast_send_query(br, port, NULL, NULL, &br_group, false, 0, - NULL); + /* we're about to select ourselves as querier */ + if (!pmctx && querier->port_ifidx) { + struct br_ip zeroip = {}; + + br_multicast_update_querier(brmctx, querier, 0, &zeroip); + } + + __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &br_group, false, + 0, NULL); time = jiffies; - time += own_query->startup_sent < br->multicast_startup_query_count ? - br->multicast_startup_query_interval : - br->multicast_query_interval; + time += own_query->startup_sent < brmctx->multicast_startup_query_count ? + brmctx->multicast_startup_query_interval : + brmctx->multicast_query_interval; mod_timer(&own_query->timer, time); } static void -br_multicast_port_query_expired(struct net_bridge_port *port, +br_multicast_port_query_expired(struct net_bridge_mcast_port *pmctx, struct bridge_mcast_own_query *query) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; + struct net_bridge_mcast *brmctx; spin_lock(&br->multicast_lock); - if (port->state == BR_STATE_DISABLED || - port->state == BR_STATE_BLOCKING) + if (br_multicast_port_ctx_state_stopped(pmctx)) goto out; - if (query->startup_sent < br->multicast_startup_query_count) + brmctx = br_multicast_port_ctx_get_global(pmctx); + if (query->startup_sent < brmctx->multicast_startup_query_count) query->startup_sent++; - br_multicast_send_query(port->br, port, query); + br_multicast_send_query(brmctx, pmctx, query); out: spin_unlock(&br->multicast_lock); @@ -1601,17 +1755,19 @@ out: static void br_ip4_multicast_port_query_expired(struct timer_list *t) { - struct net_bridge_port *port = from_timer(port, t, ip4_own_query.timer); + struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, + ip4_own_query.timer); - br_multicast_port_query_expired(port, &port->ip4_own_query); + br_multicast_port_query_expired(pmctx, &pmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_port_query_expired(struct timer_list *t) { - struct net_bridge_port *port = from_timer(port, t, ip6_own_query.timer); + struct net_bridge_mcast_port *pmctx = from_timer(pmctx, t, + ip6_own_query.timer); - br_multicast_port_query_expired(port, &port->ip6_own_query); + br_multicast_port_query_expired(pmctx, &pmctx->ip6_own_query); } #endif @@ -1620,19 +1776,27 @@ static void br_multicast_port_group_rexmit(struct timer_list *t) struct net_bridge_port_group *pg = from_timer(pg, t, rexmit_timer); struct bridge_mcast_other_query *other_query = NULL; struct net_bridge *br = pg->key.port->br; + struct net_bridge_mcast_port *pmctx; + struct net_bridge_mcast *brmctx; bool need_rexmit = false; spin_lock(&br->multicast_lock); if (!netif_running(br->dev) || hlist_unhashed(&pg->mglist) || - !br_opt_get(br, BROPT_MULTICAST_ENABLED) || - !br_opt_get(br, BROPT_MULTICAST_QUERIER)) + !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + goto out; + + pmctx = br_multicast_pg_to_port_ctx(pg); + if (!pmctx) + goto out; + brmctx = br_multicast_port_ctx_get_global(pmctx); + if (!brmctx->multicast_querier) goto out; if (pg->key.addr.proto == htons(ETH_P_IP)) - other_query = &br->ip4_other_query; + other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else - other_query = &br->ip6_other_query; + other_query = &brmctx->ip6_other_query; #endif if (!other_query || timer_pending(&other_query->timer)) @@ -1640,15 +1804,15 @@ static void br_multicast_port_group_rexmit(struct timer_list *t) if (pg->grp_query_rexmit_cnt) { pg->grp_query_rexmit_cnt--; - __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, false, 1, NULL); } - __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, true, 0, &need_rexmit); if (pg->grp_query_rexmit_cnt || need_rexmit) mod_timer(&pg->rexmit_timer, jiffies + - br->multicast_last_member_interval); + brmctx->multicast_last_member_interval); out: spin_unlock(&br->multicast_lock); } @@ -1666,23 +1830,40 @@ static int br_mc_disabled_update(struct net_device *dev, bool value, return switchdev_port_attr_set(dev, &attr, extack); } -int br_multicast_add_port(struct net_bridge_port *port) +void br_multicast_port_ctx_init(struct net_bridge_port *port, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast_port *pmctx) { - int err; - - port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; - - timer_setup(&port->ip4_mc_router_timer, + pmctx->port = port; + pmctx->vlan = vlan; + pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + timer_setup(&pmctx->ip4_mc_router_timer, br_ip4_multicast_router_expired, 0); - timer_setup(&port->ip4_own_query.timer, + timer_setup(&pmctx->ip4_own_query.timer, br_ip4_multicast_port_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) - timer_setup(&port->ip6_mc_router_timer, + timer_setup(&pmctx->ip6_mc_router_timer, br_ip6_multicast_router_expired, 0); - timer_setup(&port->ip6_own_query.timer, + timer_setup(&pmctx->ip6_own_query.timer, br_ip6_multicast_port_query_expired, 0); #endif +} + +void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) +{ +#if IS_ENABLED(CONFIG_IPV6) + del_timer_sync(&pmctx->ip6_mc_router_timer); +#endif + del_timer_sync(&pmctx->ip4_mc_router_timer); +} + +int br_multicast_add_port(struct net_bridge_port *port) +{ + int err; + + port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT; + br_multicast_port_ctx_init(port, NULL, &port->multicast_ctx); + err = br_mc_disabled_update(port->dev, br_opt_get(port->br, BROPT_MULTICAST_ENABLED), @@ -1711,10 +1892,7 @@ void br_multicast_del_port(struct net_bridge_port *port) hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); br_multicast_gc(&deleted_head); - del_timer_sync(&port->ip4_mc_router_timer); -#if IS_ENABLED(CONFIG_IPV6) - del_timer_sync(&port->ip6_mc_router_timer); -#endif + br_multicast_port_ctx_deinit(&port->multicast_ctx); free_percpu(port->mcast_stats); } @@ -1727,20 +1905,23 @@ static void br_multicast_enable(struct bridge_mcast_own_query *query) mod_timer(&query->timer, jiffies); } -static void __br_multicast_enable_port(struct net_bridge_port *port) +static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx) { - struct net_bridge *br = port->br; + struct net_bridge *br = pmctx->port->br; + struct net_bridge_mcast *brmctx; - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || !netif_running(br->dev)) + brmctx = br_multicast_port_ctx_get_global(pmctx); + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) || + !netif_running(br->dev)) return; - br_multicast_enable(&port->ip4_own_query); + br_multicast_enable(&pmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - br_multicast_enable(&port->ip6_own_query); + br_multicast_enable(&pmctx->ip6_own_query); #endif - if (port->multicast_router == MDB_RTR_TYPE_PERM) { - br_ip4_multicast_add_router(br, port); - br_ip6_multicast_add_router(br, port); + if (pmctx->multicast_router == MDB_RTR_TYPE_PERM) { + br_ip4_multicast_add_router(brmctx, pmctx); + br_ip6_multicast_add_router(brmctx, pmctx); } } @@ -1748,33 +1929,39 @@ void br_multicast_enable_port(struct net_bridge_port *port) { struct net_bridge *br = port->br; - spin_lock(&br->multicast_lock); - __br_multicast_enable_port(port); - spin_unlock(&br->multicast_lock); + spin_lock_bh(&br->multicast_lock); + __br_multicast_enable_port_ctx(&port->multicast_ctx); + spin_unlock_bh(&br->multicast_lock); } -void br_multicast_disable_port(struct net_bridge_port *port) +static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx) { - struct net_bridge *br = port->br; struct net_bridge_port_group *pg; struct hlist_node *n; bool del = false; - spin_lock(&br->multicast_lock); - hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) - if (!(pg->flags & MDB_PG_FLAGS_PERMANENT)) - br_multicast_find_del_pg(br, pg); + hlist_for_each_entry_safe(pg, n, &pmctx->port->mglist, mglist) + if (!(pg->flags & MDB_PG_FLAGS_PERMANENT) && + (!br_multicast_port_ctx_is_vlan(pmctx) || + pg->key.addr.vid == pmctx->vlan->vid)) + br_multicast_find_del_pg(pmctx->port->br, pg); - del |= br_ip4_multicast_rport_del(port); - del_timer(&port->ip4_mc_router_timer); - del_timer(&port->ip4_own_query.timer); - del |= br_ip6_multicast_rport_del(port); + del |= br_ip4_multicast_rport_del(pmctx); + del_timer(&pmctx->ip4_mc_router_timer); + del_timer(&pmctx->ip4_own_query.timer); + del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&port->ip6_mc_router_timer); - del_timer(&port->ip6_own_query.timer); + del_timer(&pmctx->ip6_mc_router_timer); + del_timer(&pmctx->ip6_own_query.timer); #endif - br_multicast_rport_del_notify(port, del); - spin_unlock(&br->multicast_lock); + br_multicast_rport_del_notify(pmctx, del); +} + +void br_multicast_disable_port(struct net_bridge_port *port) +{ + spin_lock_bh(&port->br->multicast_lock); + __br_multicast_disable_port_ctx(&port->multicast_ctx); + spin_unlock_bh(&port->br->multicast_lock); } static int __grp_src_delete_marked(struct net_bridge_port_group *pg) @@ -1799,31 +1986,32 @@ static void __grp_src_mod_timer(struct net_bridge_group_src *src, br_multicast_fwd_src_handle(src); } -static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg) +static void __grp_src_query_marked_and_rexmit(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; - struct net_bridge *br = pg->key.port->br; - u32 lmqc = br->multicast_last_member_count; + u32 lmqc = brmctx->multicast_last_member_count; unsigned long lmqt, lmi, now = jiffies; struct net_bridge_group_src *ent; - if (!netif_running(br->dev) || - !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!netif_running(brmctx->br->dev) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) return; if (pg->key.addr.proto == htons(ETH_P_IP)) - other_query = &br->ip4_other_query; + other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else - other_query = &br->ip6_other_query; + other_query = &brmctx->ip6_other_query; #endif - lmqt = now + br_multicast_lmqt(br); + lmqt = now + br_multicast_lmqt(brmctx); hlist_for_each_entry(ent, &pg->src_list, node) { if (ent->flags & BR_SGRP_F_SEND) { ent->flags &= ~BR_SGRP_F_SEND; if (ent->timer.expires > lmqt) { - if (br_opt_get(br, BROPT_MULTICAST_QUERIER) && + if (brmctx->multicast_querier && other_query && !timer_pending(&other_query->timer)) ent->src_query_rexmit_cnt = lmqc; @@ -1832,41 +2020,42 @@ static void __grp_src_query_marked_and_rexmit(struct net_bridge_port_group *pg) } } - if (!br_opt_get(br, BROPT_MULTICAST_QUERIER) || + if (!brmctx->multicast_querier || !other_query || timer_pending(&other_query->timer)) return; - __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, true, 1, NULL); - lmi = now + br->multicast_last_member_interval; + lmi = now + brmctx->multicast_last_member_interval; if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) mod_timer(&pg->rexmit_timer, lmi); } -static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) +static void __grp_send_query_and_rexmit(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg) { struct bridge_mcast_other_query *other_query = NULL; - struct net_bridge *br = pg->key.port->br; unsigned long now = jiffies, lmi; - if (!netif_running(br->dev) || - !br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!netif_running(brmctx->br->dev) || + !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED)) return; if (pg->key.addr.proto == htons(ETH_P_IP)) - other_query = &br->ip4_other_query; + other_query = &brmctx->ip4_other_query; #if IS_ENABLED(CONFIG_IPV6) else - other_query = &br->ip6_other_query; + other_query = &brmctx->ip6_other_query; #endif - if (br_opt_get(br, BROPT_MULTICAST_QUERIER) && + if (brmctx->multicast_querier && other_query && !timer_pending(&other_query->timer)) { - lmi = now + br->multicast_last_member_interval; - pg->grp_query_rexmit_cnt = br->multicast_last_member_count - 1; - __br_multicast_send_query(br, pg->key.port, pg, &pg->key.addr, + lmi = now + brmctx->multicast_last_member_interval; + pg->grp_query_rexmit_cnt = brmctx->multicast_last_member_count - 1; + __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr, &pg->key.addr, false, 0, NULL); if (!timer_pending(&pg->rexmit_timer) || time_after(pg->rexmit_timer.expires, lmi)) @@ -1875,8 +2064,8 @@ static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) if (pg->filter_mode == MCAST_EXCLUDE && (!timer_pending(&pg->timer) || - time_after(pg->timer.expires, now + br_multicast_lmqt(br)))) - mod_timer(&pg->timer, now + br_multicast_lmqt(br)); + time_after(pg->timer.expires, now + br_multicast_lmqt(brmctx)))) + mod_timer(&pg->timer, now + br_multicast_lmqt(brmctx)); } /* State Msg type New state Actions @@ -1884,11 +2073,11 @@ static void __grp_send_query_and_rexmit(struct net_bridge_port_group *pg) * INCLUDE (A) ALLOW (B) INCLUDE (A+B) (B)=GMI * EXCLUDE (X,Y) ALLOW (A) EXCLUDE (X+A,Y-A) (A)=GMI */ -static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_isinc_allow(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; @@ -1907,10 +2096,11 @@ static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, void *h_a } if (ent) - __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; return changed; @@ -1921,7 +2111,8 @@ static bool br_multicast_isinc_allow(struct net_bridge_port_group *pg, void *h_a * Delete (A-B) * Group Timer=GMI */ -static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, void *h_addr, +static void __grp_src_isexc_incl(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { @@ -1945,7 +2136,8 @@ static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, void *h_addr, br_multicast_fwd_src_handle(ent); } - br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type); + br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type); __grp_src_delete_marked(pg); } @@ -1956,11 +2148,11 @@ static void __grp_src_isexc_incl(struct net_bridge_port_group *pg, void *h_addr, * Delete (Y-A) * Group Timer=GMI */ -static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_isexc_excl(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; struct net_bridge_group_src *ent; unsigned long now = jiffies; bool changed = false; @@ -1981,13 +2173,14 @@ static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, void *h_addr, ent = br_multicast_new_group_src(pg, &src_ip); if (ent) { __grp_src_mod_timer(ent, - now + br_multicast_gmi(br)); + now + br_multicast_gmi(brmctx)); changed = true; } } } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (__grp_src_delete_marked(pg)) @@ -1996,28 +2189,28 @@ static bool __grp_src_isexc_excl(struct net_bridge_port_group *pg, void *h_addr, return changed; } -static bool br_multicast_isexc(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_isexc(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - __grp_src_isexc_incl(pg, h_addr, srcs, nsrcs, addr_size, + __grp_src_isexc_incl(brmctx, pg, h_addr, srcs, nsrcs, addr_size, grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: - changed = __grp_src_isexc_excl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_isexc_excl(brmctx, pg, h_addr, srcs, nsrcs, + addr_size, grec_type); break; } pg->filter_mode = MCAST_EXCLUDE; - mod_timer(&pg->timer, jiffies + br_multicast_gmi(br)); + mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx)); return changed; } @@ -2026,11 +2219,12 @@ static bool br_multicast_isexc(struct net_bridge_port_group *pg, void *h_addr, * INCLUDE (A) TO_IN (B) INCLUDE (A+B) (B)=GMI * Send Q(G,A-B) */ -static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_toin_incl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; @@ -2054,14 +2248,15 @@ static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, void *h_addr, changed = true; } if (ent) - __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } @@ -2071,11 +2266,12 @@ static bool __grp_src_toin_incl(struct net_bridge_port_group *pg, void *h_addr, * Send Q(G,X-A) * Send Q(G) */ -static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_toin_excl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; u32 src_idx, to_send = pg->src_ents; struct net_bridge_group_src *ent; unsigned long now = jiffies; @@ -2102,21 +2298,24 @@ static bool __grp_src_toin_excl(struct net_bridge_port_group *pg, void *h_addr, changed = true; } if (ent) - __grp_src_mod_timer(ent, now + br_multicast_gmi(br)); + __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx)); } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); - __grp_send_query_and_rexmit(pg); + __grp_send_query_and_rexmit(brmctx, pmctx, pg); return changed; } -static bool br_multicast_toin(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_toin(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { @@ -2124,12 +2323,12 @@ static bool br_multicast_toin(struct net_bridge_port_group *pg, void *h_addr, switch (pg->filter_mode) { case MCAST_INCLUDE: - changed = __grp_src_toin_incl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_toin_incl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; case MCAST_EXCLUDE: - changed = __grp_src_toin_excl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_toin_excl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; } @@ -2151,7 +2350,9 @@ static bool br_multicast_toin(struct net_bridge_port_group *pg, void *h_addr, * Send Q(G,A*B) * Group Timer=GMI */ -static void __grp_src_toex_incl(struct net_bridge_port_group *pg, void *h_addr, +static void __grp_src_toex_incl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { @@ -2178,11 +2379,12 @@ static void __grp_src_toex_incl(struct net_bridge_port_group *pg, void *h_addr, br_multicast_fwd_src_handle(ent); } - br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type); + br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type); __grp_src_delete_marked(pg); if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); } /* State Msg type New state Actions @@ -2192,7 +2394,9 @@ static void __grp_src_toex_incl(struct net_bridge_port_group *pg, void *h_addr, * Send Q(G,A-Y) * Group Timer=GMI */ -static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_toex_excl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { @@ -2224,39 +2428,41 @@ static bool __grp_src_toex_excl(struct net_bridge_port_group *pg, void *h_addr, } } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (__grp_src_delete_marked(pg)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } -static bool br_multicast_toex(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_toex(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { - struct net_bridge *br = pg->key.port->br; bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - __grp_src_toex_incl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + __grp_src_toex_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs, + addr_size, grec_type); br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE); changed = true; break; case MCAST_EXCLUDE: - changed = __grp_src_toex_excl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_toex_excl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; } pg->filter_mode = MCAST_EXCLUDE; - mod_timer(&pg->timer, jiffies + br_multicast_gmi(br)); + mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx)); return changed; } @@ -2264,7 +2470,9 @@ static bool br_multicast_toex(struct net_bridge_port_group *pg, void *h_addr, /* State Msg type New state Actions * INCLUDE (A) BLOCK (B) INCLUDE (A) Send Q(G,A*B) */ -static bool __grp_src_block_incl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_block_incl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; @@ -2286,11 +2494,12 @@ static bool __grp_src_block_incl(struct net_bridge_port_group *pg, void *h_addr, } } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } @@ -2299,7 +2508,9 @@ static bool __grp_src_block_incl(struct net_bridge_port_group *pg, void *h_addr, * EXCLUDE (X,Y) BLOCK (A) EXCLUDE (X+(A-Y),Y) (A-X-Y)=Group Timer * Send Q(G,A-Y) */ -static bool __grp_src_block_excl(struct net_bridge_port_group *pg, void *h_addr, +static bool __grp_src_block_excl(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { struct net_bridge_group_src *ent; @@ -2328,28 +2539,31 @@ static bool __grp_src_block_excl(struct net_bridge_port_group *pg, void *h_addr, } } - if (br_multicast_eht_handle(pg, h_addr, srcs, nsrcs, addr_size, grec_type)) + if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size, + grec_type)) changed = true; if (to_send) - __grp_src_query_marked_and_rexmit(pg); + __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg); return changed; } -static bool br_multicast_block(struct net_bridge_port_group *pg, void *h_addr, +static bool br_multicast_block(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, size_t addr_size, int grec_type) { bool changed = false; switch (pg->filter_mode) { case MCAST_INCLUDE: - changed = __grp_src_block_incl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_block_incl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; case MCAST_EXCLUDE: - changed = __grp_src_block_excl(pg, h_addr, srcs, nsrcs, addr_size, - grec_type); + changed = __grp_src_block_excl(brmctx, pmctx, pg, h_addr, srcs, + nsrcs, addr_size, grec_type); break; } @@ -2384,12 +2598,12 @@ br_multicast_find_port(struct net_bridge_mdb_entry *mp, return NULL; } -static int br_ip4_multicast_igmp3_report(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { - bool igmpv2 = br->multicast_igmp_version == 2; + bool igmpv2 = brmctx->multicast_igmp_version == 2; struct net_bridge_mdb_entry *mdst; struct net_bridge_port_group *pg; const unsigned char *src; @@ -2436,25 +2650,29 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, if (nsrcs == 0 && (type == IGMPV3_CHANGE_TO_INCLUDE || type == IGMPV3_MODE_IS_INCLUDE)) { - if (!port || igmpv2) { - br_ip4_multicast_leave_group(br, port, group, vid, src); + if (!pmctx || igmpv2) { + br_ip4_multicast_leave_group(brmctx, pmctx, + group, vid, src); continue; } } else { - err = br_ip4_multicast_add_group(br, port, group, vid, - src, igmpv2); + err = br_ip4_multicast_add_group(brmctx, pmctx, group, + vid, src, igmpv2); if (err) break; } - if (!port || igmpv2) + if (!pmctx || igmpv2) continue; - spin_lock_bh(&br->multicast_lock); - mdst = br_mdb_ip4_get(br, group, vid); + spin_lock_bh(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) + goto unlock_continue; + + mdst = br_mdb_ip4_get(brmctx->br, group, vid); if (!mdst) goto unlock_continue; - pg = br_multicast_find_port(mdst, port, src); + pg = br_multicast_find_port(mdst, pmctx->port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; /* reload grec and host addr */ @@ -2462,51 +2680,57 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br, h_addr = &ip_hdr(skb)->saddr; switch (type) { case IGMPV3_ALLOW_NEW_SOURCES: - changed = br_multicast_isinc_allow(pg, h_addr, grec->grec_src, + changed = br_multicast_isinc_allow(brmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_INCLUDE: - changed = br_multicast_isinc_allow(pg, h_addr, grec->grec_src, + changed = br_multicast_isinc_allow(brmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_MODE_IS_EXCLUDE: - changed = br_multicast_isexc(pg, h_addr, grec->grec_src, + changed = br_multicast_isexc(brmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_INCLUDE: - changed = br_multicast_toin(pg, h_addr, grec->grec_src, + changed = br_multicast_toin(brmctx, pmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_CHANGE_TO_EXCLUDE: - changed = br_multicast_toex(pg, h_addr, grec->grec_src, + changed = br_multicast_toex(brmctx, pmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; case IGMPV3_BLOCK_OLD_SOURCES: - changed = br_multicast_block(pg, h_addr, grec->grec_src, + changed = br_multicast_block(brmctx, pmctx, pg, h_addr, + grec->grec_src, nsrcs, sizeof(__be32), type); break; } if (changed) - br_mdb_notify(br->dev, mdst, pg, RTM_NEWMDB); + br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: - spin_unlock_bh(&br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); } return err; } #if IS_ENABLED(CONFIG_IPV6) -static int br_ip6_multicast_mld2_report(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { - bool mldv1 = br->multicast_mld_version == 1; + bool mldv1 = brmctx->multicast_mld_version == 1; struct net_bridge_mdb_entry *mdst; struct net_bridge_port_group *pg; unsigned int nsrcs_offset; + struct mld2_report *mld2r; const unsigned char *src; - struct icmp6hdr *icmp6h; struct in6_addr *h_addr; struct mld2_grec *grec; unsigned int grec_len; @@ -2514,12 +2738,12 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, int i, len, num; int err = 0; - if (!ipv6_mc_may_pull(skb, sizeof(*icmp6h))) + if (!ipv6_mc_may_pull(skb, sizeof(*mld2r))) return -EINVAL; - icmp6h = icmp6_hdr(skb); - num = ntohs(icmp6h->icmp6_dataun.un_data16[1]); - len = skb_transport_offset(skb) + sizeof(*icmp6h); + mld2r = (struct mld2_report *)icmp6_hdr(skb); + num = ntohs(mld2r->mld2r_ngrec); + len = skb_transport_offset(skb) + sizeof(*mld2r); for (i = 0; i < num; i++) { __be16 *_nsrcs, __nsrcs; @@ -2562,137 +2786,243 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br, if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE || grec->grec_type == MLD2_MODE_IS_INCLUDE) && nsrcs == 0) { - if (!port || mldv1) { - br_ip6_multicast_leave_group(br, port, + if (!pmctx || mldv1) { + br_ip6_multicast_leave_group(brmctx, pmctx, &grec->grec_mca, vid, src); continue; } } else { - err = br_ip6_multicast_add_group(br, port, + err = br_ip6_multicast_add_group(brmctx, pmctx, &grec->grec_mca, vid, src, mldv1); if (err) break; } - if (!port || mldv1) + if (!pmctx || mldv1) continue; - spin_lock_bh(&br->multicast_lock); - mdst = br_mdb_ip6_get(br, &grec->grec_mca, vid); + spin_lock_bh(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) + goto unlock_continue; + + mdst = br_mdb_ip6_get(brmctx->br, &grec->grec_mca, vid); if (!mdst) goto unlock_continue; - pg = br_multicast_find_port(mdst, port, src); + pg = br_multicast_find_port(mdst, pmctx->port, src); if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT)) goto unlock_continue; h_addr = &ipv6_hdr(skb)->saddr; switch (grec->grec_type) { case MLD2_ALLOW_NEW_SOURCES: - changed = br_multicast_isinc_allow(pg, h_addr, + changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_MODE_IS_INCLUDE: - changed = br_multicast_isinc_allow(pg, h_addr, + changed = br_multicast_isinc_allow(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_MODE_IS_EXCLUDE: - changed = br_multicast_isexc(pg, h_addr, + changed = br_multicast_isexc(brmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_CHANGE_TO_INCLUDE: - changed = br_multicast_toin(pg, h_addr, + changed = br_multicast_toin(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_CHANGE_TO_EXCLUDE: - changed = br_multicast_toex(pg, h_addr, + changed = br_multicast_toex(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; case MLD2_BLOCK_OLD_SOURCES: - changed = br_multicast_block(pg, h_addr, + changed = br_multicast_block(brmctx, pmctx, pg, h_addr, grec->grec_src, nsrcs, sizeof(struct in6_addr), grec->grec_type); break; } if (changed) - br_mdb_notify(br->dev, mdst, pg, RTM_NEWMDB); + br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: - spin_unlock_bh(&br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); } return err; } #endif -static bool br_ip4_multicast_select_querier(struct net_bridge *br, - struct net_bridge_port *port, - __be32 saddr) +static bool br_multicast_select_querier(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, + struct br_ip *saddr) { - if (!timer_pending(&br->ip4_own_query.timer) && - !timer_pending(&br->ip4_other_query.timer)) - goto update; + int port_ifidx = pmctx ? pmctx->port->dev->ifindex : 0; + struct timer_list *own_timer, *other_timer; + struct bridge_mcast_querier *querier; - if (!br->ip4_querier.addr.src.ip4) - goto update; + switch (saddr->proto) { + case htons(ETH_P_IP): + querier = &brmctx->ip4_querier; + own_timer = &brmctx->ip4_own_query.timer; + other_timer = &brmctx->ip4_other_query.timer; + if (!querier->addr.src.ip4 || + ntohl(saddr->src.ip4) <= ntohl(querier->addr.src.ip4)) + goto update; + break; +#if IS_ENABLED(CONFIG_IPV6) + case htons(ETH_P_IPV6): + querier = &brmctx->ip6_querier; + own_timer = &brmctx->ip6_own_query.timer; + other_timer = &brmctx->ip6_other_query.timer; + if (ipv6_addr_cmp(&saddr->src.ip6, &querier->addr.src.ip6) <= 0) + goto update; + break; +#endif + default: + return false; + } - if (ntohl(saddr) <= ntohl(br->ip4_querier.addr.src.ip4)) + if (!timer_pending(own_timer) && !timer_pending(other_timer)) goto update; return false; update: - br->ip4_querier.addr.src.ip4 = saddr; - - /* update protected by general multicast_lock by caller */ - rcu_assign_pointer(br->ip4_querier.port, port); + br_multicast_update_querier(brmctx, querier, port_ifidx, saddr); return true; } +static struct net_bridge_port * +__br_multicast_get_querier_port(struct net_bridge *br, + const struct bridge_mcast_querier *querier) +{ + int port_ifidx = READ_ONCE(querier->port_ifidx); + struct net_bridge_port *p; + struct net_device *dev; + + if (port_ifidx == 0) + return NULL; + + dev = dev_get_by_index_rcu(dev_net(br->dev), port_ifidx); + if (!dev) + return NULL; + p = br_port_get_rtnl_rcu(dev); + if (!p || p->br != br) + return NULL; + + return p; +} + +size_t br_multicast_querier_state_size(void) +{ + return nla_total_size(0) + /* nest attribute */ + nla_total_size(sizeof(__be32)) + /* BRIDGE_QUERIER_IP_ADDRESS */ + nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IP_PORT */ + nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IP_OTHER_TIMER */ #if IS_ENABLED(CONFIG_IPV6) -static bool br_ip6_multicast_select_querier(struct net_bridge *br, - struct net_bridge_port *port, - struct in6_addr *saddr) + nla_total_size(sizeof(struct in6_addr)) + /* BRIDGE_QUERIER_IPV6_ADDRESS */ + nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IPV6_PORT */ + nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IPV6_OTHER_TIMER */ +#endif + 0; +} + +/* protected by rtnl or rcu */ +int br_multicast_dump_querier_state(struct sk_buff *skb, + const struct net_bridge_mcast *brmctx, + int nest_attr) { - if (!timer_pending(&br->ip6_own_query.timer) && - !timer_pending(&br->ip6_other_query.timer)) - goto update; + struct bridge_mcast_querier querier = {}; + struct net_bridge_port *p; + struct nlattr *nest; - if (ipv6_addr_cmp(saddr, &br->ip6_querier.addr.src.ip6) <= 0) - goto update; + if (!br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) || + br_multicast_ctx_vlan_global_disabled(brmctx)) + return 0; - return false; + nest = nla_nest_start(skb, nest_attr); + if (!nest) + return -EMSGSIZE; -update: - br->ip6_querier.addr.src.ip6 = *saddr; + rcu_read_lock(); + if (!brmctx->multicast_querier && + !timer_pending(&brmctx->ip4_other_query.timer)) + goto out_v6; + + br_multicast_read_querier(&brmctx->ip4_querier, &querier); + if (nla_put_in_addr(skb, BRIDGE_QUERIER_IP_ADDRESS, + querier.addr.src.ip4)) { + rcu_read_unlock(); + goto out_err; + } - /* update protected by general multicast_lock by caller */ - rcu_assign_pointer(br->ip6_querier.port, port); + p = __br_multicast_get_querier_port(brmctx->br, &querier); + if (timer_pending(&brmctx->ip4_other_query.timer) && + (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IP_OTHER_TIMER, + br_timer_value(&brmctx->ip4_other_query.timer), + BRIDGE_QUERIER_PAD) || + (p && nla_put_u32(skb, BRIDGE_QUERIER_IP_PORT, p->dev->ifindex)))) { + rcu_read_unlock(); + goto out_err; + } - return true; -} +out_v6: +#if IS_ENABLED(CONFIG_IPV6) + if (!brmctx->multicast_querier && + !timer_pending(&brmctx->ip6_other_query.timer)) + goto out; + + br_multicast_read_querier(&brmctx->ip6_querier, &querier); + if (nla_put_in6_addr(skb, BRIDGE_QUERIER_IPV6_ADDRESS, + &querier.addr.src.ip6)) { + rcu_read_unlock(); + goto out_err; + } + + p = __br_multicast_get_querier_port(brmctx->br, &querier); + if (timer_pending(&brmctx->ip6_other_query.timer) && + (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IPV6_OTHER_TIMER, + br_timer_value(&brmctx->ip6_other_query.timer), + BRIDGE_QUERIER_PAD) || + (p && nla_put_u32(skb, BRIDGE_QUERIER_IPV6_PORT, + p->dev->ifindex)))) { + rcu_read_unlock(); + goto out_err; + } +out: #endif + rcu_read_unlock(); + nla_nest_end(skb, nest); + if (!nla_len(nest)) + nla_nest_cancel(skb, nest); + + return 0; + +out_err: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} static void -br_multicast_update_query_timer(struct net_bridge *br, +br_multicast_update_query_timer(struct net_bridge_mcast *brmctx, struct bridge_mcast_other_query *query, unsigned long max_delay) { if (!timer_pending(&query->timer)) query->delay_time = jiffies + max_delay; - mod_timer(&query->timer, jiffies + br->multicast_querier_interval); + mod_timer(&query->timer, jiffies + brmctx->multicast_querier_interval); } static void br_port_mc_router_state_change(struct net_bridge_port *p, @@ -2709,19 +3039,26 @@ static void br_port_mc_router_state_change(struct net_bridge_port *p, } static struct net_bridge_port * -br_multicast_rport_from_node(struct net_bridge *br, +br_multicast_rport_from_node(struct net_bridge_mcast *brmctx, struct hlist_head *mc_router_list, struct hlist_node *rlist) { + struct net_bridge_mcast_port *pmctx; + #if IS_ENABLED(CONFIG_IPV6) - if (mc_router_list == &br->ip6_mc_router_list) - return hlist_entry(rlist, struct net_bridge_port, ip6_rlist); + if (mc_router_list == &brmctx->ip6_mc_router_list) + pmctx = hlist_entry(rlist, struct net_bridge_mcast_port, + ip6_rlist); + else #endif - return hlist_entry(rlist, struct net_bridge_port, ip4_rlist); + pmctx = hlist_entry(rlist, struct net_bridge_mcast_port, + ip4_rlist); + + return pmctx->port; } static struct hlist_node * -br_multicast_get_rport_slot(struct net_bridge *br, +br_multicast_get_rport_slot(struct net_bridge_mcast *brmctx, struct net_bridge_port *port, struct hlist_head *mc_router_list) @@ -2731,7 +3068,7 @@ br_multicast_get_rport_slot(struct net_bridge *br, struct hlist_node *rlist; hlist_for_each(rlist, mc_router_list) { - p = br_multicast_rport_from_node(br, mc_router_list, rlist); + p = br_multicast_rport_from_node(brmctx, mc_router_list, rlist); if ((unsigned long)port >= (unsigned long)p) break; @@ -2742,14 +3079,14 @@ br_multicast_get_rport_slot(struct net_bridge *br, return slot; } -static bool br_multicast_no_router_otherpf(struct net_bridge_port *port, +static bool br_multicast_no_router_otherpf(struct net_bridge_mcast_port *pmctx, struct hlist_node *rnode) { #if IS_ENABLED(CONFIG_IPV6) - if (rnode != &port->ip6_rlist) - return hlist_unhashed(&port->ip6_rlist); + if (rnode != &pmctx->ip6_rlist) + return hlist_unhashed(&pmctx->ip6_rlist); else - return hlist_unhashed(&port->ip4_rlist); + return hlist_unhashed(&pmctx->ip4_rlist); #else return true; #endif @@ -2759,8 +3096,8 @@ static bool br_multicast_no_router_otherpf(struct net_bridge_port *port, * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ -static void br_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port, +static void br_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct hlist_node *rlist, struct hlist_head *mc_router_list) { @@ -2769,7 +3106,7 @@ static void br_multicast_add_router(struct net_bridge *br, if (!hlist_unhashed(rlist)) return; - slot = br_multicast_get_rport_slot(br, port, mc_router_list); + slot = br_multicast_get_rport_slot(brmctx, pmctx->port, mc_router_list); if (slot) hlist_add_behind_rcu(rlist, slot); @@ -2780,9 +3117,9 @@ static void br_multicast_add_router(struct net_bridge *br, * switched from no IPv4/IPv6 multicast router to a new * IPv4 or IPv6 multicast router. */ - if (br_multicast_no_router_otherpf(port, rlist)) { - br_rtr_notify(br->dev, port, RTM_NEWMDB); - br_port_mc_router_state_change(port, true); + if (br_multicast_no_router_otherpf(pmctx, rlist)) { + br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_NEWMDB); + br_port_mc_router_state_change(pmctx->port, true); } } @@ -2790,116 +3127,119 @@ static void br_multicast_add_router(struct net_bridge *br, * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ -static void br_ip4_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port) +static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx) { - br_multicast_add_router(br, port, &port->ip4_rlist, - &br->ip4_mc_router_list); + br_multicast_add_router(brmctx, pmctx, &pmctx->ip4_rlist, + &brmctx->ip4_mc_router_list); } /* Add port to router_list * list is maintained ordered by pointer value * and locked by br->multicast_lock and RCU */ -static void br_ip6_multicast_add_router(struct net_bridge *br, - struct net_bridge_port *port) +static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) - br_multicast_add_router(br, port, &port->ip6_rlist, - &br->ip6_mc_router_list); + br_multicast_add_router(brmctx, pmctx, &pmctx->ip6_rlist, + &brmctx->ip6_mc_router_list); #endif } -static void br_multicast_mark_router(struct net_bridge *br, - struct net_bridge_port *port, +static void br_multicast_mark_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct timer_list *timer, struct hlist_node *rlist, struct hlist_head *mc_router_list) { unsigned long now = jiffies; - if (!port) { - if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { - if (!br_ip4_multicast_is_router(br) && - !br_ip6_multicast_is_router(br)) - br_mc_router_state_change(br, true); - mod_timer(timer, now + br->multicast_querier_interval); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) + return; + + if (!pmctx) { + if (brmctx->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { + if (!br_ip4_multicast_is_router(brmctx) && + !br_ip6_multicast_is_router(brmctx)) + br_mc_router_state_change(brmctx->br, true); + mod_timer(timer, now + brmctx->multicast_querier_interval); } return; } - if (port->multicast_router == MDB_RTR_TYPE_DISABLED || - port->multicast_router == MDB_RTR_TYPE_PERM) + if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED || + pmctx->multicast_router == MDB_RTR_TYPE_PERM) return; - br_multicast_add_router(br, port, rlist, mc_router_list); - mod_timer(timer, now + br->multicast_querier_interval); + br_multicast_add_router(brmctx, pmctx, rlist, mc_router_list); + mod_timer(timer, now + brmctx->multicast_querier_interval); } -static void br_ip4_multicast_mark_router(struct net_bridge *br, - struct net_bridge_port *port) +static void br_ip4_multicast_mark_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx) { - struct timer_list *timer = &br->ip4_mc_router_timer; + struct timer_list *timer = &brmctx->ip4_mc_router_timer; struct hlist_node *rlist = NULL; - if (port) { - timer = &port->ip4_mc_router_timer; - rlist = &port->ip4_rlist; + if (pmctx) { + timer = &pmctx->ip4_mc_router_timer; + rlist = &pmctx->ip4_rlist; } - br_multicast_mark_router(br, port, timer, rlist, - &br->ip4_mc_router_list); + br_multicast_mark_router(brmctx, pmctx, timer, rlist, + &brmctx->ip4_mc_router_list); } -static void br_ip6_multicast_mark_router(struct net_bridge *br, - struct net_bridge_port *port) +static void br_ip6_multicast_mark_router(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx) { #if IS_ENABLED(CONFIG_IPV6) - struct timer_list *timer = &br->ip6_mc_router_timer; + struct timer_list *timer = &brmctx->ip6_mc_router_timer; struct hlist_node *rlist = NULL; - if (port) { - timer = &port->ip6_mc_router_timer; - rlist = &port->ip6_rlist; + if (pmctx) { + timer = &pmctx->ip6_mc_router_timer; + rlist = &pmctx->ip6_rlist; } - br_multicast_mark_router(br, port, timer, rlist, - &br->ip6_mc_router_list); + br_multicast_mark_router(brmctx, pmctx, timer, rlist, + &brmctx->ip6_mc_router_list); #endif } static void -br_ip4_multicast_query_received(struct net_bridge *br, - struct net_bridge_port *port, +br_ip4_multicast_query_received(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct bridge_mcast_other_query *query, struct br_ip *saddr, unsigned long max_delay) { - if (!br_ip4_multicast_select_querier(br, port, saddr->src.ip4)) + if (!br_multicast_select_querier(brmctx, pmctx, saddr)) return; - br_multicast_update_query_timer(br, query, max_delay); - br_ip4_multicast_mark_router(br, port); + br_multicast_update_query_timer(brmctx, query, max_delay); + br_ip4_multicast_mark_router(brmctx, pmctx); } #if IS_ENABLED(CONFIG_IPV6) static void -br_ip6_multicast_query_received(struct net_bridge *br, - struct net_bridge_port *port, +br_ip6_multicast_query_received(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct bridge_mcast_other_query *query, struct br_ip *saddr, unsigned long max_delay) { - if (!br_ip6_multicast_select_querier(br, port, &saddr->src.ip6)) + if (!br_multicast_select_querier(brmctx, pmctx, saddr)) return; - br_multicast_update_query_timer(br, query, max_delay); - br_ip6_multicast_mark_router(br, port); + br_multicast_update_query_timer(brmctx, query, max_delay); + br_ip6_multicast_mark_router(brmctx, pmctx); } #endif -static void br_ip4_multicast_query(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip4_multicast_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { @@ -2910,14 +3250,13 @@ static void br_ip4_multicast_query(struct net_bridge *br, struct igmpv3_query *ih3; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - struct br_ip saddr; + struct br_ip saddr = {}; unsigned long max_delay; unsigned long now = jiffies; __be32 group; - spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED)) + spin_lock(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; group = ih->group; @@ -2932,7 +3271,8 @@ static void br_ip4_multicast_query(struct net_bridge *br, } else if (transport_len >= sizeof(*ih3)) { ih3 = igmpv3_query_hdr(skb); if (ih3->nsrcs || - (br->multicast_igmp_version == 3 && group && ih3->suppress)) + (brmctx->multicast_igmp_version == 3 && group && + ih3->suppress)) goto out; max_delay = ih3->code ? @@ -2945,16 +3285,17 @@ static void br_ip4_multicast_query(struct net_bridge *br, saddr.proto = htons(ETH_P_IP); saddr.src.ip4 = iph->saddr; - br_ip4_multicast_query_received(br, port, &br->ip4_other_query, + br_ip4_multicast_query_received(brmctx, pmctx, + &brmctx->ip4_other_query, &saddr, max_delay); goto out; } - mp = br_mdb_ip4_get(br, group, vid); + mp = br_mdb_ip4_get(brmctx->br, group, vid); if (!mp) goto out; - max_delay *= br->multicast_last_member_count; + max_delay *= brmctx->multicast_last_member_count; if (mp->host_joined && (timer_pending(&mp->timer) ? @@ -2963,23 +3304,23 @@ static void br_ip4_multicast_query(struct net_bridge *br, mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; + (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : try_to_del_timer_sync(&p->timer) >= 0 && - (br->multicast_igmp_version == 2 || + (brmctx->multicast_igmp_version == 2 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } #if IS_ENABLED(CONFIG_IPV6) -static int br_ip6_multicast_query(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip6_multicast_query(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { @@ -2989,7 +3330,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, struct mld2_query *mld2q; struct net_bridge_port_group *p; struct net_bridge_port_group __rcu **pp; - struct br_ip saddr; + struct br_ip saddr = {}; unsigned long max_delay; unsigned long now = jiffies; unsigned int offset = skb_transport_offset(skb); @@ -2997,9 +3338,8 @@ static int br_ip6_multicast_query(struct net_bridge *br, bool is_general_query; int err = 0; - spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED)) + spin_lock(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; if (transport_len == sizeof(*mld)) { @@ -3019,7 +3359,7 @@ static int br_ip6_multicast_query(struct net_bridge *br, mld2q = (struct mld2_query *)icmp6_hdr(skb); if (!mld2q->mld2q_nsrcs) group = &mld2q->mld2q_mca; - if (br->multicast_mld_version == 2 && + if (brmctx->multicast_mld_version == 2 && !ipv6_addr_any(&mld2q->mld2q_mca) && mld2q->mld2q_suppress) goto out; @@ -3033,18 +3373,19 @@ static int br_ip6_multicast_query(struct net_bridge *br, saddr.proto = htons(ETH_P_IPV6); saddr.src.ip6 = ipv6_hdr(skb)->saddr; - br_ip6_multicast_query_received(br, port, &br->ip6_other_query, + br_ip6_multicast_query_received(brmctx, pmctx, + &brmctx->ip6_other_query, &saddr, max_delay); goto out; } else if (!group) { goto out; } - mp = br_mdb_ip6_get(br, group, vid); + mp = br_mdb_ip6_get(brmctx->br, group, vid); if (!mp) goto out; - max_delay *= br->multicast_last_member_count; + max_delay *= brmctx->multicast_last_member_count; if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, now + max_delay) : @@ -3052,25 +3393,25 @@ static int br_ip6_multicast_query(struct net_bridge *br, mod_timer(&mp->timer, now + max_delay); for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; + (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { if (timer_pending(&p->timer) ? time_after(p->timer.expires, now + max_delay) : try_to_del_timer_sync(&p->timer) >= 0 && - (br->multicast_mld_version == 1 || + (brmctx->multicast_mld_version == 1 || p->filter_mode == MCAST_EXCLUDE)) mod_timer(&p->timer, now + max_delay); } out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); return err; } #endif static void -br_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +br_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct br_ip *group, struct bridge_mcast_other_query *other_query, struct bridge_mcast_own_query *own_query, @@ -3081,22 +3422,21 @@ br_multicast_leave_group(struct net_bridge *br, unsigned long now; unsigned long time; - spin_lock(&br->multicast_lock); - if (!netif_running(br->dev) || - (port && port->state == BR_STATE_DISABLED)) + spin_lock(&brmctx->br->multicast_lock); + if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto out; - mp = br_mdb_ip_get(br, group); + mp = br_mdb_ip_get(brmctx->br, group); if (!mp) goto out; - if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) { + if (pmctx && (pmctx->port->flags & BR_MULTICAST_FAST_LEAVE)) { struct net_bridge_port_group __rcu **pp; for (pp = &mp->ports; - (p = mlock_dereference(*pp, br)) != NULL; + (p = mlock_dereference(*pp, brmctx->br)) != NULL; pp = &p->next) { - if (!br_port_group_equal(p, port, src)) + if (!br_port_group_equal(p, pmctx->port, src)) continue; if (p->flags & MDB_PG_FLAGS_PERMANENT) @@ -3111,19 +3451,19 @@ br_multicast_leave_group(struct net_bridge *br, if (timer_pending(&other_query->timer)) goto out; - if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) { - __br_multicast_send_query(br, port, NULL, NULL, &mp->addr, + if (brmctx->multicast_querier) { + __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &mp->addr, false, 0, NULL); - time = jiffies + br->multicast_last_member_count * - br->multicast_last_member_interval; + time = jiffies + brmctx->multicast_last_member_count * + brmctx->multicast_last_member_interval; mod_timer(&own_query->timer, time); - for (p = mlock_dereference(mp->ports, br); - p != NULL; - p = mlock_dereference(p->next, br)) { - if (!br_port_group_equal(p, port, src)) + for (p = mlock_dereference(mp->ports, brmctx->br); + p != NULL && pmctx != NULL; + p = mlock_dereference(p->next, brmctx->br)) { + if (!br_port_group_equal(p, pmctx->port, src)) continue; if (!hlist_unhashed(&p->mglist) && @@ -3138,10 +3478,10 @@ br_multicast_leave_group(struct net_bridge *br, } now = jiffies; - time = now + br->multicast_last_member_count * - br->multicast_last_member_interval; + time = now + brmctx->multicast_last_member_count * + brmctx->multicast_last_member_interval; - if (!port) { + if (!pmctx) { if (mp->host_joined && (timer_pending(&mp->timer) ? time_after(mp->timer.expires, time) : @@ -3152,10 +3492,10 @@ br_multicast_leave_group(struct net_bridge *br, goto out; } - for (p = mlock_dereference(mp->ports, br); + for (p = mlock_dereference(mp->ports, brmctx->br); p != NULL; - p = mlock_dereference(p->next, br)) { - if (p->key.port != port) + p = mlock_dereference(p->next, brmctx->br)) { + if (p->key.port != pmctx->port) continue; if (!hlist_unhashed(&p->mglist) && @@ -3168,11 +3508,11 @@ br_multicast_leave_group(struct net_bridge *br, break; } out: - spin_unlock(&br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } -static void br_ip4_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, __be32 group, __u16 vid, const unsigned char *src) @@ -3183,20 +3523,21 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br, if (ipv4_is_local_multicast(group)) return; - own_query = port ? &port->ip4_own_query : &br->ip4_own_query; + own_query = pmctx ? &pmctx->ip4_own_query : &brmctx->ip4_own_query; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip4 = group; br_group.proto = htons(ETH_P_IP); br_group.vid = vid; - br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query, + br_multicast_leave_group(brmctx, pmctx, &br_group, + &brmctx->ip4_other_query, own_query, src); } #if IS_ENABLED(CONFIG_IPV6) -static void br_ip6_multicast_leave_group(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, const struct in6_addr *group, __u16 vid, const unsigned char *src) @@ -3207,14 +3548,15 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br, if (ipv6_addr_is_ll_all_nodes(group)) return; - own_query = port ? &port->ip6_own_query : &br->ip6_own_query; + own_query = pmctx ? &pmctx->ip6_own_query : &brmctx->ip6_own_query; memset(&br_group, 0, sizeof(br_group)); br_group.dst.ip6 = *group; br_group.proto = htons(ETH_P_IPV6); br_group.vid = vid; - br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query, + br_multicast_leave_group(brmctx, pmctx, &br_group, + &brmctx->ip6_other_query, own_query, src); } #endif @@ -3252,8 +3594,8 @@ static void br_multicast_err_count(const struct net_bridge *br, u64_stats_update_end(&pstats->syncp); } -static void br_multicast_pim(struct net_bridge *br, - struct net_bridge_port *port, +static void br_multicast_pim(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, const struct sk_buff *skb) { unsigned int offset = skb_transport_offset(skb); @@ -3264,31 +3606,32 @@ static void br_multicast_pim(struct net_bridge *br, pim_hdr_type(pimhdr) != PIM_TYPE_HELLO) return; - spin_lock(&br->multicast_lock); - br_ip4_multicast_mark_router(br, port); - spin_unlock(&br->multicast_lock); + spin_lock(&brmctx->br->multicast_lock); + br_ip4_multicast_mark_router(brmctx, pmctx); + spin_unlock(&brmctx->br->multicast_lock); } -static int br_ip4_multicast_mrd_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static int br_ip4_multicast_mrd_rcv(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { if (ip_hdr(skb)->protocol != IPPROTO_IGMP || igmp_hdr(skb)->type != IGMP_MRDISC_ADV) return -ENOMSG; - spin_lock(&br->multicast_lock); - br_ip4_multicast_mark_router(br, port); - spin_unlock(&br->multicast_lock); + spin_lock(&brmctx->br->multicast_lock); + br_ip4_multicast_mark_router(brmctx, pmctx); + spin_unlock(&brmctx->br->multicast_lock); return 0; } -static int br_multicast_ipv4_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static int br_multicast_ipv4_rcv(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { + struct net_bridge_port *p = pmctx ? pmctx->port : NULL; const unsigned char *src; struct igmphdr *ih; int err; @@ -3300,14 +3643,14 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, BR_INPUT_SKB_CB(skb)->mrouters_only = 1; } else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) { if (ip_hdr(skb)->protocol == IPPROTO_PIM) - br_multicast_pim(br, port, skb); + br_multicast_pim(brmctx, pmctx, skb); } else if (ipv4_is_all_snoopers(ip_hdr(skb)->daddr)) { - br_ip4_multicast_mrd_rcv(br, port, skb); + br_ip4_multicast_mrd_rcv(brmctx, pmctx, skb); } return 0; } else if (err < 0) { - br_multicast_err_count(br, port, skb->protocol); + br_multicast_err_count(brmctx->br, p, skb->protocol); return err; } @@ -3319,44 +3662,45 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - err = br_ip4_multicast_add_group(br, port, ih->group, vid, src, - true); + err = br_ip4_multicast_add_group(brmctx, pmctx, ih->group, vid, + src, true); break; case IGMPV3_HOST_MEMBERSHIP_REPORT: - err = br_ip4_multicast_igmp3_report(br, port, skb, vid); + err = br_ip4_multicast_igmp3_report(brmctx, pmctx, skb, vid); break; case IGMP_HOST_MEMBERSHIP_QUERY: - br_ip4_multicast_query(br, port, skb, vid); + br_ip4_multicast_query(brmctx, pmctx, skb, vid); break; case IGMP_HOST_LEAVE_MESSAGE: - br_ip4_multicast_leave_group(br, port, ih->group, vid, src); + br_ip4_multicast_leave_group(brmctx, pmctx, ih->group, vid, src); break; } - br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp, + br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; } #if IS_ENABLED(CONFIG_IPV6) -static void br_ip6_multicast_mrd_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static void br_ip6_multicast_mrd_rcv(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb) { if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV) return; - spin_lock(&br->multicast_lock); - br_ip6_multicast_mark_router(br, port); - spin_unlock(&br->multicast_lock); + spin_lock(&brmctx->br->multicast_lock); + br_ip6_multicast_mark_router(brmctx, pmctx); + spin_unlock(&brmctx->br->multicast_lock); } -static int br_multicast_ipv6_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static int br_multicast_ipv6_rcv(struct net_bridge_mcast *brmctx, + struct net_bridge_mcast_port *pmctx, struct sk_buff *skb, u16 vid) { + struct net_bridge_port *p = pmctx ? pmctx->port : NULL; const unsigned char *src; struct mld_msg *mld; int err; @@ -3368,11 +3712,11 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, BR_INPUT_SKB_CB(skb)->mrouters_only = 1; if (err == -ENODATA && ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) - br_ip6_multicast_mrd_rcv(br, port, skb); + br_ip6_multicast_mrd_rcv(brmctx, pmctx, skb); return 0; } else if (err < 0) { - br_multicast_err_count(br, port, skb->protocol); + br_multicast_err_count(brmctx->br, p, skb->protocol); return err; } @@ -3383,29 +3727,32 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, case ICMPV6_MGM_REPORT: src = eth_hdr(skb)->h_source; BR_INPUT_SKB_CB(skb)->mrouters_only = 1; - err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid, - src, true); + err = br_ip6_multicast_add_group(brmctx, pmctx, &mld->mld_mca, + vid, src, true); break; case ICMPV6_MLD2_REPORT: - err = br_ip6_multicast_mld2_report(br, port, skb, vid); + err = br_ip6_multicast_mld2_report(brmctx, pmctx, skb, vid); break; case ICMPV6_MGM_QUERY: - err = br_ip6_multicast_query(br, port, skb, vid); + err = br_ip6_multicast_query(brmctx, pmctx, skb, vid); break; case ICMPV6_MGM_REDUCTION: src = eth_hdr(skb)->h_source; - br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src); + br_ip6_multicast_leave_group(brmctx, pmctx, &mld->mld_mca, vid, + src); break; } - br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp, + br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp, BR_MCAST_DIR_RX); return err; } #endif -int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, +int br_multicast_rcv(struct net_bridge_mcast **brmctx, + struct net_bridge_mcast_port **pmctx, + struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid) { int ret = 0; @@ -3413,16 +3760,36 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, BR_INPUT_SKB_CB(skb)->igmp = 0; BR_INPUT_SKB_CB(skb)->mrouters_only = 0; - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!br_opt_get((*brmctx)->br, BROPT_MULTICAST_ENABLED)) return 0; + if (br_opt_get((*brmctx)->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && vlan) { + const struct net_bridge_vlan *masterv; + + /* the vlan has the master flag set only when transmitting + * through the bridge device + */ + if (br_vlan_is_master(vlan)) { + masterv = vlan; + *brmctx = &vlan->br_mcast_ctx; + *pmctx = NULL; + } else { + masterv = vlan->brvlan; + *brmctx = &vlan->brvlan->br_mcast_ctx; + *pmctx = &vlan->port_mcast_ctx; + } + + if (!(masterv->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) + return 0; + } + switch (skb->protocol) { case htons(ETH_P_IP): - ret = br_multicast_ipv4_rcv(br, port, skb, vid); + ret = br_multicast_ipv4_rcv(*brmctx, *pmctx, skb, vid); break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - ret = br_multicast_ipv6_rcv(br, port, skb, vid); + ret = br_multicast_ipv6_rcv(*brmctx, *pmctx, skb, vid); break; #endif } @@ -3430,32 +3797,39 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, return ret; } -static void br_multicast_query_expired(struct net_bridge *br, +static void br_multicast_query_expired(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query, struct bridge_mcast_querier *querier) { - spin_lock(&br->multicast_lock); - if (query->startup_sent < br->multicast_startup_query_count) + spin_lock(&brmctx->br->multicast_lock); + if (br_multicast_ctx_vlan_disabled(brmctx)) + goto out; + + if (query->startup_sent < brmctx->multicast_startup_query_count) query->startup_sent++; - RCU_INIT_POINTER(querier->port, NULL); - br_multicast_send_query(br, NULL, query); - spin_unlock(&br->multicast_lock); + br_multicast_send_query(brmctx, NULL, query); +out: + spin_unlock(&brmctx->br->multicast_lock); } static void br_ip4_multicast_query_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip4_own_query.timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip4_own_query.timer); - br_multicast_query_expired(br, &br->ip4_own_query, &br->ip4_querier); + br_multicast_query_expired(brmctx, &brmctx->ip4_own_query, + &brmctx->ip4_querier); } #if IS_ENABLED(CONFIG_IPV6) static void br_ip6_multicast_query_expired(struct timer_list *t) { - struct net_bridge *br = from_timer(br, t, ip6_own_query.timer); + struct net_bridge_mcast *brmctx = from_timer(brmctx, t, + ip6_own_query.timer); - br_multicast_query_expired(br, &br->ip6_own_query, &br->ip6_querier); + br_multicast_query_expired(brmctx, &brmctx->ip6_own_query, + &brmctx->ip6_querier); } #endif @@ -3472,47 +3846,65 @@ static void br_multicast_gc_work(struct work_struct *work) br_multicast_gc(&deleted_head); } -void br_multicast_init(struct net_bridge *br) +void br_multicast_ctx_init(struct net_bridge *br, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast *brmctx) { - br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; - - br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - br->multicast_last_member_count = 2; - br->multicast_startup_query_count = 2; + brmctx->br = br; + brmctx->vlan = vlan; + brmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + brmctx->multicast_last_member_count = 2; + brmctx->multicast_startup_query_count = 2; - br->multicast_last_member_interval = HZ; - br->multicast_query_response_interval = 10 * HZ; - br->multicast_startup_query_interval = 125 * HZ / 4; - br->multicast_query_interval = 125 * HZ; - br->multicast_querier_interval = 255 * HZ; - br->multicast_membership_interval = 260 * HZ; + brmctx->multicast_last_member_interval = HZ; + brmctx->multicast_query_response_interval = 10 * HZ; + brmctx->multicast_startup_query_interval = 125 * HZ / 4; + brmctx->multicast_query_interval = 125 * HZ; + brmctx->multicast_querier_interval = 255 * HZ; + brmctx->multicast_membership_interval = 260 * HZ; - br->ip4_other_query.delay_time = 0; - br->ip4_querier.port = NULL; - br->multicast_igmp_version = 2; + brmctx->ip4_other_query.delay_time = 0; + brmctx->ip4_querier.port_ifidx = 0; + seqcount_spinlock_init(&brmctx->ip4_querier.seq, &br->multicast_lock); + brmctx->multicast_igmp_version = 2; #if IS_ENABLED(CONFIG_IPV6) - br->multicast_mld_version = 1; - br->ip6_other_query.delay_time = 0; - br->ip6_querier.port = NULL; + brmctx->multicast_mld_version = 1; + brmctx->ip6_other_query.delay_time = 0; + brmctx->ip6_querier.port_ifidx = 0; + seqcount_spinlock_init(&brmctx->ip6_querier.seq, &br->multicast_lock); #endif - br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true); - br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); - spin_lock_init(&br->multicast_lock); - timer_setup(&br->ip4_mc_router_timer, + timer_setup(&brmctx->ip4_mc_router_timer, br_ip4_multicast_local_router_expired, 0); - timer_setup(&br->ip4_other_query.timer, + timer_setup(&brmctx->ip4_other_query.timer, br_ip4_multicast_querier_expired, 0); - timer_setup(&br->ip4_own_query.timer, + timer_setup(&brmctx->ip4_own_query.timer, br_ip4_multicast_query_expired, 0); #if IS_ENABLED(CONFIG_IPV6) - timer_setup(&br->ip6_mc_router_timer, + timer_setup(&brmctx->ip6_mc_router_timer, br_ip6_multicast_local_router_expired, 0); - timer_setup(&br->ip6_other_query.timer, + timer_setup(&brmctx->ip6_other_query.timer, br_ip6_multicast_querier_expired, 0); - timer_setup(&br->ip6_own_query.timer, + timer_setup(&brmctx->ip6_own_query.timer, br_ip6_multicast_query_expired, 0); #endif +} + +void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx) +{ + __br_multicast_stop(brmctx); +} + +void br_multicast_init(struct net_bridge *br) +{ + br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX; + + br_multicast_ctx_init(br, NULL, &br->multicast_ctx); + + br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true); + br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true); + + spin_lock_init(&br->multicast_lock); INIT_HLIST_HEAD(&br->mdb_list); INIT_HLIST_HEAD(&br->mcast_gc_list); INIT_WORK(&br->mcast_gc_work, br_multicast_gc_work); @@ -3580,8 +3972,8 @@ void br_multicast_leave_snoopers(struct net_bridge *br) br_ip6_multicast_leave_snoopers(br); } -static void __br_multicast_open(struct net_bridge *br, - struct bridge_mcast_own_query *query) +static void __br_multicast_open_query(struct net_bridge *br, + struct bridge_mcast_own_query *query) { query->startup_sent = 0; @@ -3591,26 +3983,194 @@ static void __br_multicast_open(struct net_bridge *br, mod_timer(&query->timer, jiffies); } -void br_multicast_open(struct net_bridge *br) +static void __br_multicast_open(struct net_bridge_mcast *brmctx) { - __br_multicast_open(br, &br->ip4_own_query); + __br_multicast_open_query(brmctx->br, &brmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - __br_multicast_open(br, &br->ip6_own_query); + __br_multicast_open_query(brmctx->br, &brmctx->ip6_own_query); #endif } -void br_multicast_stop(struct net_bridge *br) +void br_multicast_open(struct net_bridge *br) { - del_timer_sync(&br->ip4_mc_router_timer); - del_timer_sync(&br->ip4_other_query.timer); - del_timer_sync(&br->ip4_own_query.timer); + ASSERT_RTNL(); + + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + + vg = br_vlan_group(br); + if (vg) { + list_for_each_entry(vlan, &vg->vlan_list, vlist) { + struct net_bridge_mcast *brmctx; + + brmctx = &vlan->br_mcast_ctx; + if (br_vlan_is_brentry(vlan) && + !br_multicast_ctx_vlan_disabled(brmctx)) + __br_multicast_open(&vlan->br_mcast_ctx); + } + } + } else { + __br_multicast_open(&br->multicast_ctx); + } +} + +static void __br_multicast_stop(struct net_bridge_mcast *brmctx) +{ + del_timer_sync(&brmctx->ip4_mc_router_timer); + del_timer_sync(&brmctx->ip4_other_query.timer); + del_timer_sync(&brmctx->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) - del_timer_sync(&br->ip6_mc_router_timer); - del_timer_sync(&br->ip6_other_query.timer); - del_timer_sync(&br->ip6_own_query.timer); + del_timer_sync(&brmctx->ip6_mc_router_timer); + del_timer_sync(&brmctx->ip6_other_query.timer); + del_timer_sync(&brmctx->ip6_own_query.timer); #endif } +void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on) +{ + struct net_bridge *br; + + /* it's okay to check for the flag without the multicast lock because it + * can only change under RTNL -> multicast_lock, we need the latter to + * sync with timers and packets + */ + if (on == !!(vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) + return; + + if (br_vlan_is_master(vlan)) { + br = vlan->br; + + if (!br_vlan_is_brentry(vlan) || + (on && + br_multicast_ctx_vlan_global_disabled(&vlan->br_mcast_ctx))) + return; + + spin_lock_bh(&br->multicast_lock); + vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED; + spin_unlock_bh(&br->multicast_lock); + + if (on) + __br_multicast_open(&vlan->br_mcast_ctx); + else + __br_multicast_stop(&vlan->br_mcast_ctx); + } else { + struct net_bridge_mcast *brmctx; + + brmctx = br_multicast_port_ctx_get_global(&vlan->port_mcast_ctx); + if (on && br_multicast_ctx_vlan_global_disabled(brmctx)) + return; + + br = vlan->port->br; + spin_lock_bh(&br->multicast_lock); + vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED; + if (on) + __br_multicast_enable_port_ctx(&vlan->port_mcast_ctx); + else + __br_multicast_disable_port_ctx(&vlan->port_mcast_ctx); + spin_unlock_bh(&br->multicast_lock); + } +} + +static void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on) +{ + struct net_bridge_port *p; + + if (WARN_ON_ONCE(!br_vlan_is_master(vlan))) + return; + + list_for_each_entry(p, &vlan->br->port_list, list) { + struct net_bridge_vlan *vport; + + vport = br_vlan_find(nbp_vlan_group(p), vlan->vid); + if (!vport) + continue; + br_multicast_toggle_one_vlan(vport, on); + } + + if (br_vlan_is_brentry(vlan)) + br_multicast_toggle_one_vlan(vlan, on); +} + +int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + struct net_bridge_port *p; + + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) == on) + return 0; + + if (on && !br_opt_get(br, BROPT_VLAN_ENABLED)) { + NL_SET_ERR_MSG_MOD(extack, "Cannot enable multicast vlan snooping with vlan filtering disabled"); + return -EINVAL; + } + + vg = br_vlan_group(br); + if (!vg) + return 0; + + br_opt_toggle(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED, on); + + /* disable/enable non-vlan mcast contexts based on vlan snooping */ + if (on) + __br_multicast_stop(&br->multicast_ctx); + else + __br_multicast_open(&br->multicast_ctx); + list_for_each_entry(p, &br->port_list, list) { + if (on) + br_multicast_disable_port(p); + else + br_multicast_enable_port(p); + } + + list_for_each_entry(vlan, &vg->vlan_list, vlist) + br_multicast_toggle_vlan(vlan, on); + + return 0; +} + +bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on) +{ + ASSERT_RTNL(); + + /* BR_VLFLAG_GLOBAL_MCAST_ENABLED relies on eventual consistency and + * requires only RTNL to change + */ + if (on == !!(vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) + return false; + + vlan->priv_flags ^= BR_VLFLAG_GLOBAL_MCAST_ENABLED; + br_multicast_toggle_vlan(vlan, on); + + return true; +} + +void br_multicast_stop(struct net_bridge *br) +{ + ASSERT_RTNL(); + + if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *vlan; + + vg = br_vlan_group(br); + if (vg) { + list_for_each_entry(vlan, &vg->vlan_list, vlist) { + struct net_bridge_mcast *brmctx; + + brmctx = &vlan->br_mcast_ctx; + if (br_vlan_is_brentry(vlan) && + !br_multicast_ctx_vlan_disabled(brmctx)) + __br_multicast_stop(&vlan->br_mcast_ctx); + } + } + } else { + __br_multicast_stop(&br->multicast_ctx); + } +} + void br_multicast_dev_del(struct net_bridge *br) { struct net_bridge_mdb_entry *mp; @@ -3623,44 +4183,45 @@ void br_multicast_dev_del(struct net_bridge *br) hlist_move_list(&br->mcast_gc_list, &deleted_head); spin_unlock_bh(&br->multicast_lock); + br_multicast_ctx_deinit(&br->multicast_ctx); br_multicast_gc(&deleted_head); cancel_work_sync(&br->mcast_gc_work); rcu_barrier(); } -int br_multicast_set_router(struct net_bridge *br, unsigned long val) +int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val) { int err = -EINVAL; - spin_lock_bh(&br->multicast_lock); + spin_lock_bh(&brmctx->br->multicast_lock); switch (val) { case MDB_RTR_TYPE_DISABLED: case MDB_RTR_TYPE_PERM: - br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM); - del_timer(&br->ip4_mc_router_timer); + br_mc_router_state_change(brmctx->br, val == MDB_RTR_TYPE_PERM); + del_timer(&brmctx->ip4_mc_router_timer); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&br->ip6_mc_router_timer); + del_timer(&brmctx->ip6_mc_router_timer); #endif - br->multicast_router = val; + brmctx->multicast_router = val; err = 0; break; case MDB_RTR_TYPE_TEMP_QUERY: - if (br->multicast_router != MDB_RTR_TYPE_TEMP_QUERY) - br_mc_router_state_change(br, false); - br->multicast_router = val; + if (brmctx->multicast_router != MDB_RTR_TYPE_TEMP_QUERY) + br_mc_router_state_change(brmctx->br, false); + brmctx->multicast_router = val; err = 0; break; } - spin_unlock_bh(&br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); return err; } static void -br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted) +br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted) { if (!deleted) return; @@ -3668,37 +4229,39 @@ br_multicast_rport_del_notify(struct net_bridge_port *p, bool deleted) /* For backwards compatibility for now, only notify if there is * no multicast router anymore for both IPv4 and IPv6. */ - if (!hlist_unhashed(&p->ip4_rlist)) + if (!hlist_unhashed(&pmctx->ip4_rlist)) return; #if IS_ENABLED(CONFIG_IPV6) - if (!hlist_unhashed(&p->ip6_rlist)) + if (!hlist_unhashed(&pmctx->ip6_rlist)) return; #endif - br_rtr_notify(p->br->dev, p, RTM_DELMDB); - br_port_mc_router_state_change(p, false); + br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_DELMDB); + br_port_mc_router_state_change(pmctx->port, false); /* don't allow timer refresh */ - if (p->multicast_router == MDB_RTR_TYPE_TEMP) - p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) + pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; } -int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) +int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, + unsigned long val) { - struct net_bridge *br = p->br; + struct net_bridge_mcast *brmctx; unsigned long now = jiffies; int err = -EINVAL; bool del = false; - spin_lock(&br->multicast_lock); - if (p->multicast_router == val) { + brmctx = br_multicast_port_ctx_get_global(pmctx); + spin_lock_bh(&brmctx->br->multicast_lock); + if (pmctx->multicast_router == val) { /* Refresh the temp router port timer */ - if (p->multicast_router == MDB_RTR_TYPE_TEMP) { - mod_timer(&p->ip4_mc_router_timer, - now + br->multicast_querier_interval); + if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) { + mod_timer(&pmctx->ip4_mc_router_timer, + now + brmctx->multicast_querier_interval); #if IS_ENABLED(CONFIG_IPV6) - mod_timer(&p->ip6_mc_router_timer, - now + br->multicast_querier_interval); + mod_timer(&pmctx->ip6_mc_router_timer, + now + brmctx->multicast_querier_interval); #endif } err = 0; @@ -3706,63 +4269,103 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) } switch (val) { case MDB_RTR_TYPE_DISABLED: - p->multicast_router = MDB_RTR_TYPE_DISABLED; - del |= br_ip4_multicast_rport_del(p); - del_timer(&p->ip4_mc_router_timer); - del |= br_ip6_multicast_rport_del(p); + pmctx->multicast_router = MDB_RTR_TYPE_DISABLED; + del |= br_ip4_multicast_rport_del(pmctx); + del_timer(&pmctx->ip4_mc_router_timer); + del |= br_ip6_multicast_rport_del(pmctx); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&p->ip6_mc_router_timer); + del_timer(&pmctx->ip6_mc_router_timer); #endif - br_multicast_rport_del_notify(p, del); + br_multicast_rport_del_notify(pmctx, del); break; case MDB_RTR_TYPE_TEMP_QUERY: - p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; - del |= br_ip4_multicast_rport_del(p); - del |= br_ip6_multicast_rport_del(p); - br_multicast_rport_del_notify(p, del); + pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY; + del |= br_ip4_multicast_rport_del(pmctx); + del |= br_ip6_multicast_rport_del(pmctx); + br_multicast_rport_del_notify(pmctx, del); break; case MDB_RTR_TYPE_PERM: - p->multicast_router = MDB_RTR_TYPE_PERM; - del_timer(&p->ip4_mc_router_timer); - br_ip4_multicast_add_router(br, p); + pmctx->multicast_router = MDB_RTR_TYPE_PERM; + del_timer(&pmctx->ip4_mc_router_timer); + br_ip4_multicast_add_router(brmctx, pmctx); #if IS_ENABLED(CONFIG_IPV6) - del_timer(&p->ip6_mc_router_timer); + del_timer(&pmctx->ip6_mc_router_timer); #endif - br_ip6_multicast_add_router(br, p); + br_ip6_multicast_add_router(brmctx, pmctx); break; case MDB_RTR_TYPE_TEMP: - p->multicast_router = MDB_RTR_TYPE_TEMP; - br_ip4_multicast_mark_router(br, p); - br_ip6_multicast_mark_router(br, p); + pmctx->multicast_router = MDB_RTR_TYPE_TEMP; + br_ip4_multicast_mark_router(brmctx, pmctx); + br_ip6_multicast_mark_router(brmctx, pmctx); break; default: goto unlock; } err = 0; unlock: - spin_unlock(&br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); return err; } -static void br_multicast_start_querier(struct net_bridge *br, +int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router) +{ + int err; + + if (br_vlan_is_master(v)) + err = br_multicast_set_router(&v->br_mcast_ctx, mcast_router); + else + err = br_multicast_set_port_router(&v->port_mcast_ctx, + mcast_router); + + return err; +} + +static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, struct bridge_mcast_own_query *query) { struct net_bridge_port *port; - __br_multicast_open(br, query); + if (!br_multicast_ctx_matches_vlan_snooping(brmctx)) + return; + + __br_multicast_open_query(brmctx->br, query); rcu_read_lock(); - list_for_each_entry_rcu(port, &br->port_list, list) { - if (port->state == BR_STATE_DISABLED || - port->state == BR_STATE_BLOCKING) + list_for_each_entry_rcu(port, &brmctx->br->port_list, list) { + struct bridge_mcast_own_query *ip4_own_query; +#if IS_ENABLED(CONFIG_IPV6) + struct bridge_mcast_own_query *ip6_own_query; +#endif + + if (br_multicast_port_ctx_state_stopped(&port->multicast_ctx)) continue; - if (query == &br->ip4_own_query) - br_multicast_enable(&port->ip4_own_query); + if (br_multicast_ctx_is_vlan(brmctx)) { + struct net_bridge_vlan *vlan; + + vlan = br_vlan_find(nbp_vlan_group_rcu(port), + brmctx->vlan->vid); + if (!vlan || + br_multicast_port_ctx_state_stopped(&vlan->port_mcast_ctx)) + continue; + + ip4_own_query = &vlan->port_mcast_ctx.ip4_own_query; +#if IS_ENABLED(CONFIG_IPV6) + ip6_own_query = &vlan->port_mcast_ctx.ip6_own_query; +#endif + } else { + ip4_own_query = &port->multicast_ctx.ip4_own_query; +#if IS_ENABLED(CONFIG_IPV6) + ip6_own_query = &port->multicast_ctx.ip6_own_query; +#endif + } + + if (query == &brmctx->ip4_own_query) + br_multicast_enable(ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) else - br_multicast_enable(&port->ip6_own_query); + br_multicast_enable(ip6_own_query); #endif } rcu_read_unlock(); @@ -3796,7 +4399,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val, br_multicast_open(br); list_for_each_entry(port, &br->port_list, list) - __br_multicast_enable_port(port); + __br_multicast_enable_port_ctx(&port->multicast_ctx); change_snoopers = true; @@ -3839,47 +4442,48 @@ bool br_multicast_router(const struct net_device *dev) bool is_router; spin_lock_bh(&br->multicast_lock); - is_router = br_multicast_is_router(br, NULL); + is_router = br_multicast_is_router(&br->multicast_ctx, NULL); spin_unlock_bh(&br->multicast_lock); return is_router; } EXPORT_SYMBOL_GPL(br_multicast_router); -int br_multicast_set_querier(struct net_bridge *br, unsigned long val) +int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val) { unsigned long max_delay; val = !!val; - spin_lock_bh(&br->multicast_lock); - if (br_opt_get(br, BROPT_MULTICAST_QUERIER) == val) + spin_lock_bh(&brmctx->br->multicast_lock); + if (brmctx->multicast_querier == val) goto unlock; - br_opt_toggle(br, BROPT_MULTICAST_QUERIER, !!val); + WRITE_ONCE(brmctx->multicast_querier, val); if (!val) goto unlock; - max_delay = br->multicast_query_response_interval; + max_delay = brmctx->multicast_query_response_interval; - if (!timer_pending(&br->ip4_other_query.timer)) - br->ip4_other_query.delay_time = jiffies + max_delay; + if (!timer_pending(&brmctx->ip4_other_query.timer)) + brmctx->ip4_other_query.delay_time = jiffies + max_delay; - br_multicast_start_querier(br, &br->ip4_own_query); + br_multicast_start_querier(brmctx, &brmctx->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) - if (!timer_pending(&br->ip6_other_query.timer)) - br->ip6_other_query.delay_time = jiffies + max_delay; + if (!timer_pending(&brmctx->ip6_other_query.timer)) + brmctx->ip6_other_query.delay_time = jiffies + max_delay; - br_multicast_start_querier(br, &br->ip6_own_query); + br_multicast_start_querier(brmctx, &brmctx->ip6_own_query); #endif unlock: - spin_unlock_bh(&br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } -int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val) +int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx, + unsigned long val) { /* Currently we support only version 2 and 3 */ switch (val) { @@ -3890,15 +4494,16 @@ int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val) return -EINVAL; } - spin_lock_bh(&br->multicast_lock); - br->multicast_igmp_version = val; - spin_unlock_bh(&br->multicast_lock); + spin_lock_bh(&brmctx->br->multicast_lock); + brmctx->multicast_igmp_version = val; + spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } #if IS_ENABLED(CONFIG_IPV6) -int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val) +int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx, + unsigned long val) { /* Currently we support version 1 and 2 */ switch (val) { @@ -3909,9 +4514,9 @@ int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val) return -EINVAL; } - spin_lock_bh(&br->multicast_lock); - br->multicast_mld_version = val; - spin_unlock_bh(&br->multicast_lock); + spin_lock_bh(&brmctx->br->multicast_lock); + brmctx->multicast_mld_version = val; + spin_unlock_bh(&brmctx->br->multicast_lock); return 0; } @@ -4003,7 +4608,7 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto) memset(ð, 0, sizeof(eth)); eth.h_proto = htons(proto); - ret = br_multicast_querier_exists(br, ð, NULL); + ret = br_multicast_querier_exists(&br->multicast_ctx, ð, NULL); unlock: rcu_read_unlock(); @@ -4022,9 +4627,11 @@ EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere); */ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto) { + struct net_bridge_mcast *brmctx; struct net_bridge *br; struct net_bridge_port *port; bool ret = false; + int port_ifidx; rcu_read_lock(); if (!netif_is_bridge_port(dev)) @@ -4035,17 +4642,20 @@ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto) goto unlock; br = port->br; + brmctx = &br->multicast_ctx; switch (proto) { case ETH_P_IP: - if (!timer_pending(&br->ip4_other_query.timer) || - rcu_dereference(br->ip4_querier.port) == port) + port_ifidx = brmctx->ip4_querier.port_ifidx; + if (!timer_pending(&brmctx->ip4_other_query.timer) || + port_ifidx == port->dev->ifindex) goto unlock; break; #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: - if (!timer_pending(&br->ip6_other_query.timer) || - rcu_dereference(br->ip6_querier.port) == port) + port_ifidx = brmctx->ip6_querier.port_ifidx; + if (!timer_pending(&brmctx->ip6_other_query.timer) || + port_ifidx == port->dev->ifindex) goto unlock; break; #endif @@ -4071,7 +4681,9 @@ EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent); */ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto) { - struct net_bridge_port *port, *p; + struct net_bridge_mcast_port *pmctx; + struct net_bridge_mcast *brmctx; + struct net_bridge_port *port; bool ret = false; rcu_read_lock(); @@ -4079,11 +4691,12 @@ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto) if (!port) goto unlock; + brmctx = &port->br->multicast_ctx; switch (proto) { case ETH_P_IP: - hlist_for_each_entry_rcu(p, &port->br->ip4_mc_router_list, + hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list, ip4_rlist) { - if (p == port) + if (pmctx->port == port) continue; ret = true; @@ -4092,9 +4705,9 @@ bool br_multicast_has_router_adjacent(struct net_device *dev, int proto) break; #if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: - hlist_for_each_entry_rcu(p, &port->br->ip6_mc_router_list, + hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list, ip6_rlist) { - if (p == port) + if (pmctx->port == port) continue; ret = true; @@ -4186,7 +4799,8 @@ static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats, u64_stats_update_end(&pstats->syncp); } -void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, +void br_multicast_count(struct net_bridge *br, + const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir) { struct bridge_mcast_stats __percpu *stats; diff --git a/net/bridge/br_multicast_eht.c b/net/bridge/br_multicast_eht.c index 13290a749d09..f91c071d1608 100644 --- a/net/bridge/br_multicast_eht.c +++ b/net/bridge/br_multicast_eht.c @@ -33,7 +33,8 @@ static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr); -static void br_multicast_create_eht_set_entry(struct net_bridge_port_group *pg, +static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr, int filter_mode, @@ -388,7 +389,8 @@ static void br_multicast_ip_src_to_eht_addr(const struct br_ip *src, } } -static void br_eht_convert_host_filter_mode(struct net_bridge_port_group *pg, +static void br_eht_convert_host_filter_mode(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, int filter_mode) { @@ -405,14 +407,15 @@ static void br_eht_convert_host_filter_mode(struct net_bridge_port_group *pg, br_multicast_del_eht_set_entry(pg, &zero_addr, h_addr); break; case MCAST_EXCLUDE: - br_multicast_create_eht_set_entry(pg, &zero_addr, h_addr, - MCAST_EXCLUDE, + br_multicast_create_eht_set_entry(brmctx, pg, &zero_addr, + h_addr, MCAST_EXCLUDE, true); break; } } -static void br_multicast_create_eht_set_entry(struct net_bridge_port_group *pg, +static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *src_addr, union net_bridge_eht_addr *h_addr, int filter_mode, @@ -441,8 +444,8 @@ static void br_multicast_create_eht_set_entry(struct net_bridge_port_group *pg, if (!set_h) goto fail_set_entry; - mod_timer(&set_h->timer, jiffies + br_multicast_gmi(br)); - mod_timer(&eht_set->timer, jiffies + br_multicast_gmi(br)); + mod_timer(&set_h->timer, jiffies + br_multicast_gmi(brmctx)); + mod_timer(&eht_set->timer, jiffies + br_multicast_gmi(brmctx)); return; @@ -499,7 +502,8 @@ static void br_multicast_del_eht_host(struct net_bridge_port_group *pg, } /* create new set entries from reports */ -static void __eht_create_set_entries(struct net_bridge_port_group *pg, +static void __eht_create_set_entries(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -512,8 +516,8 @@ static void __eht_create_set_entries(struct net_bridge_port_group *pg, memset(&eht_src_addr, 0, sizeof(eht_src_addr)); for (src_idx = 0; src_idx < nsrcs; src_idx++) { memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size); - br_multicast_create_eht_set_entry(pg, &eht_src_addr, h_addr, - filter_mode, + br_multicast_create_eht_set_entry(brmctx, pg, &eht_src_addr, + h_addr, filter_mode, false); } } @@ -549,7 +553,8 @@ static bool __eht_del_set_entries(struct net_bridge_port_group *pg, return changed; } -static bool br_multicast_eht_allow(struct net_bridge_port_group *pg, +static bool br_multicast_eht_allow(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -559,8 +564,8 @@ static bool br_multicast_eht_allow(struct net_bridge_port_group *pg, switch (br_multicast_eht_host_filter_mode(pg, h_addr)) { case MCAST_INCLUDE: - __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size, - MCAST_INCLUDE); + __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, + addr_size, MCAST_INCLUDE); break; case MCAST_EXCLUDE: changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs, @@ -571,7 +576,8 @@ static bool br_multicast_eht_allow(struct net_bridge_port_group *pg, return changed; } -static bool br_multicast_eht_block(struct net_bridge_port_group *pg, +static bool br_multicast_eht_block(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -585,7 +591,7 @@ static bool br_multicast_eht_block(struct net_bridge_port_group *pg, addr_size); break; case MCAST_EXCLUDE: - __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size, + __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_EXCLUDE); break; } @@ -594,7 +600,8 @@ static bool br_multicast_eht_block(struct net_bridge_port_group *pg, } /* flush_entries is true when changing mode */ -static bool __eht_inc_exc(struct net_bridge_port_group *pg, +static bool __eht_inc_exc(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -612,11 +619,10 @@ static bool __eht_inc_exc(struct net_bridge_port_group *pg, /* if we're changing mode del host and its entries */ if (flush_entries) br_multicast_del_eht_host(pg, h_addr); - __eht_create_set_entries(pg, h_addr, srcs, nsrcs, addr_size, + __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size, filter_mode); /* we can be missing sets only if we've deleted some entries */ if (flush_entries) { - struct net_bridge *br = pg->key.port->br; struct net_bridge_group_eht_set *eht_set; struct net_bridge_group_src *src_ent; struct hlist_node *tmp; @@ -647,14 +653,15 @@ static bool __eht_inc_exc(struct net_bridge_port_group *pg, &eht_src_addr); if (!eht_set) continue; - mod_timer(&eht_set->timer, jiffies + br_multicast_lmqt(br)); + mod_timer(&eht_set->timer, jiffies + br_multicast_lmqt(brmctx)); } } return changed; } -static bool br_multicast_eht_inc(struct net_bridge_port_group *pg, +static bool br_multicast_eht_inc(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -663,14 +670,15 @@ static bool br_multicast_eht_inc(struct net_bridge_port_group *pg, { bool changed; - changed = __eht_inc_exc(pg, h_addr, srcs, nsrcs, addr_size, + changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_INCLUDE, to_report); - br_eht_convert_host_filter_mode(pg, h_addr, MCAST_INCLUDE); + br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_INCLUDE); return changed; } -static bool br_multicast_eht_exc(struct net_bridge_port_group *pg, +static bool br_multicast_eht_exc(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -679,14 +687,15 @@ static bool br_multicast_eht_exc(struct net_bridge_port_group *pg, { bool changed; - changed = __eht_inc_exc(pg, h_addr, srcs, nsrcs, addr_size, + changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size, MCAST_EXCLUDE, to_report); - br_eht_convert_host_filter_mode(pg, h_addr, MCAST_EXCLUDE); + br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_EXCLUDE); return changed; } -static bool __eht_ip4_handle(struct net_bridge_port_group *pg, +static bool __eht_ip4_handle(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -696,24 +705,25 @@ static bool __eht_ip4_handle(struct net_bridge_port_group *pg, switch (grec_type) { case IGMPV3_ALLOW_NEW_SOURCES: - br_multicast_eht_allow(pg, h_addr, srcs, nsrcs, sizeof(__be32)); + br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs, + sizeof(__be32)); break; case IGMPV3_BLOCK_OLD_SOURCES: - changed = br_multicast_eht_block(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32)); break; case IGMPV3_CHANGE_TO_INCLUDE: to_report = true; fallthrough; case IGMPV3_MODE_IS_INCLUDE: - changed = br_multicast_eht_inc(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32), to_report); break; case IGMPV3_CHANGE_TO_EXCLUDE: to_report = true; fallthrough; case IGMPV3_MODE_IS_EXCLUDE: - changed = br_multicast_eht_exc(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(__be32), to_report); break; } @@ -722,7 +732,8 @@ static bool __eht_ip4_handle(struct net_bridge_port_group *pg, } #if IS_ENABLED(CONFIG_IPV6) -static bool __eht_ip6_handle(struct net_bridge_port_group *pg, +static bool __eht_ip6_handle(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, union net_bridge_eht_addr *h_addr, void *srcs, u32 nsrcs, @@ -732,18 +743,18 @@ static bool __eht_ip6_handle(struct net_bridge_port_group *pg, switch (grec_type) { case MLD2_ALLOW_NEW_SOURCES: - br_multicast_eht_allow(pg, h_addr, srcs, nsrcs, + br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr)); break; case MLD2_BLOCK_OLD_SOURCES: - changed = br_multicast_eht_block(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr)); break; case MLD2_CHANGE_TO_INCLUDE: to_report = true; fallthrough; case MLD2_MODE_IS_INCLUDE: - changed = br_multicast_eht_inc(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr), to_report); break; @@ -751,7 +762,7 @@ static bool __eht_ip6_handle(struct net_bridge_port_group *pg, to_report = true; fallthrough; case MLD2_MODE_IS_EXCLUDE: - changed = br_multicast_eht_exc(pg, h_addr, srcs, nsrcs, + changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs, sizeof(struct in6_addr), to_report); break; @@ -762,7 +773,8 @@ static bool __eht_ip6_handle(struct net_bridge_port_group *pg, #endif /* true means an entry was deleted */ -bool br_multicast_eht_handle(struct net_bridge_port_group *pg, +bool br_multicast_eht_handle(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, @@ -779,12 +791,12 @@ bool br_multicast_eht_handle(struct net_bridge_port_group *pg, memset(&eht_host_addr, 0, sizeof(eht_host_addr)); memcpy(&eht_host_addr, h_addr, addr_size); if (addr_size == sizeof(__be32)) - changed = __eht_ip4_handle(pg, &eht_host_addr, srcs, nsrcs, - grec_type); + changed = __eht_ip4_handle(brmctx, pg, &eht_host_addr, srcs, + nsrcs, grec_type); #if IS_ENABLED(CONFIG_IPV6) else - changed = __eht_ip6_handle(pg, &eht_host_addr, srcs, nsrcs, - grec_type); + changed = __eht_ip6_handle(brmctx, pg, &eht_host_addr, srcs, + nsrcs, grec_type); #endif out: diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 8edfb98ae1d5..b5af68c105a8 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -968,7 +968,7 @@ static int brnf_device_event(struct notifier_block *unused, unsigned long event, struct net *net; int ret; - if (event != NETDEV_REGISTER || !(dev->priv_flags & IFF_EBRIDGE)) + if (event != NETDEV_REGISTER || !netif_is_bridge_master(dev)) return NOTIFY_DONE; ASSERT_RTNL(); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 8642e56059fb..0c8b5f1a15bc 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -106,7 +106,7 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev, p = br_port_get_check_rcu(dev); if (p) vg = nbp_vlan_group_rcu(p); - } else if (dev->priv_flags & IFF_EBRIDGE) { + } else if (netif_is_bridge_master(dev)) { br = netdev_priv(dev); vg = br_vlan_group_rcu(br); } @@ -287,7 +287,7 @@ static int br_port_fill_attrs(struct sk_buff *skb, #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER, - p->multicast_router) || + p->multicast_ctx.multicast_router) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, p->multicast_eht_hosts_limit) || nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, @@ -932,7 +932,8 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], if (tb[IFLA_BRPORT_MULTICAST_ROUTER]) { u8 mcast_router = nla_get_u8(tb[IFLA_BRPORT_MULTICAST_ROUTER]); - err = br_multicast_set_port_router(p, mcast_router); + err = br_multicast_set_port_router(&p->multicast_ctx, + mcast_router); if (err) return err; } @@ -1049,7 +1050,7 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) p = br_port_get_rtnl(dev); /* We want to accept dev as bridge itself as well */ - if (!p && !(dev->priv_flags & IFF_EBRIDGE)) + if (!p && !netif_is_bridge_master(dev)) return -EINVAL; err = br_afspec(br, p, afspec, RTM_DELLINK, &changed, NULL); @@ -1286,7 +1287,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_MCAST_ROUTER]) { u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]); - err = br_multicast_set_router(br, multicast_router); + err = br_multicast_set_router(&br->multicast_ctx, + multicast_router); if (err) return err; } @@ -1309,7 +1311,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_MCAST_QUERIER]) { u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]); - err = br_multicast_set_querier(br, mcast_querier); + err = br_multicast_set_querier(&br->multicast_ctx, + mcast_querier); if (err) return err; } @@ -1324,49 +1327,49 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]); - br->multicast_last_member_count = val; + br->multicast_ctx.multicast_last_member_count = val; } if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) { u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]); - br->multicast_startup_query_count = val; + br->multicast_ctx.multicast_startup_query_count = val; } if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]); - br->multicast_last_member_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]); - br->multicast_membership_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERIER_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]); - br->multicast_querier_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]); - br->multicast_query_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_query_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]); - br->multicast_query_response_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) { u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]); - br->multicast_startup_query_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_startup_query_interval = clock_t_to_jiffies(val); } if (data[IFLA_BR_MCAST_STATS_ENABLED]) { @@ -1380,7 +1383,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], __u8 igmp_version; igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]); - err = br_multicast_set_igmp_version(br, igmp_version); + err = br_multicast_set_igmp_version(&br->multicast_ctx, + igmp_version); if (err) return err; } @@ -1390,7 +1394,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], __u8 mld_version; mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]); - err = br_multicast_set_mld_version(br, mld_version); + err = br_multicast_set_mld_version(&br->multicast_ctx, + mld_version); if (err) return err; } @@ -1497,6 +1502,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */ nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */ + br_multicast_querier_state_size() + /* IFLA_BR_MCAST_QUERIER_STATE */ #endif #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */ @@ -1566,50 +1572,53 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) || + if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, + br->multicast_ctx.multicast_router) || nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, br_opt_get(br, BROPT_MULTICAST_ENABLED)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR, br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) || nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, - br_opt_get(br, BROPT_MULTICAST_QUERIER)) || + br->multicast_ctx.multicast_querier) || nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED, br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) || nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) || nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT, - br->multicast_last_member_count) || + br->multicast_ctx.multicast_last_member_count) || nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT, - br->multicast_startup_query_count) || + br->multicast_ctx.multicast_startup_query_count) || nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION, - br->multicast_igmp_version)) + br->multicast_ctx.multicast_igmp_version) || + br_multicast_dump_querier_state(skb, &br->multicast_ctx, + IFLA_BR_MCAST_QUERIER_STATE)) return -EMSGSIZE; #if IS_ENABLED(CONFIG_IPV6) if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION, - br->multicast_mld_version)) + br->multicast_ctx.multicast_mld_version)) return -EMSGSIZE; #endif - clockval = jiffies_to_clock_t(br->multicast_last_member_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_membership_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_querier_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_query_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_query_response_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; - clockval = jiffies_to_clock_t(br->multicast_startup_query_interval); + clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval); if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval, IFLA_BR_PAD)) return -EMSGSIZE; @@ -1657,7 +1666,8 @@ static size_t br_get_linkxstats_size(const struct net_device *dev, int attr) } return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) + - nla_total_size(sizeof(struct br_mcast_stats)) + + nla_total_size_64bit(sizeof(struct br_mcast_stats)) + + (p ? nla_total_size_64bit(sizeof(p->stp_xstats)) : 0) + nla_total_size(0); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2b48b204205e..c0efd697865a 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -29,6 +29,8 @@ #define BR_MULTICAST_DEFAULT_HASH_MAX 4096 +#define BR_HWDOM_MAX BITS_PER_LONG + #define BR_VERSION "2.3" /* Control of forwarding link local multicast */ @@ -79,7 +81,8 @@ struct bridge_mcast_other_query { /* selected querier */ struct bridge_mcast_querier { struct br_ip addr; - struct net_bridge_port __rcu *port; + int port_ifidx; + seqcount_spinlock_t seq; }; /* IGMP/MLD statistics */ @@ -89,6 +92,60 @@ struct bridge_mcast_stats { }; #endif +/* net_bridge_mcast_port must be always defined due to forwarding stubs */ +struct net_bridge_mcast_port { +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + struct net_bridge_port *port; + struct net_bridge_vlan *vlan; + + struct bridge_mcast_own_query ip4_own_query; + struct timer_list ip4_mc_router_timer; + struct hlist_node ip4_rlist; +#if IS_ENABLED(CONFIG_IPV6) + struct bridge_mcast_own_query ip6_own_query; + struct timer_list ip6_mc_router_timer; + struct hlist_node ip6_rlist; +#endif /* IS_ENABLED(CONFIG_IPV6) */ + unsigned char multicast_router; +#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ +}; + +/* net_bridge_mcast must be always defined due to forwarding stubs */ +struct net_bridge_mcast { +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + struct net_bridge *br; + struct net_bridge_vlan *vlan; + + u32 multicast_last_member_count; + u32 multicast_startup_query_count; + + u8 multicast_querier; + u8 multicast_igmp_version; + u8 multicast_router; +#if IS_ENABLED(CONFIG_IPV6) + u8 multicast_mld_version; +#endif + unsigned long multicast_last_member_interval; + unsigned long multicast_membership_interval; + unsigned long multicast_querier_interval; + unsigned long multicast_query_interval; + unsigned long multicast_query_response_interval; + unsigned long multicast_startup_query_interval; + struct hlist_head ip4_mc_router_list; + struct timer_list ip4_mc_router_timer; + struct bridge_mcast_other_query ip4_other_query; + struct bridge_mcast_own_query ip4_own_query; + struct bridge_mcast_querier ip4_querier; +#if IS_ENABLED(CONFIG_IPV6) + struct hlist_head ip6_mc_router_list; + struct timer_list ip6_mc_router_timer; + struct bridge_mcast_other_query ip6_other_query; + struct bridge_mcast_own_query ip6_own_query; + struct bridge_mcast_querier ip6_querier; +#endif /* IS_ENABLED(CONFIG_IPV6) */ +#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */ +}; + struct br_tunnel_info { __be64 tunnel_id; struct metadata_dst __rcu *tunnel_dst; @@ -98,6 +155,8 @@ struct br_tunnel_info { enum { BR_VLFLAG_PER_PORT_STATS = BIT(0), BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1), + BR_VLFLAG_MCAST_ENABLED = BIT(2), + BR_VLFLAG_GLOBAL_MCAST_ENABLED = BIT(3), }; /** @@ -114,6 +173,9 @@ enum { * @refcnt: if MASTER flag set, this is bumped for each port referencing it * @brvlan: if MASTER flag unset, this points to the global per-VLAN context * for this VLAN entry + * @br_mcast_ctx: if MASTER flag set, this is the global vlan multicast context + * @port_mcast_ctx: if MASTER flag unset, this is the per-port/vlan multicast + * context * @vlist: sorted list of VLAN entries * @rcu: used for entry destruction * @@ -141,6 +203,11 @@ struct net_bridge_vlan { struct br_tunnel_info tinfo; + union { + struct net_bridge_mcast br_mcast_ctx; + struct net_bridge_mcast_port port_mcast_ctx; + }; + struct list_head vlist; struct rcu_head rcu; @@ -305,19 +372,13 @@ struct net_bridge_port { struct kobject kobj; struct rcu_head rcu; + struct net_bridge_mcast_port multicast_ctx; + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING - struct bridge_mcast_own_query ip4_own_query; - struct timer_list ip4_mc_router_timer; - struct hlist_node ip4_rlist; -#if IS_ENABLED(CONFIG_IPV6) - struct bridge_mcast_own_query ip6_own_query; - struct timer_list ip6_mc_router_timer; - struct hlist_node ip6_rlist; -#endif /* IS_ENABLED(CONFIG_IPV6) */ + struct bridge_mcast_stats __percpu *mcast_stats; + u32 multicast_eht_hosts_limit; u32 multicast_eht_hosts_cnt; - unsigned char multicast_router; - struct bridge_mcast_stats __percpu *mcast_stats; struct hlist_head mglist; #endif @@ -329,7 +390,12 @@ struct net_bridge_port { struct netpoll *np; #endif #ifdef CONFIG_NET_SWITCHDEV - int offload_fwd_mark; + /* Identifier used to group ports that share the same switchdev + * hardware domain. + */ + int hwdom; + int offload_count; + struct netdev_phys_item_id ppid; #endif u16 group_fwd_mask; u16 backup_redirected_cnt; @@ -367,7 +433,6 @@ enum net_bridge_opts { BROPT_NF_CALL_ARPTABLES, BROPT_GROUP_ADDR_SET, BROPT_MULTICAST_ENABLED, - BROPT_MULTICAST_QUERIER, BROPT_MULTICAST_QUERY_USE_IFADDR, BROPT_MULTICAST_STATS_ENABLED, BROPT_HAS_IPV6_ADDR, @@ -376,6 +441,7 @@ enum net_bridge_opts { BROPT_VLAN_STATS_PER_PORT, BROPT_NO_LL_LEARN, BROPT_VLAN_BRIDGE_BINDING, + BROPT_MCAST_VLAN_SNOOPING_ENABLED, }; struct net_bridge { @@ -426,25 +492,14 @@ struct net_bridge { BR_USER_STP, /* new RSTP in userspace */ } stp_enabled; + struct net_bridge_mcast multicast_ctx; + #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + struct bridge_mcast_stats __percpu *mcast_stats; u32 hash_max; - u32 multicast_last_member_count; - u32 multicast_startup_query_count; - - u8 multicast_igmp_version; - u8 multicast_router; -#if IS_ENABLED(CONFIG_IPV6) - u8 multicast_mld_version; -#endif spinlock_t multicast_lock; - unsigned long multicast_last_member_interval; - unsigned long multicast_membership_interval; - unsigned long multicast_querier_interval; - unsigned long multicast_query_interval; - unsigned long multicast_query_response_interval; - unsigned long multicast_startup_query_interval; struct rhashtable mdb_hash_tbl; struct rhashtable sg_port_tbl; @@ -452,19 +507,6 @@ struct net_bridge { struct hlist_head mcast_gc_list; struct hlist_head mdb_list; - struct hlist_head ip4_mc_router_list; - struct timer_list ip4_mc_router_timer; - struct bridge_mcast_other_query ip4_other_query; - struct bridge_mcast_own_query ip4_own_query; - struct bridge_mcast_querier ip4_querier; - struct bridge_mcast_stats __percpu *mcast_stats; -#if IS_ENABLED(CONFIG_IPV6) - struct hlist_head ip6_mc_router_list; - struct timer_list ip6_mc_router_timer; - struct bridge_mcast_other_query ip6_other_query; - struct bridge_mcast_own_query ip6_own_query; - struct bridge_mcast_querier ip6_querier; -#endif /* IS_ENABLED(CONFIG_IPV6) */ struct work_struct mcast_gc_work; #endif @@ -476,7 +518,12 @@ struct net_bridge { u32 auto_cnt; #ifdef CONFIG_NET_SWITCHDEV - int offload_fwd_mark; + /* Counter used to make sure that hardware domains get unique + * identifiers in case a bridge spans multiple switchdev instances. + */ + int last_hwdom; + /* Bit mask of hardware domain numbers in use */ + unsigned long busy_hwdoms; #endif struct hlist_head fdb_list; @@ -506,7 +553,20 @@ struct br_input_skb_cb { #endif #ifdef CONFIG_NET_SWITCHDEV - int offload_fwd_mark; + /* Set if TX data plane offloading is used towards at least one + * hardware domain. + */ + u8 tx_fwd_offload:1; + /* The switchdev hardware domain from which this packet was received. + * If skb->offload_fwd_mark was set, then this packet was already + * forwarded by hardware to the other ports in the source hardware + * domain, otherwise it wasn't. + */ + int src_hwdom; + /* Bit mask of hardware domains towards this packet has already been + * transmitted using the TX data plane offload. + */ + unsigned long fwd_hwdoms; #endif }; @@ -616,6 +676,20 @@ static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur, return true; } +static inline u8 br_vlan_multicast_router(const struct net_bridge_vlan *v) +{ + u8 mcast_router = MDB_RTR_TYPE_DISABLED; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (!br_vlan_is_master(v)) + mcast_router = v->port_mcast_ctx.multicast_router; + else + mcast_router = v->br_mcast_ctx.multicast_router; +#endif + + return mcast_router; +} + static inline int br_afspec_cmd_to_rtm(int cmd) { switch (cmd) { @@ -693,8 +767,8 @@ struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, int br_fdb_test_addr(struct net_device *dev, unsigned char *addr); int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count, unsigned long off); -int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, - const unsigned char *addr, u16 vid); +int br_fdb_add_local(struct net_bridge *br, struct net_bridge_port *source, + const unsigned char *addr, u16 vid); void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, const unsigned char *addr, u16 vid, unsigned long flags); @@ -790,15 +864,18 @@ br_port_get_check_rtnl(const struct net_device *dev) } /* br_ioctl.c */ -int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); -int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, - void __user *arg); +int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, + void __user *data, int cmd); +int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, + struct ifreq *ifr, void __user *uarg); /* br_multicast.c */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING -int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port, +int br_multicast_rcv(struct net_bridge_mcast **brmctx, + struct net_bridge_mcast_port **pmctx, + struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid); -struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, +struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid); int br_multicast_add_port(struct net_bridge_port *port); void br_multicast_del_port(struct net_bridge_port *port); @@ -810,17 +887,22 @@ void br_multicast_leave_snoopers(struct net_bridge *br); void br_multicast_open(struct net_bridge *br); void br_multicast_stop(struct net_bridge *br); void br_multicast_dev_del(struct net_bridge *br); -void br_multicast_flood(struct net_bridge_mdb_entry *mdst, - struct sk_buff *skb, bool local_rcv, bool local_orig); -int br_multicast_set_router(struct net_bridge *br, unsigned long val); -int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val); +void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, + struct net_bridge_mcast *brmctx, + bool local_rcv, bool local_orig); +int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val); +int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, + unsigned long val); +int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router); int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack); -int br_multicast_set_querier(struct net_bridge *br, unsigned long val); +int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val); int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val); -int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val); +int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx, + unsigned long val); #if IS_ENABLED(CONFIG_IPV6) -int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val); +int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx, + unsigned long val); #endif struct net_bridge_mdb_entry * br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst); @@ -835,12 +917,13 @@ int br_mdb_hash_init(struct net_bridge *br); void br_mdb_hash_fini(struct net_bridge *br); void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, int type); -void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, +void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx, int type); void br_multicast_del_pg(struct net_bridge_mdb_entry *mp, struct net_bridge_port_group *pg, struct net_bridge_port_group __rcu **pp); -void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p, +void br_multicast_count(struct net_bridge *br, + const struct net_bridge_port *p, const struct sk_buff *skb, u8 type, u8 dir); int br_multicast_init_stats(struct net_bridge *br); void br_multicast_uninit_stats(struct net_bridge *br); @@ -849,7 +932,8 @@ void br_multicast_get_stats(const struct net_bridge *br, struct br_mcast_stats *dest); void br_mdb_init(void); void br_mdb_uninit(void); -void br_multicast_host_join(struct net_bridge_mdb_entry *mp, bool notify); +void br_multicast_host_join(const struct net_bridge_mcast *brmctx, + struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify); void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg, u8 filter_mode); @@ -859,6 +943,26 @@ struct net_bridge_group_src * br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip); void br_multicast_del_group_src(struct net_bridge_group_src *src, bool fastleave); +void br_multicast_ctx_init(struct net_bridge *br, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast *brmctx); +void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx); +void br_multicast_port_ctx_init(struct net_bridge_port *port, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast_port *pmctx); +void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx); +void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on); +int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack); +bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on); + +int br_rports_fill_info(struct sk_buff *skb, + const struct net_bridge_mcast *brmctx); +int br_multicast_dump_querier_state(struct sk_buff *skb, + const struct net_bridge_mcast *brmctx, + int nest_attr); +size_t br_multicast_querier_state_size(void); +size_t br_rports_size(const struct net_bridge_mcast *brmctx); static inline bool br_group_is_l2(const struct br_ip *group) { @@ -869,52 +973,65 @@ static inline bool br_group_is_l2(const struct br_ip *group) rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) static inline struct hlist_node * -br_multicast_get_first_rport_node(struct net_bridge *b, struct sk_buff *skb) { +br_multicast_get_first_rport_node(struct net_bridge_mcast *brmctx, + struct sk_buff *skb) +{ #if IS_ENABLED(CONFIG_IPV6) if (skb->protocol == htons(ETH_P_IPV6)) - return rcu_dereference(hlist_first_rcu(&b->ip6_mc_router_list)); + return rcu_dereference(hlist_first_rcu(&brmctx->ip6_mc_router_list)); #endif - return rcu_dereference(hlist_first_rcu(&b->ip4_mc_router_list)); + return rcu_dereference(hlist_first_rcu(&brmctx->ip4_mc_router_list)); } static inline struct net_bridge_port * -br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb) { +br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb) +{ + struct net_bridge_mcast_port *mctx; + #if IS_ENABLED(CONFIG_IPV6) if (skb->protocol == htons(ETH_P_IPV6)) - return hlist_entry_safe(rp, struct net_bridge_port, ip6_rlist); + mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port, + ip6_rlist); + else #endif - return hlist_entry_safe(rp, struct net_bridge_port, ip4_rlist); + mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port, + ip4_rlist); + + if (mctx) + return mctx->port; + else + return NULL; } -static inline bool br_ip4_multicast_is_router(struct net_bridge *br) +static inline bool br_ip4_multicast_is_router(struct net_bridge_mcast *brmctx) { - return timer_pending(&br->ip4_mc_router_timer); + return timer_pending(&brmctx->ip4_mc_router_timer); } -static inline bool br_ip6_multicast_is_router(struct net_bridge *br) +static inline bool br_ip6_multicast_is_router(struct net_bridge_mcast *brmctx) { #if IS_ENABLED(CONFIG_IPV6) - return timer_pending(&br->ip6_mc_router_timer); + return timer_pending(&brmctx->ip6_mc_router_timer); #else return false; #endif } static inline bool -br_multicast_is_router(struct net_bridge *br, struct sk_buff *skb) +br_multicast_is_router(struct net_bridge_mcast *brmctx, struct sk_buff *skb) { - switch (br->multicast_router) { + switch (brmctx->multicast_router) { case MDB_RTR_TYPE_PERM: return true; case MDB_RTR_TYPE_TEMP_QUERY: if (skb) { if (skb->protocol == htons(ETH_P_IP)) - return br_ip4_multicast_is_router(br); + return br_ip4_multicast_is_router(brmctx); else if (skb->protocol == htons(ETH_P_IPV6)) - return br_ip6_multicast_is_router(br); + return br_ip6_multicast_is_router(brmctx); } else { - return br_ip4_multicast_is_router(br) || - br_ip6_multicast_is_router(br); + return br_ip4_multicast_is_router(brmctx) || + br_ip6_multicast_is_router(brmctx); } fallthrough; default: @@ -923,14 +1040,14 @@ br_multicast_is_router(struct net_bridge *br, struct sk_buff *skb) } static inline bool -__br_multicast_querier_exists(struct net_bridge *br, - struct bridge_mcast_other_query *querier, - const bool is_ipv6) +__br_multicast_querier_exists(struct net_bridge_mcast *brmctx, + struct bridge_mcast_other_query *querier, + const bool is_ipv6) { bool own_querier_enabled; - if (br_opt_get(br, BROPT_MULTICAST_QUERIER)) { - if (is_ipv6 && !br_opt_get(br, BROPT_HAS_IPV6_ADDR)) + if (brmctx->multicast_querier) { + if (is_ipv6 && !br_opt_get(brmctx->br, BROPT_HAS_IPV6_ADDR)) own_querier_enabled = false; else own_querier_enabled = true; @@ -942,18 +1059,18 @@ __br_multicast_querier_exists(struct net_bridge *br, (own_querier_enabled || timer_pending(&querier->timer)); } -static inline bool br_multicast_querier_exists(struct net_bridge *br, +static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx, struct ethhdr *eth, const struct net_bridge_mdb_entry *mdb) { switch (eth->h_proto) { case (htons(ETH_P_IP)): - return __br_multicast_querier_exists(br, - &br->ip4_other_query, false); + return __br_multicast_querier_exists(brmctx, + &brmctx->ip4_other_query, false); #if IS_ENABLED(CONFIG_IPV6) case (htons(ETH_P_IPV6)): - return __br_multicast_querier_exists(br, - &br->ip6_other_query, true); + return __br_multicast_querier_exists(brmctx, + &brmctx->ip6_other_query, true); #endif default: return !!mdb && br_group_is_l2(&mdb->addr); @@ -974,15 +1091,16 @@ static inline bool br_multicast_is_star_g(const struct br_ip *ip) } } -static inline bool br_multicast_should_handle_mode(const struct net_bridge *br, - __be16 proto) +static inline bool +br_multicast_should_handle_mode(const struct net_bridge_mcast *brmctx, + __be16 proto) { switch (proto) { case htons(ETH_P_IP): - return !!(br->multicast_igmp_version == 3); + return !!(brmctx->multicast_igmp_version == 3); #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): - return !!(br->multicast_mld_version == 2); + return !!(brmctx->multicast_mld_version == 2); #endif default: return false; @@ -994,28 +1112,143 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb) return BR_INPUT_SKB_CB(skb)->igmp; } -static inline unsigned long br_multicast_lmqt(const struct net_bridge *br) +static inline unsigned long br_multicast_lmqt(const struct net_bridge_mcast *brmctx) +{ + return brmctx->multicast_last_member_interval * + brmctx->multicast_last_member_count; +} + +static inline unsigned long br_multicast_gmi(const struct net_bridge_mcast *brmctx) +{ + return brmctx->multicast_membership_interval; +} + +static inline bool +br_multicast_ctx_is_vlan(const struct net_bridge_mcast *brmctx) +{ + return !!brmctx->vlan; +} + +static inline bool +br_multicast_port_ctx_is_vlan(const struct net_bridge_mcast_port *pmctx) +{ + return !!pmctx->vlan; +} + +static inline struct net_bridge_mcast * +br_multicast_port_ctx_get_global(const struct net_bridge_mcast_port *pmctx) +{ + if (!br_multicast_port_ctx_is_vlan(pmctx)) + return &pmctx->port->br->multicast_ctx; + else + return &pmctx->vlan->brvlan->br_mcast_ctx; +} + +static inline bool +br_multicast_ctx_vlan_global_disabled(const struct net_bridge_mcast *brmctx) +{ + return br_opt_get(brmctx->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && + br_multicast_ctx_is_vlan(brmctx) && + !(brmctx->vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED); +} + +static inline bool +br_multicast_ctx_vlan_disabled(const struct net_bridge_mcast *brmctx) +{ + return br_multicast_ctx_is_vlan(brmctx) && + !(brmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED); +} + +static inline bool +br_multicast_port_ctx_vlan_disabled(const struct net_bridge_mcast_port *pmctx) +{ + return br_multicast_port_ctx_is_vlan(pmctx) && + !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED); +} + +static inline bool +br_multicast_port_ctx_state_disabled(const struct net_bridge_mcast_port *pmctx) +{ + return pmctx->port->state == BR_STATE_DISABLED || + (br_multicast_port_ctx_is_vlan(pmctx) && + (br_multicast_port_ctx_vlan_disabled(pmctx) || + pmctx->vlan->state == BR_STATE_DISABLED)); +} + +static inline bool +br_multicast_port_ctx_state_stopped(const struct net_bridge_mcast_port *pmctx) { - return br->multicast_last_member_interval * - br->multicast_last_member_count; + return br_multicast_port_ctx_state_disabled(pmctx) || + pmctx->port->state == BR_STATE_BLOCKING || + (br_multicast_port_ctx_is_vlan(pmctx) && + pmctx->vlan->state == BR_STATE_BLOCKING); +} + +static inline bool +br_rports_have_mc_router(const struct net_bridge_mcast *brmctx) +{ +#if IS_ENABLED(CONFIG_IPV6) + return !hlist_empty(&brmctx->ip4_mc_router_list) || + !hlist_empty(&brmctx->ip6_mc_router_list); +#else + return !hlist_empty(&brmctx->ip4_mc_router_list); +#endif +} + +static inline bool +br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1, + const struct net_bridge_mcast *brmctx2) +{ + return brmctx1->multicast_igmp_version == + brmctx2->multicast_igmp_version && + brmctx1->multicast_last_member_count == + brmctx2->multicast_last_member_count && + brmctx1->multicast_startup_query_count == + brmctx2->multicast_startup_query_count && + brmctx1->multicast_last_member_interval == + brmctx2->multicast_last_member_interval && + brmctx1->multicast_membership_interval == + brmctx2->multicast_membership_interval && + brmctx1->multicast_querier_interval == + brmctx2->multicast_querier_interval && + brmctx1->multicast_query_interval == + brmctx2->multicast_query_interval && + brmctx1->multicast_query_response_interval == + brmctx2->multicast_query_response_interval && + brmctx1->multicast_startup_query_interval == + brmctx2->multicast_startup_query_interval && + brmctx1->multicast_querier == brmctx2->multicast_querier && + brmctx1->multicast_router == brmctx2->multicast_router && + !br_rports_have_mc_router(brmctx1) && + !br_rports_have_mc_router(brmctx2) && +#if IS_ENABLED(CONFIG_IPV6) + brmctx1->multicast_mld_version == + brmctx2->multicast_mld_version && +#endif + true; } -static inline unsigned long br_multicast_gmi(const struct net_bridge *br) +static inline bool +br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx) { - /* use the RFC default of 2 for QRV */ - return 2 * br->multicast_query_interval + - br->multicast_query_response_interval; + bool vlan_snooping_enabled; + + vlan_snooping_enabled = !!br_opt_get(brmctx->br, + BROPT_MCAST_VLAN_SNOOPING_ENABLED); + + return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx)); } #else -static inline int br_multicast_rcv(struct net_bridge *br, - struct net_bridge_port *port, +static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx, + struct net_bridge_mcast_port **pmctx, + struct net_bridge_vlan *vlan, struct sk_buff *skb, u16 vid) { return 0; } -static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br, +static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb, u16 vid) { return NULL; @@ -1064,17 +1297,18 @@ static inline void br_multicast_dev_del(struct net_bridge *br) static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb, + struct net_bridge_mcast *brmctx, bool local_rcv, bool local_orig) { } -static inline bool br_multicast_is_router(struct net_bridge *br, +static inline bool br_multicast_is_router(struct net_bridge_mcast *brmctx, struct sk_buff *skb) { return false; } -static inline bool br_multicast_querier_exists(struct net_bridge *br, +static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx, struct ethhdr *eth, const struct net_bridge_mdb_entry *mdb) { @@ -1118,13 +1352,59 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb) { return 0; } + +static inline void br_multicast_ctx_init(struct net_bridge *br, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast *brmctx) +{ +} + +static inline void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx) +{ +} + +static inline void br_multicast_port_ctx_init(struct net_bridge_port *port, + struct net_bridge_vlan *vlan, + struct net_bridge_mcast_port *pmctx) +{ +} + +static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx) +{ +} + +static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, + bool on) +{ +} + +static inline int br_multicast_toggle_vlan_snooping(struct net_bridge *br, + bool on, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, + bool on) +{ + return false; +} + +static inline bool +br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1, + const struct net_bridge_mcast *brmctx2) +{ + return true; +} #endif /* br_vlan.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid, u8 *state); + u16 *vid, u8 *state, + struct net_bridge_vlan **vlan); bool br_allowed_egress(struct net_bridge_vlan_group *vg, const struct sk_buff *skb); bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid); @@ -1236,8 +1516,11 @@ static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) static inline bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid, u8 *state) + u16 *vid, u8 *state, + struct net_bridge_vlan **vlan) + { + *vlan = NULL; return true; } @@ -1410,6 +1693,12 @@ static inline bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, { return true; } + +static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid) +{ + return 0; +} + #endif /* br_vlan_options.c */ @@ -1424,6 +1713,14 @@ int br_vlan_process_options(const struct net_bridge *br, struct net_bridge_vlan *range_end, struct nlattr **tb, struct netlink_ext_ack *extack); +int br_vlan_rtm_process_global_options(struct net_device *dev, + const struct nlattr *attr, + int cmd, + struct netlink_ext_ack *extack); +bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *r_end); +bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, + const struct net_bridge_vlan *v_opts); /* vlan state manipulation helpers using *_ONCE to annotate lock-free access */ static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v) @@ -1596,11 +1893,13 @@ static inline int br_cfm_status_fill_info(struct sk_buff *skb, static inline int br_cfm_mep_count(struct net_bridge *br, u32 *count) { + *count = 0; return -EOPNOTSUPP; } static inline int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count) { + *count = 0; return -EOPNOTSUPP; } #endif @@ -1645,7 +1944,25 @@ static inline void br_sysfs_delbr(struct net_device *dev) { return; } /* br_switchdev.c */ #ifdef CONFIG_NET_SWITCHDEV -int nbp_switchdev_mark_set(struct net_bridge_port *p); +int br_switchdev_port_offload(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + bool tx_fwd_offload, + struct netlink_ext_ack *extack); + +void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb); + +bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb); + +void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb); + +void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, + struct sk_buff *skb); +void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, + struct sk_buff *skb); void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb); bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, @@ -1656,18 +1973,57 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, struct netlink_ext_ack *extack); void br_switchdev_fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type); +void br_switchdev_mdb_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type); int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags, struct netlink_ext_ack *extack); int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid); +void br_switchdev_init(struct net_bridge *br); static inline void br_switchdev_frame_unmark(struct sk_buff *skb) { skb->offload_fwd_mark = 0; } #else -static inline int nbp_switchdev_mark_set(struct net_bridge_port *p) +static inline int +br_switchdev_port_offload(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + bool tx_fwd_offload, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline void +br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb) +{ +} + +static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) +{ + return false; +} + +static inline void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb) +{ +} + +static inline void +nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, + struct sk_buff *skb) +{ +} + +static inline void +nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, + struct sk_buff *skb) { - return 0; } static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p, @@ -1707,9 +2063,21 @@ br_switchdev_fdb_notify(struct net_bridge *br, { } +static inline void br_switchdev_mdb_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type) +{ +} + static inline void br_switchdev_frame_unmark(struct sk_buff *skb) { } + +static inline void br_switchdev_init(struct net_bridge *br) +{ +} + #endif /* CONFIG_NET_SWITCHDEV */ /* br_arp_nd_proxy.c */ diff --git a/net/bridge/br_private_mcast_eht.h b/net/bridge/br_private_mcast_eht.h index f89049f4892c..adf82a05515a 100644 --- a/net/bridge/br_private_mcast_eht.h +++ b/net/bridge/br_private_mcast_eht.h @@ -51,7 +51,8 @@ struct net_bridge_group_eht_set { #ifdef CONFIG_BRIDGE_IGMP_SNOOPING void br_multicast_eht_clean_sets(struct net_bridge_port_group *pg); -bool br_multicast_eht_handle(struct net_bridge_port_group *pg, +bool br_multicast_eht_handle(const struct net_bridge_mcast *brmctx, + struct net_bridge_port_group *pg, void *h_addr, void *srcs, u32 nsrcs, diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h index c54cc26211d7..2b053289f016 100644 --- a/net/bridge/br_private_tunnel.h +++ b/net/bridge/br_private_tunnel.h @@ -38,9 +38,9 @@ int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid, void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port); void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan); -int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, - struct net_bridge_port *p, - struct net_bridge_vlan_group *vg); +void br_handle_ingress_vlan_tunnel(struct sk_buff *skb, + struct net_bridge_port *p, + struct net_bridge_vlan_group *vg); int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan); bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr, diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index ba55851fe132..75204d36d7f9 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -233,7 +233,7 @@ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN); memcpy(br->bridge_id.addr, addr, ETH_ALEN); - memcpy(br->dev->dev_addr, addr, ETH_ALEN); + eth_hw_addr_set(br->dev, addr); list_for_each_entry(p, &br->port_list, list) { if (ether_addr_equal(p->designated_bridge.addr, oldaddr)) diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index d3adee0f91f9..f8fbaaa7c501 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -4,54 +4,70 @@ #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <net/ip.h> #include <net/switchdev.h> #include "br_private.h" -static int br_switchdev_mark_get(struct net_bridge *br, struct net_device *dev) -{ - struct net_bridge_port *p; +static struct static_key_false br_switchdev_tx_fwd_offload; - /* dev is yet to be added to the port list. */ - list_for_each_entry(p, &br->port_list, list) { - if (netdev_port_same_parent_id(dev, p->dev)) - return p->offload_fwd_mark; - } +static bool nbp_switchdev_can_offload_tx_fwd(const struct net_bridge_port *p, + const struct sk_buff *skb) +{ + if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) + return false; - return ++br->offload_fwd_mark; + return (p->flags & BR_TX_FWD_OFFLOAD) && + (p->hwdom != BR_INPUT_SKB_CB(skb)->src_hwdom); } -int nbp_switchdev_mark_set(struct net_bridge_port *p) +bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) { - struct netdev_phys_item_id ppid = { }; - int err; + if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload)) + return false; - ASSERT_RTNL(); + return BR_INPUT_SKB_CB(skb)->tx_fwd_offload; +} - err = dev_get_port_parent_id(p->dev, &ppid, true); - if (err) { - if (err == -EOPNOTSUPP) - return 0; - return err; - } +void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb) +{ + skb->offload_fwd_mark = br_switchdev_frame_uses_tx_fwd_offload(skb); +} - p->offload_fwd_mark = br_switchdev_mark_get(p->br, p->dev); +/* Mark the frame for TX forwarding offload if this egress port supports it */ +void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p, + struct sk_buff *skb) +{ + if (nbp_switchdev_can_offload_tx_fwd(p, skb)) + BR_INPUT_SKB_CB(skb)->tx_fwd_offload = true; +} - return 0; +/* Lazily adds the hwdom of the egress bridge port to the bit mask of hwdoms + * that the skb has been already forwarded to, to avoid further cloning to + * other ports in the same hwdom by making nbp_switchdev_allowed_egress() + * return false. + */ +void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p, + struct sk_buff *skb) +{ + if (nbp_switchdev_can_offload_tx_fwd(p, skb)) + set_bit(p->hwdom, &BR_INPUT_SKB_CB(skb)->fwd_hwdoms); } void nbp_switchdev_frame_mark(const struct net_bridge_port *p, struct sk_buff *skb) { - if (skb->offload_fwd_mark && !WARN_ON_ONCE(!p->offload_fwd_mark)) - BR_INPUT_SKB_CB(skb)->offload_fwd_mark = p->offload_fwd_mark; + if (p->hwdom) + BR_INPUT_SKB_CB(skb)->src_hwdom = p->hwdom; } bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p, const struct sk_buff *skb) { - return !skb->offload_fwd_mark || - BR_INPUT_SKB_CB(skb)->offload_fwd_mark != p->offload_fwd_mark; + struct br_input_skb_cb *cb = BR_INPUT_SKB_CB(skb); + + return !test_bit(p->hwdom, &cb->fwd_hwdoms) && + (!skb->offload_fwd_mark || cb->src_hwdom != p->hwdom); } /* Flags that can be offloaded to hardware */ @@ -107,28 +123,38 @@ int br_switchdev_set_port_flag(struct net_bridge_port *p, return 0; } +static void br_switchdev_fdb_populate(struct net_bridge *br, + struct switchdev_notifier_fdb_info *item, + const struct net_bridge_fdb_entry *fdb, + const void *ctx) +{ + const struct net_bridge_port *p = READ_ONCE(fdb->dst); + + item->addr = fdb->key.addr.addr; + item->vid = fdb->key.vlan_id; + item->added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); + item->offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags); + item->is_local = test_bit(BR_FDB_LOCAL, &fdb->flags); + item->info.dev = (!p || item->is_local) ? br->dev : p->dev; + item->info.ctx = ctx; +} + void br_switchdev_fdb_notify(struct net_bridge *br, const struct net_bridge_fdb_entry *fdb, int type) { - const struct net_bridge_port *dst = READ_ONCE(fdb->dst); - struct net_device *dev = dst ? dst->dev : br->dev; - struct switchdev_notifier_fdb_info info = { - .addr = fdb->key.addr.addr, - .vid = fdb->key.vlan_id, - .added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags), - .is_local = test_bit(BR_FDB_LOCAL, &fdb->flags), - .offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags), - }; + struct switchdev_notifier_fdb_info item; + + br_switchdev_fdb_populate(br, &item, fdb, NULL); switch (type) { case RTM_DELNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_DEVICE, - dev, &info.info, NULL); + item.info.dev, &item.info, NULL); break; case RTM_NEWNEIGH: call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_DEVICE, - dev, &info.info, NULL); + item.info.dev, &item.info, NULL); break; } } @@ -156,3 +182,575 @@ int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid) return switchdev_port_obj_del(dev, &v.obj); } + +static int nbp_switchdev_hwdom_set(struct net_bridge_port *joining) +{ + struct net_bridge *br = joining->br; + struct net_bridge_port *p; + int hwdom; + + /* joining is yet to be added to the port list. */ + list_for_each_entry(p, &br->port_list, list) { + if (netdev_phys_item_id_same(&joining->ppid, &p->ppid)) { + joining->hwdom = p->hwdom; + return 0; + } + } + + hwdom = find_next_zero_bit(&br->busy_hwdoms, BR_HWDOM_MAX, 1); + if (hwdom >= BR_HWDOM_MAX) + return -EBUSY; + + set_bit(hwdom, &br->busy_hwdoms); + joining->hwdom = hwdom; + return 0; +} + +static void nbp_switchdev_hwdom_put(struct net_bridge_port *leaving) +{ + struct net_bridge *br = leaving->br; + struct net_bridge_port *p; + + /* leaving is no longer in the port list. */ + list_for_each_entry(p, &br->port_list, list) { + if (p->hwdom == leaving->hwdom) + return; + } + + clear_bit(leaving->hwdom, &br->busy_hwdoms); +} + +static int nbp_switchdev_add(struct net_bridge_port *p, + struct netdev_phys_item_id ppid, + bool tx_fwd_offload, + struct netlink_ext_ack *extack) +{ + int err; + + if (p->offload_count) { + /* Prevent unsupported configurations such as a bridge port + * which is a bonding interface, and the member ports are from + * different hardware switches. + */ + if (!netdev_phys_item_id_same(&p->ppid, &ppid)) { + NL_SET_ERR_MSG_MOD(extack, + "Same bridge port cannot be offloaded by two physical switches"); + return -EBUSY; + } + + /* Tolerate drivers that call switchdev_bridge_port_offload() + * more than once for the same bridge port, such as when the + * bridge port is an offloaded bonding/team interface. + */ + p->offload_count++; + + return 0; + } + + p->ppid = ppid; + p->offload_count = 1; + + err = nbp_switchdev_hwdom_set(p); + if (err) + return err; + + if (tx_fwd_offload) { + p->flags |= BR_TX_FWD_OFFLOAD; + static_branch_inc(&br_switchdev_tx_fwd_offload); + } + + return 0; +} + +static void nbp_switchdev_del(struct net_bridge_port *p) +{ + if (WARN_ON(!p->offload_count)) + return; + + p->offload_count--; + + if (p->offload_count) + return; + + if (p->hwdom) + nbp_switchdev_hwdom_put(p); + + if (p->flags & BR_TX_FWD_OFFLOAD) { + p->flags &= ~BR_TX_FWD_OFFLOAD; + static_branch_dec(&br_switchdev_tx_fwd_offload); + } +} + +static int +br_switchdev_fdb_replay_one(struct net_bridge *br, struct notifier_block *nb, + const struct net_bridge_fdb_entry *fdb, + unsigned long action, const void *ctx) +{ + struct switchdev_notifier_fdb_info item; + int err; + + br_switchdev_fdb_populate(br, &item, fdb, ctx); + + err = nb->notifier_call(nb, action, &item); + return notifier_to_errno(err); +} + +static int +br_switchdev_fdb_replay(const struct net_device *br_dev, const void *ctx, + bool adding, struct notifier_block *nb) +{ + struct net_bridge_fdb_entry *fdb; + struct net_bridge *br; + unsigned long action; + int err = 0; + + if (!nb) + return 0; + + if (!netif_is_bridge_master(br_dev)) + return -EINVAL; + + br = netdev_priv(br_dev); + + if (adding) + action = SWITCHDEV_FDB_ADD_TO_DEVICE; + else + action = SWITCHDEV_FDB_DEL_TO_DEVICE; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) { + err = br_switchdev_fdb_replay_one(br, nb, fdb, action, ctx); + if (err) + break; + } + + rcu_read_unlock(); + + return err; +} + +static int +br_switchdev_vlan_replay_one(struct notifier_block *nb, + struct net_device *dev, + struct switchdev_obj_port_vlan *vlan, + const void *ctx, unsigned long action, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_port_obj_info obj_info = { + .info = { + .dev = dev, + .extack = extack, + .ctx = ctx, + }, + .obj = &vlan->obj, + }; + int err; + + err = nb->notifier_call(nb, action, &obj_info); + return notifier_to_errno(err); +} + +static int br_switchdev_vlan_replay(struct net_device *br_dev, + struct net_device *dev, + const void *ctx, bool adding, + struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_bridge_port *p; + struct net_bridge *br; + unsigned long action; + int err = 0; + u16 pvid; + + ASSERT_RTNL(); + + if (!nb) + return 0; + + if (!netif_is_bridge_master(br_dev)) + return -EINVAL; + + if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) + return -EINVAL; + + if (netif_is_bridge_master(dev)) { + br = netdev_priv(dev); + vg = br_vlan_group(br); + p = NULL; + } else { + p = br_port_get_rtnl(dev); + if (WARN_ON(!p)) + return -EINVAL; + vg = nbp_vlan_group(p); + br = p->br; + } + + if (!vg) + return 0; + + if (adding) + action = SWITCHDEV_PORT_OBJ_ADD; + else + action = SWITCHDEV_PORT_OBJ_DEL; + + pvid = br_get_pvid(vg); + + list_for_each_entry(v, &vg->vlan_list, vlist) { + struct switchdev_obj_port_vlan vlan = { + .obj.orig_dev = dev, + .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, + .flags = br_vlan_flags(v, pvid), + .vid = v->vid, + }; + + if (!br_vlan_should_use(v)) + continue; + + err = br_switchdev_vlan_replay_one(nb, dev, &vlan, ctx, + action, extack); + if (err) + return err; + } + + return err; +} + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING +struct br_switchdev_mdb_complete_info { + struct net_bridge_port *port; + struct br_ip ip; +}; + +static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *priv) +{ + struct br_switchdev_mdb_complete_info *data = priv; + struct net_bridge_port_group __rcu **pp; + struct net_bridge_port_group *p; + struct net_bridge_mdb_entry *mp; + struct net_bridge_port *port = data->port; + struct net_bridge *br = port->br; + + if (err) + goto err; + + spin_lock_bh(&br->multicast_lock); + mp = br_mdb_ip_get(br, &data->ip); + if (!mp) + goto out; + for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL; + pp = &p->next) { + if (p->key.port != port) + continue; + p->flags |= MDB_PG_FLAGS_OFFLOAD; + } +out: + spin_unlock_bh(&br->multicast_lock); +err: + kfree(priv); +} + +static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb, + const struct net_bridge_mdb_entry *mp) +{ + if (mp->addr.proto == htons(ETH_P_IP)) + ip_eth_mc_map(mp->addr.dst.ip4, mdb->addr); +#if IS_ENABLED(CONFIG_IPV6) + else if (mp->addr.proto == htons(ETH_P_IPV6)) + ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb->addr); +#endif + else + ether_addr_copy(mdb->addr, mp->addr.dst.mac_addr); + + mdb->vid = mp->addr.vid; +} + +static void br_switchdev_host_mdb_one(struct net_device *dev, + struct net_device *lower_dev, + struct net_bridge_mdb_entry *mp, + int type) +{ + struct switchdev_obj_port_mdb mdb = { + .obj = { + .id = SWITCHDEV_OBJ_ID_HOST_MDB, + .flags = SWITCHDEV_F_DEFER, + .orig_dev = dev, + }, + }; + + br_switchdev_mdb_populate(&mdb, mp); + + switch (type) { + case RTM_NEWMDB: + switchdev_port_obj_add(lower_dev, &mdb.obj, NULL); + break; + case RTM_DELMDB: + switchdev_port_obj_del(lower_dev, &mdb.obj); + break; + } +} + +static void br_switchdev_host_mdb(struct net_device *dev, + struct net_bridge_mdb_entry *mp, int type) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(dev, lower_dev, iter) + br_switchdev_host_mdb_one(dev, lower_dev, mp, type); +} + +static int +br_switchdev_mdb_replay_one(struct notifier_block *nb, struct net_device *dev, + const struct switchdev_obj_port_mdb *mdb, + unsigned long action, const void *ctx, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_port_obj_info obj_info = { + .info = { + .dev = dev, + .extack = extack, + .ctx = ctx, + }, + .obj = &mdb->obj, + }; + int err; + + err = nb->notifier_call(nb, action, &obj_info); + return notifier_to_errno(err); +} + +static int br_switchdev_mdb_queue_one(struct list_head *mdb_list, + enum switchdev_obj_id id, + const struct net_bridge_mdb_entry *mp, + struct net_device *orig_dev) +{ + struct switchdev_obj_port_mdb *mdb; + + mdb = kzalloc(sizeof(*mdb), GFP_ATOMIC); + if (!mdb) + return -ENOMEM; + + mdb->obj.id = id; + mdb->obj.orig_dev = orig_dev; + br_switchdev_mdb_populate(mdb, mp); + list_add_tail(&mdb->obj.list, mdb_list); + + return 0; +} + +void br_switchdev_mdb_notify(struct net_device *dev, + struct net_bridge_mdb_entry *mp, + struct net_bridge_port_group *pg, + int type) +{ + struct br_switchdev_mdb_complete_info *complete_info; + struct switchdev_obj_port_mdb mdb = { + .obj = { + .id = SWITCHDEV_OBJ_ID_PORT_MDB, + .flags = SWITCHDEV_F_DEFER, + }, + }; + + if (!pg) + return br_switchdev_host_mdb(dev, mp, type); + + br_switchdev_mdb_populate(&mdb, mp); + + mdb.obj.orig_dev = pg->key.port->dev; + switch (type) { + case RTM_NEWMDB: + complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC); + if (!complete_info) + break; + complete_info->port = pg->key.port; + complete_info->ip = mp->addr; + mdb.obj.complete_priv = complete_info; + mdb.obj.complete = br_switchdev_mdb_complete; + if (switchdev_port_obj_add(pg->key.port->dev, &mdb.obj, NULL)) + kfree(complete_info); + break; + case RTM_DELMDB: + switchdev_port_obj_del(pg->key.port->dev, &mdb.obj); + break; + } +} +#endif + +static int +br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev, + const void *ctx, bool adding, struct notifier_block *nb, + struct netlink_ext_ack *extack) +{ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + const struct net_bridge_mdb_entry *mp; + struct switchdev_obj *obj, *tmp; + struct net_bridge *br; + unsigned long action; + LIST_HEAD(mdb_list); + int err = 0; + + ASSERT_RTNL(); + + if (!nb) + return 0; + + if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev)) + return -EINVAL; + + br = netdev_priv(br_dev); + + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) + return 0; + + /* We cannot walk over br->mdb_list protected just by the rtnl_mutex, + * because the write-side protection is br->multicast_lock. But we + * need to emulate the [ blocking ] calling context of a regular + * switchdev event, so since both br->multicast_lock and RCU read side + * critical sections are atomic, we have no choice but to pick the RCU + * read side lock, queue up all our events, leave the critical section + * and notify switchdev from blocking context. + */ + rcu_read_lock(); + + hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) { + struct net_bridge_port_group __rcu * const *pp; + const struct net_bridge_port_group *p; + + if (mp->host_joined) { + err = br_switchdev_mdb_queue_one(&mdb_list, + SWITCHDEV_OBJ_ID_HOST_MDB, + mp, br_dev); + if (err) { + rcu_read_unlock(); + goto out_free_mdb; + } + } + + for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL; + pp = &p->next) { + if (p->key.port->dev != dev) + continue; + + err = br_switchdev_mdb_queue_one(&mdb_list, + SWITCHDEV_OBJ_ID_PORT_MDB, + mp, dev); + if (err) { + rcu_read_unlock(); + goto out_free_mdb; + } + } + } + + rcu_read_unlock(); + + if (adding) + action = SWITCHDEV_PORT_OBJ_ADD; + else + action = SWITCHDEV_PORT_OBJ_DEL; + + list_for_each_entry(obj, &mdb_list, list) { + err = br_switchdev_mdb_replay_one(nb, dev, + SWITCHDEV_OBJ_PORT_MDB(obj), + action, ctx, extack); + if (err) + goto out_free_mdb; + } + +out_free_mdb: + list_for_each_entry_safe(obj, tmp, &mdb_list, list) { + list_del(&obj->list); + kfree(SWITCHDEV_OBJ_PORT_MDB(obj)); + } + + if (err) + return err; +#endif + + return 0; +} + +static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + struct net_device *br_dev = p->br->dev; + struct net_device *dev = p->dev; + int err; + + err = br_switchdev_vlan_replay(br_dev, dev, ctx, true, blocking_nb, + extack); + if (err && err != -EOPNOTSUPP) + return err; + + err = br_switchdev_mdb_replay(br_dev, dev, ctx, true, blocking_nb, + extack); + if (err && err != -EOPNOTSUPP) + return err; + + err = br_switchdev_fdb_replay(br_dev, ctx, true, atomic_nb); + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +static void nbp_switchdev_unsync_objs(struct net_bridge_port *p, + const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb) +{ + struct net_device *br_dev = p->br->dev; + struct net_device *dev = p->dev; + + br_switchdev_vlan_replay(br_dev, dev, ctx, false, blocking_nb, NULL); + + br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL); + + br_switchdev_fdb_replay(br_dev, ctx, false, atomic_nb); +} + +/* Let the bridge know that this port is offloaded, so that it can assign a + * switchdev hardware domain to it. + */ +int br_switchdev_port_offload(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + bool tx_fwd_offload, + struct netlink_ext_ack *extack) +{ + struct netdev_phys_item_id ppid; + int err; + + err = dev_get_port_parent_id(dev, &ppid, false); + if (err) + return err; + + err = nbp_switchdev_add(p, ppid, tx_fwd_offload, extack); + if (err) + return err; + + err = nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); + if (err) + goto out_switchdev_del; + + return 0; + +out_switchdev_del: + nbp_switchdev_del(p); + + return err; +} + +void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb) +{ + nbp_switchdev_unsync_objs(p, ctx, atomic_nb, blocking_nb); + + nbp_switchdev_del(p); +} diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 381467b691d5..d9a89ddd0331 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -384,13 +384,13 @@ static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->multicast_router); + return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router); } static int set_multicast_router(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - return br_multicast_set_router(br, val); + return br_multicast_set_router(&br->multicast_ctx, val); } static ssize_t multicast_router_store(struct device *d, @@ -447,13 +447,13 @@ static ssize_t multicast_querier_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_QUERIER)); + return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier); } static int set_multicast_querier(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - return br_multicast_set_querier(br, val); + return br_multicast_set_querier(&br->multicast_ctx, val); } static ssize_t multicast_querier_store(struct device *d, @@ -514,13 +514,13 @@ static ssize_t multicast_igmp_version_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_igmp_version); + return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version); } static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - return br_multicast_set_igmp_version(br, val); + return br_multicast_set_igmp_version(&br->multicast_ctx, val); } static ssize_t multicast_igmp_version_store(struct device *d, @@ -536,13 +536,13 @@ static ssize_t multicast_last_member_count_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_last_member_count); + return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count); } static int set_last_member_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_last_member_count = val; + br->multicast_ctx.multicast_last_member_count = val; return 0; } @@ -558,13 +558,13 @@ static ssize_t multicast_startup_query_count_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_startup_query_count); + return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count); } static int set_startup_query_count(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_startup_query_count = val; + br->multicast_ctx.multicast_startup_query_count = val; return 0; } @@ -581,13 +581,13 @@ static ssize_t multicast_last_member_interval_show( { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_last_member_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval)); } static int set_last_member_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_last_member_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); return 0; } @@ -604,13 +604,13 @@ static ssize_t multicast_membership_interval_show( { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_membership_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval)); } static int set_membership_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_membership_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); return 0; } @@ -628,13 +628,13 @@ static ssize_t multicast_querier_interval_show(struct device *d, { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_querier_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval)); } static int set_querier_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_querier_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); return 0; } @@ -652,13 +652,13 @@ static ssize_t multicast_query_interval_show(struct device *d, { struct net_bridge *br = to_bridge(d); return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_query_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval)); } static int set_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_query_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_query_interval = clock_t_to_jiffies(val); return 0; } @@ -676,13 +676,13 @@ static ssize_t multicast_query_response_interval_show( struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", - jiffies_to_clock_t(br->multicast_query_response_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval)); } static int set_query_response_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_query_response_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); return 0; } @@ -700,13 +700,13 @@ static ssize_t multicast_startup_query_interval_show( struct net_bridge *br = to_bridge(d); return sprintf( buf, "%lu\n", - jiffies_to_clock_t(br->multicast_startup_query_interval)); + jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval)); } static int set_startup_query_interval(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - br->multicast_startup_query_interval = clock_t_to_jiffies(val); + br->multicast_ctx.multicast_startup_query_interval = clock_t_to_jiffies(val); return 0; } @@ -751,13 +751,13 @@ static ssize_t multicast_mld_version_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_mld_version); + return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version); } static int set_multicast_mld_version(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { - return br_multicast_set_mld_version(br, val); + return br_multicast_set_mld_version(&br->multicast_ctx, val); } static ssize_t multicast_mld_version_store(struct device *d, diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 72e92376eef1..07fa76080512 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -244,13 +244,13 @@ BRPORT_ATTR_FLAG(isolated, BR_ISOLATED); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->multicast_router); + return sprintf(buf, "%d\n", p->multicast_ctx.multicast_router); } static int store_multicast_router(struct net_bridge_port *p, unsigned long v) { - return br_multicast_set_port_router(p, v); + return br_multicast_set_port_router(&p->multicast_ctx, v); } static BRPORT_ATTR(multicast_router, 0644, show_multicast_router, store_multicast_router); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index a08e9f193009..49e105e0a447 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -190,6 +190,8 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv) rhashtable_remove_fast(&vg->vlan_hash, &masterv->vnode, br_vlan_rht_params); __vlan_del_list(masterv); + br_multicast_toggle_one_vlan(masterv, false); + br_multicast_ctx_deinit(&masterv->br_mcast_ctx); call_rcu(&masterv->rcu, br_master_vlan_rcu_free); } } @@ -280,15 +282,18 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, } else { v->stats = masterv->stats; } + br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx); } else { err = br_switchdev_port_vlan_add(dev, v->vid, flags, extack); if (err && err != -EOPNOTSUPP) goto out; + br_multicast_ctx_init(br, v, &v->br_mcast_ctx); + v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED; } /* Add the dev mac and count the vlan only if it's usable */ if (br_vlan_should_use(v)) { - err = br_fdb_insert(br, p, dev->dev_addr, v->vid); + err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); if (err) { br_err(br, "failed insert local address into bridge forwarding table\n"); goto out_filt; @@ -306,6 +311,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, __vlan_add_list(v); __vlan_add_flags(v, flags); + br_multicast_toggle_one_vlan(v, true); if (p) nbp_vlan_set_vlan_dev_state(p, v->vid); @@ -374,6 +380,8 @@ static int __vlan_del(struct net_bridge_vlan *v) br_vlan_rht_params); __vlan_del_list(v); nbp_vlan_set_vlan_dev_state(p, v->vid); + br_multicast_toggle_one_vlan(v, false); + br_multicast_port_ctx_deinit(&v->port_mcast_ctx); call_rcu(&v->rcu, nbp_vlan_rcu_free); } @@ -457,7 +465,15 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br, u64_stats_update_end(&stats->syncp); } - if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED) + /* If the skb will be sent using forwarding offload, the assumption is + * that the switchdev will inject the packet into hardware together + * with the bridge VLAN, so that it can be forwarded according to that + * VLAN. The switchdev should deal with popping the VLAN header in + * hardware on each egress port as appropriate. So only strip the VLAN + * header if forwarding offload is not being used. + */ + if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED && + !br_switchdev_frame_uses_tx_fwd_offload(skb)) __vlan_hwaccel_clear_tag(skb); if (p && (p->flags & BR_VLAN_TUNNEL) && @@ -473,7 +489,8 @@ out: static bool __allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, u16 *vid, - u8 *state) + u8 *state, + struct net_bridge_vlan **vlan) { struct pcpu_sw_netstats *stats; struct net_bridge_vlan *v; @@ -538,8 +555,9 @@ static bool __allowed_ingress(const struct net_bridge *br, */ skb->vlan_tci |= pvid; - /* if stats are disabled we can avoid the lookup */ - if (!br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { + /* if snooping and stats are disabled we can avoid the lookup */ + if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && + !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) { if (*state == BR_STATE_FORWARDING) { *state = br_vlan_get_pvid_state(vg); return br_vlan_state_allowed(*state, true); @@ -566,6 +584,8 @@ static bool __allowed_ingress(const struct net_bridge *br, u64_stats_update_end(&stats->syncp); } + *vlan = v; + return true; drop: @@ -575,17 +595,19 @@ drop: bool br_allowed_ingress(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct sk_buff *skb, - u16 *vid, u8 *state) + u16 *vid, u8 *state, + struct net_bridge_vlan **vlan) { /* If VLAN filtering is disabled on the bridge, all packets are * permitted. */ + *vlan = NULL; if (!br_opt_get(br, BROPT_VLAN_ENABLED)) { BR_INPUT_SKB_CB(skb)->vlan_filtered = false; return true; } - return __allowed_ingress(br, vg, skb, vid, state); + return __allowed_ingress(br, vg, skb, vid, state, vlan); } /* Called under RCU. */ @@ -661,8 +683,7 @@ static int br_vlan_add_existing(struct net_bridge *br, goto err_flags; } /* It was only kept for port vlans, now make it real */ - err = br_fdb_insert(br, NULL, br->dev->dev_addr, - vlan->vid); + err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid); if (err) { br_err(br, "failed to insert local address into bridge forwarding table\n"); goto err_fdb_insert; @@ -672,6 +693,7 @@ static int br_vlan_add_existing(struct net_bridge *br, vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY; vg->num_vlans++; *changed = true; + br_multicast_toggle_one_vlan(vlan, true); } if (__vlan_add_flags(vlan, flags)) @@ -818,14 +840,21 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val, if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val) return 0; + br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); + err = switchdev_port_attr_set(br->dev, &attr, extack); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + br_opt_toggle(br, BROPT_VLAN_ENABLED, !val); return err; + } - br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val); br_manage_promisc(br); recalculate_group_addr(br); br_recalculate_fwd_mask(br); + if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) { + br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n"); + br_multicast_toggle_vlan_snooping(br, false, NULL); + } return 0; } @@ -1420,6 +1449,33 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid, } EXPORT_SYMBOL_GPL(br_vlan_get_info); +int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid, + struct bridge_vlan_info *p_vinfo) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_bridge_port *p; + + p = br_port_get_check_rcu(dev); + if (p) + vg = nbp_vlan_group_rcu(p); + else if (netif_is_bridge_master(dev)) + vg = br_vlan_group_rcu(netdev_priv(dev)); + else + return -EINVAL; + + v = br_vlan_find(vg, vid); + if (!v) + return -ENOENT; + + p_vinfo->vid = vid; + p_vinfo->flags = v->flags; + if (vid == br_get_pvid(vg)) + p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID; + return 0; +} +EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu); + static int br_vlan_is_bind_vlan_dev(const struct net_device *dev) { return is_vlan_dev(dev) && @@ -1804,88 +1860,6 @@ out_kfree: kfree_skb(skb); } -static int br_vlan_replay_one(struct notifier_block *nb, - struct net_device *dev, - struct switchdev_obj_port_vlan *vlan, - const void *ctx, unsigned long action, - struct netlink_ext_ack *extack) -{ - struct switchdev_notifier_port_obj_info obj_info = { - .info = { - .dev = dev, - .extack = extack, - .ctx = ctx, - }, - .obj = &vlan->obj, - }; - int err; - - err = nb->notifier_call(nb, action, &obj_info); - return notifier_to_errno(err); -} - -int br_vlan_replay(struct net_device *br_dev, struct net_device *dev, - const void *ctx, bool adding, struct notifier_block *nb, - struct netlink_ext_ack *extack) -{ - struct net_bridge_vlan_group *vg; - struct net_bridge_vlan *v; - struct net_bridge_port *p; - struct net_bridge *br; - unsigned long action; - int err = 0; - u16 pvid; - - ASSERT_RTNL(); - - if (!netif_is_bridge_master(br_dev)) - return -EINVAL; - - if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) - return -EINVAL; - - if (netif_is_bridge_master(dev)) { - br = netdev_priv(dev); - vg = br_vlan_group(br); - p = NULL; - } else { - p = br_port_get_rtnl(dev); - if (WARN_ON(!p)) - return -EINVAL; - vg = nbp_vlan_group(p); - br = p->br; - } - - if (!vg) - return 0; - - if (adding) - action = SWITCHDEV_PORT_OBJ_ADD; - else - action = SWITCHDEV_PORT_OBJ_DEL; - - pvid = br_get_pvid(vg); - - list_for_each_entry(v, &vg->vlan_list, vlist) { - struct switchdev_obj_port_vlan vlan = { - .obj.orig_dev = dev, - .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN, - .flags = br_vlan_flags(v, pvid), - .vid = v->vid, - }; - - if (!br_vlan_should_use(v)) - continue; - - err = br_vlan_replay_one(nb, dev, &vlan, ctx, action, extack); - if (err) - return err; - } - - return err; -} -EXPORT_SYMBOL_GPL(br_vlan_replay); - /* check if v_curr can enter a range ending in range_end */ bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) @@ -1901,6 +1875,7 @@ static int br_vlan_dump_dev(const struct net_device *dev, u32 dump_flags) { struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL; + bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL); bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS); struct net_bridge_vlan_group *vg; int idx = 0, s_idx = cb->args[1]; @@ -1919,6 +1894,10 @@ static int br_vlan_dump_dev(const struct net_device *dev, vg = br_vlan_group_rcu(br); p = NULL; } else { + /* global options are dumped only for bridge devices */ + if (dump_global) + return 0; + p = br_port_get_rcu(dev); if (WARN_ON(!p)) return -EINVAL; @@ -1941,7 +1920,7 @@ static int br_vlan_dump_dev(const struct net_device *dev, /* idx must stay at range's beginning until it is filled in */ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) { - if (!br_vlan_should_use(v)) + if (!dump_global && !br_vlan_should_use(v)) continue; if (idx < s_idx) { idx++; @@ -1954,8 +1933,21 @@ static int br_vlan_dump_dev(const struct net_device *dev, continue; } - if (dump_stats || v->vid == pvid || - !br_vlan_can_enter_range(v, range_end)) { + if (dump_global) { + if (br_vlan_global_opts_can_enter_range(v, range_end)) + goto update_end; + if (!br_vlan_global_opts_fill(skb, range_start->vid, + range_end->vid, + range_start)) { + err = -EMSGSIZE; + break; + } + /* advance number of filled vlans */ + idx += range_end->vid - range_start->vid + 1; + + range_start = v; + } else if (dump_stats || v->vid == pvid || + !br_vlan_can_enter_range(v, range_end)) { u16 vlan_flags = br_vlan_flags(range_start, pvid); if (!br_vlan_fill_vids(skb, range_start->vid, @@ -1969,6 +1961,7 @@ static int br_vlan_dump_dev(const struct net_device *dev, range_start = v; } +update_end: range_end = v; } @@ -1977,11 +1970,18 @@ static int br_vlan_dump_dev(const struct net_device *dev, * - last vlan (range_start == range_end, not in range) * - last vlan range (range_start != range_end, in range) */ - if (!err && range_start && - !br_vlan_fill_vids(skb, range_start->vid, range_end->vid, - range_start, br_vlan_flags(range_start, pvid), - dump_stats)) - err = -EMSGSIZE; + if (!err && range_start) { + if (dump_global && + !br_vlan_global_opts_fill(skb, range_start->vid, + range_end->vid, range_start)) + err = -EMSGSIZE; + else if (!dump_global && + !br_vlan_fill_vids(skb, range_start->vid, + range_end->vid, range_start, + br_vlan_flags(range_start, pvid), + dump_stats)) + err = -EMSGSIZE; + } cb->args[1] = err ? idx : 0; @@ -2051,6 +2051,7 @@ static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 }, [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 }, [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED }, + [BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 }, }; static int br_vlan_rtm_process_one(struct net_device *dev, @@ -2185,12 +2186,22 @@ static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh, } nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) { - if (nla_type(attr) != BRIDGE_VLANDB_ENTRY) + switch (nla_type(attr)) { + case BRIDGE_VLANDB_ENTRY: + err = br_vlan_rtm_process_one(dev, attr, + nlh->nlmsg_type, + extack); + break; + case BRIDGE_VLANDB_GLOBAL_OPTIONS: + err = br_vlan_rtm_process_global_options(dev, attr, + nlh->nlmsg_type, + extack); + break; + default: continue; + } vlans++; - err = br_vlan_rtm_process_one(dev, attr, nlh->nlmsg_type, - extack); if (err) break; } diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c index b4add9ea8964..8ffd4ed2563c 100644 --- a/net/bridge/br_vlan_options.c +++ b/net/bridge/br_vlan_options.c @@ -40,22 +40,38 @@ static bool __vlan_tun_can_enter_range(const struct net_bridge_vlan *v_curr, bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { + u8 range_mc_rtr = br_vlan_multicast_router(range_end); + u8 curr_mc_rtr = br_vlan_multicast_router(v_curr); + return v_curr->state == range_end->state && - __vlan_tun_can_enter_range(v_curr, range_end); + __vlan_tun_can_enter_range(v_curr, range_end) && + curr_mc_rtr == range_mc_rtr; } bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v) { - return !nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, - br_vlan_get_state(v)) && - __vlan_tun_put(skb, v); + if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) || + !__vlan_tun_put(skb, v)) + return false; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_MCAST_ROUTER, + br_vlan_multicast_router(v))) + return false; +#endif + + return true; } size_t br_vlan_opts_nl_size(void) { return nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_STATE */ + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY_TUNNEL_INFO */ - + nla_total_size(sizeof(u32)); /* BRIDGE_VLANDB_TINFO_ID */ + + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_TINFO_ID */ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_MCAST_ROUTER */ +#endif + + 0; } static int br_vlan_modify_state(struct net_bridge_vlan_group *vg, @@ -181,6 +197,18 @@ static int br_vlan_process_one_opts(const struct net_bridge *br, return err; } +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]) { + u8 val; + + val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]); + err = br_multicast_set_vlan_router(v, val); + if (err) + return err; + *changed = true; + } +#endif + return 0; } @@ -258,3 +286,392 @@ int br_vlan_process_options(const struct net_bridge *br, return err; } + +bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, + const struct net_bridge_vlan *r_end) +{ + return v_curr->vid - r_end->vid == 1 && + ((v_curr->priv_flags ^ r_end->priv_flags) & + BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0 && + br_multicast_ctx_options_equal(&v_curr->br_mcast_ctx, + &r_end->br_mcast_ctx); +} + +bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, + const struct net_bridge_vlan *v_opts) +{ + struct nlattr *nest2 __maybe_unused; + u64 clockval __maybe_unused; + struct nlattr *nest; + + nest = nla_nest_start(skb, BRIDGE_VLANDB_GLOBAL_OPTIONS); + if (!nest) + return false; + + if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_ID, vid)) + goto out_err; + + if (vid_range && vid < vid_range && + nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_RANGE, vid_range)) + goto out_err; + +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING, + !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) || + nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION, + v_opts->br_mcast_ctx.multicast_igmp_version) || + nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT, + v_opts->br_mcast_ctx.multicast_last_member_count) || + nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT, + v_opts->br_mcast_ctx.multicast_startup_query_count) || + nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER, + v_opts->br_mcast_ctx.multicast_querier) || + br_multicast_dump_querier_state(skb, &v_opts->br_mcast_ctx, + BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE)) + goto out_err; + + clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL, + clockval, BRIDGE_VLANDB_GOPTS_PAD)) + goto out_err; + clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_membership_interval); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL, + clockval, BRIDGE_VLANDB_GOPTS_PAD)) + goto out_err; + clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_querier_interval); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL, + clockval, BRIDGE_VLANDB_GOPTS_PAD)) + goto out_err; + clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_interval); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL, + clockval, BRIDGE_VLANDB_GOPTS_PAD)) + goto out_err; + clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_response_interval); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL, + clockval, BRIDGE_VLANDB_GOPTS_PAD)) + goto out_err; + clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_startup_query_interval); + if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL, + clockval, BRIDGE_VLANDB_GOPTS_PAD)) + goto out_err; + + if (br_rports_have_mc_router(&v_opts->br_mcast_ctx)) { + nest2 = nla_nest_start(skb, + BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS); + if (!nest2) + goto out_err; + + rcu_read_lock(); + if (br_rports_fill_info(skb, &v_opts->br_mcast_ctx)) { + rcu_read_unlock(); + nla_nest_cancel(skb, nest2); + goto out_err; + } + rcu_read_unlock(); + + nla_nest_end(skb, nest2); + } + +#if IS_ENABLED(CONFIG_IPV6) + if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION, + v_opts->br_mcast_ctx.multicast_mld_version)) + goto out_err; +#endif +#endif + + nla_nest_end(skb, nest); + + return true; + +out_err: + nla_nest_cancel(skb, nest); + return false; +} + +static size_t rtnl_vlan_global_opts_nlmsg_size(const struct net_bridge_vlan *v) +{ + return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + + nla_total_size(0) /* BRIDGE_VLANDB_GLOBAL_OPTIONS */ + + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_ID */ +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */ + + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION */ + + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */ + + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */ + + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */ + + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */ + + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */ + + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */ + + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */ + + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */ + + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */ + + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */ + + br_multicast_querier_state_size() /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE */ + + nla_total_size(0) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */ + + br_rports_size(&v->br_mcast_ctx) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */ +#endif + + nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */ +} + +static void br_vlan_global_opts_notify(const struct net_bridge *br, + u16 vid, u16 vid_range) +{ + struct net_bridge_vlan *v; + struct br_vlan_msg *bvm; + struct nlmsghdr *nlh; + struct sk_buff *skb; + int err = -ENOBUFS; + + /* right now notifications are done only with rtnl held */ + ASSERT_RTNL(); + + /* need to find the vlan due to flags/options */ + v = br_vlan_find(br_vlan_group(br), vid); + if (!v) + return; + + skb = nlmsg_new(rtnl_vlan_global_opts_nlmsg_size(v), GFP_KERNEL); + if (!skb) + goto out_err; + + err = -EMSGSIZE; + nlh = nlmsg_put(skb, 0, 0, RTM_NEWVLAN, sizeof(*bvm), 0); + if (!nlh) + goto out_err; + bvm = nlmsg_data(nlh); + memset(bvm, 0, sizeof(*bvm)); + bvm->family = AF_BRIDGE; + bvm->ifindex = br->dev->ifindex; + + if (!br_vlan_global_opts_fill(skb, vid, vid_range, v)) + goto out_err; + + nlmsg_end(skb, nlh); + rtnl_notify(skb, dev_net(br->dev), 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); + return; + +out_err: + rtnl_set_sk_err(dev_net(br->dev), RTNLGRP_BRVLAN, err); + kfree_skb(skb); +} + +static int br_vlan_process_global_one_opts(const struct net_bridge *br, + struct net_bridge_vlan_group *vg, + struct net_bridge_vlan *v, + struct nlattr **tb, + bool *changed, + struct netlink_ext_ack *extack) +{ + int err __maybe_unused; + + *changed = false; +#ifdef CONFIG_BRIDGE_IGMP_SNOOPING + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]) { + u8 mc_snooping; + + mc_snooping = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]); + if (br_multicast_toggle_global_vlan(v, !!mc_snooping)) + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]) { + u8 ver; + + ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]); + err = br_multicast_set_igmp_version(&v->br_mcast_ctx, ver); + if (err) + return err; + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]) { + u32 cnt; + + cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]); + v->br_mcast_ctx.multicast_last_member_count = cnt; + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]) { + u32 cnt; + + cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]); + v->br_mcast_ctx.multicast_startup_query_count = cnt; + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]) { + u64 val; + + val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]); + v->br_mcast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]) { + u64 val; + + val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]); + v->br_mcast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]) { + u64 val; + + val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]); + v->br_mcast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]) { + u64 val; + + val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]); + v->br_mcast_ctx.multicast_query_interval = clock_t_to_jiffies(val); + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]) { + u64 val; + + val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]); + v->br_mcast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]) { + u64 val; + + val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]); + v->br_mcast_ctx.multicast_startup_query_interval = clock_t_to_jiffies(val); + *changed = true; + } + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]) { + u8 val; + + val = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]); + err = br_multicast_set_querier(&v->br_mcast_ctx, val); + if (err) + return err; + *changed = true; + } +#if IS_ENABLED(CONFIG_IPV6) + if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) { + u8 ver; + + ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]); + err = br_multicast_set_mld_version(&v->br_mcast_ctx, ver); + if (err) + return err; + *changed = true; + } +#endif +#endif + + return 0; +} + +static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = { + [BRIDGE_VLANDB_GOPTS_ID] = { .type = NLA_U16 }, + [BRIDGE_VLANDB_GOPTS_RANGE] = { .type = NLA_U16 }, + [BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING] = { .type = NLA_U8 }, + [BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION] = { .type = NLA_U8 }, + [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL] = { .type = NLA_U64 }, + [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER] = { .type = NLA_U8 }, + [BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, + [BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, + [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 }, + [BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 }, + [BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 }, + [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL] = { .type = NLA_U64 }, + [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, + [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, +}; + +int br_vlan_rtm_process_global_options(struct net_device *dev, + const struct nlattr *attr, + int cmd, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL; + struct nlattr *tb[BRIDGE_VLANDB_GOPTS_MAX + 1]; + struct net_bridge_vlan_group *vg; + u16 vid, vid_range = 0; + struct net_bridge *br; + int err = 0; + + if (cmd != RTM_NEWVLAN) { + NL_SET_ERR_MSG_MOD(extack, "Global vlan options support only set operation"); + return -EINVAL; + } + if (!netif_is_bridge_master(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Global vlan options can only be set on bridge device"); + return -EINVAL; + } + br = netdev_priv(dev); + vg = br_vlan_group(br); + if (WARN_ON(!vg)) + return -ENODEV; + + err = nla_parse_nested(tb, BRIDGE_VLANDB_GOPTS_MAX, attr, + br_vlan_db_gpol, extack); + if (err) + return err; + + if (!tb[BRIDGE_VLANDB_GOPTS_ID]) { + NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry id"); + return -EINVAL; + } + vid = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_ID]); + if (!br_vlan_valid_id(vid, extack)) + return -EINVAL; + + if (tb[BRIDGE_VLANDB_GOPTS_RANGE]) { + vid_range = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_RANGE]); + if (!br_vlan_valid_id(vid_range, extack)) + return -EINVAL; + if (vid >= vid_range) { + NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id"); + return -EINVAL; + } + } else { + vid_range = vid; + } + + for (; vid <= vid_range; vid++) { + bool changed = false; + + v = br_vlan_find(vg, vid); + if (!v) { + NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process global options"); + err = -ENOENT; + break; + } + + err = br_vlan_process_global_one_opts(br, vg, v, tb, &changed, + extack); + if (err) + break; + + if (changed) { + /* vlan options changed, check for range */ + if (!curr_start) { + curr_start = v; + curr_end = v; + continue; + } + + if (!br_vlan_global_opts_can_enter_range(v, curr_end)) { + br_vlan_global_opts_notify(br, curr_start->vid, + curr_end->vid); + curr_start = v; + } + curr_end = v; + } else { + /* nothing changed and nothing to notify yet */ + if (!curr_start) + continue; + + br_vlan_global_opts_notify(br, curr_start->vid, + curr_end->vid); + curr_start = NULL; + curr_end = NULL; + } + } + if (curr_start) + br_vlan_global_opts_notify(br, curr_start->vid, curr_end->vid); + + return err; +} diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 01017448ebde..6399a8a69d07 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -158,30 +158,28 @@ void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg) rhashtable_destroy(&vg->tunnel_hash); } -int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, - struct net_bridge_port *p, - struct net_bridge_vlan_group *vg) +void br_handle_ingress_vlan_tunnel(struct sk_buff *skb, + struct net_bridge_port *p, + struct net_bridge_vlan_group *vg) { struct ip_tunnel_info *tinfo = skb_tunnel_info(skb); struct net_bridge_vlan *vlan; if (!vg || !tinfo) - return 0; + return; /* if already tagged, ignore */ if (skb_vlan_tagged(skb)) - return 0; + return; /* lookup vid, given tunnel id */ vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id); if (!vlan) - return 0; + return; skb_dst_drop(skb); __vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid); - - return 0; } int br_handle_egress_vlan_tunnel(struct sk_buff *skb, diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 020b1487ee0c..1a11064f9990 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -66,7 +66,7 @@ static unsigned int ebt_broute(void *priv, struct sk_buff *skb, NFPROTO_BRIDGE, s->in, NULL, NULL, s->net, NULL); - ret = ebt_do_table(skb, &state, priv); + ret = ebt_do_table(priv, skb, &state); if (ret != NF_DROP) return ret; @@ -98,7 +98,7 @@ static const struct nf_hook_ops ebt_ops_broute = { .priority = NF_BR_PRI_FIRST, }; -static int __net_init broute_net_init(struct net *net) +static int broute_table_init(struct net *net) { return ebt_register_table(net, &broute_table, &ebt_ops_broute); } @@ -114,19 +114,30 @@ static void __net_exit broute_net_exit(struct net *net) } static struct pernet_operations broute_net_ops = { - .init = broute_net_init, .exit = broute_net_exit, .pre_exit = broute_net_pre_exit, }; static int __init ebtable_broute_init(void) { - return register_pernet_subsys(&broute_net_ops); + int ret = ebt_register_template(&broute_table, broute_table_init); + + if (ret) + return ret; + + ret = register_pernet_subsys(&broute_net_ops); + if (ret) { + ebt_unregister_template(&broute_table); + return ret; + } + + return 0; } static void __exit ebtable_broute_fini(void) { unregister_pernet_subsys(&broute_net_ops); + ebt_unregister_template(&broute_table); } module_init(ebtable_broute_init); diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 8ec0b3736803..cb949436bc0e 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -58,35 +58,28 @@ static const struct ebt_table frame_filter = { .me = THIS_MODULE, }; -static unsigned int -ebt_filter_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ebt_do_table(skb, state, priv); -} - static const struct nf_hook_ops ebt_ops_filter[] = { { - .hook = ebt_filter_hook, + .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_IN, .priority = NF_BR_PRI_FILTER_BRIDGED, }, { - .hook = ebt_filter_hook, + .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_FORWARD, .priority = NF_BR_PRI_FILTER_BRIDGED, }, { - .hook = ebt_filter_hook, + .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_FILTER_OTHER, }, }; -static int __net_init frame_filter_net_init(struct net *net) +static int frame_filter_table_init(struct net *net) { return ebt_register_table(net, &frame_filter, ebt_ops_filter); } @@ -102,19 +95,30 @@ static void __net_exit frame_filter_net_exit(struct net *net) } static struct pernet_operations frame_filter_net_ops = { - .init = frame_filter_net_init, .exit = frame_filter_net_exit, .pre_exit = frame_filter_net_pre_exit, }; static int __init ebtable_filter_init(void) { - return register_pernet_subsys(&frame_filter_net_ops); + int ret = ebt_register_template(&frame_filter, frame_filter_table_init); + + if (ret) + return ret; + + ret = register_pernet_subsys(&frame_filter_net_ops); + if (ret) { + ebt_unregister_template(&frame_filter); + return ret; + } + + return 0; } static void __exit ebtable_filter_fini(void) { unregister_pernet_subsys(&frame_filter_net_ops); + ebt_unregister_template(&frame_filter); } module_init(ebtable_filter_init); diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 7c8a1064a531..5ee0531ae506 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -58,34 +58,28 @@ static const struct ebt_table frame_nat = { .me = THIS_MODULE, }; -static unsigned int ebt_nat_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ebt_do_table(skb, state, priv); -} - static const struct nf_hook_ops ebt_ops_nat[] = { { - .hook = ebt_nat_hook, + .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_LOCAL_OUT, .priority = NF_BR_PRI_NAT_DST_OTHER, }, { - .hook = ebt_nat_hook, + .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_POST_ROUTING, .priority = NF_BR_PRI_NAT_SRC, }, { - .hook = ebt_nat_hook, + .hook = ebt_do_table, .pf = NFPROTO_BRIDGE, .hooknum = NF_BR_PRE_ROUTING, .priority = NF_BR_PRI_NAT_DST_BRIDGED, }, }; -static int __net_init frame_nat_net_init(struct net *net) +static int frame_nat_table_init(struct net *net) { return ebt_register_table(net, &frame_nat, ebt_ops_nat); } @@ -101,19 +95,30 @@ static void __net_exit frame_nat_net_exit(struct net *net) } static struct pernet_operations frame_nat_net_ops = { - .init = frame_nat_net_init, .exit = frame_nat_net_exit, .pre_exit = frame_nat_net_pre_exit, }; static int __init ebtable_nat_init(void) { - return register_pernet_subsys(&frame_nat_net_ops); + int ret = ebt_register_template(&frame_nat, frame_nat_table_init); + + if (ret) + return ret; + + ret = register_pernet_subsys(&frame_nat_net_ops); + if (ret) { + ebt_unregister_template(&frame_nat); + return ret; + } + + return ret; } static void __exit ebtable_nat_fini(void) { unregister_pernet_subsys(&frame_nat_net_ops); + ebt_unregister_template(&frame_nat); } module_init(ebtable_nat_init); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index f022deb3721e..f2dbefb61ce8 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -44,7 +44,16 @@ struct ebt_pernet { struct list_head tables; }; +struct ebt_template { + struct list_head list; + char name[EBT_TABLE_MAXNAMELEN]; + struct module *owner; + /* called when table is needed in the given netns */ + int (*table_init)(struct net *net); +}; + static unsigned int ebt_pernet_id __read_mostly; +static LIST_HEAD(template_tables); static DEFINE_MUTEX(ebt_mutex); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -180,10 +189,10 @@ ebt_get_target_c(const struct ebt_entry *e) } /* Do some firewalling */ -unsigned int ebt_do_table(struct sk_buff *skb, - const struct nf_hook_state *state, - struct ebt_table *table) +unsigned int ebt_do_table(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { + struct ebt_table *table = priv; unsigned int hook = state->hook; int i, nentries; struct ebt_entry *point; @@ -309,30 +318,57 @@ letscontinue: /* If it succeeds, returns element and locks mutex */ static inline void * -find_inlist_lock_noload(struct list_head *head, const char *name, int *error, +find_inlist_lock_noload(struct net *net, const char *name, int *error, struct mutex *mutex) { - struct { - struct list_head list; - char name[EBT_FUNCTION_MAXNAMELEN]; - } *e; + struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); + struct ebt_template *tmpl; + struct ebt_table *table; mutex_lock(mutex); - list_for_each_entry(e, head, list) { - if (strcmp(e->name, name) == 0) - return e; + list_for_each_entry(table, &ebt_net->tables, list) { + if (strcmp(table->name, name) == 0) + return table; } + + list_for_each_entry(tmpl, &template_tables, list) { + if (strcmp(name, tmpl->name) == 0) { + struct module *owner = tmpl->owner; + + if (!try_module_get(owner)) + goto out; + + mutex_unlock(mutex); + + *error = tmpl->table_init(net); + if (*error) { + module_put(owner); + return NULL; + } + + mutex_lock(mutex); + module_put(owner); + break; + } + } + + list_for_each_entry(table, &ebt_net->tables, list) { + if (strcmp(table->name, name) == 0) + return table; + } + +out: *error = -ENOENT; mutex_unlock(mutex); return NULL; } static void * -find_inlist_lock(struct list_head *head, const char *name, const char *prefix, +find_inlist_lock(struct net *net, const char *name, const char *prefix, int *error, struct mutex *mutex) { return try_then_request_module( - find_inlist_lock_noload(head, name, error, mutex), + find_inlist_lock_noload(net, name, error, mutex), "%s%s", prefix, name); } @@ -340,10 +376,7 @@ static inline struct ebt_table * find_table_lock(struct net *net, const char *name, int *error, struct mutex *mutex) { - struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); - - return find_inlist_lock(&ebt_net->tables, name, - "ebtable_", error, mutex); + return find_inlist_lock(net, name, "ebtable_", error, mutex); } static inline void ebt_free_table_info(struct ebt_table_info *info) @@ -893,7 +926,9 @@ static int translate_table(struct net *net, const char *name, return -ENOMEM; for_each_possible_cpu(i) { newinfo->chainstack[i] = - vmalloc(array_size(udc_cnt, sizeof(*(newinfo->chainstack[0])))); + vmalloc_node(array_size(udc_cnt, + sizeof(*(newinfo->chainstack[0]))), + cpu_to_node(i)); if (!newinfo->chainstack[i]) { while (i) vfree(newinfo->chainstack[--i]); @@ -1038,7 +1073,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, */ if (repl->num_counters && copy_to_user(repl->counters, counterstmp, - repl->num_counters * sizeof(struct ebt_counter))) { + array_size(repl->num_counters, sizeof(struct ebt_counter)))) { /* Silent error, can't fail, new table is already in place */ net_warn_ratelimited("ebtables: counters copy to user failed while replacing table\n"); } @@ -1258,6 +1293,54 @@ out: return ret; } +int ebt_register_template(const struct ebt_table *t, int (*table_init)(struct net *net)) +{ + struct ebt_template *tmpl; + + mutex_lock(&ebt_mutex); + list_for_each_entry(tmpl, &template_tables, list) { + if (WARN_ON_ONCE(strcmp(t->name, tmpl->name) == 0)) { + mutex_unlock(&ebt_mutex); + return -EEXIST; + } + } + + tmpl = kzalloc(sizeof(*tmpl), GFP_KERNEL); + if (!tmpl) { + mutex_unlock(&ebt_mutex); + return -ENOMEM; + } + + tmpl->table_init = table_init; + strscpy(tmpl->name, t->name, sizeof(tmpl->name)); + tmpl->owner = t->me; + list_add(&tmpl->list, &template_tables); + + mutex_unlock(&ebt_mutex); + return 0; +} +EXPORT_SYMBOL(ebt_register_template); + +void ebt_unregister_template(const struct ebt_table *t) +{ + struct ebt_template *tmpl; + + mutex_lock(&ebt_mutex); + list_for_each_entry(tmpl, &template_tables, list) { + if (strcmp(t->name, tmpl->name)) + continue; + + list_del(&tmpl->list); + mutex_unlock(&ebt_mutex); + kfree(tmpl); + return; + } + + mutex_unlock(&ebt_mutex); + WARN_ON_ONCE(1); +} +EXPORT_SYMBOL(ebt_unregister_template); + static struct ebt_table *__ebt_find_table(struct net *net, const char *name) { struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id); @@ -1318,7 +1401,8 @@ static int do_update_counters(struct net *net, const char *name, goto unlock_mutex; } - if (copy_from_user(tmp, counters, num_counters * sizeof(*counters))) { + if (copy_from_user(tmp, counters, + array_size(num_counters, sizeof(*counters)))) { ret = -EFAULT; goto unlock_mutex; } @@ -1451,7 +1535,7 @@ static int copy_counters_to_user(struct ebt_table *t, write_unlock_bh(&t->lock); if (copy_to_user(user, counterstmp, - nentries * sizeof(struct ebt_counter))) + array_size(nentries, sizeof(struct ebt_counter)))) ret = -EFAULT; vfree(counterstmp); return ret; diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c index b02e1292f7f1..4be6b04879a1 100644 --- a/net/caif/caif_usb.c +++ b/net/caif/caif_usb.c @@ -81,7 +81,7 @@ static void cfusbl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, layr->up->ctrlcmd(layr->up, ctrl, layr->id); } -static struct cflayer *cfusbl_create(int phyid, u8 ethaddr[ETH_ALEN], +static struct cflayer *cfusbl_create(int phyid, const u8 ethaddr[ETH_ALEN], u8 braddr[ETH_ALEN]) { struct cfusbl *this = kmalloc(sizeof(struct cfusbl), GFP_ATOMIC); diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 37b67194c0df..414dc5671c45 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -53,20 +53,6 @@ struct chnl_net { enum caif_states state; }; -static void robust_list_del(struct list_head *delete_node) -{ - struct list_head *list_node; - struct list_head *n; - ASSERT_RTNL(); - list_for_each_safe(list_node, n, &chnl_net_list) { - if (list_node == delete_node) { - list_del(list_node); - return; - } - } - WARN_ON(1); -} - static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt) { struct sk_buff *skb; @@ -364,6 +350,7 @@ static int chnl_net_init(struct net_device *dev) ASSERT_RTNL(); priv = netdev_priv(dev); strncpy(priv->name, dev->name, sizeof(priv->name)); + INIT_LIST_HEAD(&priv->list_field); return 0; } @@ -372,7 +359,7 @@ static void chnl_net_uninit(struct net_device *dev) struct chnl_net *priv; ASSERT_RTNL(); priv = netdev_priv(dev); - robust_list_del(&priv->list_field); + list_del_init(&priv->list_field); } static const struct net_device_ops netdev_ops = { @@ -537,7 +524,7 @@ static void __exit chnl_exit_module(void) rtnl_lock(); list_for_each_safe(list_node, _tmp, &chnl_net_list) { dev = list_entry(list_node, struct chnl_net, list_field); - list_del(list_node); + list_del_init(list_node); delete_device(dev); } rtnl_unlock(); diff --git a/net/can/bcm.c b/net/can/bcm.c index 508f67de0b80..bc88d901a1c0 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -625,7 +625,7 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer); if (bcm_rx_thr_flush(op)) { - hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2); + hrtimer_forward_now(hrtimer, op->kt_ival2); return HRTIMER_RESTART; } else { /* rearm throttle handling */ diff --git a/net/can/isotp.c b/net/can/isotp.c index caaa532ece94..df6968b28bf4 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -121,7 +121,7 @@ enum { struct tpcon { int idx; int len; - u8 state; + u32 state; u8 bs; u8 sn; u8 ll_dl; @@ -848,6 +848,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); + u32 old_state = so->tx.state; struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; @@ -860,45 +861,55 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) return -EADDRNOTAVAIL; /* we do not support multiple buffers - for now */ - if (so->tx.state != ISOTP_IDLE || wq_has_sleeper(&so->wait)) { - if (msg->msg_flags & MSG_DONTWAIT) - return -EAGAIN; + if (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE || + wq_has_sleeper(&so->wait)) { + if (msg->msg_flags & MSG_DONTWAIT) { + err = -EAGAIN; + goto err_out; + } /* wait for complete transmission of current pdu */ - wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + if (err) + goto err_out; } - if (!size || size > MAX_MSG_LENGTH) - return -EINVAL; + if (!size || size > MAX_MSG_LENGTH) { + err = -EINVAL; + goto err_out; + } /* take care of a potential SF_DL ESC offset for TX_DL > 8 */ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0; /* does the given data fit into a single frame for SF_BROADCAST? */ if ((so->opt.flags & CAN_ISOTP_SF_BROADCAST) && - (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) - return -EINVAL; + (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) { + err = -EINVAL; + goto err_out; + } err = memcpy_from_msg(so->tx.buf, msg, size); if (err < 0) - return err; + goto err_out; dev = dev_get_by_index(sock_net(sk), so->ifindex); - if (!dev) - return -ENXIO; + if (!dev) { + err = -ENXIO; + goto err_out; + } skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) { dev_put(dev); - return err; + goto err_out; } can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; - so->tx.state = ISOTP_SENDING; so->tx.len = size; so->tx.idx = 0; @@ -954,15 +965,25 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (err) { pr_notice_once("can-isotp: %s: can_send_ret %pe\n", __func__, ERR_PTR(err)); - return err; + goto err_out; } if (wait_tx_done) { /* wait for complete transmission of current pdu */ wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + + if (sk->sk_err) + return -sk->sk_err; } return size; + +err_out: + so->tx.state = old_state; + if (so->tx.state == ISOTP_IDLE) + wake_up_interruptible(&so->wait); + + return err; } static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 12369b604ce9..16af1a7f80f6 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -20,9 +20,12 @@ struct j1939_session; enum j1939_sk_errqueue_type { - J1939_ERRQUEUE_ACK, - J1939_ERRQUEUE_SCHED, - J1939_ERRQUEUE_ABORT, + J1939_ERRQUEUE_TX_ACK, + J1939_ERRQUEUE_TX_SCHED, + J1939_ERRQUEUE_TX_ABORT, + J1939_ERRQUEUE_RX_RTS, + J1939_ERRQUEUE_RX_DPO, + J1939_ERRQUEUE_RX_ABORT, }; /* j1939 devices */ @@ -87,6 +90,7 @@ struct j1939_priv { struct list_head j1939_socks; struct kref rx_kref; + u32 rx_tskey; }; void j1939_ecu_put(struct j1939_ecu *ecu); @@ -326,6 +330,7 @@ int j1939_session_activate(struct j1939_session *session); void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec); void j1939_session_timers_cancel(struct j1939_session *session); +#define J1939_MIN_TP_PACKET_SIZE 9 #define J1939_MAX_TP_PACKET_SIZE (7 * 0xff) #define J1939_MAX_ETP_PACKET_SIZE (7 * 0x00ffffff) diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 08c8606cfd9c..9bc55ecb37f9 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -249,11 +249,14 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) struct j1939_priv *priv, *priv_new; int ret; - priv = j1939_priv_get_by_ndev(ndev); + spin_lock(&j1939_netdev_lock); + priv = j1939_priv_get_by_ndev_locked(ndev); if (priv) { kref_get(&priv->rx_kref); + spin_unlock(&j1939_netdev_lock); return priv; } + spin_unlock(&j1939_netdev_lock); priv = j1939_priv_create(ndev); if (!priv) @@ -269,10 +272,10 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) /* Someone was faster than us, use their priv and roll * back our's. */ + kref_get(&priv_new->rx_kref); spin_unlock(&j1939_netdev_lock); dev_put(ndev); kfree(priv); - kref_get(&priv_new->rx_kref); return priv_new; } j1939_priv_set(ndev, priv); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 54f6d521492f..6dff4510687a 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -352,7 +352,7 @@ static void j1939_sk_sock_destruct(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); - /* This function will be call by the generic networking code, when then + /* This function will be called by the generic networking code, when * the socket is ultimately closed (sk->sk_destruct). * * The race between @@ -905,20 +905,33 @@ failure: return NULL; } -static size_t j1939_sk_opt_stats_get_size(void) +static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type) { - return - nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ - 0; + switch (type) { + case J1939_ERRQUEUE_RX_RTS: + return + nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */ + nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */ + nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */ + nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */ + nla_total_size(sizeof(u8)) + /* J1939_NLA_SRC_ADDR */ + nla_total_size(sizeof(u8)) + /* J1939_NLA_DEST_ADDR */ + 0; + default: + return + nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ + 0; + } } static struct sk_buff * -j1939_sk_get_timestamping_opt_stats(struct j1939_session *session) +j1939_sk_get_timestamping_opt_stats(struct j1939_session *session, + enum j1939_sk_errqueue_type type) { struct sk_buff *stats; u32 size; - stats = alloc_skb(j1939_sk_opt_stats_get_size(), GFP_ATOMIC); + stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC); if (!stats) return NULL; @@ -928,32 +941,67 @@ j1939_sk_get_timestamping_opt_stats(struct j1939_session *session) size = min(session->pkt.tx_acked * 7, session->total_message_size); - nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); + switch (type) { + case J1939_ERRQUEUE_RX_RTS: + nla_put_u32(stats, J1939_NLA_TOTAL_SIZE, + session->total_message_size); + nla_put_u32(stats, J1939_NLA_PGN, + session->skcb.addr.pgn); + nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME, + session->skcb.addr.src_name, J1939_NLA_PAD); + nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME, + session->skcb.addr.dst_name, J1939_NLA_PAD); + nla_put_u8(stats, J1939_NLA_SRC_ADDR, + session->skcb.addr.sa); + nla_put_u8(stats, J1939_NLA_DEST_ADDR, + session->skcb.addr.da); + break; + default: + nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); + } return stats; } -void j1939_sk_errqueue(struct j1939_session *session, - enum j1939_sk_errqueue_type type) +static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, + enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; - struct sock *sk = session->sk; struct j1939_sock *jsk; struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; int err; - /* currently we have no sk for the RX session */ - if (!sk) - return; - jsk = j1939_sk(sk); if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; - skb = j1939_sk_get_timestamping_opt_stats(session); + switch (type) { + case J1939_ERRQUEUE_TX_ACK: + if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) + return; + break; + case J1939_ERRQUEUE_TX_SCHED: + if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) + return; + break; + case J1939_ERRQUEUE_TX_ABORT: + break; + case J1939_ERRQUEUE_RX_RTS: + fallthrough; + case J1939_ERRQUEUE_RX_DPO: + fallthrough; + case J1939_ERRQUEUE_RX_ABORT: + if (!(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) + return; + break; + default: + netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); + } + + skb = j1939_sk_get_timestamping_opt_stats(session, type); if (!skb) return; @@ -964,36 +1012,42 @@ void j1939_sk_errqueue(struct j1939_session *session, serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); switch (type) { - case J1939_ERRQUEUE_ACK: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) { - kfree_skb(skb); - return; - } - + case J1939_ERRQUEUE_TX_ACK: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_ACK; - state = "ACK"; + state = "TX ACK"; break; - case J1939_ERRQUEUE_SCHED: - if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) { - kfree_skb(skb); - return; - } - + case J1939_ERRQUEUE_TX_SCHED: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SCHED; - state = "SCH"; + state = "TX SCH"; break; - case J1939_ERRQUEUE_ABORT: + case J1939_ERRQUEUE_TX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_TX_ABORT; - state = "ABT"; + state = "TX ABT"; + break; + case J1939_ERRQUEUE_RX_RTS: + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_info = J1939_EE_INFO_RX_RTS; + state = "RX RTS"; + break; + case J1939_ERRQUEUE_RX_DPO: + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_info = J1939_EE_INFO_RX_DPO; + state = "RX DPO"; + break; + case J1939_ERRQUEUE_RX_ABORT: + serr->ee.ee_errno = session->err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_info = J1939_EE_INFO_RX_ABORT; + state = "RX ABT"; break; - default: - netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); } serr->opt_stats = true; @@ -1008,6 +1062,27 @@ void j1939_sk_errqueue(struct j1939_session *session, kfree_skb(skb); }; +void j1939_sk_errqueue(struct j1939_session *session, + enum j1939_sk_errqueue_type type) +{ + struct j1939_priv *priv = session->priv; + struct j1939_sock *jsk; + + if (session->sk) { + /* send TX notifications to the socket of origin */ + __j1939_sk_errqueue(session, session->sk, type); + return; + } + + /* spread RX notifications to all sockets subscribed to this session */ + spin_lock_bh(&priv->j1939_socks_lock); + list_for_each_entry(jsk, &priv->j1939_socks, list) { + if (j1939_sk_recv_match_one(jsk, &session->skcb)) + __j1939_sk_errqueue(session, &jsk->sk, type); + } + spin_unlock_bh(&priv->j1939_socks_lock); +}; + void j1939_sk_send_loop_abort(struct sock *sk, int err) { sk->sk_err = err; diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index bdc95bd7a851..6c0a0ebdd024 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -260,10 +260,14 @@ static void __j1939_session_drop(struct j1939_session *session) static void j1939_session_destroy(struct j1939_session *session) { - if (session->err) - j1939_sk_errqueue(session, J1939_ERRQUEUE_ABORT); - else - j1939_sk_errqueue(session, J1939_ERRQUEUE_ACK); + if (session->transmission) { + if (session->err) + j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ABORT); + else + j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ACK); + } else if (session->err) { + j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); + } netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); @@ -776,7 +780,7 @@ static int j1939_session_tx_dpo(struct j1939_session *session) static int j1939_session_tx_dat(struct j1939_session *session) { struct j1939_priv *priv = session->priv; - struct j1939_sk_buff_cb *skcb; + struct j1939_sk_buff_cb *se_skcb; int offset, pkt_done, pkt_end; unsigned int len, pdelay; struct sk_buff *se_skb; @@ -788,7 +792,7 @@ static int j1939_session_tx_dat(struct j1939_session *session) if (!se_skb) return -ENOBUFS; - skcb = j1939_skb_to_cb(se_skb); + se_skcb = j1939_skb_to_cb(se_skb); tpdat = se_skb->data; ret = 0; pkt_done = 0; @@ -800,7 +804,7 @@ static int j1939_session_tx_dat(struct j1939_session *session) while (session->pkt.tx < pkt_end) { dat[0] = session->pkt.tx - session->pkt.dpo + 1; - offset = (session->pkt.tx * 7) - skcb->offset; + offset = (session->pkt.tx * 7) - se_skcb->offset; len = se_skb->len - offset; if (len > 7) len = 7; @@ -808,7 +812,8 @@ static int j1939_session_tx_dat(struct j1939_session *session) if (offset + len > se_skb->len) { netdev_err_once(priv->ndev, "%s: 0x%p: requested data outside of queued buffer: offset %i, len %i, pkt.tx: %i\n", - __func__, session, skcb->offset, se_skb->len , session->pkt.tx); + __func__, session, se_skcb->offset, + se_skb->len , session->pkt.tx); ret = -EOVERFLOW; goto out_free; } @@ -821,7 +826,7 @@ static int j1939_session_tx_dat(struct j1939_session *session) memcpy(&dat[1], &tpdat[offset], len); ret = j1939_tp_tx_dat(session, dat, len + 1); if (ret < 0) { - /* ENOBUS == CAN interface TX queue is full */ + /* ENOBUFS == CAN interface TX queue is full */ if (ret != -ENOBUFS) netdev_alert(priv->ndev, "%s: 0x%p: queue data error: %i\n", @@ -1043,7 +1048,7 @@ static int j1939_simple_txnext(struct j1939_session *session) if (ret) goto out_free; - j1939_sk_errqueue(session, J1939_ERRQUEUE_SCHED); + j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_SCHED); j1939_sk_queue_activate_next(session); out_free: @@ -1097,7 +1102,7 @@ j1939_session_deactivate_activate_next(struct j1939_session *session) } static void __j1939_session_cancel(struct j1939_session *session, - enum j1939_xtp_abort err) + enum j1939_xtp_abort err) { struct j1939_priv *priv = session->priv; @@ -1115,6 +1120,8 @@ static void __j1939_session_cancel(struct j1939_session *session, if (session->sk) j1939_sk_send_loop_abort(session->sk, session->err); + else + j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); } static void j1939_session_cancel(struct j1939_session *session, @@ -1195,13 +1202,13 @@ static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer) static void j1939_session_completed(struct j1939_session *session) { - struct sk_buff *skb; + struct sk_buff *se_skb; if (!session->transmission) { - skb = j1939_session_skb_get(session); + se_skb = j1939_session_skb_get(session); /* distribute among j1939 receivers */ - j1939_sk_recv(session->priv, skb); - consume_skb(skb); + j1939_sk_recv(session->priv, se_skb); + consume_skb(se_skb); } j1939_session_deactivate_activate_next(session); @@ -1230,12 +1237,11 @@ static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer) session->err = -ETIME; j1939_session_deactivate(session); } else { - netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n", - __func__, session); - j1939_session_list_lock(session->priv); if (session->state >= J1939_SESSION_ACTIVE && session->state < J1939_SESSION_ACTIVE_MAX) { + netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n", + __func__, session); j1939_session_get(session); hrtimer_start(&session->rxtimer, ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS), @@ -1268,12 +1274,14 @@ static bool j1939_xtp_rx_cmd_bad_pgn(struct j1939_session *session, break; case J1939_ETP_CMD_RTS: - case J1939_TP_CMD_RTS: /* fall through */ + fallthrough; + case J1939_TP_CMD_RTS: abort = J1939_XTP_ABORT_BUSY; break; case J1939_ETP_CMD_CTS: - case J1939_TP_CMD_CTS: /* fall through */ + fallthrough; + case J1939_TP_CMD_CTS: abort = J1939_XTP_ABORT_ECTS_UNXPECTED_PGN; break; @@ -1282,7 +1290,8 @@ static bool j1939_xtp_rx_cmd_bad_pgn(struct j1939_session *session, break; case J1939_ETP_CMD_EOMA: - case J1939_TP_CMD_EOMA: /* fall through */ + fallthrough; + case J1939_TP_CMD_EOMA: abort = J1939_XTP_ABORT_OTHER; break; @@ -1326,6 +1335,8 @@ static void j1939_xtp_rx_abort_one(struct j1939_priv *priv, struct sk_buff *skb, session->err = j1939_xtp_abort_to_errno(priv, abort); if (session->sk) j1939_sk_send_loop_abort(session->sk, session->err); + else + j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT); j1939_session_deactivate_activate_next(session); abort_put: @@ -1434,7 +1445,7 @@ j1939_xtp_rx_cts_one(struct j1939_session *session, struct sk_buff *skb) if (session->transmission) { if (session->pkt.tx_acked) j1939_sk_errqueue(session, - J1939_ERRQUEUE_SCHED); + J1939_ERRQUEUE_TX_SCHED); j1939_session_txtimer_cancel(session); j1939_tp_schedule_txtimer(session, 0); } @@ -1597,6 +1608,8 @@ j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv, abort = J1939_XTP_ABORT_FAULT; else if (len > priv->tp_max_packet_size) abort = J1939_XTP_ABORT_RESOURCE; + else if (len < J1939_MIN_TP_PACKET_SIZE) + abort = J1939_XTP_ABORT_FAULT; } if (abort != J1939_XTP_NO_ABORT) { @@ -1626,6 +1639,9 @@ j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv, session->pkt.rx = 0; session->pkt.tx = 0; + session->tskey = priv->rx_tskey++; + j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_RTS); + WARN_ON_ONCE(j1939_session_activate(session)); return session; @@ -1748,6 +1764,9 @@ static void j1939_xtp_rx_dpo_one(struct j1939_session *session, session->pkt.dpo = j1939_etp_ctl_to_packet(skb->data); session->last_cmd = dat[0]; j1939_tp_set_rxtimeout(session, 750); + + if (!session->transmission) + j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_DPO); } static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb, @@ -1771,8 +1790,9 @@ static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb, static void j1939_xtp_rx_dat_one(struct j1939_session *session, struct sk_buff *skb) { + enum j1939_xtp_abort abort = J1939_XTP_ABORT_FAULT; struct j1939_priv *priv = session->priv; - struct j1939_sk_buff_cb *skcb; + struct j1939_sk_buff_cb *skcb, *se_skcb; struct sk_buff *se_skb = NULL; const u8 *dat; u8 *tpdat; @@ -1785,9 +1805,11 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, skcb = j1939_skb_to_cb(skb); dat = skb->data; - if (skb->len <= 1) + if (skb->len != 8) { /* makes no sense */ + abort = J1939_XTP_ABORT_UNEXPECTED_DATA; goto out_session_cancel; + } switch (session->last_cmd) { case 0xff: @@ -1797,7 +1819,8 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, break; fallthrough; case J1939_TP_CMD_BAM: - case J1939_TP_CMD_CTS: /* fall through */ + fallthrough; + case J1939_TP_CMD_CTS: if (skcb->addr.type != J1939_ETP) break; fallthrough; @@ -1822,8 +1845,8 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, goto out_session_cancel; } - skcb = j1939_skb_to_cb(se_skb); - offset = packet * 7 - skcb->offset; + se_skcb = j1939_skb_to_cb(se_skb); + offset = packet * 7 - se_skcb->offset; nbytes = se_skb->len - offset; if (nbytes > 7) nbytes = 7; @@ -1851,7 +1874,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, if (packet == session->pkt.rx) session->pkt.rx++; - if (skcb->addr.type != J1939_ETP && + if (se_skcb->addr.type != J1939_ETP && j1939_cb_is_broadcast(&session->skcb)) { if (session->pkt.rx >= session->pkt.total) final = true; @@ -1885,7 +1908,7 @@ static void j1939_xtp_rx_dat_one(struct j1939_session *session, out_session_cancel: kfree_skb(se_skb); j1939_session_timers_cancel(session); - j1939_session_cancel(session, J1939_XTP_ABORT_FAULT); + j1939_session_cancel(session, abort); j1939_session_put(session); } @@ -2000,7 +2023,8 @@ static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb) extd = J1939_ETP; fallthrough; case J1939_TP_CMD_BAM: - case J1939_TP_CMD_RTS: /* fall through */ + fallthrough; + case J1939_TP_CMD_RTS: if (skcb->addr.type != extd) return; diff --git a/net/can/raw.c b/net/can/raw.c index cd5a49380116..7105fa4824e4 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -592,9 +592,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, ro->count = count; out_fil: - if (dev) - dev_put(dev); - + dev_put(dev); release_sock(sk); rtnl_unlock(); @@ -638,9 +636,7 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, ro->err_mask = err_mask; out_err: - if (dev) - dev_put(dev); - + dev_put(dev); release_sock(sk); rtnl_unlock(); diff --git a/net/core/Makefile b/net/core/Makefile index f7f16650fe9e..4268846f2f47 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -33,8 +33,7 @@ obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o -ifeq ($(CONFIG_INET),y) obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o -endif obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o +obj-$(CONFIG_OF) += of_net.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index f564f82e91d9..68d2cbf8331a 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -416,7 +416,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, void *, value, u64, flags) { - if (in_irq() || in_nmi()) + if (in_hardirq() || in_nmi()) return (unsigned long)NULL; return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); @@ -425,7 +425,7 @@ BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, struct sock *, sk) { - if (in_irq() || in_nmi()) + if (in_hardirq() || in_nmi()) return -EPERM; return ____bpf_sk_storage_delete(map, sk); diff --git a/net/core/dev.c b/net/core/dev.c index 8f1a47ad6781..edeb811c454e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -140,7 +140,7 @@ #include <linux/if_macvlan.h> #include <linux/errqueue.h> #include <linux/hrtimer.h> -#include <linux/netfilter_ingress.h> +#include <linux/netfilter_netdev.h> #include <linux/crash_dump.h> #include <linux/sctp.h> #include <net/udp_tunnel.h> @@ -303,6 +303,12 @@ static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net, return NULL; } +bool netdev_name_in_use(struct net *net, const char *name) +{ + return netdev_name_node_lookup(net, name); +} +EXPORT_SYMBOL(netdev_name_in_use); + int netdev_name_node_alt_create(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; @@ -676,131 +682,6 @@ void dev_remove_offload(struct packet_offload *po) } EXPORT_SYMBOL(dev_remove_offload); -/****************************************************************************** - * - * Device Boot-time Settings Routines - * - ******************************************************************************/ - -/* Boot time configuration table */ -static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; - -/** - * netdev_boot_setup_add - add new setup entry - * @name: name of the device - * @map: configured settings for the device - * - * Adds new setup entry to the dev_boot_setup list. The function - * returns 0 on error and 1 on success. This is a generic routine to - * all netdevices. - */ -static int netdev_boot_setup_add(char *name, struct ifmap *map) -{ - struct netdev_boot_setup *s; - int i; - - s = dev_boot_setup; - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { - memset(s[i].name, 0, sizeof(s[i].name)); - strlcpy(s[i].name, name, IFNAMSIZ); - memcpy(&s[i].map, map, sizeof(s[i].map)); - break; - } - } - - return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; -} - -/** - * netdev_boot_setup_check - check boot time settings - * @dev: the netdevice - * - * Check boot time settings for the device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found, 1 if they are. - */ -int netdev_boot_setup_check(struct net_device *dev) -{ - struct netdev_boot_setup *s = dev_boot_setup; - int i; - - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { - if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && - !strcmp(dev->name, s[i].name)) { - dev->irq = s[i].map.irq; - dev->base_addr = s[i].map.base_addr; - dev->mem_start = s[i].map.mem_start; - dev->mem_end = s[i].map.mem_end; - return 1; - } - } - return 0; -} -EXPORT_SYMBOL(netdev_boot_setup_check); - - -/** - * netdev_boot_base - get address from boot time settings - * @prefix: prefix for network device - * @unit: id for network device - * - * Check boot time settings for the base address of device. - * The found settings are set for the device to be used - * later in the device probing. - * Returns 0 if no settings found. - */ -unsigned long netdev_boot_base(const char *prefix, int unit) -{ - const struct netdev_boot_setup *s = dev_boot_setup; - char name[IFNAMSIZ]; - int i; - - sprintf(name, "%s%d", prefix, unit); - - /* - * If device already registered then return base of 1 - * to indicate not to probe for this interface - */ - if (__dev_get_by_name(&init_net, name)) - return 1; - - for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) - if (!strcmp(name, s[i].name)) - return s[i].map.base_addr; - return 0; -} - -/* - * Saves at boot time configured settings for any netdevice. - */ -int __init netdev_boot_setup(char *str) -{ - int ints[5]; - struct ifmap map; - - str = get_options(str, ARRAY_SIZE(ints), ints); - if (!str || !*str) - return 0; - - /* Save settings */ - memset(&map, 0, sizeof(map)); - if (ints[0] > 0) - map.irq = ints[1]; - if (ints[0] > 1) - map.base_addr = ints[2]; - if (ints[0] > 2) - map.mem_start = ints[3]; - if (ints[0] > 3) - map.mem_end = ints[4]; - - /* Add new entry to the list */ - return netdev_boot_setup_add(str, &map); -} - -__setup("netdev=", netdev_boot_setup); - /******************************************************************************* * * Device Interface Subroutines @@ -956,8 +837,7 @@ struct net_device *dev_get_by_name(struct net *net, const char *name) rcu_read_lock(); dev = dev_get_by_name_rcu(net, name); - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); return dev; } @@ -1030,8 +910,7 @@ struct net_device *dev_get_by_index(struct net *net, int ifindex) rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); return dev; } @@ -1260,7 +1139,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf) } snprintf(buf, IFNAMSIZ, name, i); - if (!__dev_get_by_name(net, buf)) + if (!netdev_name_in_use(net, buf)) return i; /* It is possible to run out of possible slots @@ -1314,7 +1193,7 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev, if (strchr(name, '%')) return dev_alloc_name_ns(net, dev, name); - else if (__dev_get_by_name(net, name)) + else if (netdev_name_in_use(net, name)) return -EEXIST; else if (dev->name != name) strlcpy(dev->name, name, IFNAMSIZ); @@ -1417,8 +1296,8 @@ rollback: old_assign_type = NET_NAME_RENAMED; goto rollback; } else { - pr_err("%s: name change rollback failed: %d\n", - dev->name, ret); + netdev_err(dev, "name change rollback failed: %d\n", + ret); } } @@ -2472,7 +2351,7 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) /* If TC0 is invalidated disable TC mapping */ if (tc->offset + tc->count > txq) { - pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); + netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); dev->num_tc = 0; return; } @@ -2483,8 +2362,8 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq) tc = &dev->tc_to_txq[q]; if (tc->offset + tc->count > txq) { - pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", - i, q); + netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", + i, q); netdev_set_prio_tc_map(dev, i, 0); } } @@ -3048,6 +2927,8 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (dev->num_tc) netif_setup_tc(dev, txq); + dev_qdisc_change_real_num_tx(dev, txq); + dev->real_num_tx_queues = txq; if (disabling) { @@ -3099,6 +2980,50 @@ EXPORT_SYMBOL(netif_set_real_num_rx_queues); #endif /** + * netif_set_real_num_queues - set actual number of RX and TX queues used + * @dev: Network device + * @txq: Actual number of TX queues + * @rxq: Actual number of RX queues + * + * Set the real number of both TX and RX queues. + * Does nothing if the number of queues is already correct. + */ +int netif_set_real_num_queues(struct net_device *dev, + unsigned int txq, unsigned int rxq) +{ + unsigned int old_rxq = dev->real_num_rx_queues; + int err; + + if (txq < 1 || txq > dev->num_tx_queues || + rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + + /* Start from increases, so the error path only does decreases - + * decreases can't fail. + */ + if (rxq > dev->real_num_rx_queues) { + err = netif_set_real_num_rx_queues(dev, rxq); + if (err) + return err; + } + if (txq > dev->real_num_tx_queues) { + err = netif_set_real_num_tx_queues(dev, txq); + if (err) + goto undo_rx; + } + if (rxq < dev->real_num_rx_queues) + WARN_ON(netif_set_real_num_rx_queues(dev, rxq)); + if (txq < dev->real_num_tx_queues) + WARN_ON(netif_set_real_num_tx_queues(dev, txq)); + + return 0; +undo_rx: + WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq)); + return err; +} +EXPORT_SYMBOL(netif_set_real_num_queues); + +/** * netif_get_num_default_rss_queues - default number of RSS queues * * This routine should set an upper limit on the number of RSS queues @@ -3190,7 +3115,7 @@ EXPORT_SYMBOL(__dev_kfree_skb_irq); void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) { - if (in_irq() || irqs_disabled()) + if (in_hardirq() || irqs_disabled()) __dev_kfree_skb_irq(skb, reason); else dev_kfree_skb(skb); @@ -3246,6 +3171,12 @@ static u16 skb_tx_hash(const struct net_device *dev, qoffset = sb_dev->tc_to_txq[tc].offset; qcount = sb_dev->tc_to_txq[tc].count; + if (unlikely(!qcount)) { + net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", + sb_dev->name, qoffset, tc); + qoffset = 0; + qcount = dev->real_num_tx_queues; + } } if (skb_rx_queue_recorded(skb)) { @@ -3491,7 +3422,7 @@ EXPORT_SYMBOL(__skb_gso_segment); #ifdef CONFIG_BUG static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb) { - pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); + netdev_err(dev, "hw csum failure\n"); skb_dump(KERN_ERR, skb, true); dump_stack(); } @@ -3989,7 +3920,8 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) skb_reset_mac_header(skb); __skb_pull(skb, skb_network_offset(skb)); skb->pkt_type = PACKET_LOOPBACK; - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (skb->ip_summed == CHECKSUM_NONE) + skb->ip_summed = CHECKSUM_UNNECESSARY; WARN_ON(!skb_dst(skb)); skb_dst_force(skb); netif_rx_ni(skb); @@ -4001,6 +3933,7 @@ EXPORT_SYMBOL(dev_loopback_xmit); static struct sk_buff * sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { +#ifdef CONFIG_NET_CLS_ACT struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); struct tcf_result cl_res; @@ -4012,7 +3945,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) qdisc_skb_cb(skb)->post_ct = false; mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) { + switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -4036,6 +3969,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) default: break; } +#endif /* CONFIG_NET_CLS_ACT */ return skb; } @@ -4229,13 +4163,20 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) qdisc_pkt_len_init(skb); #ifdef CONFIG_NET_CLS_ACT skb->tc_at_ingress = 0; -# ifdef CONFIG_NET_EGRESS +#endif +#ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { + if (nf_hook_egress_active()) { + skb = nf_hook_egress(skb, &rc, dev); + if (!skb) + goto out; + } + nf_skip_egress(skb, true); skb = sch_handle_egress(skb, &rc, dev); if (!skb) goto out; + nf_skip_egress(skb, false); } -# endif #endif /* If device/qdisc don't need skb->dst, release it right now while * its hot in this cpu cache. @@ -4756,45 +4697,18 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) return rxqueue; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, - struct xdp_buff *xdp, - struct bpf_prog *xdp_prog) +u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) { void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; - u32 metalen, act = XDP_DROP; bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; + u32 metalen, act; int off; - /* Reinjected packets coming from act_mirred or similar should - * not get XDP generic processing. - */ - if (skb_is_redirected(skb)) - return XDP_PASS; - - /* XDP packets must be linear and must have sufficient headroom - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also - * native XDP provides, thus we need to do it here as well. - */ - if (skb_cloned(skb) || skb_is_nonlinear(skb) || - skb_headroom(skb) < XDP_PACKET_HEADROOM) { - int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); - int troom = skb->tail + skb->data_len - skb->end; - - /* In case we have to go down the path and also linearize, - * then lets do the pskb_expand_head() work just once here. - */ - if (pskb_expand_head(skb, - hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, - troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) - goto do_drop; - if (skb_linearize(skb)) - goto do_drop; - } - /* The XDP program wants to see the packet starting at the MAC * header. */ @@ -4849,6 +4763,13 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, skb->protocol = eth_type_trans(skb, skb->dev); } + /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull + * before calling us again on redirect path. We do not call do_redirect + * as we leave that up to the caller. + * + * Caller is responsible for managing lifetime of skb (i.e. calling + * kfree_skb in response to actions it cannot handle/XDP_DROP). + */ switch (act) { case XDP_REDIRECT: case XDP_TX: @@ -4859,6 +4780,49 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (metalen) skb_metadata_set(skb, metalen); break; + } + + return act; +} + +static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act = XDP_DROP; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_is_redirected(skb)) + return XDP_PASS; + + /* XDP packets must be linear and must have sufficient headroom + * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also + * native XDP provides, thus we need to do it here as well. + */ + if (skb_cloned(skb) || skb_is_nonlinear(skb) || + skb_headroom(skb) < XDP_PACKET_HEADROOM) { + int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + int troom = skb->tail + skb->data_len - skb->end; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + if (pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) + goto do_drop; + if (skb_linearize(skb)) + goto do_drop; + } + + act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); + switch (act) { + case XDP_REDIRECT: + case XDP_TX: + case XDP_PASS: + break; default: bpf_warn_invalid_xdp_action(act); fallthrough; @@ -5141,8 +5105,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, skb->tc_at_ingress = 1; mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list, - &cl_res, false)) { + switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); @@ -5324,7 +5287,6 @@ another_round: ret = NET_RX_DROP; goto out; } - skb_reset_mac_len(skb); } if (eth_type_vlan(skb->protocol)) { @@ -5356,6 +5318,7 @@ skip_taps: if (static_branch_unlikely(&ingress_needed_key)) { bool another = false; + nf_skip_egress(skb, true); skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev, &another); if (another) @@ -5363,6 +5326,7 @@ skip_taps: if (!skb) goto out; + nf_skip_egress(skb, false); if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) goto out; } @@ -5650,25 +5614,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; - if (new) { - u32 i; - - mutex_lock(&new->aux->used_maps_mutex); - - /* generic XDP does not work with DEVMAPs that can - * have a bpf_prog installed on an entry - */ - for (i = 0; i < new->aux->used_map_cnt; i++) { - if (dev_map_can_have_prog(new->aux->used_maps[i]) || - cpu_map_prog_allowed(new->aux->used_maps[i])) { - mutex_unlock(&new->aux->used_maps_mutex); - return -EINVAL; - } - } - - mutex_unlock(&new->aux->used_maps_mutex); - } - switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); @@ -5876,7 +5821,7 @@ static void flush_all_backlogs(void) */ ASSERT_RTNL(); - get_online_cpus(); + cpus_read_lock(); cpumask_clear(&flush_cpus); for_each_online_cpu(cpu) { @@ -5894,7 +5839,7 @@ static void flush_all_backlogs(void) for_each_cpu(cpu, &flush_cpus) flush_work(per_cpu_ptr(&flush_works, cpu)); - put_online_cpus(); + cpus_read_unlock(); } /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ @@ -5918,7 +5863,7 @@ static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int se gro_normal_list(napi); } -static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) +static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) { struct packet_offload *ptype; __be16 type = skb->protocol; @@ -5947,12 +5892,11 @@ static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) if (err) { WARN_ON(&ptype->list == head); kfree_skb(skb); - return NET_RX_SUCCESS; + return; } out: gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); - return NET_RX_SUCCESS; } static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, @@ -6011,7 +5955,6 @@ static void gro_list_prepare(const struct list_head *head, diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb); if (skb_vlan_tag_present(p)) diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb); - diffs |= skb_metadata_dst_cmp(p, skb); diffs |= skb_metadata_differs(p, skb); if (maclen == ETH_HLEN) diffs |= compare_ether_header(skb_mac_header(p), @@ -6021,17 +5964,30 @@ static void gro_list_prepare(const struct list_head *head, skb_mac_header(skb), maclen); - diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); + /* in most common scenarions 'slow_gro' is 0 + * otherwise we are already on some slower paths + * either skip all the infrequent tests altogether or + * avoid trying too hard to skip each of them individually + */ + if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) { #if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - if (!diffs) { - struct tc_skb_ext *skb_ext = skb_ext_find(skb, TC_SKB_EXT); - struct tc_skb_ext *p_ext = skb_ext_find(p, TC_SKB_EXT); + struct tc_skb_ext *skb_ext; + struct tc_skb_ext *p_ext; +#endif + + diffs |= p->sk != skb->sk; + diffs |= skb_metadata_dst_cmp(p, skb); + diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); + +#if IS_ENABLED(CONFIG_SKB_EXTENSIONS) && IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + skb_ext = skb_ext_find(skb, TC_SKB_EXT); + p_ext = skb_ext_find(p, TC_SKB_EXT); diffs |= (!!p_ext) ^ (!!skb_ext); if (!diffs && unlikely(skb_ext)) diffs |= p_ext->chain ^ skb_ext->chain; - } #endif + } NAPI_GRO_CB(p)->same_flow = !diffs; } @@ -6296,8 +6252,12 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - skb_ext_reset(skb); - nf_reset_ct(skb); + if (unlikely(skb->slow_gro)) { + skb_orphan(skb); + skb_ext_reset(skb); + nf_reset_ct(skb); + skb->slow_gro = 0; + } napi->skb = skb; } @@ -6963,19 +6923,25 @@ EXPORT_SYMBOL(netif_napi_add); void napi_disable(struct napi_struct *n) { + unsigned long val, new; + might_sleep(); set_bit(NAPI_STATE_DISABLE, &n->state); - while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) - msleep(1); - while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) - msleep(1); + do { + val = READ_ONCE(n->state); + if (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) { + usleep_range(20, 200); + continue; + } + + new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; + new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); + } while (cmpxchg(&n->state, val, new) != val); hrtimer_cancel(&n->timer); - clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); - clear_bit(NAPI_STATE_THREADED, &n->state); } EXPORT_SYMBOL(napi_disable); @@ -6988,12 +6954,16 @@ EXPORT_SYMBOL(napi_disable); */ void napi_enable(struct napi_struct *n) { - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - smp_mb__before_atomic(); - clear_bit(NAPI_STATE_SCHED, &n->state); - clear_bit(NAPI_STATE_NPSVC, &n->state); - if (n->dev->threaded && n->thread) - set_bit(NAPI_STATE_THREADED, &n->state); + unsigned long val, new; + + do { + val = READ_ONCE(n->state); + BUG_ON(!test_bit(NAPI_STATE_SCHED, &val)); + + new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC); + if (n->dev->threaded && n->thread) + new |= NAPIF_STATE_THREADED; + } while (cmpxchg(&n->state, val, new) != val); } EXPORT_SYMBOL(napi_enable); @@ -7049,8 +7019,8 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) } if (unlikely(work > weight)) - pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n", - n->poll, work, weight); + netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n", + n->poll, work, weight); if (likely(work < weight)) return work; @@ -7597,7 +7567,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, { struct netdev_adjacent *lower; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); @@ -8604,8 +8574,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) dev->flags &= ~IFF_PROMISC; else { dev->promiscuity -= inc; - pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", - dev->name); + netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n"); return -EOVERFLOW; } } @@ -8675,8 +8644,7 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) dev->flags &= ~IFF_ALLMULTI; else { dev->allmulti -= inc; - pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", - dev->name); + netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n"); return -EOVERFLOW; } } @@ -9213,14 +9181,11 @@ int dev_get_port_parent_id(struct net_device *dev, } err = devlink_compat_switch_id_get(dev, ppid); - if (!err || err != -EOPNOTSUPP) + if (!recurse || err != -EOPNOTSUPP) return err; - if (!recurse) - return -EOPNOTSUPP; - netdev_for_each_lower_dev(dev, lower_dev, iter) { - err = dev_get_port_parent_id(lower_dev, ppid, recurse); + err = dev_get_port_parent_id(lower_dev, ppid, true); if (err) break; if (!first.id_len) @@ -9362,7 +9327,7 @@ static struct bpf_prog *dev_xdp_prog(struct net_device *dev, return dev->xdp_state[mode].prog; } -static u8 dev_xdp_prog_count(struct net_device *dev) +u8 dev_xdp_prog_count(struct net_device *dev) { u8 count = 0; int i; @@ -9372,6 +9337,7 @@ static u8 dev_xdp_prog_count(struct net_device *dev) count++; return count; } +EXPORT_SYMBOL_GPL(dev_xdp_prog_count); u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode) { @@ -9465,6 +9431,8 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack { unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES); struct bpf_prog *cur_prog; + struct net_device *upper; + struct list_head *iter; enum bpf_xdp_mode mode; bpf_op_t bpf_op; int err; @@ -9503,6 +9471,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack return -EBUSY; } + /* don't allow if an upper device already has a program */ + netdev_for_each_upper_dev_rcu(dev, upper, iter) { + if (dev_xdp_prog_count(upper) > 0) { + NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program"); + return -EEXIST; + } + } + cur_prog = dev_xdp_prog(dev, mode); /* can't replace attached prog with link */ if (link && cur_prog) { @@ -9953,6 +9929,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } + if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) { + netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n"); + features &= ~NETIF_F_LRO; + } + if (features & NETIF_F_HW_TLS_TX) { bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) == (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); @@ -10134,7 +10115,7 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); - rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!rx) return -ENOMEM; @@ -10201,7 +10182,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev) if (count < 1 || count > 0xffff) return -EINVAL; - tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!tx) return -ENOMEM; @@ -10841,7 +10822,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL); + p = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); if (!p) return NULL; @@ -10910,7 +10891,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (!dev->ethtool_ops) dev->ethtool_ops = &default_ethtool_ops; - nf_hook_ingress_init(dev); + nf_hook_netdev_init(dev); return dev; @@ -11196,7 +11177,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, * we can use it in the destination network namespace. */ err = -EEXIST; - if (__dev_get_by_name(net, dev->name)) { + if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ if (!pat) goto out; @@ -11549,7 +11530,7 @@ static void __net_exit default_device_exit(struct net *net) /* Push remaining network devices to init_net */ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); - if (__dev_get_by_name(&init_net, fb_name)) + if (netdev_name_in_use(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 45ae6eeb2964..f0cb38344126 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -16,10 +16,9 @@ * General list handling functions */ -static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, - const unsigned char *addr, int addr_len, - unsigned char addr_type, bool global, - bool sync) +static struct netdev_hw_addr* +__hw_addr_create(const unsigned char *addr, int addr_len, + unsigned char addr_type, bool global, bool sync) { struct netdev_hw_addr *ha; int alloc_size; @@ -29,32 +28,50 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list, alloc_size = L1_CACHE_BYTES; ha = kmalloc(alloc_size, GFP_ATOMIC); if (!ha) - return -ENOMEM; + return NULL; memcpy(ha->addr, addr, addr_len); ha->type = addr_type; ha->refcount = 1; ha->global_use = global; ha->synced = sync ? 1 : 0; ha->sync_cnt = 0; - list_add_tail_rcu(&ha->list, &list->list); - list->count++; - return 0; + return ha; } static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync, - int sync_count) + int sync_count, bool exclusive) { + struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL; struct netdev_hw_addr *ha; if (addr_len > MAX_ADDR_LEN) return -EINVAL; - list_for_each_entry(ha, &list->list, list) { - if (ha->type == addr_type && - !memcmp(ha->addr, addr, addr_len)) { + ha = list_first_entry(&list->list, struct netdev_hw_addr, list); + if (ha && !memcmp(addr, ha->addr, addr_len) && + (!addr_type || addr_type == ha->type)) + goto found_it; + + while (*ins_point) { + int diff; + + ha = rb_entry(*ins_point, struct netdev_hw_addr, node); + diff = memcmp(addr, ha->addr, addr_len); + if (diff == 0) + diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); + + parent = *ins_point; + if (diff < 0) { + ins_point = &parent->rb_left; + } else if (diff > 0) { + ins_point = &parent->rb_right; + } else { +found_it: + if (exclusive) + return -EEXIST; if (global) { /* check if addr is already used as global */ if (ha->global_use) @@ -73,8 +90,25 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list, } } - return __hw_addr_create_ex(list, addr, addr_len, addr_type, global, - sync); + ha = __hw_addr_create(addr, addr_len, addr_type, global, sync); + if (!ha) + return -ENOMEM; + + /* The first address in dev->dev_addrs is pointed to by dev->dev_addr + * and mutated freely by device drivers and netdev ops, so if we insert + * it into the tree we'll end up with an invalid rbtree. + */ + if (list->count > 0) { + rb_link_node(&ha->node, parent, ins_point); + rb_insert_color(&ha->node, &list->tree); + } else { + RB_CLEAR_NODE(&ha->node); + } + + list_add_tail_rcu(&ha->list, &list->list); + list->count++; + + return 0; } static int __hw_addr_add(struct netdev_hw_addr_list *list, @@ -82,7 +116,7 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char addr_type) { return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false, - 0); + 0, false); } static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, @@ -103,24 +137,61 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list, if (--ha->refcount) return 0; + + if (!RB_EMPTY_NODE(&ha->node)) + rb_erase(&ha->node, &list->tree); + list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); list->count--; return 0; } +static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list, + const unsigned char *addr, int addr_len, + unsigned char addr_type) +{ + struct netdev_hw_addr *ha; + struct rb_node *node; + + /* The first address isn't inserted into the tree because in the dev->dev_addrs + * list it's the address pointed to by dev->dev_addr which is freely mutated + * in place, so we need to check it separately. + */ + ha = list_first_entry(&list->list, struct netdev_hw_addr, list); + if (ha && !memcmp(addr, ha->addr, addr_len) && + (!addr_type || addr_type == ha->type)) + return ha; + + node = list->tree.rb_node; + + while (node) { + struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node); + int diff = memcmp(addr, ha->addr, addr_len); + + if (diff == 0 && addr_type) + diff = memcmp(&addr_type, &ha->type, sizeof(addr_type)); + + if (diff < 0) + node = node->rb_left; + else if (diff > 0) + node = node->rb_right; + else + return ha; + } + + return NULL; +} + static int __hw_addr_del_ex(struct netdev_hw_addr_list *list, const unsigned char *addr, int addr_len, unsigned char addr_type, bool global, bool sync) { - struct netdev_hw_addr *ha; + struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type); - list_for_each_entry(ha, &list->list, list) { - if (!memcmp(ha->addr, addr, addr_len) && - (ha->type == addr_type || !addr_type)) - return __hw_addr_del_entry(list, ha, global, sync); - } - return -ENOENT; + if (!ha) + return -ENOENT; + return __hw_addr_del_entry(list, ha, global, sync); } static int __hw_addr_del(struct netdev_hw_addr_list *list, @@ -137,7 +208,7 @@ static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list, int err; err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type, - false, true, ha->sync_cnt); + false, true, ha->sync_cnt, false); if (err && err != -EEXIST) return err; @@ -407,6 +478,7 @@ static void __hw_addr_flush(struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha, *tmp; + list->tree = RB_ROOT; list_for_each_entry_safe(ha, tmp, &list->list, list) { list_del_rcu(&ha->list); kfree_rcu(ha, rcu_head); @@ -418,6 +490,7 @@ void __hw_addr_init(struct netdev_hw_addr_list *list) { INIT_LIST_HEAD(&list->list); list->count = 0; + list->tree = RB_ROOT; } EXPORT_SYMBOL(__hw_addr_init); @@ -552,22 +625,14 @@ EXPORT_SYMBOL(dev_addr_del); */ int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr) { - struct netdev_hw_addr *ha; int err; netif_addr_lock_bh(dev); - list_for_each_entry(ha, &dev->uc.list, list) { - if (!memcmp(ha->addr, addr, dev->addr_len) && - ha->type == NETDEV_HW_ADDR_T_UNICAST) { - err = -EEXIST; - goto out; - } - } - err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_UNICAST, true, false); + err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_UNICAST, true, false, + 0, true); if (!err) __dev_set_rx_mode(dev); -out: netif_addr_unlock_bh(dev); return err; } @@ -745,22 +810,14 @@ EXPORT_SYMBOL(dev_uc_init); */ int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr) { - struct netdev_hw_addr *ha; int err; netif_addr_lock_bh(dev); - list_for_each_entry(ha, &dev->mc.list, list) { - if (!memcmp(ha->addr, addr, dev->addr_len) && - ha->type == NETDEV_HW_ADDR_T_MULTICAST) { - err = -EEXIST; - goto out; - } - } - err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, true, false); + err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, + NETDEV_HW_ADDR_T_MULTICAST, true, false, + 0, true); if (!err) __dev_set_rx_mode(dev); -out: netif_addr_unlock_bh(dev); return err; } @@ -773,7 +830,8 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr, netif_addr_lock_bh(dev); err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len, - NETDEV_HW_ADDR_T_MULTICAST, global, false, 0); + NETDEV_HW_ADDR_T_MULTICAST, global, false, + 0, false); if (!err) __dev_set_rx_mode(dev); netif_addr_unlock_bh(dev); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 478d032f34ac..cbab5fec64b1 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -1,10 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kmod.h> #include <linux/netdevice.h> +#include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/net_tstamp.h> #include <linux/wireless.h> +#include <linux/if_bridge.h> #include <net/dsa.h> #include <net/wext.h> @@ -25,79 +27,108 @@ static int dev_ifname(struct net *net, struct ifreq *ifr) return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex); } -static gifconf_func_t *gifconf_list[NPROTO]; - -/** - * register_gifconf - register a SIOCGIF handler - * @family: Address family - * @gifconf: Function handler - * - * Register protocol dependent address dumping routines. The handler - * that is passed must not be freed or reused until it has been replaced - * by another handler. - */ -int register_gifconf(unsigned int family, gifconf_func_t *gifconf) -{ - if (family >= NPROTO) - return -EINVAL; - gifconf_list[family] = gifconf; - return 0; -} -EXPORT_SYMBOL(register_gifconf); - /* * Perform a SIOCGIFCONF call. This structure will change * size eventually, and there is nothing I can do about it. * Thus we will need a 'compatibility mode'. */ - -int dev_ifconf(struct net *net, struct ifconf *ifc, int size) +int dev_ifconf(struct net *net, struct ifconf __user *uifc) { struct net_device *dev; - char __user *pos; - int len; - int total; - int i; + void __user *pos; + size_t size; + int len, total = 0, done; - /* - * Fetch the caller's info block. - */ + /* both the ifconf and the ifreq structures are slightly different */ + if (in_compat_syscall()) { + struct compat_ifconf ifc32; - pos = ifc->ifc_buf; - len = ifc->ifc_len; + if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf))) + return -EFAULT; - /* - * Loop over the interfaces, and write an info block for each. - */ + pos = compat_ptr(ifc32.ifcbuf); + len = ifc32.ifc_len; + size = sizeof(struct compat_ifreq); + } else { + struct ifconf ifc; + + if (copy_from_user(&ifc, uifc, sizeof(struct ifconf))) + return -EFAULT; - total = 0; + pos = ifc.ifc_buf; + len = ifc.ifc_len; + size = sizeof(struct ifreq); + } + + /* Loop over the interfaces, and write an info block for each. */ + rtnl_lock(); for_each_netdev(net, dev) { - for (i = 0; i < NPROTO; i++) { - if (gifconf_list[i]) { - int done; - if (!pos) - done = gifconf_list[i](dev, NULL, 0, size); - else - done = gifconf_list[i](dev, pos + total, - len - total, size); - if (done < 0) - return -EFAULT; - total += done; - } + if (!pos) + done = inet_gifconf(dev, NULL, 0, size); + else + done = inet_gifconf(dev, pos + total, + len - total, size); + if (done < 0) { + rtnl_unlock(); + return -EFAULT; } + total += done; } + rtnl_unlock(); - /* - * All done. Write the updated control block back to the caller. - */ - ifc->ifc_len = total; + return put_user(total, &uifc->ifc_len); +} + +static int dev_getifmap(struct net_device *dev, struct ifreq *ifr) +{ + struct ifmap *ifmap = &ifr->ifr_map; + + if (in_compat_syscall()) { + struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap; + + cifmap->mem_start = dev->mem_start; + cifmap->mem_end = dev->mem_end; + cifmap->base_addr = dev->base_addr; + cifmap->irq = dev->irq; + cifmap->dma = dev->dma; + cifmap->port = dev->if_port; + + return 0; + } + + ifmap->mem_start = dev->mem_start; + ifmap->mem_end = dev->mem_end; + ifmap->base_addr = dev->base_addr; + ifmap->irq = dev->irq; + ifmap->dma = dev->dma; + ifmap->port = dev->if_port; - /* - * Both BSD and Solaris return 0 here, so we do too. - */ return 0; } +static int dev_setifmap(struct net_device *dev, struct ifreq *ifr) +{ + struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map; + + if (!dev->netdev_ops->ndo_set_config) + return -EOPNOTSUPP; + + if (in_compat_syscall()) { + struct ifmap ifmap = { + .mem_start = cifmap->mem_start, + .mem_end = cifmap->mem_end, + .base_addr = cifmap->base_addr, + .irq = cifmap->irq, + .dma = cifmap->dma, + .port = cifmap->port, + }; + + return dev->netdev_ops->ndo_set_config(dev, &ifmap); + } + + return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map); +} + /* * Perform the SIOCxIFxxx calls, inside rcu_read_lock() */ @@ -128,13 +159,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm break; case SIOCGIFMAP: - ifr->ifr_map.mem_start = dev->mem_start; - ifr->ifr_map.mem_end = dev->mem_end; - ifr->ifr_map.base_addr = dev->base_addr; - ifr->ifr_map.irq = dev->irq; - ifr->ifr_map.dma = dev->dma; - ifr->ifr_map.port = dev->if_port; - return 0; + return dev_getifmap(dev, ifr); case SIOCGIFINDEX: ifr->ifr_ifindex = dev->ifindex; @@ -215,19 +240,19 @@ static int net_hwtstamp_validate(struct ifreq *ifr) return 0; } -static int dev_do_ioctl(struct net_device *dev, - struct ifreq *ifr, unsigned int cmd) +static int dev_eth_ioctl(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) { const struct net_device_ops *ops = dev->netdev_ops; int err; - err = dsa_ndo_do_ioctl(dev, ifr, cmd); + err = dsa_ndo_eth_ioctl(dev, ifr, cmd); if (err == 0 || err != -EOPNOTSUPP) return err; - if (ops->ndo_do_ioctl) { + if (ops->ndo_eth_ioctl) { if (netif_device_present(dev)) - err = ops->ndo_do_ioctl(dev, ifr, cmd); + err = ops->ndo_eth_ioctl(dev, ifr, cmd); else err = -ENODEV; } @@ -235,10 +260,55 @@ static int dev_do_ioctl(struct net_device *dev, return err; } +static int dev_siocbond(struct net_device *dev, + struct ifreq *ifr, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocbond) { + if (netif_device_present(dev)) + return ops->ndo_siocbond(dev, ifr, cmd); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + +static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, unsigned int cmd) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocdevprivate) { + if (netif_device_present(dev)) + return ops->ndo_siocdevprivate(dev, ifr, data, cmd); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + +static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (ops->ndo_siocwandev) { + if (netif_device_present(dev)) + return ops->ndo_siocwandev(dev, ifs); + else + return -ENODEV; + } + + return -EOPNOTSUPP; +} + /* * Perform the SIOCxIFxxx calls, inside rtnl_lock() */ -static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) +static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, + unsigned int cmd) { int err; struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); @@ -275,12 +345,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) return 0; case SIOCSIFMAP: - if (ops->ndo_set_config) { - if (!netif_device_present(dev)) - return -ENODEV; - return ops->ndo_set_config(dev, &ifr->ifr_map); - } - return -EOPNOTSUPP; + return dev_setifmap(dev, ifr); case SIOCADDMULTI: if (!ops->ndo_set_rx_mode || @@ -307,6 +372,22 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) ifr->ifr_newname[IFNAMSIZ-1] = '\0'; return dev_change_name(dev, ifr->ifr_newname); + case SIOCWANDEV: + return dev_siocwandev(dev, &ifr->ifr_settings); + + case SIOCBRADDIF: + case SIOCBRDELIF: + if (!netif_device_present(dev)) + return -ENODEV; + if (!netif_is_bridge_master(dev)) + return -EOPNOTSUPP; + dev_hold(dev); + rtnl_unlock(); + err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL); + dev_put(dev); + rtnl_lock(); + return err; + case SIOCSHWTSTAMP: err = net_hwtstamp_validate(ifr); if (err) @@ -317,23 +398,23 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) * Unknown or private ioctl */ default: - if ((cmd >= SIOCDEVPRIVATE && - cmd <= SIOCDEVPRIVATE + 15) || - cmd == SIOCBONDENSLAVE || + if (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) + return dev_siocdevprivate(dev, ifr, data, cmd); + + if (cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCSHWTSTAMP || + cmd == SIOCGHWTSTAMP) { + err = dev_eth_ioctl(dev, ifr, cmd); + } else if (cmd == SIOCBONDENSLAVE || cmd == SIOCBONDRELEASE || cmd == SIOCBONDSETHWADDR || cmd == SIOCBONDSLAVEINFOQUERY || cmd == SIOCBONDINFOQUERY || - cmd == SIOCBONDCHANGEACTIVE || - cmd == SIOCGMIIPHY || - cmd == SIOCGMIIREG || - cmd == SIOCSMIIREG || - cmd == SIOCBRADDIF || - cmd == SIOCBRDELIF || - cmd == SIOCSHWTSTAMP || - cmd == SIOCGHWTSTAMP || - cmd == SIOCWANDEV) { - err = dev_do_ioctl(dev, ifr, cmd); + cmd == SIOCBONDCHANGEACTIVE) { + err = dev_siocbond(dev, ifr, cmd); } else err = -EINVAL; @@ -386,7 +467,8 @@ EXPORT_SYMBOL(dev_load); * positive or a negative errno code on error. */ -int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout) +int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, + void __user *data, bool *need_copyout) { int ret; char *colon; @@ -436,9 +518,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCETHTOOL: dev_load(net, ifr->ifr_name); - rtnl_lock(); - ret = dev_ethtool(net, ifr); - rtnl_unlock(); + ret = dev_ethtool(net, ifr, data); if (colon) *colon = ':'; return ret; @@ -456,7 +536,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; rtnl_lock(); - ret = dev_ifsioc(net, ifr, cmd); + ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); if (colon) *colon = ':'; @@ -502,7 +582,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c case SIOCBONDINFOQUERY: dev_load(net, ifr->ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, ifr, cmd); + ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); if (need_copyout) *need_copyout = false; @@ -527,7 +607,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c cmd <= SIOCDEVPRIVATE + 15)) { dev_load(net, ifr->ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, ifr, cmd); + ret = dev_ifsioc(net, ifr, data, cmd); rtnl_unlock(); return ret; } diff --git a/net/core/devlink.c b/net/core/devlink.c index 85032626de24..6b5ee862429e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -30,6 +30,63 @@ #define CREATE_TRACE_POINTS #include <trace/events/devlink.h> +#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \ + (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX) + +struct devlink_dev_stats { + u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; +}; + +struct devlink { + u32 index; + struct list_head port_list; + struct list_head rate_list; + struct list_head sb_list; + struct list_head dpipe_table_list; + struct list_head resource_list; + struct list_head param_list; + struct list_head region_list; + struct list_head reporter_list; + struct mutex reporters_lock; /* protects reporter_list */ + struct devlink_dpipe_headers *dpipe_headers; + struct list_head trap_list; + struct list_head trap_group_list; + struct list_head trap_policer_list; + const struct devlink_ops *ops; + u64 features; + struct xarray snapshot_ids; + struct devlink_dev_stats stats; + struct device *dev; + possible_net_t _net; + /* Serializes access to devlink instance specific objects such as + * port, sb, dpipe, resource, params, region, traps and more. + */ + struct mutex lock; + u8 reload_failed:1; + refcount_t refcount; + struct completion comp; + char priv[0] __aligned(NETDEV_ALIGN); +}; + +void *devlink_priv(struct devlink *devlink) +{ + return &devlink->priv; +} +EXPORT_SYMBOL_GPL(devlink_priv); + +struct devlink *priv_to_devlink(void *priv) +{ + return container_of(priv, struct devlink, priv); +} +EXPORT_SYMBOL_GPL(priv_to_devlink); + +struct device *devlink_to_dev(const struct devlink *devlink) +{ + return devlink->dev; +} +EXPORT_SYMBOL_GPL(devlink_to_dev); + static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = { { .name = "destination mac", @@ -45,7 +102,7 @@ struct devlink_dpipe_header devlink_dpipe_header_ethernet = { .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet), .global = true, }; -EXPORT_SYMBOL(devlink_dpipe_header_ethernet); +EXPORT_SYMBOL_GPL(devlink_dpipe_header_ethernet); static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = { { @@ -62,7 +119,7 @@ struct devlink_dpipe_header devlink_dpipe_header_ipv4 = { .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4), .global = true, }; -EXPORT_SYMBOL(devlink_dpipe_header_ipv4); +EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv4); static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = { { @@ -79,7 +136,7 @@ struct devlink_dpipe_header devlink_dpipe_header_ipv6 = { .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6), .global = true, }; -EXPORT_SYMBOL(devlink_dpipe_header_ipv6); +EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv6); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); @@ -92,7 +149,24 @@ static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ DEVLINK_PORT_FN_STATE_ACTIVE), }; -static LIST_HEAD(devlink_list); +static DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); +#define DEVLINK_REGISTERED XA_MARK_1 + +/* devlink instances are open to the access from the user space after + * devlink_register() call. Such logical barrier allows us to have certain + * expectations related to locking. + * + * Before *_register() - we are in initialization stage and no parallel + * access possible to the devlink instance. All drivers perform that phase + * by implicitly holding device_lock. + * + * After *_register() - users and driver can access devlink instance at + * the same time. + */ +#define ASSERT_DEVLINK_REGISTERED(d) \ + WARN_ON_ONCE(!xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) +#define ASSERT_DEVLINK_NOT_REGISTERED(d) \ + WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED)) /* devlink_mutex * @@ -108,23 +182,25 @@ struct net *devlink_net(const struct devlink *devlink) } EXPORT_SYMBOL_GPL(devlink_net); -static void __devlink_net_set(struct devlink *devlink, struct net *net) +void devlink_put(struct devlink *devlink) { - write_pnet(&devlink->_net, net); + if (refcount_dec_and_test(&devlink->refcount)) + complete(&devlink->comp); } -void devlink_net_set(struct devlink *devlink, struct net *net) +struct devlink *__must_check devlink_try_get(struct devlink *devlink) { - if (WARN_ON(devlink->registered)) - return; - __devlink_net_set(devlink, net); + if (refcount_inc_not_zero(&devlink->refcount)) + return devlink; + return NULL; } -EXPORT_SYMBOL_GPL(devlink_net_set); static struct devlink *devlink_get_from_attrs(struct net *net, struct nlattr **attrs) { struct devlink *devlink; + unsigned long index; + bool found = false; char *busname; char *devname; @@ -136,19 +212,19 @@ static struct devlink *devlink_get_from_attrs(struct net *net, lockdep_assert_held(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { if (strcmp(devlink->dev->bus->name, busname) == 0 && strcmp(dev_name(devlink->dev), devname) == 0 && - net_eq(devlink_net(devlink), net)) - return devlink; + net_eq(devlink_net(devlink), net)) { + found = true; + break; + } } - return ERR_PTR(-ENODEV); -} + if (!found || !devlink_try_get(devlink)) + devlink = ERR_PTR(-ENODEV); -static struct devlink *devlink_get_from_info(struct genl_info *info) -{ - return devlink_get_from_attrs(genl_info_net(info), info->attrs); + return devlink; } static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, @@ -499,7 +575,7 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, int err; mutex_lock(&devlink_mutex); - devlink = devlink_get_from_info(info); + devlink = devlink_get_from_attrs(genl_info_net(info), info->attrs); if (IS_ERR(devlink)) { mutex_unlock(&devlink_mutex); return PTR_ERR(devlink); @@ -542,6 +618,7 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, unlock: if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); + devlink_put(devlink); mutex_unlock(&devlink_mutex); return err; } @@ -554,6 +631,7 @@ static void devlink_nl_post_doit(const struct genl_ops *ops, devlink = info->user_ptr[0]; if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) mutex_unlock(&devlink->lock); + devlink_put(devlink); mutex_unlock(&devlink_mutex); } @@ -739,6 +817,7 @@ static void devlink_notify(struct devlink *devlink, enum devlink_command cmd) int err; WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL); + WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -817,10 +896,11 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg, return 0; } -static int -devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops *ops, - struct devlink_port *port, struct sk_buff *msg, - struct netlink_ext_ack *extack, bool *msg_updated) +static int devlink_port_fn_hw_addr_fill(const struct devlink_ops *ops, + struct devlink_port *port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) { u8 hw_addr[MAX_ADDR_LEN]; int hw_addr_len; @@ -829,7 +909,8 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops * if (!ops->port_function_hw_addr_get) return 0; - err = ops->port_function_hw_addr_get(devlink, port, hw_addr, &hw_addr_len, extack); + err = ops->port_function_hw_addr_get(port, hw_addr, &hw_addr_len, + extack); if (err) { if (err == -EOPNOTSUPP) return 0; @@ -843,12 +924,11 @@ devlink_port_fn_hw_addr_fill(struct devlink *devlink, const struct devlink_ops * } static int devlink_nl_rate_fill(struct sk_buff *msg, - struct devlink *devlink, struct devlink_rate *devlink_rate, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, - struct netlink_ext_ack *extack) + enum devlink_command cmd, u32 portid, u32 seq, + int flags, struct netlink_ext_ack *extack) { + struct devlink *devlink = devlink_rate->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); @@ -906,12 +986,11 @@ devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate) opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED; } -static int -devlink_port_fn_state_fill(struct devlink *devlink, - const struct devlink_ops *ops, - struct devlink_port *port, struct sk_buff *msg, - struct netlink_ext_ack *extack, - bool *msg_updated) +static int devlink_port_fn_state_fill(const struct devlink_ops *ops, + struct devlink_port *port, + struct sk_buff *msg, + struct netlink_ext_ack *extack, + bool *msg_updated) { enum devlink_port_fn_opstate opstate; enum devlink_port_fn_state state; @@ -920,7 +999,7 @@ devlink_port_fn_state_fill(struct devlink *devlink, if (!ops->port_fn_state_get) return 0; - err = ops->port_fn_state_get(devlink, port, &state, &opstate, extack); + err = ops->port_fn_state_get(port, &state, &opstate, extack); if (err) { if (err == -EOPNOTSUPP) return 0; @@ -948,7 +1027,6 @@ static int devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port, struct netlink_ext_ack *extack) { - struct devlink *devlink = port->devlink; const struct devlink_ops *ops; struct nlattr *function_attr; bool msg_updated = false; @@ -958,13 +1036,12 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por if (!function_attr) return -EMSGSIZE; - ops = devlink->ops; - err = devlink_port_fn_hw_addr_fill(devlink, ops, port, msg, - extack, &msg_updated); + ops = port->devlink->ops; + err = devlink_port_fn_hw_addr_fill(ops, port, msg, extack, + &msg_updated); if (err) goto out; - err = devlink_port_fn_state_fill(devlink, ops, port, msg, extack, - &msg_updated); + err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated); out: if (err || !msg_updated) nla_nest_cancel(msg, function_attr); @@ -973,12 +1050,12 @@ out: return err; } -static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink, +static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink_port *devlink_port, - enum devlink_command cmd, u32 portid, - u32 seq, int flags, - struct netlink_ext_ack *extack) + enum devlink_command cmd, u32 portid, u32 seq, + int flags, struct netlink_ext_ack *extack) { + struct devlink *devlink = devlink_port->devlink; void *hdr; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); @@ -1043,24 +1120,23 @@ static void devlink_port_notify(struct devlink_port *devlink_port, struct sk_buff *msg; int err; - if (!devlink_port->registered) - return; - WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0, - NULL); + err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, + 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } static void devlink_rate_notify(struct devlink_rate *devlink_rate, @@ -1070,22 +1146,23 @@ static void devlink_rate_notify(struct devlink_rate *devlink_rate, struct sk_buff *msg; int err; - WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && - cmd != DEVLINK_CMD_RATE_DEL); + WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL); + + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - err = devlink_nl_rate_fill(msg, devlink, devlink_rate, - cmd, 0, 0, 0, NULL); + err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; } - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, + 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, @@ -1094,13 +1171,18 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, struct devlink_rate *devlink_rate; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_rate, &devlink->rate_list, list) { enum devlink_command cmd = DEVLINK_CMD_RATE_NEW; @@ -1110,18 +1192,19 @@ static int devlink_nl_cmd_rate_get_dumpit(struct sk_buff *msg, idx++; continue; } - err = devlink_nl_rate_fill(msg, devlink, - devlink_rate, - cmd, id, + err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id, cb->nlh->nlmsg_seq, NLM_F_MULTI, NULL); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -1136,7 +1219,6 @@ static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_rate *devlink_rate = info->user_ptr[1]; - struct devlink *devlink = devlink_rate->devlink; struct sk_buff *msg; int err; @@ -1144,8 +1226,7 @@ static int devlink_nl_cmd_rate_get_doit(struct sk_buff *skb, if (!msg) return -ENOMEM; - err = devlink_nl_rate_fill(msg, devlink, devlink_rate, - DEVLINK_CMD_RATE_NEW, + err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { @@ -1193,20 +1274,30 @@ static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg, { struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) { + devlink_put(devlink); + continue; + } + if (idx < start) { idx++; + devlink_put(devlink); continue; } + err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); + devlink_put(devlink); if (err) goto out; idx++; @@ -1222,7 +1313,6 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = devlink_port->devlink; struct sk_buff *msg; int err; @@ -1230,8 +1320,7 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb, if (!msg) return -ENOMEM; - err = devlink_nl_port_fill(msg, devlink, devlink_port, - DEVLINK_CMD_PORT_NEW, + err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW, info->snd_portid, info->snd_seq, 0, info->extack); if (err) { @@ -1248,32 +1337,39 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_port *devlink_port; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_port, &devlink->port_list, list) { if (idx < start) { idx++; continue; } - err = devlink_nl_port_fill(msg, devlink, devlink_port, + err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI, - cb->extack); + NLM_F_MULTI, cb->extack); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -1282,31 +1378,33 @@ out: return msg->len; } -static int devlink_port_type_set(struct devlink *devlink, - struct devlink_port *devlink_port, +static int devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type port_type) { int err; - if (devlink->ops->port_type_set) { - if (port_type == devlink_port->type) - return 0; - err = devlink->ops->port_type_set(devlink_port, port_type); - if (err) - return err; - devlink_port->desired_type = port_type; - devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + if (!devlink_port->devlink->ops->port_type_set) + return -EOPNOTSUPP; + + if (port_type == devlink_port->type) return 0; - } - return -EOPNOTSUPP; + + err = devlink_port->devlink->ops->port_type_set(devlink_port, + port_type); + if (err) + return err; + + devlink_port->desired_type = port_type; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + return 0; } -static int -devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port *port, - const struct nlattr *attr, struct netlink_ext_ack *extack) +static int devlink_port_function_hw_addr_set(struct devlink_port *port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) { - const struct devlink_ops *ops; + const struct devlink_ops *ops = port->devlink->ops; const u8 *hw_addr; int hw_addr_len; @@ -1327,17 +1425,16 @@ devlink_port_function_hw_addr_set(struct devlink *devlink, struct devlink_port * } } - ops = devlink->ops; if (!ops->port_function_hw_addr_set) { NL_SET_ERR_MSG_MOD(extack, "Port doesn't support function attributes"); return -EOPNOTSUPP; } - return ops->port_function_hw_addr_set(devlink, port, hw_addr, hw_addr_len, extack); + return ops->port_function_hw_addr_set(port, hw_addr, hw_addr_len, + extack); } -static int devlink_port_fn_state_set(struct devlink *devlink, - struct devlink_port *port, +static int devlink_port_fn_state_set(struct devlink_port *port, const struct nlattr *attr, struct netlink_ext_ack *extack) { @@ -1345,18 +1442,18 @@ static int devlink_port_fn_state_set(struct devlink *devlink, const struct devlink_ops *ops; state = nla_get_u8(attr); - ops = devlink->ops; + ops = port->devlink->ops; if (!ops->port_fn_state_set) { NL_SET_ERR_MSG_MOD(extack, "Function does not support state setting"); return -EOPNOTSUPP; } - return ops->port_fn_state_set(devlink, port, state, extack); + return ops->port_fn_state_set(port, state, extack); } -static int -devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, - const struct nlattr *attr, struct netlink_ext_ack *extack) +static int devlink_port_function_set(struct devlink_port *port, + const struct nlattr *attr, + struct netlink_ext_ack *extack) { struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1]; int err; @@ -1370,7 +1467,7 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR]; if (attr) { - err = devlink_port_function_hw_addr_set(devlink, port, attr, extack); + err = devlink_port_function_hw_addr_set(port, attr, extack); if (err) return err; } @@ -1380,7 +1477,7 @@ devlink_port_function_set(struct devlink *devlink, struct devlink_port *port, */ attr = tb[DEVLINK_PORT_FN_ATTR_STATE]; if (attr) - err = devlink_port_fn_state_set(devlink, port, attr, extack); + err = devlink_port_fn_state_set(port, attr, extack); if (!err) devlink_port_notify(port, DEVLINK_CMD_PORT_NEW); @@ -1391,14 +1488,13 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink_port *devlink_port = info->user_ptr[1]; - struct devlink *devlink = devlink_port->devlink; int err; if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) { enum devlink_port_type port_type; port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]); - err = devlink_port_type_set(devlink, devlink_port, port_type); + err = devlink_port_type_set(devlink_port, port_type); if (err) return err; } @@ -1407,7 +1503,7 @@ static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb, struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION]; struct netlink_ext_ack *extack = info->extack; - err = devlink_port_function_set(devlink, devlink_port, attr, extack); + err = devlink_port_function_set(devlink_port, attr, extack); if (err) return err; } @@ -1502,9 +1598,8 @@ static int devlink_port_new_notifiy(struct devlink *devlink, goto out; } - err = devlink_nl_port_fill(msg, devlink, devlink_port, - DEVLINK_CMD_NEW, info->snd_portid, - info->snd_seq, 0, NULL); + err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW, + info->snd_portid, info->snd_seq, 0, NULL); if (err) goto out; @@ -1908,13 +2003,18 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_sb *devlink_sb; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { if (idx < start) { @@ -1928,11 +2028,14 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -2052,14 +2155,19 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_sb *devlink_sb; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops->sb_pool_get) - continue; + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_pool_get_dumpit(msg, start, &idx, devlink, @@ -2070,10 +2178,13 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -2265,14 +2376,19 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_sb *devlink_sb; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops->sb_port_pool_get) - continue; + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_port_pool_get_dumpit(msg, start, &idx, @@ -2283,10 +2399,13 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -2506,14 +2625,18 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, struct devlink *devlink; struct devlink_sb *devlink_sb; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops->sb_tc_pool_bind_get) - continue; + goto retry; mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { @@ -2526,10 +2649,13 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -3241,7 +3367,7 @@ void devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry) kfree(value[value_index].mask); } } -EXPORT_SYMBOL(devlink_dpipe_entry_clear); +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_clear); static int devlink_dpipe_entries_fill(struct genl_info *info, enum devlink_command cmd, int flags, @@ -3801,10 +3927,12 @@ static void devlink_param_notify(struct devlink *devlink, struct devlink_param_item *param_item, enum devlink_command cmd); -static void devlink_reload_netns_change(struct devlink *devlink, - struct net *dest_net) +static void devlink_ns_change_notify(struct devlink *devlink, + struct net *dest_net, struct net *curr_net, + bool new) { struct devlink_param_item *param_item; + enum devlink_command cmd; /* Userspace needs to be notified about devlink objects * removed from original and entering new network namespace. @@ -3812,17 +3940,18 @@ static void devlink_reload_netns_change(struct devlink *devlink, * reload process so the notifications are generated separatelly. */ - list_for_each_entry(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_DEL); - devlink_notify(devlink, DEVLINK_CMD_DEL); + if (!dest_net || net_eq(dest_net, curr_net)) + return; - __devlink_net_set(devlink, dest_net); + if (new) + devlink_notify(devlink, DEVLINK_CMD_NEW); - devlink_notify(devlink, DEVLINK_CMD_NEW); + cmd = new ? DEVLINK_CMD_PARAM_NEW : DEVLINK_CMD_PARAM_DEL; list_for_each_entry(param_item, &devlink->param_list, list) - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_NEW); + devlink_param_notify(devlink, 0, param_item, cmd); + + if (!new) + devlink_notify(devlink, DEVLINK_CMD_DEL); } static bool devlink_reload_supported(const struct devlink_ops *ops) @@ -3902,25 +4031,27 @@ static int devlink_reload(struct devlink *devlink, struct net *dest_net, u32 *actions_performed, struct netlink_ext_ack *extack) { u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE]; + struct net *curr_net; int err; - if (!devlink->reload_enabled) - return -EOPNOTSUPP; - memcpy(remote_reload_stats, devlink->stats.remote_reload_stats, sizeof(remote_reload_stats)); + + curr_net = devlink_net(devlink); + devlink_ns_change_notify(devlink, dest_net, curr_net, false); err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack); if (err) return err; - if (dest_net && !net_eq(dest_net, devlink_net(devlink))) - devlink_reload_netns_change(devlink, dest_net); + if (dest_net && !net_eq(dest_net, curr_net)) + write_pnet(&devlink->_net, dest_net); err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack); devlink_reload_failed_set(devlink, !!err); if (err) return err; + devlink_ns_change_notify(devlink, dest_net, curr_net, true); WARN_ON(!(*actions_performed & BIT(action))); /* Catch driver on updating the remote action within devlink reload */ WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats, @@ -3970,7 +4101,7 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) u32 actions_performed; int err; - if (!devlink_reload_supported(devlink->ops)) + if (!(devlink->features & DEVLINK_F_RELOAD)) return -EOPNOTSUPP; err = devlink_resources_validate(devlink, NULL, info); @@ -4098,6 +4229,7 @@ static void __devlink_flash_update_notify(struct devlink *devlink, WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE && cmd != DEVLINK_CMD_FLASH_UPDATE_END && cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS); + WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -4117,7 +4249,7 @@ out_free_msg: static void devlink_flash_update_begin_notify(struct devlink *devlink) { - struct devlink_flash_notify params = { 0 }; + struct devlink_flash_notify params = {}; __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE, @@ -4126,7 +4258,7 @@ static void devlink_flash_update_begin_notify(struct devlink *devlink) static void devlink_flash_update_end_notify(struct devlink *devlink) { - struct devlink_flash_notify params = { 0 }; + struct devlink_flash_notify params = {}; __devlink_flash_update_notify(devlink, DEVLINK_CMD_FLASH_UPDATE_END, @@ -4283,6 +4415,21 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, + .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, + .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, + .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -4455,8 +4602,6 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, return -EOPNOTSUPP; param_value[i] = param_item->driverinit_value; } else { - if (!param_item->published) - continue; ctx.cmode = i; err = devlink_param_get(devlink, param, &ctx); if (err) @@ -4532,6 +4677,7 @@ static void devlink_param_notify(struct devlink *devlink, WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && cmd != DEVLINK_CMD_PORT_PARAM_NEW && cmd != DEVLINK_CMD_PORT_PARAM_DEL); + ASSERT_DEVLINK_REGISTERED(devlink); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -4553,13 +4699,18 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, struct devlink_param_item *param_item; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(param_item, &devlink->param_list, list) { if (idx < start) { @@ -4575,11 +4726,14 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -4773,47 +4927,6 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, info, DEVLINK_CMD_PARAM_NEW); } -static int devlink_param_register_one(struct devlink *devlink, - unsigned int port_index, - struct list_head *param_list, - const struct devlink_param *param, - enum devlink_command cmd) -{ - struct devlink_param_item *param_item; - - if (devlink_param_find_by_name(param_list, param->name)) - return -EEXIST; - - if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) - WARN_ON(param->get || param->set); - else - WARN_ON(!param->get || !param->set); - - param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); - if (!param_item) - return -ENOMEM; - param_item->param = param; - - list_add_tail(¶m_item->list, param_list); - devlink_param_notify(devlink, port_index, param_item, cmd); - return 0; -} - -static void devlink_param_unregister_one(struct devlink *devlink, - unsigned int port_index, - struct list_head *param_list, - const struct devlink_param *param, - enum devlink_command cmd) -{ - struct devlink_param_item *param_item; - - param_item = devlink_param_find_by_name(param_list, param->name); - WARN_ON(!param_item); - devlink_param_notify(devlink, port_index, param_item, cmd); - list_del(¶m_item->list); - kfree(param_item); -} - static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { @@ -4821,13 +4934,18 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, struct devlink_port *devlink_port; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_port, &devlink->port_list, list) { list_for_each_entry(param_item, @@ -4847,12 +4965,15 @@ static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, err = 0; } else if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -4987,6 +5108,11 @@ static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink, if (err) goto nla_put_failure; + err = nla_put_u32(msg, DEVLINK_ATTR_REGION_MAX_SNAPSHOTS, + region->max_snapshots); + if (err) + goto nla_put_failure; + err = devlink_nl_region_snapshots_id_put(msg, devlink, region); if (err) goto nla_put_failure; @@ -5066,13 +5192,15 @@ static void devlink_nl_region_notify(struct devlink_region *region, struct sk_buff *msg; WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); if (IS_ERR(msg)) return; - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, + 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } /** @@ -5390,15 +5518,22 @@ static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg, { struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; - int err; + int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + err = devlink_nl_cmd_region_get_devlink_dumpit(msg, cb, devlink, &idx, start); +retry: + devlink_put(devlink); if (err) goto out; } @@ -5761,6 +5896,7 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, nla_nest_end(skb, chunks_attr); genlmsg_end(skb, hdr); mutex_unlock(&devlink->lock); + devlink_put(devlink); mutex_unlock(&devlink_mutex); return skb->len; @@ -5769,6 +5905,7 @@ nla_put_failure: genlmsg_cancel(skb, hdr); out_unlock: mutex_unlock(&devlink->lock); + devlink_put(devlink); out_dev: mutex_unlock(&devlink_mutex); return err; @@ -5915,22 +6052,20 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, { struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err = 0; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - continue; - if (idx < start) { - idx++; + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; - } - if (!devlink->ops->info_get) { - idx++; - continue; - } + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + + if (idx < start || !devlink->ops->info_get) + goto inc; mutex_lock(&devlink->lock); err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, @@ -5940,9 +6075,14 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, mutex_unlock(&devlink->lock); if (err == -EOPNOTSUPP) err = 0; - else if (err) + else if (err) { + devlink_put(devlink); break; + } +inc: idx++; +retry: + devlink_put(devlink); } mutex_unlock(&devlink_mutex); @@ -6174,23 +6314,21 @@ static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg, return 0; } -int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) +static int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) { if (fmsg->putting_binary) return -EINVAL; return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG); } -EXPORT_SYMBOL_GPL(devlink_fmsg_bool_put); -int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) +static int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) { if (fmsg->putting_binary) return -EINVAL; return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8); } -EXPORT_SYMBOL_GPL(devlink_fmsg_u8_put); int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) { @@ -6201,14 +6339,13 @@ int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) } EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); -int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) +static int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) { if (fmsg->putting_binary) return -EINVAL; return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64); } -EXPORT_SYMBOL_GPL(devlink_fmsg_u64_put); int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) { @@ -6756,11 +6893,11 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_destroy); static int devlink_nl_health_reporter_fill(struct sk_buff *msg, - struct devlink *devlink, struct devlink_health_reporter *reporter, enum devlink_command cmd, u32 portid, u32 seq, int flags) { + struct devlink *devlink = reporter->devlink; struct nlattr *reporter_attr; void *hdr; @@ -6828,25 +6965,25 @@ genlmsg_cancel: static void devlink_recover_notify(struct devlink_health_reporter *reporter, enum devlink_command cmd) { + struct devlink *devlink = reporter->devlink; struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER); + WARN_ON(!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - err = devlink_nl_health_reporter_fill(msg, reporter->devlink, - reporter, cmd, 0, 0, 0); + err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } - genlmsg_multicast_netns(&devlink_nl_family, - devlink_net(reporter->devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, + 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } void @@ -7028,6 +7165,7 @@ devlink_health_reporter_get_from_cb(struct netlink_callback *cb) goto unlock; reporter = devlink_health_reporter_get_from_attrs(devlink, attrs); + devlink_put(devlink); mutex_unlock(&devlink_mutex); return reporter; unlock: @@ -7071,7 +7209,7 @@ static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, goto out; } - err = devlink_nl_health_reporter_fill(msg, devlink, reporter, + err = devlink_nl_health_reporter_fill(msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, info->snd_portid, info->snd_seq, 0); @@ -7094,13 +7232,18 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, struct devlink_port *port; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry_rep; + mutex_lock(&devlink->reporters_lock); list_for_each_entry(reporter, &devlink->reporter_list, list) { @@ -7108,24 +7251,29 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, idx++; continue; } - err = devlink_nl_health_reporter_fill(msg, devlink, - reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + err = devlink_nl_health_reporter_fill( + msg, reporter, DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + NLM_F_MULTI); if (err) { mutex_unlock(&devlink->reporters_lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->reporters_lock); +retry_rep: + devlink_put(devlink); } - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry_port; + mutex_lock(&devlink->lock); list_for_each_entry(port, &devlink->port_list, list) { mutex_lock(&port->reporters_lock); @@ -7134,14 +7282,15 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, idx++; continue; } - err = devlink_nl_health_reporter_fill(msg, devlink, reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); + err = devlink_nl_health_reporter_fill( + msg, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) { mutex_unlock(&port->reporters_lock); mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; @@ -7149,6 +7298,8 @@ devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, mutex_unlock(&port->reporters_lock); } mutex_unlock(&devlink->lock); +retry_port: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -7677,13 +7828,18 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, struct devlink_trap_item *trap_item; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(trap_item, &devlink->trap_list, list) { if (idx < start) { @@ -7697,11 +7853,14 @@ static int devlink_nl_cmd_trap_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -7896,13 +8055,18 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, u32 portid = NETLINK_CB(cb->skb).portid; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(group_item, &devlink->trap_group_list, list) { @@ -7917,11 +8081,14 @@ static int devlink_nl_cmd_trap_group_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -8202,13 +8369,18 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, u32 portid = NETLINK_CB(cb->skb).portid; struct devlink *devlink; int start = cb->args[0]; + unsigned long index; int idx = 0; int err; mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) continue; + + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + goto retry; + mutex_lock(&devlink->lock); list_for_each_entry(policer_item, &devlink->trap_policer_list, list) { @@ -8223,11 +8395,14 @@ static int devlink_nl_cmd_trap_policer_get_dumpit(struct sk_buff *msg, NLM_F_MULTI); if (err) { mutex_unlock(&devlink->lock); + devlink_put(devlink); goto out; } idx++; } mutex_unlock(&devlink->lock); +retry: + devlink_put(devlink); } out: mutex_unlock(&devlink_mutex); @@ -8768,30 +8943,63 @@ static bool devlink_reload_actions_valid(const struct devlink_ops *ops) } /** - * devlink_alloc - Allocate new devlink instance resources + * devlink_set_features - Set devlink supported features + * + * @devlink: devlink + * @features: devlink support features + * + * This interface allows us to set reload ops separatelly from + * the devlink_alloc. + */ +void devlink_set_features(struct devlink *devlink, u64 features) +{ + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + + WARN_ON(features & DEVLINK_F_RELOAD && + !devlink_reload_supported(devlink->ops)); + devlink->features = features; +} +EXPORT_SYMBOL_GPL(devlink_set_features); + +/** + * devlink_alloc_ns - Allocate new devlink instance resources + * in specific namespace * * @ops: ops * @priv_size: size of user private data + * @net: net namespace + * @dev: parent device * * Allocate new devlink instance resources, including devlink index * and name. */ -struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) +struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, + size_t priv_size, struct net *net, + struct device *dev) { struct devlink *devlink; + static u32 last_id; + int ret; - if (WARN_ON(!ops)) - return NULL; - + WARN_ON(!ops || !dev); if (!devlink_reload_actions_valid(ops)) return NULL; devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); if (!devlink) return NULL; + + ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, + &last_id, GFP_KERNEL); + if (ret < 0) { + kfree(devlink); + return NULL; + } + + devlink->dev = dev; devlink->ops = ops; xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); - __devlink_net_set(devlink, &init_net); + write_pnet(&devlink->_net, net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->rate_list); INIT_LIST_HEAD(&devlink->sb_list); @@ -8805,78 +9013,133 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->trap_policer_list); mutex_init(&devlink->lock); mutex_init(&devlink->reporters_lock); + refcount_set(&devlink->refcount, 1); + init_completion(&devlink->comp); + return devlink; } -EXPORT_SYMBOL_GPL(devlink_alloc); +EXPORT_SYMBOL_GPL(devlink_alloc_ns); -/** - * devlink_register - Register devlink instance - * - * @devlink: devlink - * @dev: parent device - */ -int devlink_register(struct devlink *devlink, struct device *dev) +static void +devlink_trap_policer_notify(struct devlink *devlink, + const struct devlink_trap_policer_item *policer_item, + enum devlink_command cmd); +static void +devlink_trap_group_notify(struct devlink *devlink, + const struct devlink_trap_group_item *group_item, + enum devlink_command cmd); +static void devlink_trap_notify(struct devlink *devlink, + const struct devlink_trap_item *trap_item, + enum devlink_command cmd); + +static void devlink_notify_register(struct devlink *devlink) { - devlink->dev = dev; - devlink->registered = true; - mutex_lock(&devlink_mutex); - list_add_tail(&devlink->list, &devlink_list); + struct devlink_trap_policer_item *policer_item; + struct devlink_trap_group_item *group_item; + struct devlink_param_item *param_item; + struct devlink_trap_item *trap_item; + struct devlink_port *devlink_port; + struct devlink_rate *rate_node; + struct devlink_region *region; + devlink_notify(devlink, DEVLINK_CMD_NEW); - mutex_unlock(&devlink_mutex); - return 0; + list_for_each_entry(devlink_port, &devlink->port_list, list) + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); + + list_for_each_entry(policer_item, &devlink->trap_policer_list, list) + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_NEW); + + list_for_each_entry(group_item, &devlink->trap_group_list, list) + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_NEW); + + list_for_each_entry(trap_item, &devlink->trap_list, list) + devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW); + + list_for_each_entry(rate_node, &devlink->rate_list, list) + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW); + + list_for_each_entry(region, &devlink->region_list, list) + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW); + + list_for_each_entry(param_item, &devlink->param_list, list) + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_NEW); } -EXPORT_SYMBOL_GPL(devlink_register); -/** - * devlink_unregister - Unregister devlink instance - * - * @devlink: devlink - */ -void devlink_unregister(struct devlink *devlink) +static void devlink_notify_unregister(struct devlink *devlink) { - mutex_lock(&devlink_mutex); - WARN_ON(devlink_reload_supported(devlink->ops) && - devlink->reload_enabled); + struct devlink_trap_policer_item *policer_item; + struct devlink_trap_group_item *group_item; + struct devlink_param_item *param_item; + struct devlink_trap_item *trap_item; + struct devlink_port *devlink_port; + struct devlink_rate *rate_node; + struct devlink_region *region; + + list_for_each_entry_reverse(param_item, &devlink->param_list, list) + devlink_param_notify(devlink, 0, param_item, + DEVLINK_CMD_PARAM_DEL); + + list_for_each_entry_reverse(region, &devlink->region_list, list) + devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL); + + list_for_each_entry_reverse(rate_node, &devlink->rate_list, list) + devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL); + + list_for_each_entry_reverse(trap_item, &devlink->trap_list, list) + devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL); + + list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list) + devlink_trap_group_notify(devlink, group_item, + DEVLINK_CMD_TRAP_GROUP_DEL); + list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list, + list) + devlink_trap_policer_notify(devlink, policer_item, + DEVLINK_CMD_TRAP_POLICER_DEL); + + list_for_each_entry_reverse(devlink_port, &devlink->port_list, list) + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); devlink_notify(devlink, DEVLINK_CMD_DEL); - list_del(&devlink->list); - mutex_unlock(&devlink_mutex); } -EXPORT_SYMBOL_GPL(devlink_unregister); /** - * devlink_reload_enable - Enable reload of devlink instance + * devlink_register - Register devlink instance * * @devlink: devlink - * - * Should be called at end of device initialization - * process when reload operation is supported. */ -void devlink_reload_enable(struct devlink *devlink) +void devlink_register(struct devlink *devlink) { + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + /* Make sure that we are in .probe() routine */ + mutex_lock(&devlink_mutex); - devlink->reload_enabled = true; + xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); + devlink_notify_register(devlink); mutex_unlock(&devlink_mutex); } -EXPORT_SYMBOL_GPL(devlink_reload_enable); +EXPORT_SYMBOL_GPL(devlink_register); /** - * devlink_reload_disable - Disable reload of devlink instance + * devlink_unregister - Unregister devlink instance * * @devlink: devlink - * - * Should be called at the beginning of device cleanup - * process when reload operation is supported. */ -void devlink_reload_disable(struct devlink *devlink) +void devlink_unregister(struct devlink *devlink) { + ASSERT_DEVLINK_REGISTERED(devlink); + /* Make sure that we are in .remove() routine */ + + devlink_put(devlink); + wait_for_completion(&devlink->comp); + mutex_lock(&devlink_mutex); - /* Mutex is taken which ensures that no reload operation is in - * progress while setting up forbidded flag. - */ - devlink->reload_enabled = false; + devlink_notify_unregister(devlink); + xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); mutex_unlock(&devlink_mutex); } -EXPORT_SYMBOL_GPL(devlink_reload_disable); +EXPORT_SYMBOL_GPL(devlink_unregister); /** * devlink_free - Free devlink instance resources @@ -8885,6 +9148,8 @@ EXPORT_SYMBOL_GPL(devlink_reload_disable); */ void devlink_free(struct devlink *devlink) { + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + mutex_destroy(&devlink->reporters_lock); mutex_destroy(&devlink->lock); WARN_ON(!list_empty(&devlink->trap_policer_list)); @@ -8900,6 +9165,7 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->port_list)); xa_destroy(&devlink->snapshot_ids); + xa_erase(&devlinks, devlink->index); kfree(devlink); } @@ -8960,9 +9226,10 @@ int devlink_port_register(struct devlink *devlink, mutex_unlock(&devlink->lock); return -EEXIST; } + + WARN_ON(devlink_port->devlink); devlink_port->devlink = devlink; devlink_port->index = port_index; - devlink_port->registered = true; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); mutex_init(&devlink_port->reporters_lock); @@ -9001,7 +9268,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { - if (WARN_ON(!devlink_port->registered)) + if (WARN_ON(!devlink_port->devlink)) return; devlink_port_type_warn_cancel(devlink_port); spin_lock_bh(&devlink_port->type_lock); @@ -9121,7 +9388,7 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, { int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); @@ -9145,7 +9412,7 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); @@ -9172,7 +9439,7 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); @@ -9200,7 +9467,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->registered)) + if (WARN_ON(devlink_port->devlink)) return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); @@ -9788,61 +10055,6 @@ static int devlink_param_verify(const struct devlink_param *param) return devlink_param_driver_verify(param); } -static int __devlink_params_register(struct devlink *devlink, - unsigned int port_index, - struct list_head *param_list, - const struct devlink_param *params, - size_t params_count, - enum devlink_command reg_cmd, - enum devlink_command unreg_cmd) -{ - const struct devlink_param *param = params; - int i; - int err; - - mutex_lock(&devlink->lock); - for (i = 0; i < params_count; i++, param++) { - err = devlink_param_verify(param); - if (err) - goto rollback; - - err = devlink_param_register_one(devlink, port_index, - param_list, param, reg_cmd); - if (err) - goto rollback; - } - - mutex_unlock(&devlink->lock); - return 0; - -rollback: - if (!i) - goto unlock; - for (param--; i > 0; i--, param--) - devlink_param_unregister_one(devlink, port_index, param_list, - param, unreg_cmd); -unlock: - mutex_unlock(&devlink->lock); - return err; -} - -static void __devlink_params_unregister(struct devlink *devlink, - unsigned int port_index, - struct list_head *param_list, - const struct devlink_param *params, - size_t params_count, - enum devlink_command cmd) -{ - const struct devlink_param *param = params; - int i; - - mutex_lock(&devlink->lock); - for (i = 0; i < params_count; i++, param++) - devlink_param_unregister_one(devlink, 0, param_list, param, - cmd); - mutex_unlock(&devlink->lock); -} - /** * devlink_params_register - register configuration parameters * @@ -9856,10 +10068,25 @@ int devlink_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { - return __devlink_params_register(devlink, 0, &devlink->param_list, - params, params_count, - DEVLINK_CMD_PARAM_NEW, - DEVLINK_CMD_PARAM_DEL); + const struct devlink_param *param = params; + int i, err; + + ASSERT_DEVLINK_NOT_REGISTERED(devlink); + + for (i = 0; i < params_count; i++, param++) { + err = devlink_param_register(devlink, param); + if (err) + goto rollback; + } + return 0; + +rollback: + if (!i) + return err; + + for (param--; i > 0; i--, param--) + devlink_param_unregister(devlink, param); + return err; } EXPORT_SYMBOL_GPL(devlink_params_register); @@ -9873,145 +10100,70 @@ void devlink_params_unregister(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { - return __devlink_params_unregister(devlink, 0, &devlink->param_list, - params, params_count, - DEVLINK_CMD_PARAM_DEL); -} -EXPORT_SYMBOL_GPL(devlink_params_unregister); + const struct devlink_param *param = params; + int i; -/** - * devlink_params_publish - publish configuration parameters - * - * @devlink: devlink - * - * Publish previously registered configuration parameters. - */ -void devlink_params_publish(struct devlink *devlink) -{ - struct devlink_param_item *param_item; + ASSERT_DEVLINK_NOT_REGISTERED(devlink); - list_for_each_entry(param_item, &devlink->param_list, list) { - if (param_item->published) - continue; - param_item->published = true; - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_NEW); - } + for (i = 0; i < params_count; i++, param++) + devlink_param_unregister(devlink, param); } -EXPORT_SYMBOL_GPL(devlink_params_publish); +EXPORT_SYMBOL_GPL(devlink_params_unregister); /** - * devlink_params_unpublish - unpublish configuration parameters + * devlink_param_register - register one configuration parameter * - * @devlink: devlink + * @devlink: devlink + * @param: one configuration parameter * - * Unpublish previously registered configuration parameters. + * Register the configuration parameter supported by the driver. + * Return: returns 0 on successful registration or error code otherwise. */ -void devlink_params_unpublish(struct devlink *devlink) +int devlink_param_register(struct devlink *devlink, + const struct devlink_param *param) { struct devlink_param_item *param_item; - list_for_each_entry(param_item, &devlink->param_list, list) { - if (!param_item->published) - continue; - param_item->published = false; - devlink_param_notify(devlink, 0, param_item, - DEVLINK_CMD_PARAM_DEL); - } -} -EXPORT_SYMBOL_GPL(devlink_params_unpublish); - -/** - * devlink_port_params_register - register port configuration parameters - * - * @devlink_port: devlink port - * @params: configuration parameters array - * @params_count: number of parameters provided - * - * Register the configuration parameters supported by the port. - */ -int devlink_port_params_register(struct devlink_port *devlink_port, - const struct devlink_param *params, - size_t params_count) -{ - return __devlink_params_register(devlink_port->devlink, - devlink_port->index, - &devlink_port->param_list, params, - params_count, - DEVLINK_CMD_PORT_PARAM_NEW, - DEVLINK_CMD_PORT_PARAM_DEL); -} -EXPORT_SYMBOL_GPL(devlink_port_params_register); + ASSERT_DEVLINK_NOT_REGISTERED(devlink); -/** - * devlink_port_params_unregister - unregister port configuration - * parameters - * - * @devlink_port: devlink port - * @params: configuration parameters array - * @params_count: number of parameters provided - */ -void devlink_port_params_unregister(struct devlink_port *devlink_port, - const struct devlink_param *params, - size_t params_count) -{ - return __devlink_params_unregister(devlink_port->devlink, - devlink_port->index, - &devlink_port->param_list, - params, params_count, - DEVLINK_CMD_PORT_PARAM_DEL); -} -EXPORT_SYMBOL_GPL(devlink_port_params_unregister); + WARN_ON(devlink_param_verify(param)); + WARN_ON(devlink_param_find_by_name(&devlink->param_list, param->name)); -static int -__devlink_param_driverinit_value_get(struct list_head *param_list, u32 param_id, - union devlink_param_value *init_val) -{ - struct devlink_param_item *param_item; + if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) + WARN_ON(param->get || param->set); + else + WARN_ON(!param->get || !param->set); - param_item = devlink_param_find_by_id(param_list, param_id); + param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); if (!param_item) - return -EINVAL; - - if (!param_item->driverinit_value_valid || - !devlink_param_cmode_is_supported(param_item->param, - DEVLINK_PARAM_CMODE_DRIVERINIT)) - return -EOPNOTSUPP; + return -ENOMEM; - if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) - strcpy(init_val->vstr, param_item->driverinit_value.vstr); - else - *init_val = param_item->driverinit_value; + param_item->param = param; + list_add_tail(¶m_item->list, &devlink->param_list); return 0; } +EXPORT_SYMBOL_GPL(devlink_param_register); -static int -__devlink_param_driverinit_value_set(struct devlink *devlink, - unsigned int port_index, - struct list_head *param_list, u32 param_id, - union devlink_param_value init_val, - enum devlink_command cmd) +/** + * devlink_param_unregister - unregister one configuration parameter + * @devlink: devlink + * @param: configuration parameter to unregister + */ +void devlink_param_unregister(struct devlink *devlink, + const struct devlink_param *param) { struct devlink_param_item *param_item; - param_item = devlink_param_find_by_id(param_list, param_id); - if (!param_item) - return -EINVAL; - - if (!devlink_param_cmode_is_supported(param_item->param, - DEVLINK_PARAM_CMODE_DRIVERINIT)) - return -EOPNOTSUPP; + ASSERT_DEVLINK_NOT_REGISTERED(devlink); - if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) - strcpy(param_item->driverinit_value.vstr, init_val.vstr); - else - param_item->driverinit_value = init_val; - param_item->driverinit_value_valid = true; - - devlink_param_notify(devlink, port_index, param_item, cmd); - return 0; + param_item = + devlink_param_find_by_name(&devlink->param_list, param->name); + WARN_ON(!param_item); + list_del(¶m_item->list); + kfree(param_item); } +EXPORT_SYMBOL_GPL(devlink_param_unregister); /** * devlink_param_driverinit_value_get - get configuration parameter @@ -10027,11 +10179,26 @@ __devlink_param_driverinit_value_set(struct devlink *devlink, int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id, union devlink_param_value *init_val) { + struct devlink_param_item *param_item; + if (!devlink_reload_supported(devlink->ops)) return -EOPNOTSUPP; - return __devlink_param_driverinit_value_get(&devlink->param_list, - param_id, init_val); + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + if (!param_item) + return -EINVAL; + + if (!param_item->driverinit_value_valid || + !devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT)) + return -EOPNOTSUPP; + + if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) + strcpy(init_val->vstr, param_item->driverinit_value.vstr); + else + *init_val = param_item->driverinit_value; + + return 0; } EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); @@ -10050,61 +10217,26 @@ EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, union devlink_param_value init_val) { - return __devlink_param_driverinit_value_set(devlink, 0, - &devlink->param_list, - param_id, init_val, - DEVLINK_CMD_PARAM_NEW); -} -EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); + struct devlink_param_item *param_item; -/** - * devlink_port_param_driverinit_value_get - get configuration parameter - * value for driver initializing - * - * @devlink_port: devlink_port - * @param_id: parameter ID - * @init_val: value of parameter in driverinit configuration mode - * - * This function should be used by the driver to get driverinit - * configuration for initialization after reload command. - */ -int devlink_port_param_driverinit_value_get(struct devlink_port *devlink_port, - u32 param_id, - union devlink_param_value *init_val) -{ - struct devlink *devlink = devlink_port->devlink; + ASSERT_DEVLINK_NOT_REGISTERED(devlink); - if (!devlink_reload_supported(devlink->ops)) - return -EOPNOTSUPP; + param_item = devlink_param_find_by_id(&devlink->param_list, param_id); + if (!param_item) + return -EINVAL; - return __devlink_param_driverinit_value_get(&devlink_port->param_list, - param_id, init_val); -} -EXPORT_SYMBOL_GPL(devlink_port_param_driverinit_value_get); + if (!devlink_param_cmode_is_supported(param_item->param, + DEVLINK_PARAM_CMODE_DRIVERINIT)) + return -EOPNOTSUPP; -/** - * devlink_port_param_driverinit_value_set - set value of configuration - * parameter for driverinit - * configuration mode - * - * @devlink_port: devlink_port - * @param_id: parameter ID - * @init_val: value of parameter to set for driverinit configuration mode - * - * This function should be used by the driver to set driverinit - * configuration mode default value. - */ -int devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port, - u32 param_id, - union devlink_param_value init_val) -{ - return __devlink_param_driverinit_value_set(devlink_port->devlink, - devlink_port->index, - &devlink_port->param_list, - param_id, init_val, - DEVLINK_CMD_PORT_PARAM_NEW); + if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING) + strcpy(param_item->driverinit_value.vstr, init_val.vstr); + else + param_item->driverinit_value = init_val; + param_item->driverinit_value_valid = true; + return 0; } -EXPORT_SYMBOL_GPL(devlink_port_param_driverinit_value_set); +EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set); /** * devlink_param_value_changed - notify devlink on a parameter's value @@ -10130,50 +10262,6 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id) EXPORT_SYMBOL_GPL(devlink_param_value_changed); /** - * devlink_port_param_value_changed - notify devlink on a parameter's value - * change. Should be called by the driver - * right after the change. - * - * @devlink_port: devlink_port - * @param_id: parameter ID - * - * This function should be used by the driver to notify devlink on value - * change, excluding driverinit configuration mode. - * For driverinit configuration mode driver should use the function - * devlink_port_param_driverinit_value_set() instead. - */ -void devlink_port_param_value_changed(struct devlink_port *devlink_port, - u32 param_id) -{ - struct devlink_param_item *param_item; - - param_item = devlink_param_find_by_id(&devlink_port->param_list, - param_id); - WARN_ON(!param_item); - - devlink_param_notify(devlink_port->devlink, devlink_port->index, - param_item, DEVLINK_CMD_PORT_PARAM_NEW); -} -EXPORT_SYMBOL_GPL(devlink_port_param_value_changed); - -/** - * devlink_param_value_str_fill - Safely fill-up the string preventing - * from overflow of the preallocated buffer - * - * @dst_val: destination devlink_param_value - * @src: source buffer - */ -void devlink_param_value_str_fill(union devlink_param_value *dst_val, - const char *src) -{ - size_t len; - - len = strlcpy(dst_val->vstr, src, __DEVLINK_PARAM_MAX_STRING_VALUE); - WARN_ON(len >= __DEVLINK_PARAM_MAX_STRING_VALUE); -} -EXPORT_SYMBOL_GPL(devlink_param_value_str_fill); - -/** * devlink_region_create - create a new address region * * @devlink: devlink @@ -10591,6 +10679,8 @@ devlink_trap_group_notify(struct devlink *devlink, WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW && cmd != DEVLINK_CMD_TRAP_GROUP_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -10632,6 +10722,8 @@ static void devlink_trap_notify(struct devlink *devlink, WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW && cmd != DEVLINK_CMD_TRAP_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -11013,6 +11105,8 @@ devlink_trap_policer_notify(struct devlink *devlink, WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW && cmd != DEVLINK_CMD_TRAP_POLICER_DEL); + if (!xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED)) + return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -11181,45 +11275,36 @@ free_msg: nlmsg_free(msg); } -void devlink_compat_running_version(struct net_device *dev, - char *buf, size_t len) +static struct devlink_port *netdev_to_devlink_port(struct net_device *dev) { - struct devlink *devlink; + if (!dev->netdev_ops->ndo_get_devlink_port) + return NULL; - dev_hold(dev); - rtnl_unlock(); + return dev->netdev_ops->ndo_get_devlink_port(dev); +} - devlink = netdev_to_devlink(dev); - if (!devlink || !devlink->ops->info_get) - goto out; +void devlink_compat_running_version(struct devlink *devlink, + char *buf, size_t len) +{ + if (!devlink->ops->info_get) + return; mutex_lock(&devlink->lock); __devlink_compat_running_version(devlink, buf, len); mutex_unlock(&devlink->lock); - -out: - rtnl_lock(); - dev_put(dev); } -int devlink_compat_flash_update(struct net_device *dev, const char *file_name) +int devlink_compat_flash_update(struct devlink *devlink, const char *file_name) { struct devlink_flash_update_params params = {}; - struct devlink *devlink; int ret; - dev_hold(dev); - rtnl_unlock(); - - devlink = netdev_to_devlink(dev); - if (!devlink || !devlink->ops->flash_update) { - ret = -EOPNOTSUPP; - goto out; - } + if (!devlink->ops->flash_update) + return -EOPNOTSUPP; ret = request_firmware(¶ms.fw, file_name, devlink->dev); if (ret) - goto out; + return ret; mutex_lock(&devlink->lock); devlink_flash_update_begin_notify(devlink); @@ -11229,10 +11314,6 @@ int devlink_compat_flash_update(struct net_device *dev, const char *file_name) release_firmware(params.fw); -out: - rtnl_lock(); - dev_put(dev); - return ret; } @@ -11276,23 +11357,29 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net) { struct devlink *devlink; u32 actions_performed; + unsigned long index; int err; /* In case network namespace is getting destroyed, reload * all devlink instances from this namespace into init_net. */ mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (net_eq(devlink_net(devlink), net)) { - if (WARN_ON(!devlink_reload_supported(devlink->ops))) - continue; - err = devlink_reload(devlink, &init_net, - DEVLINK_RELOAD_ACTION_DRIVER_REINIT, - DEVLINK_RELOAD_LIMIT_UNSPEC, - &actions_performed, NULL); - if (err && err != -EOPNOTSUPP) - pr_warn("Failed to reload devlink instance into init_net\n"); - } + xa_for_each_marked(&devlinks, index, devlink, DEVLINK_REGISTERED) { + if (!devlink_try_get(devlink)) + continue; + + if (!net_eq(devlink_net(devlink), net)) + goto retry; + + WARN_ON(!(devlink->features & DEVLINK_F_RELOAD)); + err = devlink_reload(devlink, &init_net, + DEVLINK_RELOAD_ACTION_DRIVER_REINIT, + DEVLINK_RELOAD_LIMIT_UNSPEC, + &actions_performed, NULL); + if (err && err != -EOPNOTSUPP) + pr_warn("Failed to reload devlink instance into init_net\n"); +retry: + devlink_put(devlink); } mutex_unlock(&devlink_mutex); } diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index ead2a8aa57b4..49442cae6f69 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -850,8 +850,7 @@ net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata) } hw_metadata->input_dev = metadata->input_dev; - if (hw_metadata->input_dev) - dev_hold(hw_metadata->input_dev); + dev_hold(hw_metadata->input_dev); return hw_metadata; @@ -867,8 +866,7 @@ free_hw_metadata: static void net_dm_hw_metadata_free(const struct devlink_trap_metadata *hw_metadata) { - if (hw_metadata->input_dev) - dev_put(hw_metadata->input_dev); + dev_put(hw_metadata->input_dev); kfree(hw_metadata->fa_cookie); kfree(hw_metadata->trap_name); kfree(hw_metadata->trap_group_name); diff --git a/net/core/dst.c b/net/core/dst.c index fb3bcba87744..497ef9b3fc6a 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -49,8 +49,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, unsigned short flags) { dst->dev = dev; - if (dev) - dev_hold(dev); + dev_hold(dev); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; @@ -118,8 +117,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) if (dst->ops->destroy) dst->ops->destroy(dst); - if (dst->dev) - dev_put(dst->dev); + dev_put(dst->dev); lwtstate_put(dst->lwtstate); diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index a9f937975080..79df7cd9dbc1 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -57,7 +57,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops, { struct fib_rule *r; - r = kzalloc(ops->rule_size, GFP_KERNEL); + r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (r == NULL) return -ENOMEM; @@ -541,7 +541,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - nlrule = kzalloc(ops->rule_size, GFP_KERNEL); + nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT); if (!nlrule) { err = -ENOMEM; goto errout; diff --git a/net/core/filter.c b/net/core/filter.c index d70187ce851b..8e8d3b49c297 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -77,6 +77,7 @@ #include <net/transp_v6.h> #include <linux/btf_ids.h> #include <net/tls.h> +#include <net/xdp.h> static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id); @@ -113,7 +114,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to BPF_PROG_RUN. It returns 0 if the packet should + * wrapper to bpf_prog_run. It returns 0 if the packet should * be accepted or -EPERM if the packet should be tossed. * */ @@ -2179,17 +2180,9 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, skb->tstamp = 0; if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { - struct sk_buff *skb2; - - skb2 = skb_realloc_headroom(skb, hh_len); - if (unlikely(!skb2)) { - kfree_skb(skb); + skb = skb_expand_head(skb, hh_len); + if (!skb) return -ENOMEM; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; } rcu_read_lock_bh(); @@ -2213,8 +2206,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb, } rcu_read_unlock_bh(); if (dst) - IP6_INC_STATS(dev_net(dst->dev), - ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); out_drop: kfree_skb(skb); return -ENETDOWN; @@ -2286,17 +2278,9 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb, skb->tstamp = 0; if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { - struct sk_buff *skb2; - - skb2 = skb_realloc_headroom(skb, hh_len); - if (unlikely(!skb2)) { - kfree_skb(skb); + skb = skb_expand_head(skb, hh_len); + if (!skb) return -ENOMEM; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; } rcu_read_lock_bh(); @@ -3880,8 +3864,7 @@ BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset) if (unlikely(meta < xdp_frame_end || meta > xdp->data)) return -EINVAL; - if (unlikely((metalen & (sizeof(__u32) - 1)) || - (metalen > 32))) + if (unlikely(xdp_metalen_invalid(metalen))) return -EACCES; xdp->data_meta = meta; @@ -3950,6 +3933,31 @@ void bpf_clear_redirect_map(struct bpf_map *map) } } +DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); +EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key); + +u32 xdp_master_redirect(struct xdp_buff *xdp) +{ + struct net_device *master, *slave; + struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); + + master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev); + slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp); + if (slave && slave != xdp->rxq->dev) { + /* The target device is different from the receiving device, so + * redirect it to the new device. + * Using XDP_REDIRECT gets the correct behaviour from XDP enabled + * drivers to unmap the packet from their rx ring. + */ + ri->tgt_index = slave->ifindex; + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; + return XDP_REDIRECT; + } + return XDP_TX; +} +EXPORT_SYMBOL_GPL(xdp_master_redirect); + int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { @@ -4040,8 +4048,12 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, goto err; consume_skb(skb); break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_generic_redirect(fwd, skb); + if (unlikely(err)) + goto err; + break; default: - /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } @@ -4664,6 +4676,30 @@ static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = { .arg1_type = ARG_PTR_TO_CTX_OR_NULL, }; +BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = { + .func = bpf_get_netns_cookie_sock_ops, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + +BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx) +{ + return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL); +} + +static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = { + .func = bpf_get_netns_cookie_sk_msg, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, +}; + BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) { struct sock *sk = sk_to_full_sk(skb->sk); @@ -5012,6 +5048,46 @@ err_clear: return -EINVAL; } +BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + if (level == SOL_TCP && optname == TCP_CONGESTION) { + if (optlen >= sizeof("cdg") - 1 && + !strncmp("cdg", optval, optlen)) + return -ENOTSUPP; + } + + return _bpf_setsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_setsockopt_proto = { + .func = bpf_sk_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(sk, level, optname, optval, optlen); +} + +const struct bpf_func_proto bpf_sk_getsockopt_proto = { + .func = bpf_sk_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, int, level, int, optname, char *, optval, int, optlen) { @@ -7445,6 +7521,8 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sock_ops_proto; #ifdef CONFIG_INET case BPF_FUNC_load_hdr_opt: return &bpf_sock_ops_load_hdr_opt_proto; @@ -7491,6 +7569,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_get_netns_cookie: + return &bpf_get_netns_cookie_sk_msg_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; @@ -7685,6 +7765,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type break; case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): return false; + case bpf_ctx_range(struct __sk_buff, hwtstamp): + if (type == BPF_WRITE || size != sizeof(__u64)) + return false; + break; case bpf_ctx_range(struct __sk_buff, tstamp): if (size != sizeof(__u64)) return false; @@ -7694,6 +7778,9 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type return false; info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; break; + case offsetofend(struct __sk_buff, gso_size) ... offsetof(struct __sk_buff, hwtstamp) - 1: + /* Explicitly prohibit access to padding in __sk_buff. */ + return false; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -7722,6 +7809,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): + case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } @@ -7792,6 +7880,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): + case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } @@ -8293,6 +8382,7 @@ static bool sk_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, tstamp): case bpf_ctx_range(struct __sk_buff, wire_len): + case bpf_ctx_range(struct __sk_buff, hwtstamp): return false; } @@ -8804,6 +8894,17 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_buff, sk)); break; + case offsetof(struct __sk_buff, hwtstamp): + BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8); + BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0); + + insn = bpf_convert_shinfo_access(si, insn); + *insn++ = BPF_LDX_MEM(BPF_DW, + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + hwtstamps, 8, + target_size)); + break; } return insn - insn_buf; @@ -10069,7 +10170,7 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, enum sk_action action; bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash); - action = BPF_PROG_RUN(prog, &reuse_kern); + action = bpf_prog_run(prog, &reuse_kern); if (action == SK_PASS) return reuse_kern.selected_sk; @@ -10622,6 +10723,26 @@ const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; +BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk) +{ + /* unix_sock type is not generated in dwarf and hence btf, + * trigger an explicit type generation here. + */ + BTF_TYPE_EMIT(struct unix_sock); + if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +const struct bpf_func_proto bpf_skc_to_unix_sock_proto = { + .func = bpf_skc_to_unix_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, + .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX], +}; + BPF_CALL_1(bpf_sock_from_file, struct file *, file) { return (unsigned long)sock_from_file(file); @@ -10661,6 +10782,9 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_skc_to_udp6_sock: func = &bpf_skc_to_udp6_sock_proto; break; + case BPF_FUNC_skc_to_unix_sock: + func = &bpf_skc_to_unix_sock_proto; + break; default: return bpf_base_func_proto(func_id); } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 4b2415d34873..3255f57f5131 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1056,8 +1056,10 @@ proto_again: FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container); - memcpy(&key_addrs->v4addrs, &iph->saddr, - sizeof(key_addrs->v4addrs)); + memcpy(&key_addrs->v4addrs.src, &iph->saddr, + sizeof(key_addrs->v4addrs.src)); + memcpy(&key_addrs->v4addrs.dst, &iph->daddr, + sizeof(key_addrs->v4addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } @@ -1101,8 +1103,10 @@ proto_again: FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); - memcpy(&key_addrs->v6addrs, &iph->saddr, - sizeof(key_addrs->v6addrs)); + memcpy(&key_addrs->v6addrs.src, &iph->saddr, + sizeof(key_addrs->v6addrs.src)); + memcpy(&key_addrs->v6addrs.dst, &iph->daddr, + sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } @@ -1192,9 +1196,8 @@ proto_again: break; } - proto = hdr->proto; nhoff += PPPOE_SES_HLEN; - switch (proto) { + switch (hdr->proto) { case htons(PPP_IP): proto = htons(ETH_P_IP); fdret = FLOW_DISSECT_RET_PROTO_AGAIN; @@ -1303,6 +1306,11 @@ ip_proto_again: switch (ip_proto) { case IPPROTO_GRE: + if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector, target_container, data, &proto, &nhoff, &hlen, flags); @@ -1360,6 +1368,11 @@ ip_proto_again: break; } case IPPROTO_IPIP: + if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + proto = htons(ETH_P_IP); key_control->flags |= FLOW_DIS_ENCAPSULATION; @@ -1372,6 +1385,11 @@ ip_proto_again: break; case IPPROTO_IPV6: + if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + proto = htons(ETH_P_IPV6); key_control->flags |= FLOW_DIS_ENCAPSULATION; diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 715b67f6c62f..6beaea13564a 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -321,13 +321,13 @@ EXPORT_SYMBOL(flow_block_cb_setup_simple); static DEFINE_MUTEX(flow_indr_block_lock); static LIST_HEAD(flow_block_indr_list); static LIST_HEAD(flow_block_indr_dev_list); +static LIST_HEAD(flow_indir_dev_list); struct flow_indr_dev { struct list_head list; flow_indr_block_bind_cb_t *cb; void *cb_priv; refcount_t refcnt; - struct rcu_head rcu; }; static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, @@ -346,6 +346,33 @@ static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, return indr_dev; } +struct flow_indir_dev_info { + void *data; + struct net_device *dev; + struct Qdisc *sch; + enum tc_setup_type type; + void (*cleanup)(struct flow_block_cb *block_cb); + struct list_head list; + enum flow_block_command command; + enum flow_block_binder_type binder_type; + struct list_head *cb_list; +}; + +static void existing_qdiscs_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) +{ + struct flow_block_offload bo; + struct flow_indir_dev_info *cur; + + list_for_each_entry(cur, &flow_indir_dev_list, list) { + memset(&bo, 0, sizeof(bo)); + bo.command = cur->command; + bo.binder_type = cur->binder_type; + INIT_LIST_HEAD(&bo.cb_list); + cb(cur->dev, cur->sch, cb_priv, cur->type, &bo, cur->data, cur->cleanup); + list_splice(&bo.cb_list, cur->cb_list); + } +} + int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) { struct flow_indr_dev *indr_dev; @@ -367,6 +394,7 @@ int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) } list_add(&indr_dev->list, &flow_block_indr_dev_list); + existing_qdiscs_register(cb, cb_priv); mutex_unlock(&flow_indr_block_lock); return 0; @@ -463,7 +491,59 @@ out: } EXPORT_SYMBOL(flow_indr_block_cb_alloc); -int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, +static struct flow_indir_dev_info *find_indir_dev(void *data) +{ + struct flow_indir_dev_info *cur; + + list_for_each_entry(cur, &flow_indir_dev_list, list) { + if (cur->data == data) + return cur; + } + return NULL; +} + +static int indir_dev_add(void *data, struct net_device *dev, struct Qdisc *sch, + enum tc_setup_type type, void (*cleanup)(struct flow_block_cb *block_cb), + struct flow_block_offload *bo) +{ + struct flow_indir_dev_info *info; + + info = find_indir_dev(data); + if (info) + return -EEXIST; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->data = data; + info->dev = dev; + info->sch = sch; + info->type = type; + info->cleanup = cleanup; + info->command = bo->command; + info->binder_type = bo->binder_type; + info->cb_list = bo->cb_list_head; + + list_add(&info->list, &flow_indir_dev_list); + return 0; +} + +static int indir_dev_remove(void *data) +{ + struct flow_indir_dev_info *info; + + info = find_indir_dev(data); + if (!info) + return -ENOENT; + + list_del(&info->list); + + kfree(info); + return 0; +} + +int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)) @@ -471,6 +551,12 @@ int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, struct flow_indr_dev *this; mutex_lock(&flow_indr_block_lock); + + if (bo->command == FLOW_BLOCK_BIND) + indir_dev_add(data, dev, sch, type, cleanup, bo); + else if (bo->command == FLOW_BLOCK_UNBIND) + indir_dev_remove(data); + list_for_each_entry(this, &flow_block_indr_dev_list, list) this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 8e582e29a41e..4fcbdd71c59f 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -40,10 +40,10 @@ */ struct net_rate_estimator { - struct gnet_stats_basic_packed *bstats; + struct gnet_stats_basic_sync *bstats; spinlock_t *stats_lock; - seqcount_t *running; - struct gnet_stats_basic_cpu __percpu *cpu_bstats; + bool running; + struct gnet_stats_basic_sync __percpu *cpu_bstats; u8 ewma_log; u8 intvl_log; /* period : (250ms << intvl_log) */ @@ -60,13 +60,13 @@ struct net_rate_estimator { }; static void est_fetch_counters(struct net_rate_estimator *e, - struct gnet_stats_basic_packed *b) + struct gnet_stats_basic_sync *b) { - memset(b, 0, sizeof(*b)); + gnet_stats_basic_sync_init(b); if (e->stats_lock) spin_lock(e->stats_lock); - __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats); + gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running); if (e->stats_lock) spin_unlock(e->stats_lock); @@ -76,14 +76,18 @@ static void est_fetch_counters(struct net_rate_estimator *e, static void est_timer(struct timer_list *t) { struct net_rate_estimator *est = from_timer(est, t, timer); - struct gnet_stats_basic_packed b; + struct gnet_stats_basic_sync b; + u64 b_bytes, b_packets; u64 rate, brate; est_fetch_counters(est, &b); - brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log); + b_bytes = u64_stats_read(&b.bytes); + b_packets = u64_stats_read(&b.packets); + + brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log); brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); - rate = (b.packets - est->last_packets) << (10 - est->intvl_log); + rate = (b_packets - est->last_packets) << (10 - est->intvl_log); rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); @@ -91,8 +95,8 @@ static void est_timer(struct timer_list *t) est->avpps += rate; write_seqcount_end(&est->seq); - est->last_bytes = b.bytes; - est->last_packets = b.packets; + est->last_bytes = b_bytes; + est->last_packets = b_packets; est->next_jiffies += ((HZ/4) << est->intvl_log); @@ -109,7 +113,9 @@ static void est_timer(struct timer_list *t) * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @lock: lock for statistics and control path - * @running: qdisc running seqcount + * @running: true if @bstats represents a running qdisc, thus @bstats' + * internal values might change during basic reads. Only used + * if @bstats_cpu is NULL * @opt: rate estimator configuration TLV * * Creates a new rate estimator with &bstats as source and &rate_est @@ -121,16 +127,16 @@ static void est_timer(struct timer_list *t) * Returns 0 on success or a negative error code. * */ -int gen_new_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_new_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, + bool running, struct nlattr *opt) { struct gnet_estimator *parm = nla_data(opt); struct net_rate_estimator *old, *est; - struct gnet_stats_basic_packed b; + struct gnet_stats_basic_sync b; int intvl_log; if (nla_len(opt) < sizeof(*parm)) @@ -164,8 +170,8 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, est_fetch_counters(est, &b); if (lock) local_bh_enable(); - est->last_bytes = b.bytes; - est->last_packets = b.packets; + est->last_bytes = u64_stats_read(&b.bytes); + est->last_packets = u64_stats_read(&b.packets); if (lock) spin_lock_bh(lock); @@ -214,7 +220,9 @@ EXPORT_SYMBOL(gen_kill_estimator); * @cpu_bstats: bstats per cpu * @rate_est: rate estimator statistics * @lock: lock for statistics and control path - * @running: qdisc running seqcount (might be NULL) + * @running: true if @bstats represents a running qdisc, thus @bstats' + * internal values might change during basic reads. Only used + * if @cpu_bstats is NULL * @opt: rate estimator configuration TLV * * Replaces the configuration of a rate estimator by calling @@ -222,11 +230,11 @@ EXPORT_SYMBOL(gen_kill_estimator); * * Returns 0 on success or a negative error code. */ -int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu_bstats, +int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt) + bool running, struct nlattr *opt) { return gen_new_estimator(bstats, cpu_bstats, rate_est, lock, running, opt); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index e491b083b348..a10335b4ba2d 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -18,7 +18,7 @@ #include <linux/gen_stats.h> #include <net/netlink.h> #include <net/gen_stats.h> - +#include <net/sch_generic.h> static inline int gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) @@ -114,63 +114,112 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, } EXPORT_SYMBOL(gnet_stats_start_copy); -static void -__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu) +/* Must not be inlined, due to u64_stats seqcount_t lockdep key */ +void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b) { + u64_stats_set(&b->bytes, 0); + u64_stats_set(&b->packets, 0); + u64_stats_init(&b->syncp); +} +EXPORT_SYMBOL(gnet_stats_basic_sync_init); + +static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu) +{ + u64 t_bytes = 0, t_packets = 0; int i; for_each_possible_cpu(i) { - struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); + struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); unsigned int start; u64 bytes, packets; do { start = u64_stats_fetch_begin_irq(&bcpu->syncp); - bytes = bcpu->bstats.bytes; - packets = bcpu->bstats.packets; + bytes = u64_stats_read(&bcpu->bytes); + packets = u64_stats_read(&bcpu->packets); } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); - bstats->bytes += bytes; - bstats->packets += packets; + t_bytes += bytes; + t_packets += packets; + } + _bstats_update(bstats, t_bytes, t_packets); +} + +void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running) +{ + unsigned int start; + u64 bytes = 0; + u64 packets = 0; + + WARN_ON_ONCE((cpu || running) && in_hardirq()); + + if (cpu) { + gnet_stats_add_basic_cpu(bstats, cpu); + return; } + do { + if (running) + start = u64_stats_fetch_begin_irq(&b->syncp); + bytes = u64_stats_read(&b->bytes); + packets = u64_stats_read(&b->packets); + } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); + + _bstats_update(bstats, bytes, packets); } +EXPORT_SYMBOL(gnet_stats_add_basic); -void -__gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_stats_basic_packed *bstats, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, bool running) { - unsigned int seq; + unsigned int start; if (cpu) { - __gnet_stats_copy_basic_cpu(bstats, cpu); + u64 t_bytes = 0, t_packets = 0; + int i; + + for_each_possible_cpu(i) { + struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + u64 bytes, packets; + + do { + start = u64_stats_fetch_begin_irq(&bcpu->syncp); + bytes = u64_stats_read(&bcpu->bytes); + packets = u64_stats_read(&bcpu->packets); + } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + + t_bytes += bytes; + t_packets += packets; + } + *ret_bytes = t_bytes; + *ret_packets = t_packets; return; } do { if (running) - seq = read_seqcount_begin(running); - bstats->bytes = b->bytes; - bstats->packets = b->packets; - } while (running && read_seqcount_retry(running, seq)); + start = u64_stats_fetch_begin_irq(&b->syncp); + *ret_bytes = u64_stats_read(&b->bytes); + *ret_packets = u64_stats_read(&b->packets); + } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); } -EXPORT_SYMBOL(__gnet_stats_copy_basic); static int -___gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b, - int type) +___gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + int type, bool running) { - struct gnet_stats_basic_packed bstats = {0}; + u64 bstats_bytes, bstats_packets; - __gnet_stats_copy_basic(running, &bstats, cpu, b); + gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running); if (d->compat_tc_stats && type == TCA_STATS_BASIC) { - d->tc_stats.bytes = bstats.bytes; - d->tc_stats.packets = bstats.packets; + d->tc_stats.bytes = bstats_bytes; + d->tc_stats.packets = bstats_packets; } if (d->tail) { @@ -178,24 +227,28 @@ ___gnet_stats_copy_basic(const seqcount_t *running, int res; memset(&sb, 0, sizeof(sb)); - sb.bytes = bstats.bytes; - sb.packets = bstats.packets; + sb.bytes = bstats_bytes; + sb.packets = bstats_packets; res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); - if (res < 0 || sb.packets == bstats.packets) + if (res < 0 || sb.packets == bstats_packets) return res; /* emit 64bit stats only if needed */ - return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets, - sizeof(bstats.packets), TCA_STATS_PAD); + return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets, + sizeof(bstats_packets), TCA_STATS_PAD); } return 0; } /** * gnet_stats_copy_basic - copy basic statistics into statistic TLV - * @running: seqcount_t pointer * @d: dumping handle * @cpu: copy statistic per cpu * @b: basic statistics + * @running: true if @b represents a running qdisc, thus @b's + * internal values might change during basic reads. + * Only used if @cpu is NULL + * + * Context: task; must not be run from IRQ or BH contexts * * Appends the basic statistics to the top level TLV created by * gnet_stats_start_copy(). @@ -204,22 +257,25 @@ ___gnet_stats_copy_basic(const seqcount_t *running, * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + bool running) { - return ___gnet_stats_copy_basic(running, d, cpu, b, - TCA_STATS_BASIC); + return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running); } EXPORT_SYMBOL(gnet_stats_copy_basic); /** * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV - * @running: seqcount_t pointer * @d: dumping handle * @cpu: copy statistic per cpu * @b: basic statistics + * @running: true if @b represents a running qdisc, thus @b's + * internal values might change during basic reads. + * Only used if @cpu is NULL + * + * Context: task; must not be run from IRQ or BH contexts * * Appends the basic statistics to the top level TLV created by * gnet_stats_start_copy(). @@ -228,13 +284,12 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic_hw(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +gnet_stats_copy_basic_hw(struct gnet_dump *d, + struct gnet_stats_basic_sync __percpu *cpu, + struct gnet_stats_basic_sync *b, + bool running) { - return ___gnet_stats_copy_basic(running, d, cpu, b, - TCA_STATS_BASIC_HW); + return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running); } EXPORT_SYMBOL(gnet_stats_copy_basic_hw); @@ -282,16 +337,15 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, } EXPORT_SYMBOL(gnet_stats_copy_rate_est); -static void -__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *q) +static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *q) { int i; for_each_possible_cpu(i) { const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); - qstats->qlen = 0; + qstats->qlen += qcpu->backlog; qstats->backlog += qcpu->backlog; qstats->drops += qcpu->drops; qstats->requeues += qcpu->requeues; @@ -299,24 +353,21 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, } } -void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu, - const struct gnet_stats_queue *q, - __u32 qlen) +void gnet_stats_add_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q) { if (cpu) { - __gnet_stats_copy_queue_cpu(qstats, cpu); + gnet_stats_add_queue_cpu(qstats, cpu); } else { - qstats->qlen = q->qlen; - qstats->backlog = q->backlog; - qstats->drops = q->drops; - qstats->requeues = q->requeues; - qstats->overlimits = q->overlimits; + qstats->qlen += q->qlen; + qstats->backlog += q->backlog; + qstats->drops += q->drops; + qstats->requeues += q->requeues; + qstats->overlimits += q->overlimits; } - - qstats->qlen = qlen; } -EXPORT_SYMBOL(__gnet_stats_copy_queue); +EXPORT_SYMBOL(gnet_stats_add_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV @@ -339,7 +390,8 @@ gnet_stats_copy_queue(struct gnet_dump *d, { struct gnet_stats_queue qstats = {0}; - __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen); + gnet_stats_add_queue(&qstats, cpu_q, q); + qstats.qlen = qlen; if (d->compat_tc_stats) { d->tc_stats.drops = qstats.drops; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 8ec7d13d2860..2820aca2173a 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -23,6 +23,9 @@ #include <net/ip6_fib.h> #include <net/rtnh.h> +DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled); +EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled); + #ifdef CONFIG_MODULES static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) @@ -43,6 +46,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "SEG6LOCAL"; case LWTUNNEL_ENCAP_RPL: return "RPL"; + case LWTUNNEL_ENCAP_IOAM6: + return "IOAM6"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 53e85c70c6e5..47931c8be04b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -122,6 +122,8 @@ static void neigh_mark_dead(struct neighbour *n) list_del_init(&n->gc_list); atomic_dec(&n->tbl->gc_entries); } + if (!list_empty(&n->managed_list)) + list_del_init(&n->managed_list); } static void neigh_update_gc_list(struct neighbour *n) @@ -130,7 +132,6 @@ static void neigh_update_gc_list(struct neighbour *n) write_lock_bh(&n->tbl->lock); write_lock(&n->lock); - if (n->dead) goto out; @@ -149,32 +150,59 @@ static void neigh_update_gc_list(struct neighbour *n) list_add_tail(&n->gc_list, &n->tbl->gc_list); atomic_inc(&n->tbl->gc_entries); } +out: + write_unlock(&n->lock); + write_unlock_bh(&n->tbl->lock); +} +static void neigh_update_managed_list(struct neighbour *n) +{ + bool on_managed_list, add_to_managed; + + write_lock_bh(&n->tbl->lock); + write_lock(&n->lock); + if (n->dead) + goto out; + + add_to_managed = n->flags & NTF_MANAGED; + on_managed_list = !list_empty(&n->managed_list); + + if (!add_to_managed && on_managed_list) + list_del_init(&n->managed_list); + else if (add_to_managed && !on_managed_list) + list_add_tail(&n->managed_list, &n->tbl->managed_list); out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } -static bool neigh_update_ext_learned(struct neighbour *neigh, u32 flags, - int *notify) +static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, + bool *gc_update, bool *managed_update) { - bool rc = false; - u8 ndm_flags; + u32 ndm_flags, old_flags = neigh->flags; if (!(flags & NEIGH_UPDATE_F_ADMIN)) - return rc; + return; - ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; - if ((neigh->flags ^ ndm_flags) & NTF_EXT_LEARNED) { + ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; + ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; + + if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) neigh->flags |= NTF_EXT_LEARNED; else neigh->flags &= ~NTF_EXT_LEARNED; - rc = true; *notify = 1; + *gc_update = true; + } + if ((old_flags ^ ndm_flags) & NTF_MANAGED) { + if (ndm_flags & NTF_MANAGED) + neigh->flags |= NTF_MANAGED; + else + neigh->flags &= ~NTF_MANAGED; + *notify = 1; + *managed_update = true; } - - return rc; } static bool neigh_del(struct neighbour *n, struct neighbour __rcu **np, @@ -379,7 +407,7 @@ EXPORT_SYMBOL(neigh_ifdown); static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev, - bool exempt_from_gc) + u32 flags, bool exempt_from_gc) { struct neighbour *n = NULL; unsigned long now = jiffies; @@ -412,6 +440,7 @@ do_alloc: n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; + n->flags = flags; seqlock_init(&n->hh.hh_lock); n->parms = neigh_parms_clone(&tbl->parms); timer_setup(&n->timer, neigh_timer_handler, 0); @@ -421,6 +450,7 @@ do_alloc: refcount_set(&n->refcnt, 1); n->dead = 1; INIT_LIST_HEAD(&n->gc_list); + INIT_LIST_HEAD(&n->managed_list); atomic_inc(&tbl->entries); out: @@ -575,19 +605,18 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, } EXPORT_SYMBOL(neigh_lookup_nodev); -static struct neighbour *___neigh_create(struct neigh_table *tbl, - const void *pkey, - struct net_device *dev, - bool exempt_from_gc, bool want_ref) +static struct neighbour * +___neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, u32 flags, + bool exempt_from_gc, bool want_ref) { - struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev, exempt_from_gc); - u32 hash_val; - unsigned int key_len = tbl->key_len; - int error; + u32 hash_val, key_len = tbl->key_len; + struct neighbour *n1, *rc, *n; struct neigh_hash_table *nht; + int error; + n = neigh_alloc(tbl, dev, flags, exempt_from_gc); trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc); - if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; @@ -650,7 +679,8 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl, n->dead = 0; if (!exempt_from_gc) list_add_tail(&n->gc_list, &n->tbl->gc_list); - + if (n->flags & NTF_MANAGED) + list_add_tail(&n->managed_list, &n->tbl->managed_list); if (want_ref) neigh_hold(n); rcu_assign_pointer(n->next, @@ -674,7 +704,7 @@ out_neigh_release: struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref) { - return ___neigh_create(tbl, pkey, dev, false, want_ref); + return ___neigh_create(tbl, pkey, dev, 0, false, want_ref); } EXPORT_SYMBOL(__neigh_create); @@ -741,12 +771,10 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); n->dev = dev; - if (dev) - dev_hold(dev); + dev_hold(dev); if (tbl->pconstructor && tbl->pconstructor(n)) { - if (dev) - dev_put(dev); + dev_put(dev); kfree(n); n = NULL; goto out; @@ -778,8 +806,7 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); - if (n->dev) - dev_put(n->dev); + dev_put(n->dev); kfree(n); return 0; } @@ -812,8 +839,7 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, n->next = NULL; if (tbl->pdestructor) tbl->pdestructor(n); - if (n->dev) - dev_put(n->dev); + dev_put(n->dev); kfree(n); } return -ENOENT; @@ -1209,8 +1235,6 @@ static void neigh_update_hhs(struct neighbour *neigh) } } - - /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. @@ -1221,7 +1245,8 @@ static void neigh_update_hhs(struct neighbour *neigh) lladdr instead of overriding it if it is different. NEIGH_UPDATE_F_ADMIN means that the change is administrative. - + NEIGH_UPDATE_F_USE means that the entry is user triggered. + NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed. NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as @@ -1229,17 +1254,15 @@ static void neigh_update_hhs(struct neighbour *neigh) Caller MUST hold reference count on the entry. */ - static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid, struct netlink_ext_ack *extack) { - bool ext_learn_change = false; - u8 old; - int err; - int notify = 0; - struct net_device *dev; + bool gc_update = false, managed_update = false; int update_isrouter = 0; + struct net_device *dev; + int err, notify = 0; + u8 old; trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid); @@ -1258,7 +1281,13 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, (old & (NUD_NOARP | NUD_PERMANENT))) goto out; - ext_learn_change = neigh_update_ext_learned(neigh, flags, ¬ify); + neigh_update_flags(neigh, flags, ¬ify, &gc_update, &managed_update); + if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) { + new = old & ~NUD_PERMANENT; + neigh->nud_state = new; + err = 0; + goto out; + } if (!(new & NUD_VALID)) { neigh_del_timer(neigh); @@ -1403,15 +1432,13 @@ out: if (update_isrouter) neigh_update_is_router(neigh, flags, ¬ify); write_unlock_bh(&neigh->lock); - - if (((new ^ old) & NUD_PERMANENT) || ext_learn_change) + if (((new ^ old) & NUD_PERMANENT) || gc_update) neigh_update_gc_list(neigh); - + if (managed_update) + neigh_update_managed_list(neigh); if (notify) neigh_update_notify(neigh, nlmsg_pid); - trace_neigh_update_done(neigh, err); - return err; } @@ -1537,6 +1564,20 @@ int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb) } EXPORT_SYMBOL(neigh_direct_output); +static void neigh_managed_work(struct work_struct *work) +{ + struct neigh_table *tbl = container_of(work, struct neigh_table, + managed_work.work); + struct neighbour *neigh; + + write_lock_bh(&tbl->lock); + list_for_each_entry(neigh, &tbl->managed_list, managed_list) + neigh_event_send(neigh, NULL); + queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, + NEIGH_VAR(&tbl->parms, DELAY_PROBE_TIME)); + write_unlock_bh(&tbl->lock); +} + static void neigh_proxy_process(struct timer_list *t) { struct neigh_table *tbl = from_timer(tbl, t, proxy_timer); @@ -1662,8 +1703,7 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) list_del(&parms->list); parms->dead = 1; write_unlock_bh(&tbl->lock); - if (parms->dev) - dev_put(parms->dev); + dev_put(parms->dev); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } EXPORT_SYMBOL(neigh_parms_release); @@ -1684,6 +1724,8 @@ void neigh_table_init(int index, struct neigh_table *tbl) INIT_LIST_HEAD(&tbl->parms_list); INIT_LIST_HEAD(&tbl->gc_list); + INIT_LIST_HEAD(&tbl->managed_list); + list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); @@ -1715,9 +1757,13 @@ void neigh_table_init(int index, struct neigh_table *tbl) WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); + INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, tbl->parms.reachable_time); + INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work); + queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0); + timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0); skb_queue_head_init_class(&tbl->proxy_queue, &neigh_table_proxy_queue_class); @@ -1788,6 +1834,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, [NDA_NH_ID] = { .type = NLA_U32 }, + [NDA_FLAGS_EXT] = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK), [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED }, }; @@ -1860,7 +1907,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_OVERRIDE_ISROUTER; + NEIGH_UPDATE_F_OVERRIDE_ISROUTER; struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; @@ -1869,6 +1916,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct neighbour *neigh; void *dst, *lladdr; u8 protocol = 0; + u32 ndm_flags; int err; ASSERT_RTNL(); @@ -1884,6 +1932,15 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, } ndm = nlmsg_data(nlh); + ndm_flags = ndm->ndm_flags; + if (tb[NDA_FLAGS_EXT]) { + u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]); + + BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE < + (sizeof(ndm->ndm_flags) * BITS_PER_BYTE + + hweight32(NTF_EXT_MASK))); + ndm_flags |= (ext << NTF_EXT_SHIFT); + } if (ndm->ndm_ifindex) { dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { @@ -1911,14 +1968,18 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); - - if (ndm->ndm_flags & NTF_PROXY) { + if (ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; + if (ndm_flags & NTF_MANAGED) { + NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); + goto out; + } + err = -ENOBUFS; pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { - pn->flags = ndm->ndm_flags; + pn->flags = ndm_flags; if (protocol) pn->protocol = protocol; err = 0; @@ -1938,16 +1999,24 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { - bool exempt_from_gc; + bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; + bool exempt_from_gc = ndm_permanent || + ndm_flags & NTF_EXT_LEARNED; if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; goto out; } + if (ndm_permanent && (ndm_flags & NTF_MANAGED)) { + NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry"); + err = -EINVAL; + goto out; + } - exempt_from_gc = ndm->ndm_state & NUD_PERMANENT || - ndm->ndm_flags & NTF_EXT_LEARNED; - neigh = ___neigh_create(tbl, dst, dev, exempt_from_gc, true); + neigh = ___neigh_create(tbl, dst, dev, + ndm_flags & + (NTF_EXT_LEARNED | NTF_MANAGED), + exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); goto out; @@ -1966,22 +2035,22 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (protocol) neigh->protocol = protocol; - - if (ndm->ndm_flags & NTF_EXT_LEARNED) + if (ndm_flags & NTF_EXT_LEARNED) flags |= NEIGH_UPDATE_F_EXT_LEARNED; - - if (ndm->ndm_flags & NTF_ROUTER) + if (ndm_flags & NTF_ROUTER) flags |= NEIGH_UPDATE_F_ISROUTER; + if (ndm_flags & NTF_MANAGED) + flags |= NEIGH_UPDATE_F_MANAGED; + if (ndm_flags & NTF_USE) + flags |= NEIGH_UPDATE_F_USE; - if (ndm->ndm_flags & NTF_USE) { + err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid, extack); + if (!err && ndm_flags & (NTF_USE | NTF_MANAGED)) { neigh_event_send(neigh, NULL); err = 0; - } else - err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, - NETLINK_CB(skb).portid, extack); - + } neigh_release(neigh); - out: return err; } @@ -2432,6 +2501,7 @@ out: static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, u32 pid, u32 seq, int type, unsigned int flags) { + u32 neigh_flags, neigh_flags_ext; unsigned long now = jiffies; struct nda_cacheinfo ci; struct nlmsghdr *nlh; @@ -2441,11 +2511,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, if (nlh == NULL) return -EMSGSIZE; + neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT; + neigh_flags = neigh->flags & NTF_OLD_MASK; + ndm = nlmsg_data(nlh); ndm->ndm_family = neigh->ops->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; - ndm->ndm_flags = neigh->flags; + ndm->ndm_flags = neigh_flags; ndm->ndm_type = neigh->type; ndm->ndm_ifindex = neigh->dev->ifindex; @@ -2476,6 +2549,8 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol)) goto nla_put_failure; + if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) + goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -2489,6 +2564,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) { + u32 neigh_flags, neigh_flags_ext; struct nlmsghdr *nlh; struct ndmsg *ndm; @@ -2496,11 +2572,14 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (nlh == NULL) return -EMSGSIZE; + neigh_flags_ext = pn->flags >> NTF_EXT_SHIFT; + neigh_flags = pn->flags & NTF_OLD_MASK; + ndm = nlmsg_data(nlh); ndm->ndm_family = tbl->family; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; - ndm->ndm_flags = pn->flags | NTF_PROXY; + ndm->ndm_flags = neigh_flags | NTF_PROXY; ndm->ndm_type = RTN_UNICAST; ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0; ndm->ndm_state = NUD_NONE; @@ -2510,6 +2589,8 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, if (pn->protocol && nla_put_u8(skb, NDA_PROTOCOL, pn->protocol)) goto nla_put_failure; + if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext)) + goto nla_put_failure; nlmsg_end(skb, nlh); return 0; @@ -2533,6 +2614,13 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) return false; master = dev ? netdev_master_upper_dev_get(dev) : NULL; + + /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another + * invalid value for ifindex to denote "no master". + */ + if (master_idx == -1) + return !!master; + if (!master || master->ifindex != master_idx) return true; @@ -2818,6 +2906,7 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */ + nla_total_size(sizeof(struct nda_cacheinfo)) + nla_total_size(4) /* NDA_PROBES */ + + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } @@ -2846,6 +2935,7 @@ static inline size_t pneigh_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */ + + nla_total_size(4) /* NDA_FLAGS_EXT */ + nla_total_size(1); /* NDA_PROTOCOL */ } @@ -3315,12 +3405,13 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); + seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } - seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " - "%08lx %08lx %08lx %08lx %08lx %08lx\n", + seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " + "%08lx %08lx %08lx " + "%08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f6197774048b..9c01c642cf9e 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -175,6 +175,14 @@ static int change_carrier(struct net_device *dev, unsigned long new_carrier) static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { + struct net_device *netdev = to_net_dev(dev); + + /* The check is also done in change_carrier; this helps returning early + * without hitting the trylock/restart in netdev_store. + */ + if (!netdev->netdev_ops->ndo_change_carrier) + return -EOPNOTSUPP; + return netdev_store(dev, attr, buf, len, change_carrier); } @@ -196,6 +204,12 @@ static ssize_t speed_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; + /* The check is also done in __ethtool_get_link_ksettings; this helps + * returning early without hitting the trylock/restart below. + */ + if (!netdev->ethtool_ops->get_link_ksettings) + return ret; + if (!rtnl_trylock()) return restart_syscall(); @@ -216,6 +230,12 @@ static ssize_t duplex_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; + /* The check is also done in __ethtool_get_link_ksettings; this helps + * returning early without hitting the trylock/restart below. + */ + if (!netdev->ethtool_ops->get_link_ksettings) + return ret; + if (!rtnl_trylock()) return restart_syscall(); @@ -468,6 +488,14 @@ static ssize_t proto_down_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { + struct net_device *netdev = to_net_dev(dev); + + /* The check is also done in change_proto_down; this helps returning + * early without hitting the trylock/restart in netdev_store. + */ + if (!netdev->netdev_ops->ndo_change_proto_down) + return -EOPNOTSUPP; + return netdev_store(dev, attr, buf, len, change_proto_down); } NETDEVICE_SHOW_RW(proto_down, fmt_dec); @@ -478,6 +506,12 @@ static ssize_t phys_port_id_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; + /* The check is also done in dev_get_phys_port_id; this helps returning + * early without hitting the trylock/restart below. + */ + if (!netdev->netdev_ops->ndo_get_phys_port_id) + return -EOPNOTSUPP; + if (!rtnl_trylock()) return restart_syscall(); @@ -500,6 +534,13 @@ static ssize_t phys_port_name_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; + /* The checks are also done in dev_get_phys_port_name; this helps + * returning early without hitting the trylock/restart below. + */ + if (!netdev->netdev_ops->ndo_get_phys_port_name && + !netdev->netdev_ops->ndo_get_devlink_port) + return -EOPNOTSUPP; + if (!rtnl_trylock()) return restart_syscall(); @@ -522,6 +563,14 @@ static ssize_t phys_switch_id_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; + /* The checks are also done in dev_get_phys_port_name; this helps + * returning early without hitting the trylock/restart below. This works + * because recurse is false when calling dev_get_port_parent_id. + */ + if (!netdev->netdev_ops->ndo_get_port_parent_id && + !netdev->netdev_ops->ndo_get_devlink_port) + return -EOPNOTSUPP; + if (!rtnl_trylock()) return restart_syscall(); @@ -1226,6 +1275,12 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue, if (!capable(CAP_NET_ADMIN)) return -EPERM; + /* The check is also done later; this helps returning early without + * hitting the trylock/restart below. + */ + if (!dev->netdev_ops->ndo_set_tx_maxrate) + return -EOPNOTSUPP; + err = kstrtou32(buf, 10, &rate); if (err < 0) return err; @@ -1869,7 +1924,7 @@ static struct class net_class __ro_after_init = { .get_ownership = net_get_ownership, }; -#ifdef CONFIG_OF_NET +#ifdef CONFIG_OF static int of_dev_node_match(struct device *dev, const void *data) { for (; dev; dev = dev->parent) { @@ -1973,9 +2028,9 @@ int netdev_register_kobject(struct net_device *ndev) int netdev_change_owner(struct net_device *ndev, const struct net *net_old, const struct net *net_new) { + kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID; + kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID; struct device *dev = &ndev->dev; - kuid_t old_uid, new_uid; - kgid_t old_gid, new_gid; int error; net_ns_get_ownership(net_old, &old_uid, &old_gid); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 9b5a767eddd5..202fa5eacd0f 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -98,7 +98,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) } ng = net_alloc_generic(); - if (ng == NULL) + if (!ng) return -ENOMEM; /* @@ -148,13 +148,6 @@ out: return err; } -static void ops_free(const struct pernet_operations *ops, struct net *net) -{ - if (ops->id && ops->size) { - kfree(net_generic(net, *ops->id)); - } -} - static void ops_pre_exit_list(const struct pernet_operations *ops, struct list_head *net_exit_list) { @@ -184,7 +177,7 @@ static void ops_free_list(const struct pernet_operations *ops, struct net *net; if (ops->size && ops->id) { list_for_each_entry(net, net_exit_list, exit_list) - ops_free(ops, net); + kfree(net_generic(net, *ops->id)); } } @@ -433,15 +426,18 @@ out_free: static void net_free(struct net *net) { - kfree(rcu_access_pointer(net->gen)); - kmem_cache_free(net_cachep, net); + if (refcount_dec_and_test(&net->passive)) { + kfree(rcu_access_pointer(net->gen)); + kmem_cache_free(net_cachep, net); + } } void net_drop_ns(void *p) { - struct net *ns = p; - if (ns && refcount_dec_and_test(&ns->passive)) - net_free(ns); + struct net *net = (struct net *)p; + + if (net) + net_free(net); } struct net *copy_net_ns(unsigned long flags, @@ -477,9 +473,11 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: +#ifdef CONFIG_KEYS key_remove_domain(net->key_domain); +#endif put_user_ns(user_ns); - net_drop_ns(net); + net_free(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); @@ -609,9 +607,11 @@ static void cleanup_net(struct work_struct *work) list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); dec_net_namespaces(net->ucounts); +#ifdef CONFIG_KEYS key_remove_domain(net->key_domain); +#endif put_user_ns(net->user_ns); - net_drop_ns(net); + net_free(net); } } @@ -1120,6 +1120,14 @@ static int __init net_ns_init(void) pure_initcall(net_ns_init); +static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list) +{ + ops_pre_exit_list(ops, net_exit_list); + synchronize_rcu(); + ops_exit_list(ops, net_exit_list); + ops_free_list(ops, net_exit_list); +} + #ifdef CONFIG_NET_NS static int __register_pernet_operations(struct list_head *list, struct pernet_operations *ops) @@ -1145,10 +1153,7 @@ static int __register_pernet_operations(struct list_head *list, out_undo: /* If I have an error cleanup all namespaces I initialized */ list_del(&ops->list); - ops_pre_exit_list(ops, &net_exit_list); - synchronize_rcu(); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + free_exit_list(ops, &net_exit_list); return error; } @@ -1161,10 +1166,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) /* See comment in __register_pernet_operations() */ for_each_net(net) list_add_tail(&net->exit_list, &net_exit_list); - ops_pre_exit_list(ops, &net_exit_list); - synchronize_rcu(); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + + free_exit_list(ops, &net_exit_list); } #else @@ -1187,10 +1190,7 @@ static void __unregister_pernet_operations(struct pernet_operations *ops) } else { LIST_HEAD(net_exit_list); list_add(&init_net.exit_list, &net_exit_list); - ops_pre_exit_list(ops, &net_exit_list); - synchronize_rcu(); - ops_exit_list(ops, &net_exit_list); - ops_free_list(ops, &net_exit_list); + free_exit_list(ops, &net_exit_list); } } diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index b49c57d35a88..1a6a86693b74 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -71,11 +71,8 @@ static int update_classid_sock(const void *v, struct file *file, unsigned n) struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file); - if (sock) { - spin_lock(&cgroup_sk_update_lock); + if (sock) sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); - spin_unlock(&cgroup_sk_update_lock); - } if (--ctx->batch == 0) { ctx->batch = UPDATE_CLASSID_BATCH; return n + 1; @@ -121,8 +118,6 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, struct css_task_iter it; struct task_struct *p; - cgroup_sk_alloc_disable(); - cs->classid = (u32)value; css_task_iter_start(css, 0, &it); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 99a431c56f23..8456dfbe2eb4 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -207,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of, if (!dev) return -ENODEV; - cgroup_sk_alloc_disable(); - rtnl_lock(); ret = netprio_set_prio(of_css(of), dev, prio); @@ -221,12 +219,10 @@ static ssize_t write_priomap(struct kernfs_open_file *of, static int update_netprio(const void *v, struct file *file, unsigned n) { struct socket *sock = sock_from_file(file); - if (sock) { - spin_lock(&cgroup_sk_update_lock); + + if (sock) sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data, (unsigned long)v); - spin_unlock(&cgroup_sk_update_lock); - } return 0; } @@ -235,8 +231,6 @@ static void net_prio_attach(struct cgroup_taskset *tset) struct task_struct *p; struct cgroup_subsys_state *css; - cgroup_sk_alloc_disable(); - cgroup_taskset_for_each(p, css, tset) { void *v = (void *)(unsigned long)css->id; diff --git a/net/core/of_net.c b/net/core/of_net.c new file mode 100644 index 000000000000..f1a9bf7578e7 --- /dev/null +++ b/net/core/of_net.c @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * OF helpers for network devices. + * + * Initially copied out of arch/powerpc/kernel/prom_parse.c + */ +#include <linux/etherdevice.h> +#include <linux/kernel.h> +#include <linux/of_net.h> +#include <linux/of_platform.h> +#include <linux/phy.h> +#include <linux/export.h> +#include <linux/device.h> +#include <linux/nvmem-consumer.h> + +/** + * of_get_phy_mode - Get phy mode for given device_node + * @np: Pointer to the given device_node + * @interface: Pointer to the result + * + * The function gets phy interface string from property 'phy-mode' or + * 'phy-connection-type'. The index in phy_modes table is set in + * interface and 0 returned. In case of error interface is set to + * PHY_INTERFACE_MODE_NA and an errno is returned, e.g. -ENODEV. + */ +int of_get_phy_mode(struct device_node *np, phy_interface_t *interface) +{ + const char *pm; + int err, i; + + *interface = PHY_INTERFACE_MODE_NA; + + err = of_property_read_string(np, "phy-mode", &pm); + if (err < 0) + err = of_property_read_string(np, "phy-connection-type", &pm); + if (err < 0) + return err; + + for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++) + if (!strcasecmp(pm, phy_modes(i))) { + *interface = i; + return 0; + } + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(of_get_phy_mode); + +static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr) +{ + struct property *pp = of_find_property(np, name, NULL); + + if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value)) { + memcpy(addr, pp->value, ETH_ALEN); + return 0; + } + return -ENODEV; +} + +static int of_get_mac_addr_nvmem(struct device_node *np, u8 *addr) +{ + struct platform_device *pdev = of_find_device_by_node(np); + struct nvmem_cell *cell; + const void *mac; + size_t len; + int ret; + + /* Try lookup by device first, there might be a nvmem_cell_lookup + * associated with a given device. + */ + if (pdev) { + ret = nvmem_get_mac_address(&pdev->dev, addr); + put_device(&pdev->dev); + return ret; + } + + cell = of_nvmem_cell_get(np, "mac-address"); + if (IS_ERR(cell)) + return PTR_ERR(cell); + + mac = nvmem_cell_read(cell, &len); + nvmem_cell_put(cell); + + if (IS_ERR(mac)) + return PTR_ERR(mac); + + if (len != ETH_ALEN || !is_valid_ether_addr(mac)) { + kfree(mac); + return -EINVAL; + } + + memcpy(addr, mac, ETH_ALEN); + kfree(mac); + + return 0; +} + +/** + * of_get_mac_address() + * @np: Caller's Device Node + * @addr: Pointer to a six-byte array for the result + * + * Search the device tree for the best MAC address to use. 'mac-address' is + * checked first, because that is supposed to contain to "most recent" MAC + * address. If that isn't set, then 'local-mac-address' is checked next, + * because that is the default address. If that isn't set, then the obsolete + * 'address' is checked, just in case we're using an old device tree. If any + * of the above isn't set, then try to get MAC address from nvmem cell named + * 'mac-address'. + * + * Note that the 'address' property is supposed to contain a virtual address of + * the register set, but some DTS files have redefined that property to be the + * MAC address. + * + * All-zero MAC addresses are rejected, because those could be properties that + * exist in the device tree, but were not set by U-Boot. For example, the + * DTS could define 'mac-address' and 'local-mac-address', with zero MAC + * addresses. Some older U-Boots only initialized 'local-mac-address'. In + * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists + * but is all zeros. + * + * Return: 0 on success and errno in case of error. +*/ +int of_get_mac_address(struct device_node *np, u8 *addr) +{ + int ret; + + if (!np) + return -ENODEV; + + ret = of_get_mac_addr(np, "mac-address", addr); + if (!ret) + return 0; + + ret = of_get_mac_addr(np, "local-mac-address", addr); + if (!ret) + return 0; + + ret = of_get_mac_addr(np, "address", addr); + if (!ret) + return 0; + + return of_get_mac_addr_nvmem(np, addr); +} +EXPORT_SYMBOL(of_get_mac_address); + +/** + * of_get_ethdev_address() + * @np: Caller's Device Node + * @dev: Pointer to netdevice which address will be updated + * + * Search the device tree for the best MAC address to use. + * If found set @dev->dev_addr to that address. + * + * See documentation of of_get_mac_address() for more information on how + * the best address is determined. + * + * Return: 0 on success and errno in case of error. + */ +int of_get_ethdev_address(struct device_node *np, struct net_device *dev) +{ + u8 addr[ETH_ALEN]; + int ret; + + ret = of_get_mac_address(np, addr); + if (!ret) + eth_hw_addr_set(dev, addr); + return ret; +} +EXPORT_SYMBOL(of_get_ethdev_address); diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 8ab7b402244c..9b60e4301a44 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -24,6 +24,8 @@ #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) +#define BIAS_MAX LONG_MAX + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params) { @@ -47,6 +49,12 @@ static int page_pool_init(struct page_pool *pool, * which is the XDP_TX use-case. */ if (pool->p.flags & PP_FLAG_DMA_MAP) { + /* DMA-mapping is not supported on 32-bit systems with + * 64-bit DMA mapping. + */ + if (sizeof(dma_addr_t) > sizeof(unsigned long)) + return -EOPNOTSUPP; + if ((pool->p.dma_dir != DMA_FROM_DEVICE) && (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; @@ -206,6 +214,19 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page) return true; } +static void page_pool_set_pp_info(struct page_pool *pool, + struct page *page) +{ + page->pp = pool; + page->pp_magic |= PP_SIGNATURE; +} + +static void page_pool_clear_pp_info(struct page *page) +{ + page->pp_magic = 0; + page->pp = NULL; +} + static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { @@ -222,7 +243,7 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, return NULL; } - page->pp_magic |= PP_SIGNATURE; + page_pool_set_pp_info(pool, page); /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -266,7 +287,8 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, put_page(page); continue; } - page->pp_magic |= PP_SIGNATURE; + + page_pool_set_pp_info(pool, page); pool->alloc.cache[pool->alloc.count++] = page; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; @@ -345,12 +367,12 @@ void page_pool_release_page(struct page_pool *pool, struct page *page) DMA_ATTR_SKIP_CPU_SYNC); page_pool_set_dma_addr(page, 0); skip_dma_unmap: - page->pp_magic = 0; + page_pool_clear_pp_info(page); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ - count = atomic_inc_return(&pool->pages_state_release_cnt); + count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); } EXPORT_SYMBOL(page_pool_release_page); @@ -405,6 +427,11 @@ static __always_inline struct page * __page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { + /* It is not the last user for the page frag case */ + if (pool->p.flags & PP_FLAG_PAGE_FRAG && + page_pool_atomic_sub_frag_count_return(page, 1)) + return NULL; + /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the * regular page allocator APIs. @@ -497,6 +524,84 @@ void page_pool_put_page_bulk(struct page_pool *pool, void **data, } EXPORT_SYMBOL(page_pool_put_page_bulk); +static struct page *page_pool_drain_frag(struct page_pool *pool, + struct page *page) +{ + long drain_count = BIAS_MAX - pool->frag_users; + + /* Some user is still using the page frag */ + if (likely(page_pool_atomic_sub_frag_count_return(page, + drain_count))) + return NULL; + + if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) { + if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) + page_pool_dma_sync_for_device(pool, page, -1); + + return page; + } + + page_pool_return_page(pool, page); + return NULL; +} + +static void page_pool_free_frag(struct page_pool *pool) +{ + long drain_count = BIAS_MAX - pool->frag_users; + struct page *page = pool->frag_page; + + pool->frag_page = NULL; + + if (!page || + page_pool_atomic_sub_frag_count_return(page, drain_count)) + return; + + page_pool_return_page(pool, page); +} + +struct page *page_pool_alloc_frag(struct page_pool *pool, + unsigned int *offset, + unsigned int size, gfp_t gfp) +{ + unsigned int max_size = PAGE_SIZE << pool->p.order; + struct page *page = pool->frag_page; + + if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) || + size > max_size)) + return NULL; + + size = ALIGN(size, dma_get_cache_alignment()); + *offset = pool->frag_offset; + + if (page && *offset + size > max_size) { + page = page_pool_drain_frag(pool, page); + if (page) + goto frag_reset; + } + + if (!page) { + page = page_pool_alloc_pages(pool, gfp); + if (unlikely(!page)) { + pool->frag_page = NULL; + return NULL; + } + + pool->frag_page = page; + +frag_reset: + pool->frag_users = 1; + *offset = 0; + pool->frag_offset = size; + page_pool_set_frag_count(page, BIAS_MAX); + return page; + } + + pool->frag_users++; + pool->frag_offset = *offset + size; + return page; +} +EXPORT_SYMBOL(page_pool_alloc_frag); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; @@ -602,6 +707,8 @@ void page_pool_destroy(struct page_pool *pool) if (!page_pool_put(pool)) return; + page_pool_free_frag(pool); + if (!page_pool_release(pool)) return; @@ -652,7 +759,6 @@ bool page_pool_return_skb_page(struct page *page) * The page will be returned to the pool here regardless of the * 'flipped' fragment being in use or not. */ - page->pp = NULL; page_pool_put_full_page(pp, page, false); return true; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7e258d255e90..a3d74e2704c4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -175,6 +175,9 @@ #define IP_NAME_SZ 32 #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */ #define MPLS_STACK_BOTTOM htonl(0x00000100) +/* Max number of internet mix entries that can be specified in imix_weights. */ +#define MAX_IMIX_ENTRIES 20 +#define IMIX_PRECISION 100 /* Precision of IMIX distribution */ #define func_enter() pr_debug("entering %s\n", __func__); @@ -242,6 +245,12 @@ static char *pkt_flag_names[] = { #define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4) #define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4) +struct imix_pkt { + u64 size; + u64 weight; + u64 count_so_far; +}; + struct flow_state { __be32 cur_daddr; int count; @@ -343,6 +352,12 @@ struct pktgen_dev { __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6 (see RFC 3260, sec. 4) */ + /* IMIX */ + unsigned int n_imix_entries; + struct imix_pkt imix_entries[MAX_IMIX_ENTRIES]; + /* Maps 0-IMIX_PRECISION range to imix_entry based on probability*/ + __u8 imix_distribution[IMIX_PRECISION]; + /* MPLS */ unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */ __be32 labels[MAX_MPLS_LABELS]; @@ -471,6 +486,7 @@ static void pktgen_stop_all_threads(struct pktgen_net *pn); static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); +static void fill_imix_distribution(struct pktgen_dev *pkt_dev); /* Module parameters, defaults. */ static int pg_count_d __read_mostly = 1000; @@ -552,6 +568,16 @@ static int pktgen_if_show(struct seq_file *seq, void *v) (unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size, pkt_dev->max_pkt_size); + if (pkt_dev->n_imix_entries > 0) { + seq_puts(seq, " imix_weights: "); + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + seq_printf(seq, "%llu,%llu ", + pkt_dev->imix_entries[i].size, + pkt_dev->imix_entries[i].weight); + } + seq_puts(seq, "\n"); + } + seq_printf(seq, " frags: %d delay: %llu clone_skb: %d ifname: %s\n", pkt_dev->nfrags, (unsigned long long) pkt_dev->delay, @@ -669,6 +695,18 @@ static int pktgen_if_show(struct seq_file *seq, void *v) (unsigned long long)pkt_dev->sofar, (unsigned long long)pkt_dev->errors); + if (pkt_dev->n_imix_entries > 0) { + int i; + + seq_puts(seq, " imix_size_counts: "); + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + seq_printf(seq, "%llu,%llu ", + pkt_dev->imix_entries[i].size, + pkt_dev->imix_entries[i].count_so_far); + } + seq_puts(seq, "\n"); + } + seq_printf(seq, " started: %lluus stopped: %lluus idle: %lluus\n", (unsigned long long) ktime_to_us(pkt_dev->started_at), @@ -792,6 +830,62 @@ done_str: return i; } +/* Parses imix entries from user buffer. + * The user buffer should consist of imix entries separated by spaces + * where each entry consists of size and weight delimited by commas. + * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example. + */ +static ssize_t get_imix_entries(const char __user *buffer, + struct pktgen_dev *pkt_dev) +{ + const int max_digits = 10; + int i = 0; + long len; + char c; + + pkt_dev->n_imix_entries = 0; + + do { + unsigned long weight; + unsigned long size; + + len = num_arg(&buffer[i], max_digits, &size); + if (len < 0) + return len; + i += len; + if (get_user(c, &buffer[i])) + return -EFAULT; + /* Check for comma between size_i and weight_i */ + if (c != ',') + return -EINVAL; + i++; + + if (size < 14 + 20 + 8) + size = 14 + 20 + 8; + + len = num_arg(&buffer[i], max_digits, &weight); + if (len < 0) + return len; + if (weight <= 0) + return -EINVAL; + + pkt_dev->imix_entries[pkt_dev->n_imix_entries].size = size; + pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight; + + i += len; + if (get_user(c, &buffer[i])) + return -EFAULT; + + i++; + pkt_dev->n_imix_entries++; + + if (pkt_dev->n_imix_entries > MAX_IMIX_ENTRIES) + return -E2BIG; + } while (c == ' '); + + return i; +} + static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) { unsigned int n = 0; @@ -960,6 +1054,20 @@ static ssize_t pktgen_if_write(struct file *file, return count; } + if (!strcmp(name, "imix_weights")) { + if (pkt_dev->clone_skb > 0) + return -EINVAL; + + len = get_imix_entries(&user_buffer[i], pkt_dev); + if (len < 0) + return len; + + fill_imix_distribution(pkt_dev); + + i += len; + return count; + } + if (!strcmp(name, "debug")) { len = num_arg(&user_buffer[i], 10, &value); if (len < 0) @@ -1082,10 +1190,16 @@ static ssize_t pktgen_if_write(struct file *file, len = num_arg(&user_buffer[i], 10, &value); if (len < 0) return len; + /* clone_skb is not supported for netif_receive xmit_mode and + * IMIX mode. + */ if ((value > 0) && ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) return -ENOTSUPP; + if (value > 0 && pkt_dev->n_imix_entries > 0) + return -EINVAL; + i += len; pkt_dev->clone_skb = value; @@ -1190,11 +1304,6 @@ static ssize_t pktgen_if_write(struct file *file, * pktgen_xmit() is called */ pkt_dev->last_ok = 1; - - /* override clone_skb if user passed default value - * at module loading time - */ - pkt_dev->clone_skb = 0; } else if (strcmp(f, "queue_xmit") == 0) { pkt_dev->xmit_mode = M_QUEUE_XMIT; pkt_dev->last_ok = 1; @@ -2477,6 +2586,14 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) t = pkt_dev->min_pkt_size; } pkt_dev->cur_pkt_size = t; + } else if (pkt_dev->n_imix_entries > 0) { + struct imix_pkt *entry; + __u32 t = prandom_u32() % IMIX_PRECISION; + __u8 entry_index = pkt_dev->imix_distribution[t]; + + entry = &pkt_dev->imix_entries[entry_index]; + entry->count_so_far++; + pkt_dev->cur_pkt_size = entry->size; } set_cur_queue_map(pkt_dev); @@ -2484,6 +2601,32 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->flows[flow].count++; } +static void fill_imix_distribution(struct pktgen_dev *pkt_dev) +{ + int cumulative_probabilites[MAX_IMIX_ENTRIES]; + int j = 0; + __u64 cumulative_prob = 0; + __u64 total_weight = 0; + int i = 0; + + for (i = 0; i < pkt_dev->n_imix_entries; i++) + total_weight += pkt_dev->imix_entries[i].weight; + + /* Fill cumulative_probabilites with sum of normalized probabilities */ + for (i = 0; i < pkt_dev->n_imix_entries - 1; i++) { + cumulative_prob += div64_u64(pkt_dev->imix_entries[i].weight * + IMIX_PRECISION, + total_weight); + cumulative_probabilites[i] = cumulative_prob; + } + cumulative_probabilites[pkt_dev->n_imix_entries - 1] = 100; + + for (i = 0; i < IMIX_PRECISION; i++) { + if (i == cumulative_probabilites[j]) + j++; + pkt_dev->imix_distribution[i] = j; + } +} #ifdef CONFIG_XFRM static u32 pktgen_dst_metrics[RTAX_MAX + 1] = { @@ -3145,7 +3288,19 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC, ktime_to_ns(elapsed)); - bps = pps * 8 * pkt_dev->cur_pkt_size; + if (pkt_dev->n_imix_entries > 0) { + int i; + struct imix_pkt *entry; + + bps = 0; + for (i = 0; i < pkt_dev->n_imix_entries; i++) { + entry = &pkt_dev->imix_entries[i]; + bps += entry->size * entry->count_so_far; + } + bps = div64_u64(bps * 8 * NSEC_PER_SEC, ktime_to_ns(elapsed)); + } else { + bps = pps * 8 * pkt_dev->cur_pkt_size; + } mbps = bps; do_div(mbps, 1000000); @@ -3447,7 +3602,6 @@ out: static int pktgen_thread_worker(void *arg) { - DEFINE_WAIT(wait); struct pktgen_thread *t = arg; struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c index e33fde06d528..dd4cf01d1e0a 100644 --- a/net/core/ptp_classifier.c +++ b/net/core/ptp_classifier.c @@ -103,7 +103,7 @@ static struct bpf_prog *ptp_insns __read_mostly; unsigned int ptp_classify_raw(const struct sk_buff *skb) { - return BPF_PROG_RUN(ptp_insns, skb); + return bpf_prog_run(ptp_insns, skb); } EXPORT_SYMBOL_GPL(ptp_classify_raw); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 662eb1c37f47..2af8aeeadadf 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -301,7 +301,7 @@ int rtnl_unregister(int protocol, int msgtype) } link = rtnl_dereference(tab[msgindex]); - rcu_assign_pointer(tab[msgindex], NULL); + RCU_INIT_POINTER(tab[msgindex], NULL); rtnl_unlock(); kfree_rcu(link, rcu); @@ -337,7 +337,7 @@ void rtnl_unregister_all(int protocol) if (!link) continue; - rcu_assign_pointer(tab[msgindex], NULL); + RCU_INIT_POINTER(tab[msgindex], NULL); kfree_rcu(link, rcu); } rtnl_unlock(); @@ -710,15 +710,8 @@ out: int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; - int err = 0; - NETLINK_CB(skb).dst_group = group; - if (echo) - refcount_inc(&skb->users); - netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); - if (echo) - err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); - return err; + return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); } int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) @@ -733,12 +726,8 @@ void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; - int report = 0; - if (nlh) - report = nlmsg_report(nlh); - - nlmsg_notify(rtnl, skb, pid, group, report, flags); + nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); } EXPORT_SYMBOL(rtnl_notify); @@ -1970,6 +1959,13 @@ static bool link_master_filtered(struct net_device *dev, int master_idx) return false; master = netdev_master_upper_dev_get(dev); + + /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need + * another invalid value for ifindex to denote "no master". + */ + if (master_idx == -1) + return !!master; + if (!master || master->ifindex != master_idx) return true; @@ -2268,7 +2264,8 @@ invalid_attr: return -EINVAL; } -static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) +static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], + struct netlink_ext_ack *extack) { if (dev) { if (tb[IFLA_ADDRESS] && @@ -2295,7 +2292,7 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return -EOPNOTSUPP; if (af_ops->validate_link_af) { - err = af_ops->validate_link_af(dev, af); + err = af_ops->validate_link_af(dev, af, extack); if (err < 0) return err; } @@ -2603,7 +2600,7 @@ static int do_setlink(const struct sk_buff *skb, const struct net_device_ops *ops = dev->netdev_ops; int err; - err = validate_linkmsg(dev, tb); + err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; @@ -3207,8 +3204,8 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, dev->mtu = mtu; } if (tb[IFLA_ADDRESS]) { - memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), - nla_len(tb[IFLA_ADDRESS])); + __dev_addr_set(dev, nla_data(tb[IFLA_ADDRESS]), + nla_len(tb[IFLA_ADDRESS])); dev->addr_assign_type = NET_ADDR_SET; } if (tb[IFLA_BROADCAST]) @@ -3302,7 +3299,7 @@ replay: m_ops = master_dev->rtnl_link_ops; } - err = validate_linkmsg(dev, tb); + err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; @@ -3807,9 +3804,8 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; - size_t if_info_size; - skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags); + skb = nlmsg_new(if_nlmsg_size(dev, 0), flags); if (skb == NULL) goto errout; @@ -4387,7 +4383,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; if (br_dev != netdev_master_upper_dev_get(dev) && - !(dev->priv_flags & IFF_EBRIDGE)) + !netif_is_bridge_master(dev)) continue; cops = ops; } @@ -5265,7 +5261,7 @@ nla_put_failure: static size_t if_nlmsg_stats_size(const struct net_device *dev, u32 filter_mask) { - size_t size = 0; + size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); diff --git a/net/core/scm.c b/net/core/scm.c index ae3085d9aae8..5c356f0dee30 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -79,7 +79,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (!fpl) { - fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT); if (!fpl) return -ENOMEM; *fplp = fpl; @@ -355,7 +355,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) return NULL; new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); diff --git a/net/core/selftests.c b/net/core/selftests.c index ba7b0171974c..acb1ee97bbd3 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -15,8 +15,8 @@ #include <net/udp.h> struct net_packet_attrs { - unsigned char *src; - unsigned char *dst; + const unsigned char *src; + const unsigned char *dst; u32 ip_src; u32 ip_dst; bool tcp; @@ -173,8 +173,8 @@ static int net_test_loopback_validate(struct sk_buff *skb, struct net_device *orig_ndev) { struct net_test_priv *tpriv = pt->af_packet_priv; - unsigned char *src = tpriv->packet->src; - unsigned char *dst = tpriv->packet->dst; + const unsigned char *src = tpriv->packet->src; + const unsigned char *dst = tpriv->packet->dst; struct netsfhdr *shdr; struct ethhdr *ehdr; struct udphdr *uhdr; @@ -318,6 +318,15 @@ static int net_test_phy_loopback_udp(struct net_device *ndev) return __net_test_loopback(ndev, &attr); } +static int net_test_phy_loopback_udp_mtu(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.max_size = ndev->mtu; + return __net_test_loopback(ndev, &attr); +} + static int net_test_phy_loopback_tcp(struct net_device *ndev) { struct net_packet_attrs attr = { }; @@ -345,6 +354,9 @@ static const struct net_test { .name = "PHY internal loopback, UDP ", .fn = net_test_phy_loopback_udp, }, { + .name = "PHY internal loopback, MTU ", + .fn = net_test_phy_loopback_udp_mtu, + }, { .name = "PHY internal loopback, TCP ", .fn = net_test_phy_loopback_tcp, }, { diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fc7942c0dddc..67a9188d8a49 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -70,6 +70,7 @@ #include <net/xfrm.h> #include <net/mpls.h> #include <net/mptcp.h> +#include <net/mctp.h> #include <net/page_pool.h> #include <linux/uaccess.h> @@ -80,6 +81,7 @@ #include <linux/indirect_call_wrapper.h> #include "datagram.h" +#include "sock_destructor.h" struct kmem_cache *skbuff_head_cache __ro_after_init; static struct kmem_cache *skbuff_fclone_cache __ro_after_init; @@ -134,34 +136,31 @@ struct napi_alloc_cache { static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); -static void *__alloc_frag_align(unsigned int fragsz, gfp_t gfp_mask, - unsigned int align_mask) +void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return page_frag_alloc_align(&nc->page, fragsz, gfp_mask, align_mask); -} - -void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) -{ fragsz = SKB_DATA_ALIGN(fragsz); - return __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask); + return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); } EXPORT_SYMBOL(__napi_alloc_frag_align); void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { - struct page_frag_cache *nc; void *data; fragsz = SKB_DATA_ALIGN(fragsz); - if (in_irq() || irqs_disabled()) { - nc = this_cpu_ptr(&netdev_alloc_cache); + if (in_hardirq() || irqs_disabled()) { + struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache); + data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask); } else { + struct napi_alloc_cache *nc; + local_bh_disable(); - data = __alloc_frag_align(fragsz, GFP_ATOMIC, align_mask); + nc = this_cpu_ptr(&napi_alloc_cache); + data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask); local_bh_enable(); } return data; @@ -397,8 +396,9 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, { struct kmem_cache *cache; struct sk_buff *skb; - u8 *data; + unsigned int osize; bool pfmemalloc; + u8 *data; cache = (flags & SKB_ALLOC_FCLONE) ? skbuff_fclone_cache : skbuff_head_cache; @@ -430,7 +430,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ - size = SKB_WITH_OVERHEAD(ksize(data)); + osize = ksize(data); + size = SKB_WITH_OVERHEAD(osize); prefetchw(data + size); /* @@ -439,7 +440,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, * the tail pointer in struct sk_buff! */ memset(skb, 0, offsetof(struct sk_buff, tail)); - __build_skb_around(skb, data, 0); + __build_skb_around(skb, data, osize); skb->pfmemalloc = pfmemalloc; if (flags & SKB_ALLOC_FCLONE) { @@ -502,7 +503,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - if (in_irq() || irqs_disabled()) { + if (in_hardirq() || irqs_disabled()) { nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; @@ -724,7 +725,7 @@ void skb_release_head_state(struct sk_buff *skb) { skb_dst_drop(skb); if (skb->destructor) { - WARN_ON(in_irq()); + WARN_ON(in_hardirq()); skb->destructor(skb); } #if IS_ENABLED(CONFIG_NF_CONNTRACK) @@ -954,9 +955,13 @@ void __kfree_skb_defer(struct sk_buff *skb) void napi_skb_free_stolen_head(struct sk_buff *skb) { - nf_reset_ct(skb); - skb_dst_drop(skb); - skb_ext_put(skb); + if (unlikely(skb->slow_gro)) { + nf_reset_ct(skb); + skb_dst_drop(skb); + skb_ext_put(skb); + skb_orphan(skb); + skb->slow_gro = 0; + } napi_skb_cache_put(skb); } @@ -1786,6 +1791,57 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) EXPORT_SYMBOL(skb_realloc_headroom); /** + * skb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @headroom: needed headroom + * + * Unlike skb_realloc_headroom, this one does not allocate a new skb + * if possible; copies skb->sk to new skb as needed + * and frees original skb in case of failures. + * + * It expect increased headroom and generates warning otherwise. + */ + +struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom) +{ + int delta = headroom - skb_headroom(skb); + int osize = skb_end_offset(skb); + struct sock *sk = skb->sk; + + if (WARN_ONCE(delta <= 0, + "%s is expecting an increase in the headroom", __func__)) + return skb; + + delta = SKB_DATA_ALIGN(delta); + /* pskb_expand_head() might crash, if skb is shared. */ + if (skb_shared(skb) || !is_skb_wmem(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (unlikely(!nskb)) + goto fail; + + if (sk) + skb_set_owner_w(nskb, sk); + consume_skb(skb); + skb = nskb; + } + if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC)) + goto fail; + + if (sk && is_skb_wmem(skb)) { + delta = skb_end_offset(skb) - osize; + refcount_add(delta, &sk->sk_wmem_alloc); + skb->truesize += delta; + } + return skb; + +fail: + kfree_skb(skb); + return NULL; +} +EXPORT_SYMBOL(skb_expand_head); + +/** * skb_copy_expand - copy and expand sk_buff * @skb: buffer to copy * @newheadroom: new free bytes at head @@ -3838,7 +3894,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_push(nskb, -skb_network_offset(nskb) + offset); skb_release_head_state(nskb); - __copy_skb_header(nskb, skb); + __copy_skb_header(nskb, skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); skb_copy_from_linear_data_offset(skb, -tnl_hlen, @@ -3889,6 +3945,9 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) NAPI_GRO_CB(p)->last = skb; NAPI_GRO_CB(p)->count++; p->data_len += skb->len; + + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; p->truesize += skb->truesize; p->len += skb->len; @@ -4256,6 +4315,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) unsigned int headlen = skb_headlen(skb); unsigned int len = skb_gro_len(skb); unsigned int delta_truesize; + unsigned int new_truesize; struct sk_buff *lp; if (unlikely(p->len + len >= 65536 || NAPI_GRO_CB(skb)->flush)) @@ -4287,10 +4347,10 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) skb_frag_size_sub(frag, offset); /* all fragments truesize : remove (head size + sk_buff) */ - delta_truesize = skb->truesize - - SKB_TRUESIZE(skb_end_offset(skb)); + new_truesize = SKB_TRUESIZE(skb_end_offset(skb)); + delta_truesize = skb->truesize - new_truesize; - skb->truesize -= skb->data_len; + skb->truesize = new_truesize; skb->len -= skb->data_len; skb->data_len = 0; @@ -4319,12 +4379,16 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags); /* We dont need to clear skbinfo->nr_frags here */ - delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff)); + new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff)); + delta_truesize = skb->truesize - new_truesize; + skb->truesize = new_truesize; NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD; goto done; } merge: + /* sk owenrship - if any - completely transferred to the aggregated packet */ + skb->destructor = NULL; delta_truesize = skb->truesize; if (offset > headlen) { unsigned int eat = offset - headlen; @@ -4377,6 +4441,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_MPTCP) [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext), #endif +#if IS_ENABLED(CONFIG_MCTP_FLOWS) + [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -4394,6 +4461,9 @@ static __always_inline unsigned int skb_ext_total_length(void) #if IS_ENABLED(CONFIG_MPTCP) skb_ext_type_len[SKB_EXT_MPTCP] + #endif +#if IS_ENABLED(CONFIG_MCTP_FLOWS) + skb_ext_type_len[SKB_EXT_MCTP] + +#endif 0; } @@ -6449,6 +6519,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) new->chunks = newlen; new->offset[id] = newoff; set_active: + skb->slow_gro = 1; skb->extensions = new; skb->active_extensions |= 1 << id; return skb_ext_get_ptr(new, id); @@ -6465,6 +6536,14 @@ static void skb_ext_put_sp(struct sec_path *sp) } #endif +#ifdef CONFIG_MCTP_FLOWS +static void skb_ext_put_mctp(struct mctp_flow *flow) +{ + if (flow->key) + mctp_key_unref(flow->key); +} +#endif + void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id) { struct skb_ext *ext = skb->extensions; @@ -6500,6 +6579,10 @@ free_now: if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH)) skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH)); #endif +#ifdef CONFIG_MCTP_FLOWS + if (__skb_ext_exist(ext, SKB_EXT_MCTP)) + skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP)); +#endif kmem_cache_free(skbuff_ext_cache, ext); } diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 2d6249b28928..1ae52ac943f6 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -474,6 +474,20 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); +bool sk_msg_is_readable(struct sock *sk) +{ + struct sk_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) + empty = list_empty(&psock->ingress_msg); + rcu_read_unlock(); + return !empty; +} +EXPORT_SYMBOL_GPL(sk_msg_is_readable); + static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, struct sk_buff *skb) { @@ -494,6 +508,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, } static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, + u32 off, u32 len, struct sk_psock *psock, struct sock *sk, struct sk_msg *msg) @@ -507,11 +522,11 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, */ if (skb_linearize(skb)) return -EAGAIN; - num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len); + num_sge = skb_to_sgvec(skb, msg->sg.data, off, len); if (unlikely(num_sge < 0)) return num_sge; - copied = skb->len; + copied = len; msg->sg.start = 0; msg->sg.size = copied; msg->sg.end = num_sge; @@ -522,9 +537,11 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, return copied; } -static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb); +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len); -static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) +static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len) { struct sock *sk = psock->sk; struct sk_msg *msg; @@ -535,7 +552,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) * correctly. */ if (unlikely(skb->sk == sk)) - return sk_psock_skb_ingress_self(psock, skb); + return sk_psock_skb_ingress_self(psock, skb, off, len); msg = sk_psock_create_ingress_msg(sk, skb); if (!msg) return -EAGAIN; @@ -547,7 +564,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) * into user buffers. */ skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); if (err < 0) kfree(msg); return err; @@ -557,7 +574,8 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb) * skb. In this case we do not need to check memory limits or skb_set_owner_r * because the skb is already accounted for here. */ -static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb) +static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, + u32 off, u32 len) { struct sk_msg *msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); struct sock *sk = psock->sk; @@ -567,7 +585,7 @@ static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb return -EAGAIN; sk_msg_init(msg); skb_set_owner_r(skb, sk); - err = sk_psock_skb_ingress_enqueue(skb, psock, sk, msg); + err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg); if (err < 0) kfree(msg); return err; @@ -581,7 +599,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, return -EAGAIN; return skb_send_sock(psock->sk, skb, off, len); } - return sk_psock_skb_ingress(psock, skb); + return sk_psock_skb_ingress(psock, skb, off, len); } static void sk_psock_skb_state(struct sk_psock *psock, @@ -624,6 +642,12 @@ static void sk_psock_backlog(struct work_struct *work) while ((skb = skb_dequeue(&psock->ingress_skb))) { len = skb->len; off = 0; + if (skb_bpf_strparser(skb)) { + struct strp_msg *stm = strp_msg(skb); + + off = stm->offset; + len = stm->full_len; + } start: ingress = skb_bpf_ingress(skb); skb_bpf_redirect_clear(skb); @@ -863,6 +887,7 @@ static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb) * return code, but then didn't set a redirect interface. */ if (unlikely(!sk_other)) { + skb_bpf_redirect_clear(skb); sock_drop(from->sk, skb); return -EIO; } @@ -930,6 +955,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, { struct sock *sk_other; int err = 0; + u32 len, off; switch (verdict) { case __SK_PASS: @@ -937,6 +963,7 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, sk_other = psock->sk; if (sock_flag(sk_other, SOCK_DEAD) || !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + skb_bpf_redirect_clear(skb); goto out_free; } @@ -949,7 +976,15 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, * retrying later from workqueue. */ if (skb_queue_empty(&psock->ingress_skb)) { - err = sk_psock_skb_ingress_self(psock, skb); + len = skb->len; + off = 0; + if (skb_bpf_strparser(skb)) { + struct strp_msg *stm = strp_msg(skb); + + off = stm->offset; + len = stm->full_len; + } + err = sk_psock_skb_ingress_self(psock, skb, off, len); } if (err < 0) { spin_lock_bh(&psock->ingress_lock); @@ -1015,6 +1050,8 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) skb_dst_drop(skb); skb_bpf_redirect_clear(skb); ret = bpf_prog_run_pin_on_cpu(prog, skb); + if (ret == SK_PASS) + skb_bpf_set_strparser(skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } diff --git a/net/core/sock.c b/net/core/sock.c index a3eea6e0b30a..9862eefce21e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -226,6 +226,7 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ + x "AF_MCTP" , \ x "AF_MAX" static const char *const af_family_key_strings[AF_MAX+1] = { @@ -349,7 +350,7 @@ void sk_error_report(struct sock *sk) } EXPORT_SYMBOL(sk_error_report); -static int sock_get_timeout(long timeo, void *optval, bool old_timeval) +int sock_get_timeout(long timeo, void *optval, bool old_timeval) { struct __kernel_sock_timeval tv; @@ -378,12 +379,11 @@ static int sock_get_timeout(long timeo, void *optval, bool old_timeval) *(struct __kernel_sock_timeval *)optval = tv; return sizeof(tv); } +EXPORT_SYMBOL(sock_get_timeout); -static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, - bool old_timeval) +int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, + sockptr_t optval, int optlen, bool old_timeval) { - struct __kernel_sock_timeval tv; - if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { struct old_timeval32 tv32; @@ -392,8 +392,8 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) return -EFAULT; - tv.tv_sec = tv32.tv_sec; - tv.tv_usec = tv32.tv_usec; + tv->tv_sec = tv32.tv_sec; + tv->tv_usec = tv32.tv_usec; } else if (old_timeval) { struct __kernel_old_timeval old_tv; @@ -401,14 +401,28 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, return -EINVAL; if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) return -EFAULT; - tv.tv_sec = old_tv.tv_sec; - tv.tv_usec = old_tv.tv_usec; + tv->tv_sec = old_tv.tv_sec; + tv->tv_usec = old_tv.tv_usec; } else { - if (optlen < sizeof(tv)) + if (optlen < sizeof(*tv)) return -EINVAL; - if (copy_from_sockptr(&tv, optval, sizeof(tv))) + if (copy_from_sockptr(tv, optval, sizeof(*tv))) return -EFAULT; } + + return 0; +} +EXPORT_SYMBOL(sock_copy_user_timeval); + +static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, + bool old_timeval) +{ + struct __kernel_sock_timeval tv; + int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); + + if (err) + return err; + if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) return -EDOM; @@ -946,6 +960,53 @@ void sock_set_mark(struct sock *sk, u32 val) } EXPORT_SYMBOL(sock_set_mark); +static void sock_release_reserved_memory(struct sock *sk, int bytes) +{ + /* Round down bytes to multiple of pages */ + bytes &= ~(SK_MEM_QUANTUM - 1); + + WARN_ON(bytes > sk->sk_reserved_mem); + sk->sk_reserved_mem -= bytes; + sk_mem_reclaim(sk); +} + +static int sock_reserve_memory(struct sock *sk, int bytes) +{ + long allocated; + bool charged; + int pages; + + if (!mem_cgroup_sockets_enabled || !sk->sk_memcg) + return -EOPNOTSUPP; + + if (!bytes) + return 0; + + pages = sk_mem_pages(bytes); + + /* pre-charge to memcg */ + charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); + if (!charged) + return -ENOMEM; + + /* pre-charge to forward_alloc */ + allocated = sk_memory_allocated_add(sk, pages); + /* If the system goes into memory pressure with this + * precharge, give up and return error. + */ + if (allocated > sk_prot_mem_limits(sk, 1)) { + sk_memory_allocated_sub(sk, pages); + mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); + return -ENOMEM; + } + sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT; + + sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT; + + return 0; +} + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -1357,6 +1418,32 @@ set_sndbuf: ret = sock_bindtoindex_locked(sk, val); break; + case SO_BUF_LOCK: + if (val & ~SOCK_BUF_LOCK_MASK) { + ret = -EINVAL; + break; + } + sk->sk_userlocks = val | (sk->sk_userlocks & + ~SOCK_BUF_LOCK_MASK); + break; + + case SO_RESERVE_MEM: + { + int delta; + + if (val < 0) { + ret = -EINVAL; + break; + } + + delta = val - sk->sk_reserved_mem; + if (delta < 0) + sock_release_reserved_memory(sk, -delta); + else + ret = sock_reserve_memory(sk, delta); + break; + } + default: ret = -ENOPROTOOPT; break; @@ -1366,6 +1453,16 @@ set_sndbuf: } EXPORT_SYMBOL(sock_setsockopt); +static const struct cred *sk_get_peer_cred(struct sock *sk) +{ + const struct cred *cred; + + spin_lock(&sk->sk_peer_lock); + cred = get_cred(sk->sk_peer_cred); + spin_unlock(&sk->sk_peer_lock); + + return cred; +} static void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred) @@ -1542,7 +1639,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, struct ucred peercred; if (len > sizeof(peercred)) len = sizeof(peercred); + + spin_lock(&sk->sk_peer_lock); cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); + spin_unlock(&sk->sk_peer_lock); + if (copy_to_user(optval, &peercred, len)) return -EFAULT; goto lenout; @@ -1550,20 +1651,23 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_PEERGROUPS: { + const struct cred *cred; int ret, n; - if (!sk->sk_peer_cred) + cred = sk_get_peer_cred(sk); + if (!cred) return -ENODATA; - n = sk->sk_peer_cred->group_info->ngroups; + n = cred->group_info->ngroups; if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); + put_cred(cred); return put_user(len, optlen) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); - ret = groups_to_user((gid_t __user *)optval, - sk->sk_peer_cred->group_info); + ret = groups_to_user((gid_t __user *)optval, cred->group_info); + put_cred(cred); if (ret) return ret; goto lenout; @@ -1719,6 +1823,14 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val64 = sock_net(sk)->net_cookie; break; + case SO_BUF_LOCK: + v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; + break; + + case SO_RESERVE_MEM: + v.val = sk->sk_reserved_mem; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -1921,9 +2033,10 @@ static void __sk_destruct(struct rcu_head *head) sk->sk_frag.page = NULL; } - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ + put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); + if (likely(sk->sk_net_refcnt)) put_net(sock_net(sk)); sk_prot_free(sk->sk_prot_creator, sk); @@ -2031,6 +2144,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; + newsk->sk_reserved_mem = 0; atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; @@ -2560,7 +2674,6 @@ static void sk_leave_memory_pressure(struct sock *sk) } } -#define SKB_FRAG_PAGE_ORDER get_order(32768) DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); /** @@ -2714,10 +2827,12 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { struct proto *prot = sk->sk_prot; long allocated = sk_memory_allocated_add(sk, amt); + bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg; bool charged = true; - if (mem_cgroup_sockets_enabled && sk->sk_memcg && - !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) + if (memcg_charge && + !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt, + gfp_memcg_charge()))) goto suppress_allocation; /* Under limit. */ @@ -2771,8 +2886,14 @@ suppress_allocation: /* Fail only if socket is _under_ its sndbuf. * In this case we cannot block, so that we have to fail. */ - if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) + if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { + /* Force charge with __GFP_NOFAIL */ + if (memcg_charge && !charged) { + mem_cgroup_charge_skmem(sk->sk_memcg, amt, + gfp_memcg_charge() | __GFP_NOFAIL); + } return 1; + } } if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) @@ -2780,7 +2901,7 @@ suppress_allocation: sk_memory_allocated_sub(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) + if (memcg_charge && charged) mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); return 0; @@ -3124,6 +3245,8 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_peer_pid = NULL; sk->sk_peer_cred = NULL; + spin_lock_init(&sk->sk_peer_lock); + sk->sk_write_pending = 0; sk->sk_rcvlowat = 1; sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; @@ -3158,17 +3281,15 @@ EXPORT_SYMBOL(sock_init_data); void lock_sock_nested(struct sock *sk, int subclass) { + /* The sk_lock has mutex_lock() semantics here. */ + mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); + might_sleep(); spin_lock_bh(&sk->sk_lock.slock); if (sk->sk_lock.owned) __lock_sock(sk); sk->sk_lock.owned = 1; - spin_unlock(&sk->sk_lock.slock); - /* - * The sk_lock has mutex_lock() semantics here: - */ - mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); - local_bh_enable(); + spin_unlock_bh(&sk->sk_lock.slock); } EXPORT_SYMBOL(lock_sock_nested); @@ -3191,42 +3312,37 @@ void release_sock(struct sock *sk) } EXPORT_SYMBOL(release_sock); -/** - * lock_sock_fast - fast version of lock_sock - * @sk: socket - * - * This version should be used for very small section, where process wont block - * return false if fast path is taken: - * - * sk_lock.slock locked, owned = 0, BH disabled - * - * return true if slow path is taken: - * - * sk_lock.slock unlocked, owned = 1, BH enabled - */ -bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) +bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); - if (!sk->sk_lock.owned) + if (!sk->sk_lock.owned) { /* - * Note : We must disable BH + * Fast path return with bottom halves disabled and + * sock::sk_lock.slock held. + * + * The 'mutex' is not contended and holding + * sock::sk_lock.slock prevents all other lockers to + * proceed so the corresponding unlock_sock_fast() can + * avoid the slow path of release_sock() completely and + * just release slock. + * + * From a semantical POV this is equivalent to 'acquiring' + * the 'mutex', hence the corresponding lockdep + * mutex_release() has to happen in the fast path of + * unlock_sock_fast(). */ return false; + } __lock_sock(sk); sk->sk_lock.owned = 1; - spin_unlock(&sk->sk_lock.slock); - /* - * The sk_lock has mutex_lock() semantics here: - */ - mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); __acquire(&sk->sk_lock.slock); - local_bh_enable(); + spin_unlock_bh(&sk->sk_lock.slock); return true; } -EXPORT_SYMBOL(lock_sock_fast); +EXPORT_SYMBOL(__lock_sock_fast); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32) diff --git a/net/core/sock_destructor.h b/net/core/sock_destructor.h new file mode 100644 index 000000000000..2f396e6bfba5 --- /dev/null +++ b/net/core/sock_destructor.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_CORE_SOCK_DESTRUCTOR_H +#define _NET_CORE_SOCK_DESTRUCTOR_H +#include <net/tcp.h> + +static inline bool is_skb_wmem(const struct sk_buff *skb) +{ + return skb->destructor == sock_wfree || + skb->destructor == __sock_wfree || + (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree); +} +#endif diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 60decd6420ca..e252b8ec2b85 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -211,8 +211,6 @@ out: return psock; } -static bool sock_map_redirect_allowed(const struct sock *sk); - static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct sk_psock_progs *progs = sock_map_progs(map); @@ -223,13 +221,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) struct sk_psock *psock; int ret; - /* Only sockets we can redirect into/from in BPF need to hold - * refs to parser/verdict progs and have their sk_data_ready - * and sk_write_space callbacks overridden. - */ - if (!sock_map_redirect_allowed(sk)) - goto no_progs; - stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); @@ -264,7 +255,6 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } } -no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); @@ -527,12 +517,6 @@ static bool sk_is_tcp(const struct sock *sk) sk->sk_protocol == IPPROTO_TCP; } -static bool sk_is_udp(const struct sock *sk) -{ - return sk->sk_type == SOCK_DGRAM && - sk->sk_protocol == IPPROTO_UDP; -} - static bool sock_map_redirect_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) @@ -550,10 +534,7 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) { if (sk_is_tcp(sk)) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); - else if (sk_is_udp(sk)) - return sk_hashed(sk); - - return false; + return true; } static int sock_hash_update_common(struct bpf_map *map, void *key, @@ -1513,6 +1494,7 @@ void sock_map_unhash(struct sock *sk) rcu_read_unlock(); saved_unhash(sk); } +EXPORT_SYMBOL_GPL(sock_map_unhash); void sock_map_close(struct sock *sk, long timeout) { @@ -1536,6 +1518,7 @@ void sock_map_close(struct sock *sk, long timeout) release_sock(sk); saved_close(sk, timeout); } +EXPORT_SYMBOL_GPL(sock_map_close); static int sock_map_iter_attach_target(struct bpf_prog *prog, union bpf_iter_link_info *linfo, diff --git a/net/core/stream.c b/net/core/stream.c index 4f1d4aa5fb38..06b36c730ce8 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -195,14 +195,11 @@ void sk_stream_kill_queues(struct sock *sk) /* First the read buffer. */ __skb_queue_purge(&sk->sk_receive_queue); - /* Next, the error queue. */ - __skb_queue_purge(&sk->sk_error_queue); - /* Next, the write queue. */ WARN_ON(!skb_queue_empty(&sk->sk_write_queue)); /* Account for returned memory. */ - sk_mem_reclaim(sk); + sk_mem_reclaim_final(sk); WARN_ON(sk->sk_wmem_queued); WARN_ON(sk->sk_forward_alloc); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index c8496c1142c9..5f88526ad61c 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -419,7 +419,7 @@ static struct ctl_table net_core_table[] = { .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, .extra1 = &long_one, - .extra2 = &long_max, + .extra2 = &bpf_jit_limit_max, }, #endif { diff --git a/net/core/xdp.c b/net/core/xdp.c index cc92ccb38432..5ddc29f29bad 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -143,8 +143,6 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) if (xdp_rxq->reg_state == REG_STATE_UNUSED) return; - WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); - xdp_rxq_info_unreg_mem_model(xdp_rxq); xdp_rxq->reg_state = REG_STATE_UNREGISTERED; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index c5c1d2b8045e..5183e627468d 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -48,7 +48,7 @@ extern bool dccp_debug; extern struct inet_hashinfo dccp_hashinfo; -extern struct percpu_counter dccp_orphan_count; +DECLARE_PER_CPU(unsigned int, dccp_orphan_count); void dccp_time_wait(struct sock *sk, int state, int timeo); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index c5c74a34d139..91e7a2202697 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -94,6 +94,8 @@ struct sock *dccp_create_openreq_child(const struct sock *sk, newdp->dccps_role = DCCP_ROLE_SERVER; newdp->dccps_hc_rx_ackvec = NULL; newdp->dccps_service_list = NULL; + newdp->dccps_hc_rx_ccid = NULL; + newdp->dccps_hc_tx_ccid = NULL; newdp->dccps_service = dreq->dreq_service; newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo; newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 7eb0fb231940..fc44dadc778b 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -42,8 +42,8 @@ DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; EXPORT_SYMBOL_GPL(dccp_statistics); -struct percpu_counter dccp_orphan_count; -EXPORT_SYMBOL_GPL(dccp_orphan_count); +DEFINE_PER_CPU(unsigned int, dccp_orphan_count); +EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count); struct inet_hashinfo dccp_hashinfo; EXPORT_SYMBOL_GPL(dccp_hashinfo); @@ -1055,7 +1055,7 @@ adjudge_to_death: bh_lock_sock(sk); WARN_ON(sock_owned_by_user(sk)); - percpu_counter_inc(sk->sk_prot->orphan_count); + this_cpu_inc(dccp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) @@ -1115,18 +1115,15 @@ static int __init dccp_init(void) BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > sizeof_field(struct sk_buff, cb)); - rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL); - if (rc) - goto out_fail; inet_hashinfo_init(&dccp_hashinfo); rc = inet_hashinfo2_init_mod(&dccp_hashinfo); if (rc) - goto out_free_percpu; + goto out_fail; rc = -ENOBUFS; dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!dccp_hashinfo.bind_bucket_cachep) goto out_free_hashinfo2; @@ -1226,8 +1223,6 @@ out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); out_free_hashinfo2: inet_hashinfo2_free_mod(&dccp_hashinfo); -out_free_percpu: - percpu_counter_destroy(&dccp_orphan_count); out_fail: dccp_hashinfo.bhash = NULL; dccp_hashinfo.ehash = NULL; @@ -1250,7 +1245,6 @@ static void __exit dccp_fini(void) dccp_ackvec_exit(); dccp_sysctl_exit(); inet_hashinfo2_free_mod(&dccp_hashinfo); - percpu_counter_destroy(&dccp_orphan_count); } module_init(dccp_init); diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index d1c50a48614b..0ee7d4c0c955 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -521,8 +521,7 @@ int dn_dev_set_default(struct net_device *dev, int force) } spin_unlock(&dndev_lock); - if (old) - dev_put(old); + dev_put(old); return rv; } @@ -536,8 +535,7 @@ static void dn_dev_check_default(struct net_device *dev) } spin_unlock(&dndev_lock); - if (dev) - dev_put(dev); + dev_put(dev); } /* diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 77fbf8e9df4b..269c029ad74f 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -92,8 +92,7 @@ void dn_fib_free_info(struct dn_fib_info *fi) } change_nexthops(fi) { - if (nh->nh_dev) - dev_put(nh->nh_dev); + dev_put(nh->nh_dev); nh->nh_dev = NULL; } endfor_nexthops(fi); kfree(fi); @@ -102,7 +101,7 @@ void dn_fib_free_info(struct dn_fib_info *fi) void dn_fib_release_info(struct dn_fib_info *fi) { spin_lock(&dn_fib_info_lock); - if (fi && --fi->fib_treeref == 0) { + if (fi && refcount_dec_and_test(&fi->fib_treeref)) { if (fi->fib_next) fi->fib_next->fib_prev = fi->fib_prev; if (fi->fib_prev) @@ -385,11 +384,11 @@ link_it: if ((ofi = dn_fib_find_info(fi)) != NULL) { fi->fib_dead = 1; dn_fib_free_info(fi); - ofi->fib_treeref++; + refcount_inc(&ofi->fib_treeref); return ofi; } - fi->fib_treeref++; + refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); spin_lock(&dn_fib_info_lock); fi->fib_next = dn_fib_info_list; diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 729d3de6020d..7e85f2a1ae25 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1026,8 +1026,7 @@ source_ok: if (!fld.daddr) { fld.daddr = fld.saddr; - if (dev_out) - dev_put(dev_out); + dev_put(dev_out); err = -EINVAL; dev_out = init_net.loopback_dev; if (!dev_out->dn_ptr) @@ -1084,8 +1083,7 @@ source_ok: neigh_release(neigh); neigh = NULL; } else { - if (dev_out) - dev_put(dev_out); + dev_put(dev_out); if (dn_dev_islocal(neigh->dev, fld.daddr)) { dev_out = init_net.loopback_dev; res.type = RTN_LOCAL; @@ -1144,8 +1142,7 @@ select_source: if (res.type == RTN_LOCAL) { if (!fld.saddr) fld.saddr = fld.daddr; - if (dev_out) - dev_put(dev_out); + dev_put(dev_out); dev_out = init_net.loopback_dev; dev_hold(dev_out); if (!dev_out->dn_ptr) @@ -1168,8 +1165,7 @@ select_source: if (!fld.saddr) fld.saddr = DN_FIB_RES_PREFSRC(res); - if (dev_out) - dev_put(dev_out); + dev_put(dev_out); dev_out = DN_FIB_RES_DEV(res); dev_hold(dev_out); fld.flowidn_oif = dev_out->ifindex; @@ -1222,8 +1218,7 @@ done: neigh_release(neigh); if (free_res) dn_fib_res_put(&res); - if (dev_out) - dev_put(dev_out); + dev_put(dev_out); out: return err; @@ -1503,8 +1498,7 @@ done: if (free_res) dn_fib_res_put(&res); dev_put(in_dev); - if (out_dev) - dev_put(out_dev); + dev_put(out_dev); out: return err; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 00bb89b2d86f..8cb87b5067ee 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -18,16 +18,6 @@ if NET_DSA # Drivers must select the appropriate tagging format(s) -config NET_DSA_TAG_8021Q - tristate - select VLAN_8021Q - help - Unlike the other tagging protocols, the 802.1Q config option simply - provides helpers for other tagging implementations that might rely on - VLAN in one way or another. It is not a complete solution. - - Drivers which use these helpers should select this as dependency. - config NET_DSA_TAG_AR9331 tristate "Tag driver for Atheros AR9331 SoC with built-in switch" help @@ -102,17 +92,8 @@ config NET_DSA_TAG_KSZ Say Y if you want to enable support for tagging frames for the Microchip 8795/9477/9893 families of switches. -config NET_DSA_TAG_RTL4_A - tristate "Tag driver for Realtek 4 byte protocol A tags" - help - Say Y or M if you want to enable support for tagging frames for the - Realtek switches with 4 byte protocol A tags, sich as found in - the Realtek RTL8366RB. - config NET_DSA_TAG_OCELOT tristate "Tag driver for Ocelot family of switches, using NPI port" - depends on MSCC_OCELOT_SWITCH_LIB || \ - (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST) select PACKING help Say Y or M if you want to enable NPI tagging for the Ocelot switches @@ -124,9 +105,6 @@ config NET_DSA_TAG_OCELOT config NET_DSA_TAG_OCELOT_8021Q tristate "Tag driver for Ocelot family of switches, using VLAN" - depends on MSCC_OCELOT_SWITCH_LIB || \ - (MSCC_OCELOT_SWITCH_LIB=n && COMPILE_TEST) - select NET_DSA_TAG_8021Q help Say Y or M if you want to enable support for tagging frames with a custom VLAN-based header. Frames that require timestamping, such as @@ -141,6 +119,19 @@ config NET_DSA_TAG_QCA Say Y or M if you want to enable support for tagging frames for the Qualcomm Atheros QCA8K switches. +config NET_DSA_TAG_RTL4_A + tristate "Tag driver for Realtek 4 byte protocol A tags" + help + Say Y or M if you want to enable support for tagging frames for the + Realtek switches with 4 byte protocol A tags, sich as found in + the Realtek RTL8366RB. + +config NET_DSA_TAG_RTL8_4 + tristate "Tag driver for Realtek 8 byte protocol 4 tags" + help + Say Y or M if you want to enable support for tagging frames for Realtek + switches with 8 byte protocol 4 tags, such as the Realtek RTL8365MB-VC. + config NET_DSA_TAG_LAN9303 tristate "Tag driver for SMSC/Microchip LAN9303 family of switches" help @@ -149,7 +140,6 @@ config NET_DSA_TAG_LAN9303 config NET_DSA_TAG_SJA1105 tristate "Tag driver for NXP SJA1105 switches" - select NET_DSA_TAG_8021Q select PACKING help Say Y or M if you want to enable support for tagging frames with the diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 44bc79952b8b..9f75820e7c98 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,22 +1,22 @@ # SPDX-License-Identifier: GPL-2.0 # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o +dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o tag_8021q.o # tagging formats -obj-$(CONFIG_NET_DSA_TAG_8021Q) += tag_8021q.o obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o obj-$(CONFIG_NET_DSA_TAG_DSA_COMMON) += tag_dsa.o obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o -obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o +obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o +obj-$(CONFIG_NET_DSA_TAG_RTL8_4) += tag_rtl8_4.o obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 84cad1be9ce4..ea5169e671ae 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -238,7 +238,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb) return 0; - nskb = cpu_dp->rcv(skb, dev, pt); + nskb = cpu_dp->rcv(skb, dev); if (!nskb) { kfree_skb(skb); return 0; @@ -280,23 +280,22 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, } #ifdef CONFIG_PM_SLEEP -static bool dsa_is_port_initialized(struct dsa_switch *ds, int p) +static bool dsa_port_is_initialized(const struct dsa_port *dp) { - const struct dsa_port *dp = dsa_to_port(ds, p); - return dp->type == DSA_PORT_TYPE_USER && dp->slave; } int dsa_switch_suspend(struct dsa_switch *ds) { - int i, ret = 0; + struct dsa_port *dp; + int ret = 0; /* Suspend slave network devices */ - for (i = 0; i < ds->num_ports; i++) { - if (!dsa_is_port_initialized(ds, i)) + dsa_switch_for_each_port(dp, ds) { + if (!dsa_port_is_initialized(dp)) continue; - ret = dsa_slave_suspend(dsa_to_port(ds, i)->slave); + ret = dsa_slave_suspend(dp->slave); if (ret) return ret; } @@ -310,7 +309,8 @@ EXPORT_SYMBOL_GPL(dsa_switch_suspend); int dsa_switch_resume(struct dsa_switch *ds) { - int i, ret = 0; + struct dsa_port *dp; + int ret = 0; if (ds->ops->resume) ret = ds->ops->resume(ds); @@ -319,11 +319,11 @@ int dsa_switch_resume(struct dsa_switch *ds) return ret; /* Resume slave network devices */ - for (i = 0; i < ds->num_ports; i++) { - if (!dsa_is_port_initialized(ds, i)) + dsa_switch_for_each_port(dp, ds) { + if (!dsa_port_is_initialized(dp)) continue; - ret = dsa_slave_resume(dsa_to_port(ds, i)->slave); + ret = dsa_slave_resume(dp->slave); if (ret) return ret; } @@ -345,6 +345,11 @@ bool dsa_schedule_work(struct work_struct *work) return queue_work(dsa_owq, work); } +void dsa_flush_workqueue(void) +{ + flush_workqueue(dsa_owq); +} + int dsa_devlink_param_get(struct devlink *dl, u32 id, struct devlink_param_gset_ctx *ctx) { diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 185629f27f80..826957b6442b 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -21,6 +21,9 @@ static DEFINE_MUTEX(dsa2_mutex); LIST_HEAD(dsa_tree_list); +/* Track the bridges with forwarding offload enabled */ +static unsigned long dsa_fwd_offloading_bridges; + /** * dsa_tree_notify - Execute code for all switches in a DSA switch tree. * @dst: collection of struct dsa_switch devices to notify. @@ -49,6 +52,9 @@ int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v) * Can be used to notify the switching fabric of events such as cross-chip * bridging between disjoint trees (such as islands of tagger-compatible * switches bridged by an incompatible middle switch). + * + * WARNING: this function is not reliable during probe time, because probing + * between trees is asynchronous and not all DSA trees might have probed. */ int dsa_broadcast(unsigned long e, void *v) { @@ -123,6 +129,51 @@ void dsa_lag_unmap(struct dsa_switch_tree *dst, struct net_device *lag) } } +static int dsa_bridge_num_find(const struct net_device *bridge_dev) +{ + struct dsa_switch_tree *dst; + struct dsa_port *dp; + + /* When preparing the offload for a port, it will have a valid + * dp->bridge_dev pointer but a not yet valid dp->bridge_num. + * However there might be other ports having the same dp->bridge_dev + * and a valid dp->bridge_num, so just ignore this port. + */ + list_for_each_entry(dst, &dsa_tree_list, list) + list_for_each_entry(dp, &dst->ports, list) + if (dp->bridge_dev == bridge_dev && + dp->bridge_num != -1) + return dp->bridge_num; + + return -1; +} + +int dsa_bridge_num_get(const struct net_device *bridge_dev, int max) +{ + int bridge_num = dsa_bridge_num_find(bridge_dev); + + if (bridge_num < 0) { + /* First port that offloads TX forwarding for this bridge */ + bridge_num = find_first_zero_bit(&dsa_fwd_offloading_bridges, + DSA_MAX_NUM_OFFLOADING_BRIDGES); + if (bridge_num >= max) + return -1; + + set_bit(bridge_num, &dsa_fwd_offloading_bridges); + } + + return bridge_num; +} + +void dsa_bridge_num_put(const struct net_device *bridge_dev, int bridge_num) +{ + /* Check if the bridge is still in use, otherwise it is time + * to clean it up so we can reuse this bridge_num later. + */ + if (dsa_bridge_num_find(bridge_dev) < 0) + clear_bit(bridge_num, &dsa_fwd_offloading_bridges); +} + struct dsa_switch *dsa_switch_find(int tree_index, int sw_index) { struct dsa_switch_tree *dst; @@ -311,6 +362,9 @@ static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) return NULL; } +/* Assign the default CPU port (the first one in the tree) to all ports of the + * fabric which don't already have one as part of their own switch. + */ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) { struct dsa_port *cpu_dp, *dp; @@ -321,15 +375,45 @@ static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst) return -EINVAL; } - /* Assign the default CPU port to all ports of the fabric */ - list_for_each_entry(dp, &dst->ports, list) + list_for_each_entry(dp, &dst->ports, list) { + if (dp->cpu_dp) + continue; + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) dp->cpu_dp = cpu_dp; + } return 0; } -static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst) +/* Perform initial assignment of CPU ports to user ports and DSA links in the + * fabric, giving preference to CPU ports local to each switch. Default to + * using the first CPU port in the switch tree if the port does not have a CPU + * port local to this switch. + */ +static int dsa_tree_setup_cpu_ports(struct dsa_switch_tree *dst) +{ + struct dsa_port *cpu_dp, *dp; + + list_for_each_entry(cpu_dp, &dst->ports, list) { + if (!dsa_port_is_cpu(cpu_dp)) + continue; + + /* Prefer a local CPU port */ + dsa_switch_for_each_port(dp, cpu_dp->ds) { + /* Prefer the first local CPU port found */ + if (dp->cpu_dp) + continue; + + if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp)) + dp->cpu_dp = cpu_dp; + } + } + + return dsa_tree_setup_default_cpu(dst); +} + +static void dsa_tree_teardown_cpu_ports(struct dsa_switch_tree *dst) { struct dsa_port *dp; @@ -342,15 +426,23 @@ static int dsa_port_setup(struct dsa_port *dp) { struct devlink_port *dlp = &dp->devlink_port; bool dsa_port_link_registered = false; + struct dsa_switch *ds = dp->ds; bool dsa_port_enabled = false; int err = 0; if (dp->setup) return 0; + mutex_init(&dp->addr_lists_lock); INIT_LIST_HEAD(&dp->fdbs); INIT_LIST_HEAD(&dp->mdbs); + if (ds->ops->port_setup) { + err = ds->ops->port_setup(ds, dp->index); + if (err) + return err; + } + switch (dp->type) { case DSA_PORT_TYPE_UNUSED: dsa_port_disable(dp); @@ -393,8 +485,11 @@ static int dsa_port_setup(struct dsa_port *dp) dsa_port_disable(dp); if (err && dsa_port_link_registered) dsa_port_link_unregister_of(dp); - if (err) + if (err) { + if (ds->ops->port_teardown) + ds->ops->port_teardown(ds, dp->index); return err; + } dp->setup = true; @@ -446,11 +541,15 @@ static int dsa_port_devlink_setup(struct dsa_port *dp) static void dsa_port_teardown(struct dsa_port *dp) { struct devlink_port *dlp = &dp->devlink_port; + struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a, *tmp; if (!dp->setup) return; + if (ds->ops->port_teardown) + ds->ops->port_teardown(ds, dp->index); + devlink_port_type_clear(dlp); switch (dp->type) { @@ -494,6 +593,36 @@ static void dsa_port_devlink_teardown(struct dsa_port *dp) dp->devlink_port_setup = false; } +/* Destroy the current devlink port, and create a new one which has the UNUSED + * flavour. At this point, any call to ds->ops->port_setup has been already + * balanced out by a call to ds->ops->port_teardown, so we know that any + * devlink port regions the driver had are now unregistered. We then call its + * ds->ops->port_setup again, in order for the driver to re-create them on the + * new devlink port. + */ +static int dsa_port_reinit_as_unused(struct dsa_port *dp) +{ + struct dsa_switch *ds = dp->ds; + int err; + + dsa_port_devlink_teardown(dp); + dp->type = DSA_PORT_TYPE_UNUSED; + err = dsa_port_devlink_setup(dp); + if (err) + return err; + + if (ds->ops->port_setup) { + /* On error, leave the devlink port registered, + * dsa_switch_teardown will clean it up later. + */ + err = ds->ops->port_setup(ds, dp->index); + if (err) + return err; + } + + return 0; +} + static int dsa_devlink_info_get(struct devlink *dl, struct devlink_info_req *req, struct netlink_ext_ack *extack) @@ -671,16 +800,17 @@ static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds) { const struct dsa_device_ops *tag_ops = ds->dst->tag_ops; struct dsa_switch_tree *dst = ds->dst; - int port, err; + struct dsa_port *cpu_dp; + int err; if (tag_ops->proto == dst->default_proto) return 0; - for (port = 0; port < ds->num_ports; port++) { - if (!dsa_is_cpu_port(ds, port)) - continue; - - err = ds->ops->change_tag_protocol(ds, port, tag_ops->proto); + dsa_switch_for_each_cpu_port(cpu_dp, ds) { + rtnl_lock(); + err = ds->ops->change_tag_protocol(ds, cpu_dp->index, + tag_ops->proto); + rtnl_unlock(); if (err) { dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n", tag_ops->name, ERR_PTR(err)); @@ -710,25 +840,20 @@ static int dsa_switch_setup(struct dsa_switch *ds) /* Add the switch to devlink before calling setup, so that setup can * add dpipe tables */ - ds->devlink = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv)); + ds->devlink = + devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv), ds->dev); if (!ds->devlink) return -ENOMEM; dl_priv = devlink_priv(ds->devlink); dl_priv->ds = ds; - err = devlink_register(ds->devlink, ds->dev); - if (err) - goto free_devlink; - /* Setup devlink port instances now, so that the switch * setup() can register regions etc, against the ports */ - list_for_each_entry(dp, &ds->dst->ports, list) { - if (dp->ds == ds) { - err = dsa_port_devlink_setup(dp); - if (err) - goto unregister_devlink_ports; - } + dsa_switch_for_each_port(dp, ds) { + err = dsa_port_devlink_setup(dp); + if (err) + goto unregister_devlink_ports; } err = dsa_switch_register_notifier(ds); @@ -745,10 +870,8 @@ static int dsa_switch_setup(struct dsa_switch *ds) if (err) goto teardown; - devlink_params_publish(ds->devlink); - if (!ds->slave_mii_bus && ds->ops->phy_read) { - ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); + ds->slave_mii_bus = mdiobus_alloc(); if (!ds->slave_mii_bus) { err = -ENOMEM; goto teardown; @@ -758,27 +881,26 @@ static int dsa_switch_setup(struct dsa_switch *ds) err = mdiobus_register(ds->slave_mii_bus); if (err < 0) - goto teardown; + goto free_slave_mii_bus; } ds->setup = true; - + devlink_register(ds->devlink); return 0; +free_slave_mii_bus: + if (ds->slave_mii_bus && ds->ops->phy_read) + mdiobus_free(ds->slave_mii_bus); teardown: if (ds->ops->teardown) ds->ops->teardown(ds); unregister_notifier: dsa_switch_unregister_notifier(ds); unregister_devlink_ports: - list_for_each_entry(dp, &ds->dst->ports, list) - if (dp->ds == ds) - dsa_port_devlink_teardown(dp); - devlink_unregister(ds->devlink); -free_devlink: + dsa_switch_for_each_port(dp, ds) + dsa_port_devlink_teardown(dp); devlink_free(ds->devlink); ds->devlink = NULL; - return err; } @@ -789,19 +911,23 @@ static void dsa_switch_teardown(struct dsa_switch *ds) if (!ds->setup) return; - if (ds->slave_mii_bus && ds->ops->phy_read) - mdiobus_unregister(ds->slave_mii_bus); + if (ds->devlink) + devlink_unregister(ds->devlink); - dsa_switch_unregister_notifier(ds); + if (ds->slave_mii_bus && ds->ops->phy_read) { + mdiobus_unregister(ds->slave_mii_bus); + mdiobus_free(ds->slave_mii_bus); + ds->slave_mii_bus = NULL; + } if (ds->ops->teardown) ds->ops->teardown(ds); + dsa_switch_unregister_notifier(ds); + if (ds->devlink) { - list_for_each_entry(dp, &ds->dst->ports, list) - if (dp->ds == ds) - dsa_port_devlink_teardown(dp); - devlink_unregister(ds->devlink); + dsa_switch_for_each_port(dp, ds) + dsa_port_devlink_teardown(dp); devlink_free(ds->devlink); ds->devlink = NULL; } @@ -809,6 +935,33 @@ static void dsa_switch_teardown(struct dsa_switch *ds) ds->setup = false; } +/* First tear down the non-shared, then the shared ports. This ensures that + * all work items scheduled by our switchdev handlers for user ports have + * completed before we destroy the refcounting kept on the shared ports. + */ +static void dsa_tree_teardown_ports(struct dsa_switch_tree *dst) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_user(dp) || dsa_port_is_unused(dp)) + dsa_port_teardown(dp); + + dsa_flush_workqueue(); + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) + dsa_port_teardown(dp); +} + +static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + dsa_switch_teardown(dp->ds); +} + static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) { struct dsa_port *dp; @@ -823,38 +976,22 @@ static int dsa_tree_setup_switches(struct dsa_switch_tree *dst) list_for_each_entry(dp, &dst->ports, list) { err = dsa_port_setup(dp); if (err) { - dsa_port_devlink_teardown(dp); - dp->type = DSA_PORT_TYPE_UNUSED; - err = dsa_port_devlink_setup(dp); + err = dsa_port_reinit_as_unused(dp); if (err) goto teardown; - continue; } } return 0; teardown: - list_for_each_entry(dp, &dst->ports, list) - dsa_port_teardown(dp); + dsa_tree_teardown_ports(dst); - list_for_each_entry(dp, &dst->ports, list) - dsa_switch_teardown(dp->ds); + dsa_tree_teardown_switches(dst); return err; } -static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst) -{ - struct dsa_port *dp; - - list_for_each_entry(dp, &dst->ports, list) - dsa_port_teardown(dp); - - list_for_each_entry(dp, &dst->ports, list) - dsa_switch_teardown(dp->ds); -} - static int dsa_tree_setup_master(struct dsa_switch_tree *dst) { struct dsa_port *dp; @@ -921,13 +1058,13 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) if (!complete) return 0; - err = dsa_tree_setup_default_cpu(dst); + err = dsa_tree_setup_cpu_ports(dst); if (err) return err; err = dsa_tree_setup_switches(dst); if (err) - goto teardown_default_cpu; + goto teardown_cpu_ports; err = dsa_tree_setup_master(dst); if (err) @@ -946,9 +1083,10 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) teardown_master: dsa_tree_teardown_master(dst); teardown_switches: + dsa_tree_teardown_ports(dst); dsa_tree_teardown_switches(dst); -teardown_default_cpu: - dsa_tree_teardown_default_cpu(dst); +teardown_cpu_ports: + dsa_tree_teardown_cpu_ports(dst); return err; } @@ -964,9 +1102,11 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) dsa_tree_teardown_master(dst); + dsa_tree_teardown_ports(dst); + dsa_tree_teardown_switches(dst); - dsa_tree_teardown_default_cpu(dst); + dsa_tree_teardown_cpu_ports(dst); list_for_each_entry_safe(dl, next, &dst->rtable, list) { list_del(&dl->list); @@ -1003,7 +1143,7 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, goto out_unlock; list_for_each_entry(dp, &dst->ports, list) { - if (!dsa_is_user_port(dp->ds, dp->index)) + if (!dsa_port_is_user(dp)) continue; if (dp->slave->flags & IFF_UP) @@ -1034,8 +1174,8 @@ static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) struct dsa_switch_tree *dst = ds->dst; struct dsa_port *dp; - list_for_each_entry(dp, &dst->ports, list) - if (dp->ds == ds && dp->index == index) + dsa_switch_for_each_port(dp, ds) + if (dp->index == index) return dp; dp = kzalloc(sizeof(*dp), GFP_KERNEL); @@ -1044,6 +1184,7 @@ static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index) dp->ds = ds; dp->index = index; + dp->bridge_num = -1; INIT_LIST_HEAD(&dp->list); list_add_tail(&dp->list, &dst->ports); @@ -1219,12 +1360,15 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, for_each_available_child_of_node(ports, port) { err = of_property_read_u32(port, "reg", ®); - if (err) + if (err) { + of_node_put(port); goto out_put_node; + } if (reg >= ds->num_ports) { dev_err(ds->dev, "port %pOF index %u exceeds num_ports (%zu)\n", port, reg, ds->num_ports); + of_node_put(port); err = -EINVAL; goto out_put_node; } @@ -1232,8 +1376,10 @@ static int dsa_switch_parse_ports_of(struct dsa_switch *ds, dp = dsa_to_port(ds, reg); err = dsa_port_parse_of(dp, port); - if (err) + if (err) { + of_node_put(port); goto out_put_node; + } } out_put_node: @@ -1265,6 +1411,9 @@ static int dsa_switch_parse_member_of(struct dsa_switch *ds, return -EEXIST; } + if (ds->dst->last_switch < ds->index) + ds->dst->last_switch = ds->index; + return 0; } @@ -1372,12 +1521,9 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) static void dsa_switch_release_ports(struct dsa_switch *ds) { - struct dsa_switch_tree *dst = ds->dst; struct dsa_port *dp, *next; - list_for_each_entry_safe(dp, next, &dst->ports, list) { - if (dp->ds != ds) - continue; + dsa_switch_for_each_port_safe(dp, next, ds) { list_del(&dp->list); kfree(dp); } @@ -1454,3 +1600,47 @@ void dsa_unregister_switch(struct dsa_switch *ds) mutex_unlock(&dsa2_mutex); } EXPORT_SYMBOL_GPL(dsa_unregister_switch); + +/* If the DSA master chooses to unregister its net_device on .shutdown, DSA is + * blocking that operation from completion, due to the dev_hold taken inside + * netdev_upper_dev_link. Unlink the DSA slave interfaces from being uppers of + * the DSA master, so that the system can reboot successfully. + */ +void dsa_switch_shutdown(struct dsa_switch *ds) +{ + struct net_device *master, *slave_dev; + LIST_HEAD(unregister_list); + struct dsa_port *dp; + + mutex_lock(&dsa2_mutex); + rtnl_lock(); + + dsa_switch_for_each_user_port(dp, ds) { + master = dp->cpu_dp->master; + slave_dev = dp->slave; + + netdev_upper_dev_unlink(master, slave_dev); + /* Just unlinking ourselves as uppers of the master is not + * sufficient. When the master net device unregisters, that will + * also call dev_close, which we will catch as NETDEV_GOING_DOWN + * and trigger a dev_close on our own devices (dsa_slave_close). + * In turn, that will call dev_mc_unsync on the master's net + * device. If the master is also a DSA switch port, this will + * trigger dsa_slave_set_rx_mode which will call dev_mc_sync on + * its own master. Lockdep will complain about the fact that + * all cascaded masters have the same dsa_master_addr_list_lock_key, + * which it normally would not do if the cascaded masters would + * be in a proper upper/lower relationship, which we've just + * destroyed. + * To suppress the lockdep warnings, let's actually unregister + * the DSA slave interfaces too, to avoid the nonsensical + * multicast address list synchronization on shutdown. + */ + unregister_netdevice_queue(slave_dev, &unregister_list); + } + unregister_netdevice_many(&unregister_list); + + rtnl_unlock(); + mutex_unlock(&dsa2_mutex); +} +EXPORT_SYMBOL_GPL(dsa_switch_shutdown); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index f201c33980bf..a5c9bc7b66c6 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -14,6 +14,8 @@ #include <net/dsa.h> #include <net/gro_cells.h> +#define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG + enum { DSA_NOTIFIER_AGEING_TIME, DSA_NOTIFIER_BRIDGE_JOIN, @@ -39,6 +41,8 @@ enum { DSA_NOTIFIER_MRP_DEL, DSA_NOTIFIER_MRP_ADD_RING_ROLE, DSA_NOTIFIER_MRP_DEL_RING_ROLE, + DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, + DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, }; /* DSA_NOTIFIER_AGEING_TIME */ @@ -113,6 +117,14 @@ struct dsa_notifier_mrp_ring_role_info { int port; }; +/* DSA_NOTIFIER_TAG_8021Q_VLAN_* */ +struct dsa_notifier_tag_8021q_vlan_info { + int tree_index; + int sw_index; + int port; + u16 vid; +}; + struct dsa_switchdev_event_work { struct dsa_switch *ds; int port; @@ -158,6 +170,7 @@ void dsa_tag_driver_put(const struct dsa_device_ops *ops); const struct dsa_device_ops *dsa_find_tagger_by_name(const char *buf); bool dsa_schedule_work(struct work_struct *work); +void dsa_flush_workqueue(void); const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops); static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) @@ -187,23 +200,21 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, /* port.c */ void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, const struct dsa_device_ops *tag_ops); -int dsa_port_set_state(struct dsa_port *dp, u8 state); +int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age); int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy); int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy); void dsa_port_disable_rt(struct dsa_port *dp); void dsa_port_disable(struct dsa_port *dp); int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, struct netlink_ext_ack *extack); -int dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br, - struct netlink_ext_ack *extack); +void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); int dsa_port_lag_change(struct dsa_port *dp, struct netdev_lag_lower_state_info *linfo); int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, struct netdev_lag_upper_info *uinfo, struct netlink_ext_ack *extack); -int dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev, - struct netlink_ext_ack *extack); +void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct netlink_ext_ack *extack); @@ -231,11 +242,9 @@ int dsa_port_host_mdb_del(const struct dsa_port *dp, int dsa_port_pre_bridge_flags(const struct dsa_port *dp, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); -int dsa_port_bridge_flags(const struct dsa_port *dp, +int dsa_port_bridge_flags(struct dsa_port *dp, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack); -int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, - struct netlink_ext_ack *extack); int dsa_port_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack); @@ -253,16 +262,18 @@ int dsa_port_link_register_of(struct dsa_port *dp); void dsa_port_link_unregister_of(struct dsa_port *dp); int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr); void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr); +int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast); +void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast); extern const struct phylink_mac_ops dsa_port_phylink_mac_ops; static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp, - struct net_device *dev) + const struct net_device *dev) { return dsa_port_to_bridge_port(dp) == dev; } static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, - struct net_device *bridge_dev) + const struct net_device *bridge_dev) { /* DSA ports connected to a bridge, and event was emitted * for the bridge. @@ -272,7 +283,7 @@ static inline bool dsa_port_offloads_bridge(struct dsa_port *dp, /* Returns true if any port of this tree offloads the given net_device */ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, - struct net_device *dev) + const struct net_device *dev) { struct dsa_port *dp; @@ -283,6 +294,19 @@ static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst, return false; } +/* Returns true if any port of this tree offloads the given bridge */ +static inline bool dsa_tree_offloads_bridge(struct dsa_switch_tree *dst, + const struct net_device *bridge_dev) +{ + struct dsa_port *dp; + + list_for_each_entry(dp, &dst->ports, list) + if (dsa_port_offloads_bridge(dp, bridge_dev)) + return true; + + return false; +} + /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; extern struct notifier_block dsa_slave_switchdev_notifier; @@ -297,6 +321,8 @@ int dsa_slave_register_notifier(void); void dsa_slave_unregister_notifier(void); void dsa_slave_setup_tagger(struct net_device *slave); int dsa_slave_change_mtu(struct net_device *dev, int new_mtu); +int dsa_slave_manage_vlan_filtering(struct net_device *dev, + bool vlan_filtering); static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev) { @@ -372,6 +398,141 @@ static inline struct sk_buff *dsa_untag_bridge_pvid(struct sk_buff *skb) return skb; } +/* For switches without hardware support for DSA tagging to be able + * to support termination through the bridge. + */ +static inline struct net_device * +dsa_find_designated_bridge_port_by_vid(struct net_device *master, u16 vid) +{ + struct dsa_port *cpu_dp = master->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; + struct bridge_vlan_info vinfo; + struct net_device *slave; + struct dsa_port *dp; + int err; + + list_for_each_entry(dp, &dst->ports, list) { + if (dp->type != DSA_PORT_TYPE_USER) + continue; + + if (!dp->bridge_dev) + continue; + + if (dp->stp_state != BR_STATE_LEARNING && + dp->stp_state != BR_STATE_FORWARDING) + continue; + + /* Since the bridge might learn this packet, keep the CPU port + * affinity with the port that will be used for the reply on + * xmit. + */ + if (dp->cpu_dp != cpu_dp) + continue; + + slave = dp->slave; + + err = br_vlan_get_info_rcu(slave, vid, &vinfo); + if (err) + continue; + + return slave; + } + + return NULL; +} + +/* If the ingress port offloads the bridge, we mark the frame as autonomously + * forwarded by hardware, so the software bridge doesn't forward in twice, back + * to us, because we already did. However, if we're in fallback mode and we do + * software bridging, we are not offloading it, therefore the dp->bridge_dev + * pointer is not populated, and flooding needs to be done by software (we are + * effectively operating in standalone ports mode). + */ +static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb) +{ + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + + skb->offload_fwd_mark = !!(dp->bridge_dev); +} + +/* Helper for removing DSA header tags from packets in the RX path. + * Must not be called before skb_pull(len). + * skb->data + * | + * v + * | | | | | | | | | | | | | | | | | | | + * +-----------------------+-----------------------+---------------+-------+ + * | Destination MAC | Source MAC | DSA header | EType | + * +-----------------------+-----------------------+---------------+-------+ + * | | + * <----- len -----> <----- len -----> + * | + * >>>>>>> v + * >>>>>>> | | | | | | | | | | | | | | | + * >>>>>>> +-----------------------+-----------------------+-------+ + * >>>>>>> | Destination MAC | Source MAC | EType | + * +-----------------------+-----------------------+-------+ + * ^ + * | + * skb->data + */ +static inline void dsa_strip_etype_header(struct sk_buff *skb, int len) +{ + memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - len, 2 * ETH_ALEN); +} + +/* Helper for creating space for DSA header tags in TX path packets. + * Must not be called before skb_push(len). + * + * Before: + * + * <<<<<<< | | | | | | | | | | | | | | | + * ^ <<<<<<< +-----------------------+-----------------------+-------+ + * | <<<<<<< | Destination MAC | Source MAC | EType | + * | +-----------------------+-----------------------+-------+ + * <----- len -----> + * | + * | + * skb->data + * + * After: + * + * | | | | | | | | | | | | | | | | | | | + * +-----------------------+-----------------------+---------------+-------+ + * | Destination MAC | Source MAC | DSA header | EType | + * +-----------------------+-----------------------+---------------+-------+ + * ^ | | + * | <----- len -----> + * skb->data + */ +static inline void dsa_alloc_etype_header(struct sk_buff *skb, int len) +{ + memmove(skb->data, skb->data + len, 2 * ETH_ALEN); +} + +/* On RX, eth_type_trans() on the DSA master pulls ETH_HLEN bytes starting from + * skb_mac_header(skb), which leaves skb->data pointing at the first byte after + * what the DSA master perceives as the EtherType (the beginning of the L3 + * protocol). Since DSA EtherType header taggers treat the EtherType as part of + * the DSA tag itself, and the EtherType is 2 bytes in length, the DSA header + * is located 2 bytes behind skb->data. Note that EtherType in this context + * means the first 2 bytes of the DSA header, not the encapsulated EtherType + * that will become visible after the DSA header is stripped. + */ +static inline void *dsa_etype_header_pos_rx(struct sk_buff *skb) +{ + return skb->data - 2; +} + +/* On TX, skb->data points to skb_mac_header(skb), which means that EtherType + * header taggers start exactly where the EtherType is (the EtherType is + * treated as part of the DSA header). + */ +static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb) +{ + return skb->data + 2 * ETH_ALEN; +} + /* switch.c */ int dsa_switch_register_notifier(struct dsa_switch *ds); void dsa_switch_unregister_notifier(struct dsa_switch *ds); @@ -385,6 +546,18 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, struct net_device *master, const struct dsa_device_ops *tag_ops, const struct dsa_device_ops *old_tag_ops); +int dsa_bridge_num_get(const struct net_device *bridge_dev, int max); +void dsa_bridge_num_put(const struct net_device *bridge_dev, int bridge_num); + +/* tag_8021q.c */ +int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info); +int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info); +int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info); +int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info); extern struct list_head dsa_tree_list; diff --git a/net/dsa/master.c b/net/dsa/master.c index 3fc90e36772d..e8e19857621b 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -210,14 +210,14 @@ static int dsa_master_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) break; } - if (dev->netdev_ops->ndo_do_ioctl) - err = dev->netdev_ops->ndo_do_ioctl(dev, ifr, cmd); + if (dev->netdev_ops->ndo_eth_ioctl) + err = dev->netdev_ops->ndo_eth_ioctl(dev, ifr, cmd); return err; } static const struct dsa_netdevice_ops dsa_netdev_ops = { - .ndo_do_ioctl = dsa_master_ioctl, + .ndo_eth_ioctl = dsa_master_ioctl, }; static int dsa_master_ethtool_setup(struct net_device *dev) diff --git a/net/dsa/port.c b/net/dsa/port.c index 28b45b7e66df..f6f12ad2b525 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -30,7 +30,52 @@ static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v) return dsa_tree_notify(dp->ds->dst, e, v); } -int dsa_port_set_state(struct dsa_port *dp, u8 state) +static void dsa_port_notify_bridge_fdb_flush(const struct dsa_port *dp) +{ + struct net_device *brport_dev = dsa_port_to_bridge_port(dp); + struct switchdev_notifier_fdb_info info = { + /* flush all VLANs */ + .vid = 0, + }; + + /* When the port becomes standalone it has already left the bridge. + * Don't notify the bridge in that case. + */ + if (!brport_dev) + return; + + call_switchdev_notifiers(SWITCHDEV_FDB_FLUSH_TO_BRIDGE, + brport_dev, &info.info, NULL); +} + +static void dsa_port_fast_age(const struct dsa_port *dp) +{ + struct dsa_switch *ds = dp->ds; + + if (!ds->ops->port_fast_age) + return; + + ds->ops->port_fast_age(ds, dp->index); + + dsa_port_notify_bridge_fdb_flush(dp); +} + +static bool dsa_port_can_configure_learning(struct dsa_port *dp) +{ + struct switchdev_brport_flags flags = { + .mask = BR_LEARNING, + }; + struct dsa_switch *ds = dp->ds; + int err; + + if (!ds->ops->port_bridge_flags || !ds->ops->port_pre_bridge_flags) + return false; + + err = ds->ops->port_pre_bridge_flags(ds, dp->index, flags, NULL); + return !err; +} + +int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age) { struct dsa_switch *ds = dp->ds; int port = dp->index; @@ -40,10 +85,14 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state) ds->ops->port_stp_state_set(ds, port, state); - if (ds->ops->port_fast_age) { + if (!dsa_port_can_configure_learning(dp) || + (do_fast_age && dp->learning)) { /* Fast age FDB entries or flush appropriate forwarding database * for the given port, if we are moving it from Learning or * Forwarding state, to Disabled or Blocking or Listening state. + * Ports that were standalone before the STP state change don't + * need to fast age the FDB, since address learning is off in + * standalone mode. */ if ((dp->stp_state == BR_STATE_LEARNING || @@ -51,7 +100,7 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state) (state == BR_STATE_DISABLED || state == BR_STATE_BLOCKING || state == BR_STATE_LISTENING)) - ds->ops->port_fast_age(ds, port); + dsa_port_fast_age(dp); } dp->stp_state = state; @@ -59,11 +108,12 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state) return 0; } -static void dsa_port_set_state_now(struct dsa_port *dp, u8 state) +static void dsa_port_set_state_now(struct dsa_port *dp, u8 state, + bool do_fast_age) { int err; - err = dsa_port_set_state(dp, state); + err = dsa_port_set_state(dp, state, do_fast_age); if (err) pr_err("DSA: failed to set STP state %u (%d)\n", state, err); } @@ -81,7 +131,7 @@ int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy) } if (!dp->bridge_dev) - dsa_port_set_state_now(dp, BR_STATE_FORWARDING); + dsa_port_set_state_now(dp, BR_STATE_FORWARDING, false); if (dp->pl) phylink_start(dp->pl); @@ -109,7 +159,7 @@ void dsa_port_disable_rt(struct dsa_port *dp) phylink_stop(dp->pl); if (!dp->bridge_dev) - dsa_port_set_state_now(dp, BR_STATE_DISABLED); + dsa_port_set_state_now(dp, BR_STATE_DISABLED, false); if (ds->ops->port_disable) ds->ops->port_disable(ds, port); @@ -167,8 +217,8 @@ static void dsa_port_clear_brport_flags(struct dsa_port *dp) } } -static int dsa_port_switchdev_sync(struct dsa_port *dp, - struct netlink_ext_ack *extack) +static int dsa_port_switchdev_sync_attrs(struct dsa_port *dp, + struct netlink_ext_ack *extack) { struct net_device *brport_dev = dsa_port_to_bridge_port(dp); struct net_device *br = dp->bridge_dev; @@ -178,7 +228,7 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp, if (err) return err; - err = dsa_port_set_state(dp, br_port_get_stp_state(brport_dev)); + err = dsa_port_set_state(dp, br_port_get_stp_state(brport_dev), false); if (err && err != -EOPNOTSUPP) return err; @@ -186,67 +236,10 @@ static int dsa_port_switchdev_sync(struct dsa_port *dp, if (err && err != -EOPNOTSUPP) return err; - err = dsa_port_mrouter(dp->cpu_dp, br_multicast_router(br), extack); - if (err && err != -EOPNOTSUPP) - return err; - err = dsa_port_ageing_time(dp, br_get_ageing_time(br)); if (err && err != -EOPNOTSUPP) return err; - err = br_mdb_replay(br, brport_dev, dp, true, - &dsa_slave_switchdev_blocking_notifier, extack); - if (err && err != -EOPNOTSUPP) - return err; - - /* Forwarding and termination FDB entries on the port */ - err = br_fdb_replay(br, brport_dev, dp, true, - &dsa_slave_switchdev_notifier); - if (err && err != -EOPNOTSUPP) - return err; - - /* Termination FDB entries on the bridge itself */ - err = br_fdb_replay(br, br, dp, true, &dsa_slave_switchdev_notifier); - if (err && err != -EOPNOTSUPP) - return err; - - err = br_vlan_replay(br, brport_dev, dp, true, - &dsa_slave_switchdev_blocking_notifier, extack); - if (err && err != -EOPNOTSUPP) - return err; - - return 0; -} - -static int dsa_port_switchdev_unsync_objs(struct dsa_port *dp, - struct net_device *br, - struct netlink_ext_ack *extack) -{ - struct net_device *brport_dev = dsa_port_to_bridge_port(dp); - int err; - - /* Delete the switchdev objects left on this port */ - err = br_mdb_replay(br, brport_dev, dp, false, - &dsa_slave_switchdev_blocking_notifier, extack); - if (err && err != -EOPNOTSUPP) - return err; - - /* Forwarding and termination FDB entries on the port */ - err = br_fdb_replay(br, brport_dev, dp, false, - &dsa_slave_switchdev_notifier); - if (err && err != -EOPNOTSUPP) - return err; - - /* Termination FDB entries on the bridge itself */ - err = br_fdb_replay(br, br, dp, false, &dsa_slave_switchdev_notifier); - if (err && err != -EOPNOTSUPP) - return err; - - err = br_vlan_replay(br, brport_dev, dp, false, - &dsa_slave_switchdev_blocking_notifier, extack); - if (err && err != -EOPNOTSUPP) - return err; - return 0; } @@ -268,21 +261,63 @@ static void dsa_port_switchdev_unsync_attrs(struct dsa_port *dp) /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer, * so allow it to be in BR_STATE_FORWARDING to be kept functional */ - dsa_port_set_state_now(dp, BR_STATE_FORWARDING); + dsa_port_set_state_now(dp, BR_STATE_FORWARDING, true); /* VLAN filtering is handled by dsa_switch_bridge_leave */ - /* Some drivers treat the notification for having a local multicast - * router by allowing multicast to be flooded to the CPU, so we should - * allow this in standalone mode too. - */ - dsa_port_mrouter(dp->cpu_dp, true, NULL); - /* Ageing time may be global to the switch chip, so don't change it * here because we have no good reason (or value) to change it to. */ } +static void dsa_port_bridge_tx_fwd_unoffload(struct dsa_port *dp, + struct net_device *bridge_dev) +{ + int bridge_num = dp->bridge_num; + struct dsa_switch *ds = dp->ds; + + /* No bridge TX forwarding offload => do nothing */ + if (!ds->ops->port_bridge_tx_fwd_unoffload || dp->bridge_num == -1) + return; + + dp->bridge_num = -1; + + dsa_bridge_num_put(bridge_dev, bridge_num); + + /* Notify the chips only once the offload has been deactivated, so + * that they can update their configuration accordingly. + */ + ds->ops->port_bridge_tx_fwd_unoffload(ds, dp->index, bridge_dev, + bridge_num); +} + +static bool dsa_port_bridge_tx_fwd_offload(struct dsa_port *dp, + struct net_device *bridge_dev) +{ + struct dsa_switch *ds = dp->ds; + int bridge_num, err; + + if (!ds->ops->port_bridge_tx_fwd_offload) + return false; + + bridge_num = dsa_bridge_num_get(bridge_dev, + ds->num_fwd_offloading_bridges); + if (bridge_num < 0) + return false; + + dp->bridge_num = bridge_num; + + /* Notify the driver */ + err = ds->ops->port_bridge_tx_fwd_offload(ds, dp->index, bridge_dev, + bridge_num); + if (err) { + dsa_port_bridge_tx_fwd_unoffload(dp, bridge_dev); + return false; + } + + return true; +} + int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, struct netlink_ext_ack *extack) { @@ -292,6 +327,9 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, .port = dp->index, .br = br, }; + struct net_device *dev = dp->slave; + struct net_device *brport_dev; + bool tx_fwd_offload; int err; /* Here the interface is already bridged. Reflect the current @@ -299,16 +337,31 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br, */ dp->bridge_dev = br; + brport_dev = dsa_port_to_bridge_port(dp); + err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_JOIN, &info); if (err) goto out_rollback; - err = dsa_port_switchdev_sync(dp, extack); + tx_fwd_offload = dsa_port_bridge_tx_fwd_offload(dp, br); + + err = switchdev_bridge_port_offload(brport_dev, dev, dp, + &dsa_slave_switchdev_notifier, + &dsa_slave_switchdev_blocking_notifier, + tx_fwd_offload, extack); if (err) goto out_rollback_unbridge; + err = dsa_port_switchdev_sync_attrs(dp, extack); + if (err) + goto out_rollback_unoffload; + return 0; +out_rollback_unoffload: + switchdev_bridge_port_unoffload(brport_dev, dp, + &dsa_slave_switchdev_notifier, + &dsa_slave_switchdev_blocking_notifier); out_rollback_unbridge: dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info); out_rollback: @@ -316,10 +369,19 @@ out_rollback: return err; } -int dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br, - struct netlink_ext_ack *extack) +void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br) { - return dsa_port_switchdev_unsync_objs(dp, br, extack); + struct net_device *brport_dev = dsa_port_to_bridge_port(dp); + + /* Don't try to unoffload something that is not offloaded */ + if (!brport_dev) + return; + + switchdev_bridge_port_unoffload(brport_dev, dp, + &dsa_slave_switchdev_notifier, + &dsa_slave_switchdev_blocking_notifier); + + dsa_flush_workqueue(); } void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) @@ -337,9 +399,13 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) */ dp->bridge_dev = NULL; + dsa_port_bridge_tx_fwd_unoffload(dp, br); + err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info); if (err) - pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); + dev_err(dp->ds->dev, + "port %d failed to notify DSA_NOTIFIER_BRIDGE_LEAVE: %pe\n", + dp->index, ERR_PTR(err)); dsa_port_switchdev_unsync_attrs(dp); } @@ -409,13 +475,10 @@ err_lag_join: return err; } -int dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag, - struct netlink_ext_ack *extack) +void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag) { if (dp->bridge_dev) - return dsa_port_pre_bridge_leave(dp, dp->bridge_dev, extack); - - return 0; + dsa_port_pre_bridge_leave(dp, dp->bridge_dev); } void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag) @@ -441,8 +504,9 @@ void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag) err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info); if (err) - pr_err("DSA: failed to notify DSA_NOTIFIER_LAG_LEAVE: %d\n", - err); + dev_err(dp->ds->dev, + "port %d failed to notify DSA_NOTIFIER_LAG_LEAVE: %pe\n", + dp->index, ERR_PTR(err)); dsa_lag_unmap(dp->ds->dst, lag); } @@ -453,14 +517,15 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; - int err, i; + struct dsa_port *other_dp; + int err; /* VLAN awareness was off, so the question is "can we turn it on". * We may have had 8021q uppers, those need to go. Make sure we don't * enter an inconsistent state: deny changing the VLAN awareness state * as long as we have 8021q uppers. */ - if (vlan_filtering && dsa_is_user_port(ds, dp->index)) { + if (vlan_filtering && dsa_port_is_user(dp)) { struct net_device *upper_dev, *slave = dp->slave; struct net_device *br = dp->bridge_dev; struct list_head *iter; @@ -495,10 +560,10 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, * different ports of the same switch device and one of them has a * different setting than what is being requested. */ - for (i = 0; i < ds->num_ports; i++) { + dsa_switch_for_each_port(other_dp, ds) { struct net_device *other_bridge; - other_bridge = dsa_to_port(ds, i)->bridge_dev; + other_bridge = other_dp->bridge_dev; if (!other_bridge) continue; /* If it's the same bridge, it also has same @@ -518,6 +583,7 @@ static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp, int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct netlink_ext_ack *extack) { + bool old_vlan_filtering = dsa_port_is_vlan_filtering(dp); struct dsa_switch *ds = dp->ds; bool apply; int err; @@ -543,12 +609,45 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, if (err) return err; - if (ds->vlan_filtering_is_global) + if (ds->vlan_filtering_is_global) { + struct dsa_port *other_dp; + ds->vlan_filtering = vlan_filtering; - else + + dsa_switch_for_each_user_port(other_dp, ds) { + struct net_device *slave = dp->slave; + + /* We might be called in the unbind path, so not + * all slave devices might still be registered. + */ + if (!slave) + continue; + + err = dsa_slave_manage_vlan_filtering(slave, + vlan_filtering); + if (err) + goto restore; + } + } else { dp->vlan_filtering = vlan_filtering; + err = dsa_slave_manage_vlan_filtering(dp->slave, + vlan_filtering); + if (err) + goto restore; + } + return 0; + +restore: + ds->ops->port_vlan_filtering(ds, dp->index, old_vlan_filtering, NULL); + + if (ds->vlan_filtering_is_global) + ds->vlan_filtering = old_vlan_filtering; + else + dp->vlan_filtering = old_vlan_filtering; + + return err; } /* This enforces legacy behavior for switch drivers which assume they can't @@ -595,27 +694,35 @@ int dsa_port_pre_bridge_flags(const struct dsa_port *dp, return ds->ops->port_pre_bridge_flags(ds, dp->index, flags, extack); } -int dsa_port_bridge_flags(const struct dsa_port *dp, +int dsa_port_bridge_flags(struct dsa_port *dp, struct switchdev_brport_flags flags, struct netlink_ext_ack *extack) { struct dsa_switch *ds = dp->ds; + int err; if (!ds->ops->port_bridge_flags) return -EOPNOTSUPP; - return ds->ops->port_bridge_flags(ds, dp->index, flags, extack); -} + err = ds->ops->port_bridge_flags(ds, dp->index, flags, extack); + if (err) + return err; -int dsa_port_mrouter(struct dsa_port *dp, bool mrouter, - struct netlink_ext_ack *extack) -{ - struct dsa_switch *ds = dp->ds; + if (flags.mask & BR_LEARNING) { + bool learning = flags.val & BR_LEARNING; - if (!ds->ops->port_set_mrouter) - return -EOPNOTSUPP; + if (learning == dp->learning) + return 0; + + if ((dp->learning && !learning) && + (dp->stp_state == BR_STATE_LEARNING || + dp->stp_state == BR_STATE_FORWARDING)) + dsa_port_fast_age(dp); - return ds->ops->port_set_mrouter(ds, dp->index, mrouter, extack); + dp->learning = learning; + } + + return 0; } int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, @@ -844,7 +951,6 @@ int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, const struct dsa_device_ops *tag_ops) { - cpu_dp->filter = tag_ops->filter; cpu_dp->rcv = tag_ops->rcv; cpu_dp->tag_ops = tag_ops; } @@ -934,7 +1040,7 @@ static void dsa_port_phylink_mac_link_down(struct phylink_config *config, struct phy_device *phydev = NULL; struct dsa_switch *ds = dp->ds; - if (dsa_is_user_port(ds, dp->index)) + if (dsa_port_is_user(dp)) phydev = dp->slave->phydev; if (!ds->ops->phylink_mac_link_down) { @@ -1062,6 +1168,10 @@ static int dsa_port_phylink_register(struct dsa_port *dp) dp->pl_config.type = PHYLINK_DEV; dp->pl_config.pcs_poll = ds->pcs_poll; + if (ds->ops->phylink_get_interfaces) + ds->ops->phylink_get_interfaces(ds, dp->index, + dp->pl_config.supported_interfaces); + dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, &dsa_port_phylink_mac_ops); if (IS_ERR(dp->pl)) { @@ -1215,5 +1325,42 @@ void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr) err = dsa_port_notify(dp, DSA_NOTIFIER_HSR_LEAVE, &info); if (err) - pr_err("DSA: failed to notify DSA_NOTIFIER_HSR_LEAVE\n"); + dev_err(dp->ds->dev, + "port %d failed to notify DSA_NOTIFIER_HSR_LEAVE: %pe\n", + dp->index, ERR_PTR(err)); +} + +int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast) +{ + struct dsa_notifier_tag_8021q_vlan_info info = { + .tree_index = dp->ds->dst->index, + .sw_index = dp->ds->index, + .port = dp->index, + .vid = vid, + }; + + if (broadcast) + return dsa_broadcast(DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, &info); + + return dsa_port_notify(dp, DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, &info); +} + +void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast) +{ + struct dsa_notifier_tag_8021q_vlan_info info = { + .tree_index = dp->ds->dst->index, + .sw_index = dp->ds->index, + .port = dp->index, + .vid = vid, + }; + int err; + + if (broadcast) + err = dsa_broadcast(DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, &info); + else + err = dsa_port_notify(dp, DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, &info); + if (err) + dev_err(dp->ds->dev, + "port %d failed to notify tag_8021q VLAN %d deletion: %pe\n", + dp->index, vid, ERR_PTR(err)); } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 23be8e01026b..ad61f6bc8886 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -174,7 +174,7 @@ static int dsa_slave_set_mac_address(struct net_device *dev, void *a) dev_uc_del(master, dev->dev_addr); out: - ether_addr_copy(dev->dev_addr, addr->sa_data); + eth_hw_addr_set(dev, addr->sa_data); return 0; } @@ -286,7 +286,7 @@ static int dsa_slave_port_attr_set(struct net_device *dev, const void *ctx, if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev)) return -EOPNOTSUPP; - ret = dsa_port_set_state(dp, attr->u.stp_state); + ret = dsa_port_set_state(dp, attr->u.stp_state, true); break; case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING: if (!dsa_port_offloads_bridge(dp, attr->orig_dev)) @@ -314,12 +314,6 @@ static int dsa_slave_port_attr_set(struct net_device *dev, const void *ctx, ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack); break; - case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER: - if (!dsa_port_offloads_bridge(dp, attr->orig_dev)) - return -EOPNOTSUPP; - - ret = dsa_port_mrouter(dp->cpu_dp, attr->u.mrouter, extack); - break; default: ret = -EOPNOTSUPP; break; @@ -795,6 +789,37 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset) return -EOPNOTSUPP; } +static void dsa_slave_get_eth_phy_stats(struct net_device *dev, + struct ethtool_eth_phy_stats *phy_stats) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_eth_phy_stats) + ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats); +} + +static void dsa_slave_get_eth_mac_stats(struct net_device *dev, + struct ethtool_eth_mac_stats *mac_stats) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_eth_mac_stats) + ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats); +} + +static void +dsa_slave_get_eth_ctrl_stats(struct net_device *dev, + struct ethtool_eth_ctrl_stats *ctrl_stats) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + + if (ds->ops->get_eth_ctrl_stats) + ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats); +} + static void dsa_slave_net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) { @@ -1415,6 +1440,76 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, return 0; } +static int dsa_slave_restore_vlan(struct net_device *vdev, int vid, void *arg) +{ + __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); + + return dsa_slave_vlan_rx_add_vid(arg, proto, vid); +} + +static int dsa_slave_clear_vlan(struct net_device *vdev, int vid, void *arg) +{ + __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q); + + return dsa_slave_vlan_rx_kill_vid(arg, proto, vid); +} + +/* Keep the VLAN RX filtering list in sync with the hardware only if VLAN + * filtering is enabled. The baseline is that only ports that offload a + * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware, + * but there are exceptions for quirky hardware. + * + * If ds->vlan_filtering_is_global = true, then standalone ports which share + * the same switch with other ports that offload a VLAN-aware bridge are also + * inevitably VLAN-aware. + * + * To summarize, a DSA switch port offloads: + * + * - If standalone (this includes software bridge, software LAG): + * - if ds->needs_standalone_vlan_filtering = true, OR if + * (ds->vlan_filtering_is_global = true AND there are bridges spanning + * this switch chip which have vlan_filtering=1) + * - the 8021q upper VLANs + * - else (standalone VLAN filtering is not needed, VLAN filtering is not + * global, or it is, but no port is under a VLAN-aware bridge): + * - no VLAN (any 8021q upper is a software VLAN) + * + * - If under a vlan_filtering=0 bridge which it offload: + * - if ds->configure_vlan_while_not_filtering = true (default): + * - the bridge VLANs. These VLANs are committed to hardware but inactive. + * - else (deprecated): + * - no VLAN. The bridge VLANs are not restored when VLAN awareness is + * enabled, so this behavior is broken and discouraged. + * + * - If under a vlan_filtering=1 bridge which it offload: + * - the bridge VLANs + * - the 8021q upper VLANs + */ +int dsa_slave_manage_vlan_filtering(struct net_device *slave, + bool vlan_filtering) +{ + int err; + + if (vlan_filtering) { + slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER; + + err = vlan_for_each(slave, dsa_slave_restore_vlan, slave); + if (err) { + vlan_for_each(slave, dsa_slave_clear_vlan, slave); + slave->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; + return err; + } + } else { + err = vlan_for_each(slave, dsa_slave_clear_vlan, slave); + if (err) + return err; + + slave->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER; + } + + return 0; +} + struct dsa_hw_port { struct list_head list; struct net_device *dev; @@ -1631,6 +1726,9 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_strings = dsa_slave_get_strings, .get_ethtool_stats = dsa_slave_get_ethtool_stats, .get_sset_count = dsa_slave_get_sset_count, + .get_eth_phy_stats = dsa_slave_get_eth_phy_stats, + .get_eth_mac_stats = dsa_slave_get_eth_mac_stats, + .get_eth_ctrl_stats = dsa_slave_get_eth_ctrl_stats, .set_wol = dsa_slave_set_wol, .get_wol = dsa_slave_get_wol, .set_eee = dsa_slave_set_eee, @@ -1687,7 +1785,7 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, .ndo_fdb_dump = dsa_slave_fdb_dump, - .ndo_do_ioctl = dsa_slave_ioctl, + .ndo_eth_ioctl = dsa_slave_ioctl, .ndo_get_iflink = dsa_slave_get_iflink, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = dsa_slave_netpoll_setup, @@ -1773,6 +1871,10 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) dp->pl_config.poll_fixed_state = true; } + if (ds->ops->phylink_get_interfaces) + ds->ops->phylink_get_interfaces(ds, dp->index, + dp->pl_config.supported_interfaces); + dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, &dsa_port_phylink_mac_ops); if (IS_ERR(dp->pl)) { @@ -1790,13 +1892,11 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) * use the switch internal MDIO bus instead */ ret = dsa_slave_phy_connect(slave_dev, dp->index, phy_flags); - if (ret) { - netdev_err(slave_dev, - "failed to connect to port %d: %d\n", - dp->index, ret); - phylink_destroy(dp->pl); - return ret; - } + } + if (ret) { + netdev_err(slave_dev, "failed to connect to PHY: %pe\n", + ERR_PTR(ret)); + phylink_destroy(dp->pl); } return ret; @@ -1822,12 +1922,12 @@ void dsa_slave_setup_tagger(struct net_device *slave) p->xmit = cpu_dp->tag_ops->xmit; slave->features = master->vlan_features | NETIF_F_HW_TC; - if (ds->ops->port_vlan_add && ds->ops->port_vlan_del) - slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER; slave->hw_features |= NETIF_F_HW_TC; slave->features |= NETIF_F_LLTX; if (slave->needed_tailroom) slave->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST); + if (ds->needs_standalone_vlan_filtering) + slave->features |= NETIF_F_HW_VLAN_CTAG_FILTER; } static struct lock_class_key dsa_slave_netdev_xmit_lock_key; @@ -1892,7 +1992,7 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; if (!is_zero_ether_addr(port->mac)) - ether_addr_copy(slave_dev->dev_addr, port->mac); + eth_hw_addr_set(slave_dev, port->mac); else eth_hw_addr_inherit(slave_dev, master); slave_dev->priv_flags |= IFF_NO_QUEUE; @@ -2015,6 +2115,11 @@ static int dsa_slave_changeupper(struct net_device *dev, err = dsa_port_bridge_join(dp, info->upper_dev, extack); if (!err) dsa_bridge_mtu_normalization(dp); + if (err == -EOPNOTSUPP) { + NL_SET_ERR_MSG_MOD(extack, + "Offloading not supported"); + err = 0; + } err = notifier_from_errno(err); } else { dsa_port_bridge_leave(dp, info->upper_dev); @@ -2056,20 +2161,16 @@ static int dsa_slave_prechangeupper(struct net_device *dev, struct netdev_notifier_changeupper_info *info) { struct dsa_port *dp = dsa_slave_to_port(dev); - struct netlink_ext_ack *extack; - int err = 0; - - extack = netdev_notifier_info_to_extack(&info->info); if (netif_is_bridge_master(info->upper_dev) && !info->linking) - err = dsa_port_pre_bridge_leave(dp, info->upper_dev, extack); + dsa_port_pre_bridge_leave(dp, info->upper_dev); else if (netif_is_lag_master(info->upper_dev) && !info->linking) - err = dsa_port_pre_lag_leave(dp, info->upper_dev, extack); + dsa_port_pre_lag_leave(dp, info->upper_dev); /* dsa_port_pre_hsr_leave is not yet necessary since hsr cannot be * meaningfully enslaved to a bridge yet */ - return notifier_from_errno(err); + return NOTIFY_DONE; } static int @@ -2271,7 +2372,7 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, dst = cpu_dp->ds->dst; list_for_each_entry(dp, &dst->ports, list) { - if (!dsa_is_user_port(dp->ds, dp->index)) + if (!dsa_port_is_user(dp)) continue; list_add(&dp->slave->close_list, &close_list); @@ -2316,7 +2417,6 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) dp = dsa_to_port(ds, switchdev_work->port); - rtnl_lock(); switch (switchdev_work->event) { case SWITCHDEV_FDB_ADD_TO_DEVICE: if (switchdev_work->host_addr) @@ -2351,32 +2451,81 @@ static void dsa_slave_switchdev_event_work(struct work_struct *work) break; } - rtnl_unlock(); - dev_put(switchdev_work->dev); kfree(switchdev_work); } -static int dsa_lower_dev_walk(struct net_device *lower_dev, - struct netdev_nested_priv *priv) +static bool dsa_foreign_dev_check(const struct net_device *dev, + const struct net_device *foreign_dev) { - if (dsa_slave_dev_check(lower_dev)) { - priv->data = (void *)netdev_priv(lower_dev); - return 1; - } + const struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch_tree *dst = dp->ds->dst; - return 0; + if (netif_is_bridge_master(foreign_dev)) + return !dsa_tree_offloads_bridge(dst, foreign_dev); + + if (netif_is_bridge_port(foreign_dev)) + return !dsa_tree_offloads_bridge_port(dst, foreign_dev); + + /* Everything else is foreign */ + return true; } -static struct dsa_slave_priv *dsa_slave_dev_lower_find(struct net_device *dev) +static int dsa_slave_fdb_event(struct net_device *dev, + struct net_device *orig_dev, + unsigned long event, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info) { - struct netdev_nested_priv priv = { - .data = NULL, - }; + struct dsa_switchdev_event_work *switchdev_work; + struct dsa_port *dp = dsa_slave_to_port(dev); + bool host_addr = fdb_info->is_local; + struct dsa_switch *ds = dp->ds; - netdev_walk_all_lower_dev_rcu(dev, dsa_lower_dev_walk, &priv); + if (ctx && ctx != dp) + return 0; + + if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del) + return -EOPNOTSUPP; + + if (dsa_slave_dev_check(orig_dev) && + switchdev_fdb_is_dynamically_learned(fdb_info)) + return 0; - return (struct dsa_slave_priv *)priv.data; + /* FDB entries learned by the software bridge should be installed as + * host addresses only if the driver requests assisted learning. + */ + if (switchdev_fdb_is_dynamically_learned(fdb_info) && + !ds->assisted_learning_on_cpu_port) + return 0; + + /* Also treat FDB entries on foreign interfaces bridged with us as host + * addresses. + */ + if (dsa_foreign_dev_check(dev, orig_dev)) + host_addr = true; + + switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); + if (!switchdev_work) + return -ENOMEM; + + netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n", + event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting", + orig_dev->name, fdb_info->addr, fdb_info->vid, + host_addr ? " as host address" : ""); + + INIT_WORK(&switchdev_work->work, dsa_slave_switchdev_event_work); + switchdev_work->ds = ds; + switchdev_work->port = dp->index; + switchdev_work->event = event; + switchdev_work->dev = dev; + + ether_addr_copy(switchdev_work->addr, fdb_info->addr); + switchdev_work->vid = fdb_info->vid; + switchdev_work->host_addr = host_addr; + + dsa_schedule_work(&switchdev_work->work); + + return 0; } /* Called under rcu_read_lock() */ @@ -2384,10 +2533,6 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); - const struct switchdev_notifier_fdb_info *fdb_info; - struct dsa_switchdev_event_work *switchdev_work; - bool host_addr = false; - struct dsa_port *dp; int err; switch (event) { @@ -2398,91 +2543,12 @@ static int dsa_slave_switchdev_event(struct notifier_block *unused, return notifier_from_errno(err); case SWITCHDEV_FDB_ADD_TO_DEVICE: case SWITCHDEV_FDB_DEL_TO_DEVICE: - fdb_info = ptr; - - if (dsa_slave_dev_check(dev)) { - dp = dsa_slave_to_port(dev); - - if (fdb_info->is_local) - host_addr = true; - else if (!fdb_info->added_by_user) - return NOTIFY_OK; - } else { - /* Snoop addresses added to foreign interfaces - * bridged with us, or the bridge - * itself. Dynamically learned addresses can - * also be added for switches that don't - * automatically learn SA from CPU-injected - * traffic. - */ - struct net_device *br_dev; - struct dsa_slave_priv *p; - - if (netif_is_bridge_master(dev)) - br_dev = dev; - else - br_dev = netdev_master_upper_dev_get_rcu(dev); - - if (!br_dev) - return NOTIFY_DONE; - - if (!netif_is_bridge_master(br_dev)) - return NOTIFY_DONE; - - p = dsa_slave_dev_lower_find(br_dev); - if (!p) - return NOTIFY_DONE; - - dp = p->dp; - host_addr = fdb_info->is_local; - - /* FDB entries learned by the software bridge should - * be installed as host addresses only if the driver - * requests assisted learning. - * On the other hand, FDB entries for local termination - * should always be installed. - */ - if (!fdb_info->added_by_user && !fdb_info->is_local && - !dp->ds->assisted_learning_on_cpu_port) - return NOTIFY_DONE; - - /* When the bridge learns an address on an offloaded - * LAG we don't want to send traffic to the CPU, the - * other ports bridged with the LAG should be able to - * autonomously forward towards it. - * On the other hand, if the address is local - * (therefore not learned) then we want to trap it to - * the CPU regardless of whether the interface it - * belongs to is offloaded or not. - */ - if (dsa_tree_offloads_bridge_port(dp->ds->dst, dev) && - !fdb_info->is_local) - return NOTIFY_DONE; - } - - if (!dp->ds->ops->port_fdb_add || !dp->ds->ops->port_fdb_del) - return NOTIFY_DONE; - - switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC); - if (!switchdev_work) - return NOTIFY_BAD; - - INIT_WORK(&switchdev_work->work, - dsa_slave_switchdev_event_work); - switchdev_work->ds = dp->ds; - switchdev_work->port = dp->index; - switchdev_work->event = event; - switchdev_work->dev = dev; - - ether_addr_copy(switchdev_work->addr, - fdb_info->addr); - switchdev_work->vid = fdb_info->vid; - switchdev_work->host_addr = host_addr; - - /* Hold a reference for dsa_fdb_offload_notify */ - dev_hold(dev); - dsa_schedule_work(&switchdev_work->work); - break; + err = switchdev_handle_fdb_event_to_device(dev, event, ptr, + dsa_slave_dev_check, + dsa_foreign_dev_check, + dsa_slave_fdb_event, + NULL); + return notifier_from_errno(err); default: return NOTIFY_DONE; } diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 5ece05dfd8f2..bb155a16d454 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -17,14 +17,11 @@ static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds, unsigned int ageing_time) { - int i; - - for (i = 0; i < ds->num_ports; ++i) { - struct dsa_port *dp = dsa_to_port(ds, i); + struct dsa_port *dp; + dsa_switch_for_each_port(dp, ds) if (dp->ageing_time && dp->ageing_time < ageing_time) ageing_time = dp->ageing_time; - } return ageing_time; } @@ -49,10 +46,10 @@ static int dsa_switch_ageing_time(struct dsa_switch *ds, return 0; } -static bool dsa_switch_mtu_match(struct dsa_switch *ds, int port, - struct dsa_notifier_mtu_info *info) +static bool dsa_port_mtu_match(struct dsa_port *dp, + struct dsa_notifier_mtu_info *info) { - if (ds->index == info->sw_index && port == info->port) + if (dp->ds->index == info->sw_index && dp->index == info->port) return true; /* Do not propagate to other switches in the tree if the notifier was @@ -61,7 +58,7 @@ static bool dsa_switch_mtu_match(struct dsa_switch *ds, int port, if (info->targeted_match) return false; - if (dsa_is_dsa_port(ds, port) || dsa_is_cpu_port(ds, port)) + if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) return true; return false; @@ -70,14 +67,16 @@ static bool dsa_switch_mtu_match(struct dsa_switch *ds, int port, static int dsa_switch_mtu(struct dsa_switch *ds, struct dsa_notifier_mtu_info *info) { - int port, ret; + struct dsa_port *dp; + int ret; if (!ds->ops->port_change_mtu) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_mtu_match(ds, port, info)) { - ret = ds->ops->port_change_mtu(ds, port, info->mtu); + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_mtu_match(dp, info)) { + ret = ds->ops->port_change_mtu(ds, dp->index, + info->mtu); if (ret) return ret; } @@ -90,27 +89,38 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) { struct dsa_switch_tree *dst = ds->dst; + int err; - if (dst->index == info->tree_index && ds->index == info->sw_index && - ds->ops->port_bridge_join) - return ds->ops->port_bridge_join(ds, info->port, info->br); + if (dst->index == info->tree_index && ds->index == info->sw_index) { + if (!ds->ops->port_bridge_join) + return -EOPNOTSUPP; + + err = ds->ops->port_bridge_join(ds, info->port, info->br); + if (err) + return err; + } if ((dst->index != info->tree_index || ds->index != info->sw_index) && - ds->ops->crosschip_bridge_join) - return ds->ops->crosschip_bridge_join(ds, info->tree_index, - info->sw_index, - info->port, info->br); + ds->ops->crosschip_bridge_join) { + err = ds->ops->crosschip_bridge_join(ds, info->tree_index, + info->sw_index, + info->port, info->br); + if (err) + return err; + } - return 0; + return dsa_tag_8021q_bridge_join(ds, info); } static int dsa_switch_bridge_leave(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) { - bool unset_vlan_filtering = br_vlan_enabled(info->br); struct dsa_switch_tree *dst = ds->dst; struct netlink_ext_ack extack = {0}; - int err, port; + bool change_vlan_filtering = false; + bool vlan_filtering; + struct dsa_port *dp; + int err; if (dst->index == info->tree_index && ds->index == info->sw_index && ds->ops->port_bridge_leave) @@ -122,6 +132,15 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, info->sw_index, info->port, info->br); + if (ds->needs_standalone_vlan_filtering && !br_vlan_enabled(info->br)) { + change_vlan_filtering = true; + vlan_filtering = true; + } else if (!ds->needs_standalone_vlan_filtering && + br_vlan_enabled(info->br)) { + change_vlan_filtering = true; + vlan_filtering = false; + } + /* If the bridge was vlan_filtering, the bridge core doesn't trigger an * event for changing vlan_filtering setting upon slave ports leaving * it. That is a good thing, because that lets us handle it and also @@ -130,47 +149,49 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, * vlan_filtering callback is only when the last port leaves the last * VLAN-aware bridge. */ - if (unset_vlan_filtering && ds->vlan_filtering_is_global) { - for (port = 0; port < ds->num_ports; port++) { + if (change_vlan_filtering && ds->vlan_filtering_is_global) { + dsa_switch_for_each_port(dp, ds) { struct net_device *bridge_dev; - bridge_dev = dsa_to_port(ds, port)->bridge_dev; + bridge_dev = dp->bridge_dev; if (bridge_dev && br_vlan_enabled(bridge_dev)) { - unset_vlan_filtering = false; + change_vlan_filtering = false; break; } } } - if (unset_vlan_filtering) { + + if (change_vlan_filtering) { err = dsa_port_vlan_filtering(dsa_to_port(ds, info->port), - false, &extack); + vlan_filtering, &extack); if (extack._msg) dev_err(ds->dev, "port %d: %s\n", info->port, extack._msg); - if (err && err != EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) return err; } - return 0; + + return dsa_tag_8021q_bridge_leave(ds, info); } /* Matches for all upstream-facing ports (the CPU port and all upstream-facing * DSA links) that sit between the targeted port on which the notifier was * emitted and its dedicated CPU port. */ -static bool dsa_switch_host_address_match(struct dsa_switch *ds, int port, - int info_sw_index, int info_port) +static bool dsa_port_host_address_match(struct dsa_port *dp, + int info_sw_index, int info_port) { struct dsa_port *targeted_dp, *cpu_dp; struct dsa_switch *targeted_ds; - targeted_ds = dsa_switch_find(ds->dst->index, info_sw_index); + targeted_ds = dsa_switch_find(dp->ds->dst->index, info_sw_index); targeted_dp = dsa_to_port(targeted_ds, info_port); cpu_dp = targeted_dp->cpu_dp; - if (dsa_switch_is_upstream_of(ds, targeted_ds)) - return port == dsa_towards_port(ds, cpu_dp->ds->index, - cpu_dp->index); + if (dsa_switch_is_upstream_of(dp->ds, targeted_ds)) + return dp->index == dsa_towards_port(dp->ds, cpu_dp->ds->index, + cpu_dp->index); return false; } @@ -188,31 +209,36 @@ static struct dsa_mac_addr *dsa_mac_addr_find(struct list_head *addr_list, return NULL; } -static int dsa_switch_do_mdb_add(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) +static int dsa_port_do_mdb_add(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) { - struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; - int err; + int port = dp->index; + int err = 0; /* No need to bother with refcounting for user ports */ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) return ds->ops->port_mdb_add(ds, port, mdb); + mutex_lock(&dp->addr_lists_lock); + a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid); if (a) { refcount_inc(&a->refcount); - return 0; + goto out; } a = kzalloc(sizeof(*a), GFP_KERNEL); - if (!a) - return -ENOMEM; + if (!a) { + err = -ENOMEM; + goto out; + } err = ds->ops->port_mdb_add(ds, port, mdb); if (err) { kfree(a); - return err; + goto out; } ether_addr_copy(a->addr, mdb->addr); @@ -220,64 +246,80 @@ static int dsa_switch_do_mdb_add(struct dsa_switch *ds, int port, refcount_set(&a->refcount, 1); list_add_tail(&a->list, &dp->mdbs); - return 0; +out: + mutex_unlock(&dp->addr_lists_lock); + + return err; } -static int dsa_switch_do_mdb_del(struct dsa_switch *ds, int port, - const struct switchdev_obj_port_mdb *mdb) +static int dsa_port_do_mdb_del(struct dsa_port *dp, + const struct switchdev_obj_port_mdb *mdb) { - struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; - int err; + int port = dp->index; + int err = 0; /* No need to bother with refcounting for user ports */ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) return ds->ops->port_mdb_del(ds, port, mdb); + mutex_lock(&dp->addr_lists_lock); + a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid); - if (!a) - return -ENOENT; + if (!a) { + err = -ENOENT; + goto out; + } if (!refcount_dec_and_test(&a->refcount)) - return 0; + goto out; err = ds->ops->port_mdb_del(ds, port, mdb); if (err) { - refcount_inc(&a->refcount); - return err; + refcount_set(&a->refcount, 1); + goto out; } list_del(&a->list); kfree(a); - return 0; +out: + mutex_unlock(&dp->addr_lists_lock); + + return err; } -static int dsa_switch_do_fdb_add(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid) +static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr, + u16 vid) { - struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; - int err; + int port = dp->index; + int err = 0; /* No need to bother with refcounting for user ports */ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) return ds->ops->port_fdb_add(ds, port, addr, vid); + mutex_lock(&dp->addr_lists_lock); + a = dsa_mac_addr_find(&dp->fdbs, addr, vid); if (a) { refcount_inc(&a->refcount); - return 0; + goto out; } a = kzalloc(sizeof(*a), GFP_KERNEL); - if (!a) - return -ENOMEM; + if (!a) { + err = -ENOMEM; + goto out; + } err = ds->ops->port_fdb_add(ds, port, addr, vid); if (err) { kfree(a); - return err; + goto out; } ether_addr_copy(a->addr, addr); @@ -285,53 +327,63 @@ static int dsa_switch_do_fdb_add(struct dsa_switch *ds, int port, refcount_set(&a->refcount, 1); list_add_tail(&a->list, &dp->fdbs); - return 0; +out: + mutex_unlock(&dp->addr_lists_lock); + + return err; } -static int dsa_switch_do_fdb_del(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid) +static int dsa_port_do_fdb_del(struct dsa_port *dp, const unsigned char *addr, + u16 vid) { - struct dsa_port *dp = dsa_to_port(ds, port); + struct dsa_switch *ds = dp->ds; struct dsa_mac_addr *a; - int err; + int port = dp->index; + int err = 0; /* No need to bother with refcounting for user ports */ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) return ds->ops->port_fdb_del(ds, port, addr, vid); + mutex_lock(&dp->addr_lists_lock); + a = dsa_mac_addr_find(&dp->fdbs, addr, vid); - if (!a) - return -ENOENT; + if (!a) { + err = -ENOENT; + goto out; + } if (!refcount_dec_and_test(&a->refcount)) - return 0; + goto out; err = ds->ops->port_fdb_del(ds, port, addr, vid); if (err) { - refcount_inc(&a->refcount); - return err; + refcount_set(&a->refcount, 1); + goto out; } list_del(&a->list); kfree(a); - return 0; +out: + mutex_unlock(&dp->addr_lists_lock); + + return err; } static int dsa_switch_host_fdb_add(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { + struct dsa_port *dp; int err = 0; - int port; if (!ds->ops->port_fdb_add) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_host_address_match(ds, port, info->sw_index, - info->port)) { - err = dsa_switch_do_fdb_add(ds, port, info->addr, - info->vid); + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_host_address_match(dp, info->sw_index, + info->port)) { + err = dsa_port_do_fdb_add(dp, info->addr, info->vid); if (err) break; } @@ -343,17 +395,16 @@ static int dsa_switch_host_fdb_add(struct dsa_switch *ds, static int dsa_switch_host_fdb_del(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { + struct dsa_port *dp; int err = 0; - int port; if (!ds->ops->port_fdb_del) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_host_address_match(ds, port, info->sw_index, - info->port)) { - err = dsa_switch_do_fdb_del(ds, port, info->addr, - info->vid); + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_host_address_match(dp, info->sw_index, + info->port)) { + err = dsa_port_do_fdb_del(dp, info->addr, info->vid); if (err) break; } @@ -366,22 +417,24 @@ static int dsa_switch_fdb_add(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { int port = dsa_towards_port(ds, info->sw_index, info->port); + struct dsa_port *dp = dsa_to_port(ds, port); if (!ds->ops->port_fdb_add) return -EOPNOTSUPP; - return dsa_switch_do_fdb_add(ds, port, info->addr, info->vid); + return dsa_port_do_fdb_add(dp, info->addr, info->vid); } static int dsa_switch_fdb_del(struct dsa_switch *ds, struct dsa_notifier_fdb_info *info) { int port = dsa_towards_port(ds, info->sw_index, info->port); + struct dsa_port *dp = dsa_to_port(ds, port); if (!ds->ops->port_fdb_del) return -EOPNOTSUPP; - return dsa_switch_do_fdb_del(ds, port, info->addr, info->vid); + return dsa_port_do_fdb_del(dp, info->addr, info->vid); } static int dsa_switch_hsr_join(struct dsa_switch *ds, @@ -447,37 +500,39 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { int port = dsa_towards_port(ds, info->sw_index, info->port); + struct dsa_port *dp = dsa_to_port(ds, port); if (!ds->ops->port_mdb_add) return -EOPNOTSUPP; - return dsa_switch_do_mdb_add(ds, port, info->mdb); + return dsa_port_do_mdb_add(dp, info->mdb); } static int dsa_switch_mdb_del(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { int port = dsa_towards_port(ds, info->sw_index, info->port); + struct dsa_port *dp = dsa_to_port(ds, port); if (!ds->ops->port_mdb_del) return -EOPNOTSUPP; - return dsa_switch_do_mdb_del(ds, port, info->mdb); + return dsa_port_do_mdb_del(dp, info->mdb); } static int dsa_switch_host_mdb_add(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { + struct dsa_port *dp; int err = 0; - int port; if (!ds->ops->port_mdb_add) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_host_address_match(ds, port, info->sw_index, - info->port)) { - err = dsa_switch_do_mdb_add(ds, port, info->mdb); + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_host_address_match(dp, info->sw_index, + info->port)) { + err = dsa_port_do_mdb_add(dp, info->mdb); if (err) break; } @@ -489,16 +544,16 @@ static int dsa_switch_host_mdb_add(struct dsa_switch *ds, static int dsa_switch_host_mdb_del(struct dsa_switch *ds, struct dsa_notifier_mdb_info *info) { + struct dsa_port *dp; int err = 0; - int port; if (!ds->ops->port_mdb_del) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_host_address_match(ds, port, info->sw_index, - info->port)) { - err = dsa_switch_do_mdb_del(ds, port, info->mdb); + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_host_address_match(dp, info->sw_index, + info->port)) { + err = dsa_port_do_mdb_del(dp, info->mdb); if (err) break; } @@ -507,13 +562,13 @@ static int dsa_switch_host_mdb_del(struct dsa_switch *ds, return err; } -static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port, - struct dsa_notifier_vlan_info *info) +static bool dsa_port_vlan_match(struct dsa_port *dp, + struct dsa_notifier_vlan_info *info) { - if (ds->index == info->sw_index && port == info->port) + if (dp->ds->index == info->sw_index && dp->index == info->port) return true; - if (dsa_is_dsa_port(ds, port)) + if (dsa_port_is_dsa(dp)) return true; return false; @@ -522,14 +577,15 @@ static bool dsa_switch_vlan_match(struct dsa_switch *ds, int port, static int dsa_switch_vlan_add(struct dsa_switch *ds, struct dsa_notifier_vlan_info *info) { - int port, err; + struct dsa_port *dp; + int err; if (!ds->ops->port_vlan_add) return -EOPNOTSUPP; - for (port = 0; port < ds->num_ports; port++) { - if (dsa_switch_vlan_match(ds, port, info)) { - err = ds->ops->port_vlan_add(ds, port, info->vlan, + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_vlan_match(dp, info)) { + err = ds->ops->port_vlan_add(ds, dp->index, info->vlan, info->extack); if (err) return err; @@ -558,38 +614,34 @@ static int dsa_switch_change_tag_proto(struct dsa_switch *ds, struct dsa_notifier_tag_proto_info *info) { const struct dsa_device_ops *tag_ops = info->tag_ops; - int port, err; + struct dsa_port *dp, *cpu_dp; + int err; if (!ds->ops->change_tag_protocol) return -EOPNOTSUPP; ASSERT_RTNL(); - for (port = 0; port < ds->num_ports; port++) { - if (!dsa_is_cpu_port(ds, port)) - continue; - - err = ds->ops->change_tag_protocol(ds, port, tag_ops->proto); + dsa_switch_for_each_cpu_port(cpu_dp, ds) { + err = ds->ops->change_tag_protocol(ds, cpu_dp->index, + tag_ops->proto); if (err) return err; - dsa_port_set_tag_protocol(dsa_to_port(ds, port), tag_ops); + dsa_port_set_tag_protocol(cpu_dp, tag_ops); } /* Now that changing the tag protocol can no longer fail, let's update * the remaining bits which are "duplicated for faster access", and the * bits that depend on the tagger, such as the MTU. */ - for (port = 0; port < ds->num_ports; port++) { - if (dsa_is_user_port(ds, port)) { - struct net_device *slave; + dsa_switch_for_each_user_port(dp, ds) { + struct net_device *slave = dp->slave; - slave = dsa_to_port(ds, port)->slave; - dsa_slave_setup_tagger(slave); + dsa_slave_setup_tagger(slave); - /* rtnl_mutex is held in dsa_tree_change_tag_proto */ - dsa_slave_change_mtu(slave, slave->mtu); - } + /* rtnl_mutex is held in dsa_tree_change_tag_proto */ + dsa_slave_change_mtu(slave, slave->mtu); } return 0; @@ -726,6 +778,12 @@ static int dsa_switch_event(struct notifier_block *nb, case DSA_NOTIFIER_MRP_DEL_RING_ROLE: err = dsa_switch_mrp_del_ring_role(ds, info); break; + case DSA_NOTIFIER_TAG_8021Q_VLAN_ADD: + err = dsa_switch_tag_8021q_vlan_add(ds, info); + break; + case DSA_NOTIFIER_TAG_8021Q_VLAN_DEL: + err = dsa_switch_tag_8021q_vlan_del(ds, info); + break; default: err = -EOPNOTSUPP; break; diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 4aa29f90ecea..72cac2c0af7b 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -6,7 +6,6 @@ * dsa_8021q_netdev_ops is registered for API compliance and not used * directly by callers. */ -#include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/dsa/8021q.h> @@ -17,7 +16,7 @@ * * | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | * +-----------+-----+-----------------+-----------+-----------------------+ - * | DIR | SVL | SWITCH_ID | SUBVLAN | PORT | + * | DIR | VBID| SWITCH_ID | VBID | PORT | * +-----------+-----+-----------------+-----------+-----------------------+ * * DIR - VID[11:10]: @@ -27,24 +26,14 @@ * These values make the special VIDs of 0, 1 and 4095 to be left * unused by this coding scheme. * - * SVL/SUBVLAN - { VID[9], VID[5:4] }: - * Sub-VLAN encoding. Valid only when DIR indicates an RX VLAN. - * * 0 (0b000): Field does not encode a sub-VLAN, either because - * received traffic is untagged, PVID-tagged or because a second - * VLAN tag is present after this tag and not inside of it. - * * 1 (0b001): Received traffic is tagged with a VID value private - * to the host. This field encodes the index in the host's lookup - * table through which the value of the ingress VLAN ID can be - * recovered. - * * 2 (0b010): Field encodes a sub-VLAN. - * ... - * * 7 (0b111): Field encodes a sub-VLAN. - * When DIR indicates a TX VLAN, SUBVLAN must be transmitted as zero - * (by the host) and ignored on receive (by the switch). - * * SWITCH_ID - VID[8:6]: * Index of switch within DSA tree. Must be between 0 and 7. * + * VBID - { VID[9], VID[5:4] }: + * Virtual bridge ID. If between 1 and 7, packet targets the broadcast + * domain of a bridge. If transmitted as zero, packet targets a single + * port. Field only valid on transmit, must be ignored on receive. + * * PORT - VID[3:0]: * Index of switch port. Must be between 0 and 15. */ @@ -61,49 +50,49 @@ #define DSA_8021Q_SWITCH_ID(x) (((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \ DSA_8021Q_SWITCH_ID_MASK) -#define DSA_8021Q_SUBVLAN_HI_SHIFT 9 -#define DSA_8021Q_SUBVLAN_HI_MASK GENMASK(9, 9) -#define DSA_8021Q_SUBVLAN_LO_SHIFT 4 -#define DSA_8021Q_SUBVLAN_LO_MASK GENMASK(5, 4) -#define DSA_8021Q_SUBVLAN_HI(x) (((x) & GENMASK(2, 2)) >> 2) -#define DSA_8021Q_SUBVLAN_LO(x) ((x) & GENMASK(1, 0)) -#define DSA_8021Q_SUBVLAN(x) \ - (((DSA_8021Q_SUBVLAN_LO(x) << DSA_8021Q_SUBVLAN_LO_SHIFT) & \ - DSA_8021Q_SUBVLAN_LO_MASK) | \ - ((DSA_8021Q_SUBVLAN_HI(x) << DSA_8021Q_SUBVLAN_HI_SHIFT) & \ - DSA_8021Q_SUBVLAN_HI_MASK)) +#define DSA_8021Q_VBID_HI_SHIFT 9 +#define DSA_8021Q_VBID_HI_MASK GENMASK(9, 9) +#define DSA_8021Q_VBID_LO_SHIFT 4 +#define DSA_8021Q_VBID_LO_MASK GENMASK(5, 4) +#define DSA_8021Q_VBID_HI(x) (((x) & GENMASK(2, 2)) >> 2) +#define DSA_8021Q_VBID_LO(x) ((x) & GENMASK(1, 0)) +#define DSA_8021Q_VBID(x) \ + (((DSA_8021Q_VBID_LO(x) << DSA_8021Q_VBID_LO_SHIFT) & \ + DSA_8021Q_VBID_LO_MASK) | \ + ((DSA_8021Q_VBID_HI(x) << DSA_8021Q_VBID_HI_SHIFT) & \ + DSA_8021Q_VBID_HI_MASK)) #define DSA_8021Q_PORT_SHIFT 0 #define DSA_8021Q_PORT_MASK GENMASK(3, 0) #define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \ DSA_8021Q_PORT_MASK) +u16 dsa_8021q_bridge_tx_fwd_offload_vid(int bridge_num) +{ + /* The VBID value of 0 is reserved for precise TX */ + return DSA_8021Q_DIR_TX | DSA_8021Q_VBID(bridge_num + 1); +} +EXPORT_SYMBOL_GPL(dsa_8021q_bridge_tx_fwd_offload_vid); + /* Returns the VID to be inserted into the frame from xmit for switch steering * instructions on egress. Encodes switch ID and port ID. */ -u16 dsa_8021q_tx_vid(struct dsa_switch *ds, int port) +u16 dsa_tag_8021q_tx_vid(const struct dsa_port *dp) { - return DSA_8021Q_DIR_TX | DSA_8021Q_SWITCH_ID(ds->index) | - DSA_8021Q_PORT(port); + return DSA_8021Q_DIR_TX | DSA_8021Q_SWITCH_ID(dp->ds->index) | + DSA_8021Q_PORT(dp->index); } -EXPORT_SYMBOL_GPL(dsa_8021q_tx_vid); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_tx_vid); /* Returns the VID that will be installed as pvid for this switch port, sent as * tagged egress towards the CPU port and decoded by the rcv function. */ -u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port) +u16 dsa_tag_8021q_rx_vid(const struct dsa_port *dp) { - return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(ds->index) | - DSA_8021Q_PORT(port); + return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(dp->ds->index) | + DSA_8021Q_PORT(dp->index); } -EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid); - -u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan) -{ - return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(ds->index) | - DSA_8021Q_PORT(port) | DSA_8021Q_SUBVLAN(subvlan); -} -EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid_subvlan); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_rx_vid); /* Returns the decoded switch ID from the RX VID. */ int dsa_8021q_rx_switch_id(u16 vid) @@ -119,20 +108,6 @@ int dsa_8021q_rx_source_port(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); -/* Returns the decoded subvlan from the RX VID. */ -u16 dsa_8021q_rx_subvlan(u16 vid) -{ - u16 svl_hi, svl_lo; - - svl_hi = (vid & DSA_8021Q_SUBVLAN_HI_MASK) >> - DSA_8021Q_SUBVLAN_HI_SHIFT; - svl_lo = (vid & DSA_8021Q_SUBVLAN_LO_MASK) >> - DSA_8021Q_SUBVLAN_LO_SHIFT; - - return (svl_hi << 2) | svl_lo; -} -EXPORT_SYMBOL_GPL(dsa_8021q_rx_subvlan); - bool vid_is_dsa_8021q_rxvlan(u16 vid) { return (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX; @@ -151,21 +126,155 @@ bool vid_is_dsa_8021q(u16 vid) } EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); -/* If @enabled is true, installs @vid with @flags into the switch port's HW - * filter. - * If @enabled is false, deletes @vid (ignores @flags) from the port. Had the - * user explicitly configured this @vid through the bridge core, then the @vid - * is installed again, but this time with the flags from the bridge layer. - */ -static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, - u16 flags, bool enabled) +static struct dsa_tag_8021q_vlan * +dsa_tag_8021q_vlan_find(struct dsa_8021q_context *ctx, int port, u16 vid) { - struct dsa_port *dp = dsa_to_port(ctx->ds, port); + struct dsa_tag_8021q_vlan *v; - if (enabled) - return ctx->ops->vlan_add(ctx->ds, dp->index, vid, flags); + list_for_each_entry(v, &ctx->vlans, list) + if (v->vid == vid && v->port == port) + return v; - return ctx->ops->vlan_del(ctx->ds, dp->index, vid); + return NULL; +} + +static int dsa_port_do_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, + u16 flags) +{ + struct dsa_8021q_context *ctx = dp->ds->tag_8021q_ctx; + struct dsa_switch *ds = dp->ds; + struct dsa_tag_8021q_vlan *v; + int port = dp->index; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->tag_8021q_vlan_add(ds, port, vid, flags); + + v = dsa_tag_8021q_vlan_find(ctx, port, vid); + if (v) { + refcount_inc(&v->refcount); + return 0; + } + + v = kzalloc(sizeof(*v), GFP_KERNEL); + if (!v) + return -ENOMEM; + + err = ds->ops->tag_8021q_vlan_add(ds, port, vid, flags); + if (err) { + kfree(v); + return err; + } + + v->vid = vid; + v->port = port; + refcount_set(&v->refcount, 1); + list_add_tail(&v->list, &ctx->vlans); + + return 0; +} + +static int dsa_port_do_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid) +{ + struct dsa_8021q_context *ctx = dp->ds->tag_8021q_ctx; + struct dsa_switch *ds = dp->ds; + struct dsa_tag_8021q_vlan *v; + int port = dp->index; + int err; + + /* No need to bother with refcounting for user ports */ + if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) + return ds->ops->tag_8021q_vlan_del(ds, port, vid); + + v = dsa_tag_8021q_vlan_find(ctx, port, vid); + if (!v) + return -ENOENT; + + if (!refcount_dec_and_test(&v->refcount)) + return 0; + + err = ds->ops->tag_8021q_vlan_del(ds, port, vid); + if (err) { + refcount_inc(&v->refcount); + return err; + } + + list_del(&v->list); + kfree(v); + + return 0; +} + +static bool +dsa_port_tag_8021q_vlan_match(struct dsa_port *dp, + struct dsa_notifier_tag_8021q_vlan_info *info) +{ + struct dsa_switch *ds = dp->ds; + + if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) + return true; + + if (ds->dst->index == info->tree_index && ds->index == info->sw_index) + return dp->index == info->port; + + return false; +} + +int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info) +{ + struct dsa_port *dp; + int err; + + /* Since we use dsa_broadcast(), there might be other switches in other + * trees which don't support tag_8021q, so don't return an error. + * Or they might even support tag_8021q but have not registered yet to + * use it (maybe they use another tagger currently). + */ + if (!ds->ops->tag_8021q_vlan_add || !ds->tag_8021q_ctx) + return 0; + + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_tag_8021q_vlan_match(dp, info)) { + u16 flags = 0; + + if (dsa_port_is_user(dp)) + flags |= BRIDGE_VLAN_INFO_UNTAGGED; + + if (vid_is_dsa_8021q_rxvlan(info->vid) && + dsa_8021q_rx_switch_id(info->vid) == ds->index && + dsa_8021q_rx_source_port(info->vid) == dp->index) + flags |= BRIDGE_VLAN_INFO_PVID; + + err = dsa_port_do_tag_8021q_vlan_add(dp, info->vid, + flags); + if (err) + return err; + } + } + + return 0; +} + +int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds, + struct dsa_notifier_tag_8021q_vlan_info *info) +{ + struct dsa_port *dp; + int err; + + if (!ds->ops->tag_8021q_vlan_del || !ds->tag_8021q_ctx) + return 0; + + dsa_switch_for_each_port(dp, ds) { + if (dsa_port_tag_8021q_vlan_match(dp, info)) { + err = dsa_port_do_tag_8021q_vlan_del(dp, info->vid); + if (err) + return err; + } + } + + return 0; } /* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single @@ -181,12 +290,6 @@ static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, * force all switched traffic to pass through the CPU. So we must also make * the other front-panel ports members of this VID we're adding, albeit * we're not making it their PVID (they'll still have their own). - * By the way - just because we're installing the same VID in multiple - * switch ports doesn't mean that they'll start to talk to one another, even - * while not bridged: the final forwarding decision is still an AND between - * the L2 forwarding information (which is limiting forwarding in this case) - * and the VLAN-based restrictions (of which there are none in this case, - * since all ports are members). * - On TX (ingress from CPU and towards network) we are faced with a problem. * If we were to tag traffic (from within DSA) with the port's pvid, all * would be well, assuming the switch ports were standalone. Frames would @@ -200,9 +303,10 @@ static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, * a member of the VID we're tagging the traffic with - the desired one. * * So at the end, each front-panel port will have one RX VID (also the PVID), - * the RX VID of all other front-panel ports, and one TX VID. Whereas the CPU - * port will have the RX and TX VIDs of all front-panel ports, and on top of - * that, is also tagged-input and tagged-output (VLAN trunk). + * the RX VID of all other front-panel ports that are in the same bridge, and + * one TX VID. Whereas the CPU port will have the RX and TX VIDs of all + * front-panel ports, and on top of that, is also tagged-input and + * tagged-output (VLAN trunk). * * CPU port CPU port * +-------------+-----+-------------+ +-------------+-----+-------------+ @@ -220,246 +324,244 @@ static int dsa_8021q_vid_apply(struct dsa_8021q_context *ctx, int port, u16 vid, * +-+-----+-+-----+-+-----+-+-----+-+ +-+-----+-+-----+-+-----+-+-----+-+ * swp0 swp1 swp2 swp3 swp0 swp1 swp2 swp3 */ -static int dsa_8021q_setup_port(struct dsa_8021q_context *ctx, int port, - bool enabled) +static bool +dsa_port_tag_8021q_bridge_match(struct dsa_port *dp, + struct dsa_notifier_bridge_info *info) +{ + /* Don't match on self */ + if (dp->ds->dst->index == info->tree_index && + dp->ds->index == info->sw_index && + dp->index == info->port) + return false; + + if (dsa_port_is_user(dp)) + return dp->bridge_dev == info->br; + + return false; +} + +int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info) { - int upstream = dsa_upstream_port(ctx->ds, port); - u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port); - u16 tx_vid = dsa_8021q_tx_vid(ctx->ds, port); + struct dsa_switch *targeted_ds; + struct dsa_port *targeted_dp; + struct dsa_port *dp; + u16 targeted_rx_vid; + int err; + + if (!ds->tag_8021q_ctx) + return 0; + + targeted_ds = dsa_switch_find(info->tree_index, info->sw_index); + targeted_dp = dsa_to_port(targeted_ds, info->port); + targeted_rx_vid = dsa_tag_8021q_rx_vid(targeted_dp); + + dsa_switch_for_each_port(dp, ds) { + u16 rx_vid = dsa_tag_8021q_rx_vid(dp); + + if (!dsa_port_tag_8021q_bridge_match(dp, info)) + continue; + + /* Install the RX VID of the targeted port in our VLAN table */ + err = dsa_port_tag_8021q_vlan_add(dp, targeted_rx_vid, true); + if (err) + return err; + + /* Install our RX VID into the targeted port's VLAN table */ + err = dsa_port_tag_8021q_vlan_add(targeted_dp, rx_vid, true); + if (err) + return err; + } + + return 0; +} + +int dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, + struct dsa_notifier_bridge_info *info) +{ + struct dsa_switch *targeted_ds; + struct dsa_port *targeted_dp; + struct dsa_port *dp; + u16 targeted_rx_vid; + + if (!ds->tag_8021q_ctx) + return 0; + + targeted_ds = dsa_switch_find(info->tree_index, info->sw_index); + targeted_dp = dsa_to_port(targeted_ds, info->port); + targeted_rx_vid = dsa_tag_8021q_rx_vid(targeted_dp); + + dsa_switch_for_each_port(dp, ds) { + u16 rx_vid = dsa_tag_8021q_rx_vid(dp); + + if (!dsa_port_tag_8021q_bridge_match(dp, info)) + continue; + + /* Remove the RX VID of the targeted port from our VLAN table */ + dsa_port_tag_8021q_vlan_del(dp, targeted_rx_vid, true); + + /* Remove our RX VID from the targeted port's VLAN table */ + dsa_port_tag_8021q_vlan_del(targeted_dp, rx_vid, true); + } + + return 0; +} + +int dsa_tag_8021q_bridge_tx_fwd_offload(struct dsa_switch *ds, int port, + struct net_device *br, + int bridge_num) +{ + u16 tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge_num); + + return dsa_port_tag_8021q_vlan_add(dsa_to_port(ds, port), tx_vid, + true); +} +EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_tx_fwd_offload); + +void dsa_tag_8021q_bridge_tx_fwd_unoffload(struct dsa_switch *ds, int port, + struct net_device *br, + int bridge_num) +{ + u16 tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(bridge_num); + + dsa_port_tag_8021q_vlan_del(dsa_to_port(ds, port), tx_vid, true); +} +EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_tx_fwd_unoffload); + +/* Set up a port's tag_8021q RX and TX VLAN for standalone mode operation */ +static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port) +{ + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_port *dp = dsa_to_port(ds, port); + u16 rx_vid = dsa_tag_8021q_rx_vid(dp); + u16 tx_vid = dsa_tag_8021q_tx_vid(dp); struct net_device *master; - int i, err, subvlan; + int err; /* The CPU port is implicitly configured by * configuring the front-panel ports */ - if (!dsa_is_user_port(ctx->ds, port)) + if (!dsa_port_is_user(dp)) return 0; - master = dsa_to_port(ctx->ds, port)->cpu_dp->master; + master = dp->cpu_dp->master; /* Add this user port's RX VID to the membership list of all others * (including itself). This is so that bridging will not be hindered. * L2 forwarding rules still take precedence when there are no VLAN * restrictions, so there are no concerns about leaking traffic. */ - for (i = 0; i < ctx->ds->num_ports; i++) { - u16 flags; - - if (i == upstream) - continue; - else if (i == port) - /* The RX VID is pvid on this port */ - flags = BRIDGE_VLAN_INFO_UNTAGGED | - BRIDGE_VLAN_INFO_PVID; - else - /* The RX VID is a regular VLAN on all others */ - flags = BRIDGE_VLAN_INFO_UNTAGGED; - - err = dsa_8021q_vid_apply(ctx, i, rx_vid, flags, enabled); - if (err) { - dev_err(ctx->ds->dev, - "Failed to apply RX VID %d to port %d: %d\n", - rx_vid, port, err); - return err; - } - } - - /* CPU port needs to see this port's RX VID - * as tagged egress. - */ - err = dsa_8021q_vid_apply(ctx, upstream, rx_vid, 0, enabled); + err = dsa_port_tag_8021q_vlan_add(dp, rx_vid, false); if (err) { - dev_err(ctx->ds->dev, - "Failed to apply RX VID %d to port %d: %d\n", - rx_vid, port, err); + dev_err(ds->dev, + "Failed to apply RX VID %d to port %d: %pe\n", + rx_vid, port, ERR_PTR(err)); return err; } - /* Add to the master's RX filter not only @rx_vid, but in fact - * the entire subvlan range, just in case this DSA switch might - * want to use sub-VLANs. - */ - for (subvlan = 0; subvlan < DSA_8021Q_N_SUBVLAN; subvlan++) { - u16 vid = dsa_8021q_rx_vid_subvlan(ctx->ds, port, subvlan); - - if (enabled) - vlan_vid_add(master, ctx->proto, vid); - else - vlan_vid_del(master, ctx->proto, vid); - } + /* Add @rx_vid to the master's RX filter. */ + vlan_vid_add(master, ctx->proto, rx_vid); /* Finally apply the TX VID on this port and on the CPU port */ - err = dsa_8021q_vid_apply(ctx, port, tx_vid, BRIDGE_VLAN_INFO_UNTAGGED, - enabled); - if (err) { - dev_err(ctx->ds->dev, - "Failed to apply TX VID %d on port %d: %d\n", - tx_vid, port, err); - return err; - } - err = dsa_8021q_vid_apply(ctx, upstream, tx_vid, 0, enabled); + err = dsa_port_tag_8021q_vlan_add(dp, tx_vid, false); if (err) { - dev_err(ctx->ds->dev, - "Failed to apply TX VID %d on port %d: %d\n", - tx_vid, upstream, err); + dev_err(ds->dev, + "Failed to apply TX VID %d on port %d: %pe\n", + tx_vid, port, ERR_PTR(err)); return err; } return err; } -int dsa_8021q_setup(struct dsa_8021q_context *ctx, bool enabled) +static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port) { - int rc, port; + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_port *dp = dsa_to_port(ds, port); + u16 rx_vid = dsa_tag_8021q_rx_vid(dp); + u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + struct net_device *master; - ASSERT_RTNL(); + /* The CPU port is implicitly configured by + * configuring the front-panel ports + */ + if (!dsa_port_is_user(dp)) + return; - for (port = 0; port < ctx->ds->num_ports; port++) { - rc = dsa_8021q_setup_port(ctx, port, enabled); - if (rc < 0) { - dev_err(ctx->ds->dev, - "Failed to setup VLAN tagging for port %d: %d\n", - port, rc); - return rc; - } - } + master = dp->cpu_dp->master; - return 0; -} -EXPORT_SYMBOL_GPL(dsa_8021q_setup); + dsa_port_tag_8021q_vlan_del(dp, rx_vid, false); -static int dsa_8021q_crosschip_link_apply(struct dsa_8021q_context *ctx, - int port, - struct dsa_8021q_context *other_ctx, - int other_port, bool enabled) -{ - u16 rx_vid = dsa_8021q_rx_vid(ctx->ds, port); + vlan_vid_del(master, ctx->proto, rx_vid); - /* @rx_vid of local @ds port @port goes to @other_port of - * @other_ds - */ - return dsa_8021q_vid_apply(other_ctx, other_port, rx_vid, - BRIDGE_VLAN_INFO_UNTAGGED, enabled); + dsa_port_tag_8021q_vlan_del(dp, tx_vid, false); } -static int dsa_8021q_crosschip_link_add(struct dsa_8021q_context *ctx, int port, - struct dsa_8021q_context *other_ctx, - int other_port) +static int dsa_tag_8021q_setup(struct dsa_switch *ds) { - struct dsa_8021q_crosschip_link *c; + int err, port; + + ASSERT_RTNL(); - list_for_each_entry(c, &ctx->crosschip_links, list) { - if (c->port == port && c->other_ctx == other_ctx && - c->other_port == other_port) { - refcount_inc(&c->refcount); - return 0; + for (port = 0; port < ds->num_ports; port++) { + err = dsa_tag_8021q_port_setup(ds, port); + if (err < 0) { + dev_err(ds->dev, + "Failed to setup VLAN tagging for port %d: %pe\n", + port, ERR_PTR(err)); + return err; } } - dev_dbg(ctx->ds->dev, - "adding crosschip link from port %d to %s port %d\n", - port, dev_name(other_ctx->ds->dev), other_port); - - c = kzalloc(sizeof(*c), GFP_KERNEL); - if (!c) - return -ENOMEM; - - c->port = port; - c->other_ctx = other_ctx; - c->other_port = other_port; - refcount_set(&c->refcount, 1); - - list_add(&c->list, &ctx->crosschip_links); - return 0; } -static void dsa_8021q_crosschip_link_del(struct dsa_8021q_context *ctx, - struct dsa_8021q_crosschip_link *c, - bool *keep) +static void dsa_tag_8021q_teardown(struct dsa_switch *ds) { - *keep = !refcount_dec_and_test(&c->refcount); + int port; - if (*keep) - return; - - dev_dbg(ctx->ds->dev, - "deleting crosschip link from port %d to %s port %d\n", - c->port, dev_name(c->other_ctx->ds->dev), c->other_port); + ASSERT_RTNL(); - list_del(&c->list); - kfree(c); + for (port = 0; port < ds->num_ports; port++) + dsa_tag_8021q_port_teardown(ds, port); } -/* Make traffic from local port @port be received by remote port @other_port. - * This means that our @rx_vid needs to be installed on @other_ds's upstream - * and user ports. The user ports should be egress-untagged so that they can - * pop the dsa_8021q VLAN. But the @other_upstream can be either egress-tagged - * or untagged: it doesn't matter, since it should never egress a frame having - * our @rx_vid. - */ -int dsa_8021q_crosschip_bridge_join(struct dsa_8021q_context *ctx, int port, - struct dsa_8021q_context *other_ctx, - int other_port) +int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto) { - /* @other_upstream is how @other_ds reaches us. If we are part - * of disjoint trees, then we are probably connected through - * our CPU ports. If we're part of the same tree though, we should - * probably use dsa_towards_port. - */ - int other_upstream = dsa_upstream_port(other_ctx->ds, other_port); - int rc; + struct dsa_8021q_context *ctx; - rc = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_port); - if (rc) - return rc; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; - rc = dsa_8021q_crosschip_link_apply(ctx, port, other_ctx, - other_port, true); - if (rc) - return rc; + ctx->proto = proto; + ctx->ds = ds; - rc = dsa_8021q_crosschip_link_add(ctx, port, other_ctx, other_upstream); - if (rc) - return rc; + INIT_LIST_HEAD(&ctx->vlans); - return dsa_8021q_crosschip_link_apply(ctx, port, other_ctx, - other_upstream, true); + ds->tag_8021q_ctx = ctx; + + return dsa_tag_8021q_setup(ds); } -EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_register); -int dsa_8021q_crosschip_bridge_leave(struct dsa_8021q_context *ctx, int port, - struct dsa_8021q_context *other_ctx, - int other_port) +void dsa_tag_8021q_unregister(struct dsa_switch *ds) { - int other_upstream = dsa_upstream_port(other_ctx->ds, other_port); - struct dsa_8021q_crosschip_link *c, *n; - - list_for_each_entry_safe(c, n, &ctx->crosschip_links, list) { - if (c->port == port && c->other_ctx == other_ctx && - (c->other_port == other_port || - c->other_port == other_upstream)) { - struct dsa_8021q_context *other_ctx = c->other_ctx; - int other_port = c->other_port; - bool keep; - int rc; - - dsa_8021q_crosschip_link_del(ctx, c, &keep); - if (keep) - continue; - - rc = dsa_8021q_crosschip_link_apply(ctx, port, - other_ctx, - other_port, - false); - if (rc) - return rc; - } + struct dsa_8021q_context *ctx = ds->tag_8021q_ctx; + struct dsa_tag_8021q_vlan *v, *n; + + dsa_tag_8021q_teardown(ds); + + list_for_each_entry_safe(v, n, &ctx->vlans, list) { + list_del(&v->list); + kfree(v); } - return 0; + ds->tag_8021q_ctx = NULL; + + kfree(ctx); } -EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_leave); +EXPORT_SYMBOL_GPL(dsa_tag_8021q_unregister); struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci) @@ -471,8 +573,7 @@ struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, } EXPORT_SYMBOL_GPL(dsa_8021q_xmit); -void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, - int *subvlan) +void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id) { u16 vid, tci; @@ -489,9 +590,6 @@ void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id, *source_port = dsa_8021q_rx_source_port(vid); *switch_id = dsa_8021q_rx_switch_id(vid); - *subvlan = dsa_8021q_rx_subvlan(vid); skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } EXPORT_SYMBOL_GPL(dsa_8021q_rcv); - -MODULE_LICENSE("GPL v2"); diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c index 0efae1a372b3..8a02ac44282f 100644 --- a/net/dsa/tag_ar9331.c +++ b/net/dsa/tag_ar9331.c @@ -44,8 +44,7 @@ static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb, } static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb, - struct net_device *ndev, - struct packet_type *pt) + struct net_device *ndev) { u8 ver, port; u16 hdr; diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 0750af951fc9..96dbb8ee2fee 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -99,7 +99,7 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, skb_push(skb, BRCM_TAG_LEN); if (offset) - memmove(skb->data, skb->data + BRCM_TAG_LEN, offset); + dsa_alloc_etype_header(skb, BRCM_TAG_LEN); brcm_tag = skb->data + offset; @@ -136,7 +136,6 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, */ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, unsigned int offset) { int source_port; @@ -167,7 +166,7 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } @@ -182,20 +181,16 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, } -static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *nskb; /* skb->data points to the EtherType, the tag is right before it */ - nskb = brcm_tag_rcv_ll(skb, dev, pt, 2); + nskb = brcm_tag_rcv_ll(skb, dev, 2); if (!nskb) return nskb; - /* Move the Ethernet DA and SA */ - memmove(nskb->data - ETH_HLEN, - nskb->data - ETH_HLEN - BRCM_TAG_LEN, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, BRCM_TAG_LEN); return nskb; } @@ -233,7 +228,7 @@ static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb, skb_push(skb, BRCM_LEG_TAG_LEN); - memmove(skb->data, skb->data + BRCM_LEG_TAG_LEN, 2 * ETH_ALEN); + dsa_alloc_etype_header(skb, BRCM_LEG_TAG_LEN); brcm_tag = skb->data + 2 * ETH_ALEN; @@ -251,8 +246,7 @@ static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb, } static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt) + struct net_device *dev) { int source_port; u8 *brcm_tag; @@ -260,7 +254,7 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, BRCM_LEG_PORT_ID))) return NULL; - brcm_tag = skb->data - 2; + brcm_tag = dsa_etype_header_pos_rx(skb); source_port = brcm_tag[5] & BRCM_LEG_PORT_ID; @@ -271,12 +265,9 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_LEG_TAG_LEN); - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); - /* Move the Ethernet DA and SA */ - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - BRCM_LEG_TAG_LEN, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, BRCM_LEG_TAG_LEN); return skb; } @@ -302,11 +293,10 @@ static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb, } static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt) + struct net_device *dev) { /* tag is prepended to the packet */ - return brcm_tag_rcv_ll(skb, dev, pt, ETH_HLEN); + return brcm_tag_rcv_ll(skb, dev, ETH_HLEN); } static const struct dsa_device_ops brcm_prepend_netdev_ops = { diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index a822355afc90..b3da4b2ea11c 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -45,6 +45,7 @@ * 6 6 2 2 4 2 N */ +#include <linux/dsa/mv88e6xxx.h> #include <linux/etherdevice.h> #include <linux/list.h> #include <linux/slab.h> @@ -126,18 +127,37 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, u8 extra) { struct dsa_port *dp = dsa_slave_to_port(dev); + u8 tag_dev, tag_port; + enum dsa_cmd cmd; u8 *dsa_header; + if (skb->offload_fwd_mark) { + struct dsa_switch_tree *dst = dp->ds->dst; + + cmd = DSA_CMD_FORWARD; + + /* When offloading forwarding for a bridge, inject FORWARD + * packets on behalf of a virtual switch device with an index + * past the physical switches. + */ + tag_dev = dst->last_switch + 1 + dp->bridge_num; + tag_port = 0; + } else { + cmd = DSA_CMD_FROM_CPU; + tag_dev = dp->ds->index; + tag_port = dp->index; + } + if (skb->protocol == htons(ETH_P_8021Q)) { if (extra) { skb_push(skb, extra); - memmove(skb->data, skb->data + extra, 2 * ETH_ALEN); + dsa_alloc_etype_header(skb, extra); } - /* Construct tagged FROM_CPU DSA tag from 802.1Q tag. */ - dsa_header = skb->data + 2 * ETH_ALEN + extra; - dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | 0x20 | dp->ds->index; - dsa_header[1] = dp->index << 3; + /* Construct tagged DSA tag from 802.1Q tag. */ + dsa_header = dsa_etype_header_pos_tx(skb) + extra; + dsa_header[0] = (cmd << 6) | 0x20 | tag_dev; + dsa_header[1] = tag_port << 3; /* Move CFI field from byte 2 to byte 1. */ if (dsa_header[2] & 0x10) { @@ -145,15 +165,21 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, dsa_header[2] &= ~0x10; } } else { + struct net_device *br = dp->bridge_dev; + u16 vid; + + vid = br ? MV88E6XXX_VID_BRIDGED : MV88E6XXX_VID_STANDALONE; + skb_push(skb, DSA_HLEN + extra); - memmove(skb->data, skb->data + DSA_HLEN + extra, 2 * ETH_ALEN); - - /* Construct untagged FROM_CPU DSA tag. */ - dsa_header = skb->data + 2 * ETH_ALEN + extra; - dsa_header[0] = (DSA_CMD_FROM_CPU << 6) | dp->ds->index; - dsa_header[1] = dp->index << 3; - dsa_header[2] = 0x00; - dsa_header[3] = 0x00; + dsa_alloc_etype_header(skb, DSA_HLEN + extra); + + /* Construct DSA header from untagged frame. */ + dsa_header = dsa_etype_header_pos_tx(skb) + extra; + + dsa_header[0] = (cmd << 6) | tag_dev; + dsa_header[1] = tag_port << 3; + dsa_header[2] = vid >> 8; + dsa_header[3] = vid & 0xff; } return skb; @@ -162,21 +188,19 @@ static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev, static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, u8 extra) { + bool trap = false, trunk = false; int source_device, source_port; - bool trunk = false; enum dsa_code code; enum dsa_cmd cmd; u8 *dsa_header; /* The ethertype field is part of the DSA header. */ - dsa_header = skb->data - 2; + dsa_header = dsa_etype_header_pos_rx(skb); cmd = dsa_header[0] >> 6; switch (cmd) { case DSA_CMD_FORWARD: - skb->offload_fwd_mark = 1; - - trunk = !!(dsa_header[1] & 7); + trunk = !!(dsa_header[1] & 4); break; case DSA_CMD_TO_CPU: @@ -194,7 +218,6 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, * device (like a bridge) that forwarding has * already been done by hardware. */ - skb->offload_fwd_mark = 1; break; case DSA_CODE_MGMT_TRAP: case DSA_CODE_IGMP_MLD_TRAP: @@ -202,6 +225,7 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, /* Traps have, by definition, not been * forwarded by hardware, so don't mark them. */ + trap = true; break; default: /* Reserved code, this could be anything. Drop @@ -235,6 +259,15 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, if (!skb->dev) return NULL; + /* When using LAG offload, skb->dev is not a DSA slave interface, + * so we cannot call dsa_default_offload_fwd_mark and we need to + * special-case it. + */ + if (trunk) + skb->offload_fwd_mark = true; + else if (!trap) + dsa_default_offload_fwd_mark(skb); + /* If the 'tagged' bit is set; convert the DSA tag to a 802.1Q * tag, and delete the ethertype (extra) if applicable. If the * 'tagged' bit is cleared; delete the DSA tag, and ethertype @@ -269,14 +302,10 @@ static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev, memcpy(dsa_header, new_header, DSA_HLEN); if (extra) - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - extra, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, extra); } else { skb_pull_rcsum(skb, DSA_HLEN); - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - DSA_HLEN - extra, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, DSA_HLEN + extra); } return skb; @@ -289,8 +318,7 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) return dsa_xmit_ll(skb, dev, 0); } -static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev) { if (unlikely(!pskb_may_pull(skb, DSA_HLEN))) return NULL; @@ -322,7 +350,7 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) if (!skb) return NULL; - edsa_header = skb->data + 2 * ETH_ALEN; + edsa_header = dsa_etype_header_pos_tx(skb); edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff; edsa_header[1] = ETH_P_EDSA & 0xff; edsa_header[2] = 0x00; @@ -330,8 +358,7 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev) { if (unlikely(!pskb_may_pull(skb, EDSA_HLEN))) return NULL; diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 5985dab06ab8..df7140984da3 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -75,8 +75,7 @@ static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb, } static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt) + struct net_device *dev) { int port; u8 *gswip_tag; diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index 424130f85f59..f64b805303cd 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -29,8 +29,7 @@ static struct sk_buff *hellcreek_xmit(struct sk_buff *skb, } static struct sk_buff *hellcreek_rcv(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt) + struct net_device *dev) { /* Tag decoding */ u8 *tag = skb_tail_pointer(skb) - HELLCREEK_TAG_LEN; @@ -44,7 +43,7 @@ static struct sk_buff *hellcreek_rcv(struct sk_buff *skb, pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN); - skb->offload_fwd_mark = true; + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index a201ccf2435d..3509fc967ca9 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -6,7 +6,6 @@ #include <linux/etherdevice.h> #include <linux/list.h> -#include <linux/slab.h> #include <net/dsa.h> #include "dsa_priv.h" @@ -24,7 +23,7 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, pskb_trim_rcsum(skb, skb->len - len); - skb->offload_fwd_mark = true; + dsa_default_offload_fwd_mark(skb); return skb; } @@ -67,8 +66,7 @@ static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev) { u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; @@ -134,8 +132,7 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, return skb; } -static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev) { /* Tag decoding */ u8 *tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN; diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c index 26207ef39ebc..cb548188f813 100644 --- a/net/dsa/tag_lan9303.c +++ b/net/dsa/tag_lan9303.c @@ -62,9 +62,10 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) skb_push(skb, LAN9303_TAG_LEN); /* make room between MACs and Ether-Type */ - memmove(skb->data, skb->data + LAN9303_TAG_LEN, 2 * ETH_ALEN); + dsa_alloc_etype_header(skb, LAN9303_TAG_LEN); + + lan9303_tag = dsa_etype_header_pos_tx(skb); - lan9303_tag = (__be16 *)(skb->data + 2 * ETH_ALEN); tag = lan9303_xmit_use_arl(dp, skb->data) ? LAN9303_TAG_TX_USE_ALR : dp->index | LAN9303_TAG_TX_STP_OVERRIDE; @@ -74,8 +75,7 @@ static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev) { __be16 *lan9303_tag; u16 lan9303_tag1; @@ -87,13 +87,7 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, return NULL; } - /* '->data' points into the middle of our special VLAN tag information: - * - * ~ MAC src | 0x81 | 0x00 | 0xyy | 0xzz | ether type - * ^ - * ->data - */ - lan9303_tag = (__be16 *)(skb->data - 2); + lan9303_tag = dsa_etype_header_pos_rx(skb); if (lan9303_tag[0] != htons(ETH_P_8021Q)) { dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid VLAN marker\n"); @@ -113,9 +107,11 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, * and the current ethertype field. */ skb_pull_rcsum(skb, 2 + 2); - memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN), - 2 * ETH_ALEN); - skb->offload_fwd_mark = !(lan9303_tag1 & LAN9303_TAG_RX_TRAPPED_TO_CPU); + + dsa_strip_etype_header(skb, LAN9303_TAG_LEN); + + if (!(lan9303_tag1 & LAN9303_TAG_RX_TRAPPED_TO_CPU)) + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index cc3ba864ad5b..415d8ece242a 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -41,10 +41,10 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, default: xmit_tpid = MTK_HDR_XMIT_UNTAGGED; skb_push(skb, MTK_HDR_LEN); - memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN); + dsa_alloc_etype_header(skb, MTK_HDR_LEN); } - mtk_tag = skb->data + 2 * ETH_ALEN; + mtk_tag = dsa_etype_header_pos_tx(skb); /* Mark tag attribute on special tag insertion to notify hardware * whether that's a combined special tag with 802.1Q header. @@ -61,8 +61,7 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, return skb; } -static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev) { u16 hdr; int port; @@ -71,19 +70,13 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN))) return NULL; - /* The MTK header is added by the switch between src addr - * and ethertype at this point, skb->data points to 2 bytes - * after src addr so header should be 2 bytes right before. - */ - phdr = (__be16 *)(skb->data - 2); + phdr = dsa_etype_header_pos_rx(skb); hdr = ntohs(*phdr); /* Remove MTK tag and recalculate checksum. */ skb_pull_rcsum(skb, MTK_HDR_LEN); - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - MTK_HDR_LEN, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, MTK_HDR_LEN); /* Get source port information */ port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK); @@ -92,7 +85,7 @@ static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb->dev) return NULL; - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 190f4bfd3bef..cd60b94fc175 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -1,19 +1,55 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright 2019 NXP Semiconductors +/* Copyright 2019 NXP */ #include <linux/dsa/ocelot.h> -#include <soc/mscc/ocelot.h> #include "dsa_priv.h" +/* If the port is under a VLAN-aware bridge, remove the VLAN header from the + * payload and move it into the DSA tag, which will make the switch classify + * the packet to the bridge VLAN. Otherwise, leave the classified VLAN at zero, + * which is the pvid of standalone and VLAN-unaware bridge ports. + */ +static void ocelot_xmit_get_vlan_info(struct sk_buff *skb, struct dsa_port *dp, + u64 *vlan_tci, u64 *tag_type) +{ + struct net_device *br = READ_ONCE(dp->bridge_dev); + struct vlan_ethhdr *hdr; + u16 proto, tci; + + if (!br || !br_vlan_enabled(br)) { + *vlan_tci = 0; + *tag_type = IFH_TAG_TYPE_C; + return; + } + + hdr = (struct vlan_ethhdr *)skb_mac_header(skb); + br_vlan_get_proto(br, &proto); + + if (ntohs(hdr->h_vlan_proto) == proto) { + __skb_vlan_pop(skb, &tci); + *vlan_tci = tci; + } else { + rcu_read_lock(); + br_vlan_get_pvid_rcu(br, &tci); + rcu_read_unlock(); + *vlan_tci = tci; + } + + *tag_type = (proto != ETH_P_8021Q) ? IFH_TAG_TYPE_S : IFH_TAG_TYPE_C; +} + static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, __be32 ifh_prefix, void **ifh) { struct dsa_port *dp = dsa_slave_to_port(netdev); struct dsa_switch *ds = dp->ds; + u64 vlan_tci, tag_type; void *injection; __be32 *prefix; u32 rew_op = 0; + ocelot_xmit_get_vlan_info(skb, dp, &vlan_tci, &tag_type); + injection = skb_push(skb, OCELOT_TAG_LEN); prefix = skb_push(skb, OCELOT_SHORT_PREFIX_LEN); @@ -22,6 +58,8 @@ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, ocelot_ifh_set_bypass(injection, 1); ocelot_ifh_set_src(injection, ds->num_ports); ocelot_ifh_set_qos_class(injection, skb->priority); + ocelot_ifh_set_vlan_tci(injection, vlan_tci); + ocelot_ifh_set_tag_type(injection, tag_type); rew_op = ocelot_ptp_rew_op(skb); if (rew_op) @@ -55,8 +93,7 @@ static struct sk_buff *seville_xmit(struct sk_buff *skb, } static struct sk_buff *ocelot_rcv(struct sk_buff *skb, - struct net_device *netdev, - struct packet_type *pt) + struct net_device *netdev) { u64 src_port, qos_class; u64 vlan_tci, tag_type; @@ -104,7 +141,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, */ return NULL; - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); skb->priority = qos_class; /* Ocelot switches copy frames unmodified to the CPU. However, it is diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 85ac85c3af8c..a1919ea5e828 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright 2020-2021 NXP Semiconductors +/* Copyright 2020-2021 NXP * * An implementation of the software-defined tag_8021q.c tagger format, which * also preserves full functionality under a vlan_filtering bridge. It does @@ -9,47 +9,60 @@ * that on egress */ #include <linux/dsa/8021q.h> -#include <soc/mscc/ocelot.h> -#include <soc/mscc/ocelot_ptp.h> +#include <linux/dsa/ocelot.h> #include "dsa_priv.h" +static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp, + struct sk_buff *skb) +{ + struct felix_deferred_xmit_work *xmit_work; + struct felix_port *felix_port = dp->priv; + + xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC); + if (!xmit_work) + return NULL; + + /* Calls felix_port_deferred_xmit in felix.c */ + kthread_init_work(&xmit_work->work, felix_port->xmit_work_fn); + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + xmit_work->dp = dp; + xmit_work->skb = skb_get(skb); + + kthread_queue_work(felix_port->xmit_worker, &xmit_work->work); + + return NULL; +} + static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { struct dsa_port *dp = dsa_slave_to_port(netdev); - u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - struct ocelot *ocelot = dp->ds->priv; - int port = dp->index; - u32 rew_op = 0; + u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + struct ethhdr *hdr = eth_hdr(skb); - rew_op = ocelot_ptp_rew_op(skb); - if (rew_op) { - if (!ocelot_can_inject(ocelot, 0)) - return NULL; - - ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb); - return NULL; - } + if (ocelot_ptp_rew_op(skb) || is_link_local_ether_addr(hdr->h_dest)) + return ocelot_defer_xmit(dp, skb); return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q, ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); } static struct sk_buff *ocelot_rcv(struct sk_buff *skb, - struct net_device *netdev, - struct packet_type *pt) + struct net_device *netdev) { - int src_port, switch_id, subvlan; + int src_port, switch_id; - dsa_8021q_rcv(skb, &src_port, &switch_id, &subvlan); + dsa_8021q_rcv(skb, &src_port, &switch_id); skb->dev = dsa_master_find_slave(netdev, switch_id, src_port); if (!skb->dev) return NULL; - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 693bda013065..1ea9401b8ace 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -36,8 +36,8 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) skb_push(skb, QCA_HDR_LEN); - memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN); - phdr = (__be16 *)(skb->data + 2 * ETH_ALEN); + dsa_alloc_etype_header(skb, QCA_HDR_LEN); + phdr = dsa_etype_header_pos_tx(skb); /* Set the version field, and set destination port information */ hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S | @@ -48,8 +48,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev) { u8 ver; u16 hdr; @@ -59,11 +58,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN))) return NULL; - /* The QCA header is added by the switch between src addr and Ethertype - * At this point, skb->data points to ethertype so header should be - * right before - */ - phdr = (__be16 *)(skb->data - 2); + phdr = dsa_etype_header_pos_rx(skb); hdr = ntohs(*phdr); /* Make sure the version is correct */ @@ -73,8 +68,7 @@ static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, /* Remove QCA tag and recalculate checksum */ skb_pull_rcsum(skb, QCA_HDR_LEN); - memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN, - ETH_HLEN - QCA_HDR_LEN); + dsa_strip_etype_header(skb, QCA_HDR_LEN); /* Get source port information */ port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK); diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index 57c46b4ab2b3..6d928ee3ef7a 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -47,16 +47,17 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, dp->index); skb_push(skb, RTL4_A_HDR_LEN); - memmove(skb->data, skb->data + RTL4_A_HDR_LEN, 2 * ETH_ALEN); - tag = skb->data + 2 * ETH_ALEN; + dsa_alloc_etype_header(skb, RTL4_A_HDR_LEN); + tag = dsa_etype_header_pos_tx(skb); /* Set Ethertype */ p = (__be16 *)tag; *p = htons(RTL4_A_ETHERTYPE); - out = (RTL4_A_PROTOCOL_RTL8366RB << 12) | (2 << 8); - /* The lower bits is the port number */ - out |= (u8)dp->index; + out = (RTL4_A_PROTOCOL_RTL8366RB << RTL4_A_PROTOCOL_SHIFT); + /* The lower bits indicate the port number */ + out |= BIT(dp->index); + p = (__be16 *)(tag + 2); *p = htons(out); @@ -64,8 +65,7 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, } static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb, - struct net_device *dev, - struct packet_type *pt) + struct net_device *dev) { u16 protport; __be16 *p; @@ -77,12 +77,7 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb, if (unlikely(!pskb_may_pull(skb, RTL4_A_HDR_LEN))) return NULL; - /* The RTL4 header has its own custom Ethertype 0x8899 and that - * starts right at the beginning of the packet, after the src - * ethernet addr. Apparently skb->data always points 2 bytes in, - * behind the Ethertype. - */ - tag = skb->data - 2; + tag = dsa_etype_header_pos_rx(skb); p = (__be16 *)tag; etype = ntohs(*p); if (etype != RTL4_A_ETHERTYPE) { @@ -109,12 +104,9 @@ static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb, /* Remove RTL4 tag and recalculate checksum */ skb_pull_rcsum(skb, RTL4_A_HDR_LEN); - /* Move ethernet DA and SA in front of the data */ - memmove(skb->data - ETH_HLEN, - skb->data - ETH_HLEN - RTL4_A_HDR_LEN, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, RTL4_A_HDR_LEN); - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c new file mode 100644 index 000000000000..02686ad4045d --- /dev/null +++ b/net/dsa/tag_rtl8_4.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Handler for Realtek 8 byte switch tags + * + * Copyright (C) 2021 Alvin Å ipraga <alsi@bang-olufsen.dk> + * + * NOTE: Currently only supports protocol "4" found in the RTL8365MB, hence + * named tag_rtl8_4. + * + * This tag header has the following format: + * + * ------------------------------------------- + * | MAC DA | MAC SA | 8 byte tag | Type | ... + * ------------------------------------------- + * _______________/ \______________________________________ + * / \ + * 0 7|8 15 + * |-----------------------------------+-----------------------------------|--- + * | (16-bit) | ^ + * | Realtek EtherType [0x8899] | | + * |-----------------------------------+-----------------------------------| 8 + * | (8-bit) | (8-bit) | + * | Protocol [0x04] | REASON | b + * |-----------------------------------+-----------------------------------| y + * | (1) | (1) | (2) | (1) | (3) | (1) | (1) | (1) | (5) | t + * | FID_EN | X | FID | PRI_EN | PRI | KEEP | X | LEARN_DIS | X | e + * |-----------------------------------+-----------------------------------| s + * | (1) | (15-bit) | | + * | ALLOW | TX/RX | v + * |-----------------------------------+-----------------------------------|--- + * + * With the following field descriptions: + * + * field | description + * ------------+------------- + * Realtek | 0x8899: indicates that this is a proprietary Realtek tag; + * EtherType | note that Realtek uses the same EtherType for + * | other incompatible tag formats (e.g. tag_rtl4_a.c) + * Protocol | 0x04: indicates that this tag conforms to this format + * X | reserved + * ------------+------------- + * REASON | reason for forwarding packet to CPU + * | 0: packet was forwarded or flooded to CPU + * | 80: packet was trapped to CPU + * FID_EN | 1: packet has an FID + * | 0: no FID + * FID | FID of packet (if FID_EN=1) + * PRI_EN | 1: force priority of packet + * | 0: don't force priority + * PRI | priority of packet (if PRI_EN=1) + * KEEP | preserve packet VLAN tag format + * LEARN_DIS | don't learn the source MAC address of the packet + * ALLOW | 1: treat TX/RX field as an allowance port mask, meaning the + * | packet may only be forwarded to ports specified in the + * | mask + * | 0: no allowance port mask, TX/RX field is the forwarding + * | port mask + * TX/RX | TX (switch->CPU): port number the packet was received on + * | RX (CPU->switch): forwarding port mask (if ALLOW=0) + * | allowance port mask (if ALLOW=1) + */ + +#include <linux/bitfield.h> +#include <linux/bits.h> +#include <linux/etherdevice.h> + +#include "dsa_priv.h" + +/* Protocols supported: + * + * 0x04 = RTL8365MB DSA protocol + */ + +#define RTL8_4_TAG_LEN 8 + +#define RTL8_4_PROTOCOL GENMASK(15, 8) +#define RTL8_4_PROTOCOL_RTL8365MB 0x04 +#define RTL8_4_REASON GENMASK(7, 0) +#define RTL8_4_REASON_FORWARD 0 +#define RTL8_4_REASON_TRAP 80 + +#define RTL8_4_LEARN_DIS BIT(5) + +#define RTL8_4_TX GENMASK(3, 0) +#define RTL8_4_RX GENMASK(10, 0) + +static struct sk_buff *rtl8_4_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + __be16 *tag; + + skb_push(skb, RTL8_4_TAG_LEN); + + dsa_alloc_etype_header(skb, RTL8_4_TAG_LEN); + tag = dsa_etype_header_pos_tx(skb); + + /* Set Realtek EtherType */ + tag[0] = htons(ETH_P_REALTEK); + + /* Set Protocol; zero REASON */ + tag[1] = htons(FIELD_PREP(RTL8_4_PROTOCOL, RTL8_4_PROTOCOL_RTL8365MB)); + + /* Zero FID_EN, FID, PRI_EN, PRI, KEEP; set LEARN_DIS */ + tag[2] = htons(FIELD_PREP(RTL8_4_LEARN_DIS, 1)); + + /* Zero ALLOW; set RX (CPU->switch) forwarding port mask */ + tag[3] = htons(FIELD_PREP(RTL8_4_RX, BIT(dp->index))); + + return skb; +} + +static struct sk_buff *rtl8_4_tag_rcv(struct sk_buff *skb, + struct net_device *dev) +{ + __be16 *tag; + u16 etype; + u8 reason; + u8 proto; + u8 port; + + if (unlikely(!pskb_may_pull(skb, RTL8_4_TAG_LEN))) + return NULL; + + tag = dsa_etype_header_pos_rx(skb); + + /* Parse Realtek EtherType */ + etype = ntohs(tag[0]); + if (unlikely(etype != ETH_P_REALTEK)) { + dev_warn_ratelimited(&dev->dev, + "non-realtek ethertype 0x%04x\n", etype); + return NULL; + } + + /* Parse Protocol */ + proto = FIELD_GET(RTL8_4_PROTOCOL, ntohs(tag[1])); + if (unlikely(proto != RTL8_4_PROTOCOL_RTL8365MB)) { + dev_warn_ratelimited(&dev->dev, + "unknown realtek protocol 0x%02x\n", + proto); + return NULL; + } + + /* Parse REASON */ + reason = FIELD_GET(RTL8_4_REASON, ntohs(tag[1])); + + /* Parse TX (switch->CPU) */ + port = FIELD_GET(RTL8_4_TX, ntohs(tag[3])); + skb->dev = dsa_master_find_slave(dev, 0, port); + if (!skb->dev) { + dev_warn_ratelimited(&dev->dev, + "could not find slave for port %d\n", + port); + return NULL; + } + + /* Remove tag and recalculate checksum */ + skb_pull_rcsum(skb, RTL8_4_TAG_LEN); + + dsa_strip_etype_header(skb, RTL8_4_TAG_LEN); + + if (reason != RTL8_4_REASON_TRAP) + dsa_default_offload_fwd_mark(skb); + + return skb; +} + +static const struct dsa_device_ops rtl8_4_netdev_ops = { + .name = "rtl8_4", + .proto = DSA_TAG_PROTO_RTL8_4, + .xmit = rtl8_4_tag_xmit, + .rcv = rtl8_4_tag_rcv, + .needed_headroom = RTL8_4_TAG_LEN, +}; +module_dsa_tag_driver(rtl8_4_netdev_ops); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 9c2df9ece01b..262c8833a910 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -4,6 +4,7 @@ #include <linux/if_vlan.h> #include <linux/dsa/sja1105.h> #include <linux/dsa/8021q.h> +#include <linux/skbuff.h> #include <linux/packing.h> #include "dsa_priv.h" @@ -53,6 +54,11 @@ #define SJA1110_TX_TRAILER_LEN 4 #define SJA1110_MAX_PADDING_LEN 15 +enum sja1110_meta_tstamp { + SJA1110_META_TSTAMP_TX = 0, + SJA1110_META_TSTAMP_RX = 1, +}; + /* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */ static inline bool sja1105_is_link_local(const struct sk_buff *skb) { @@ -115,74 +121,140 @@ static inline bool sja1105_is_meta_frame(const struct sk_buff *skb) return true; } -static bool sja1105_can_use_vlan_as_tags(const struct sk_buff *skb) +/* Calls sja1105_port_deferred_xmit in sja1105_main.c */ +static struct sk_buff *sja1105_defer_xmit(struct dsa_port *dp, + struct sk_buff *skb) { - struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); - u16 vlan_tci; - - if (hdr->h_vlan_proto == htons(ETH_P_SJA1105)) - return true; + struct sja1105_port *sp = dp->priv; - if (hdr->h_vlan_proto != htons(ETH_P_8021Q) && - !skb_vlan_tag_present(skb)) - return false; + if (!dsa_port_is_sja1105(dp)) + return skb; - if (skb_vlan_tag_present(skb)) - vlan_tci = skb_vlan_tag_get(skb); - else - vlan_tci = ntohs(hdr->h_vlan_TCI); + /* Increase refcount so the kfree_skb in dsa_slave_xmit + * won't really free the packet. + */ + skb_queue_tail(&sp->xmit_queue, skb_get(skb)); + kthread_queue_work(sp->xmit_worker, &sp->xmit_work); - return vid_is_dsa_8021q(vlan_tci & VLAN_VID_MASK); + return NULL; } -/* This is the first time the tagger sees the frame on RX. - * Figure out if we can decode it. +/* Send VLAN tags with a TPID that blends in with whatever VLAN protocol a + * bridge spanning ports of this switch might have. */ -static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) +static u16 sja1105_xmit_tpid(struct dsa_port *dp) { - if (sja1105_can_use_vlan_as_tags(skb)) - return true; - if (sja1105_is_link_local(skb)) - return true; - if (sja1105_is_meta_frame(skb)) - return true; - return false; + struct dsa_switch *ds = dp->ds; + struct dsa_port *other_dp; + u16 proto; + + /* Since VLAN awareness is global, then if this port is VLAN-unaware, + * all ports are. Use the VLAN-unaware TPID used for tag_8021q. + */ + if (!dsa_port_is_vlan_filtering(dp)) + return ETH_P_SJA1105; + + /* Port is VLAN-aware, so there is a bridge somewhere (a single one, + * we're sure about that). It may not be on this port though, so we + * need to find it. + */ + dsa_switch_for_each_port(other_dp, ds) { + if (!other_dp->bridge_dev) + continue; + + /* Error is returned only if CONFIG_BRIDGE_VLAN_FILTERING, + * which seems pointless to handle, as our port cannot become + * VLAN-aware in that case. + */ + br_vlan_get_proto(other_dp->bridge_dev, &proto); + + return proto; + } + + WARN_ONCE(1, "Port is VLAN-aware but cannot find associated bridge!\n"); + + return ETH_P_SJA1105; } -/* Calls sja1105_port_deferred_xmit in sja1105_main.c */ -static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp, - struct sk_buff *skb) +static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb, + struct net_device *netdev) { - /* Increase refcount so the kfree_skb in dsa_slave_xmit - * won't really free the packet. + struct dsa_port *dp = dsa_slave_to_port(netdev); + struct net_device *br = dp->bridge_dev; + u16 tx_vid; + + /* If the port is under a VLAN-aware bridge, just slide the + * VLAN-tagged packet into the FDB and hope for the best. + * This works because we support a single VLAN-aware bridge + * across the entire dst, and its VLANs cannot be shared with + * any standalone port. */ - skb_queue_tail(&sp->xmit_queue, skb_get(skb)); - kthread_queue_work(sp->xmit_worker, &sp->xmit_work); + if (br_vlan_enabled(br)) + return skb; - return NULL; + /* If the port is under a VLAN-unaware bridge, use an imprecise + * TX VLAN that targets the bridge's entire broadcast domain, + * instead of just the specific port. + */ + tx_vid = dsa_8021q_bridge_tx_fwd_offload_vid(dp->bridge_num); + + return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp), tx_vid); } -static u16 sja1105_xmit_tpid(struct sja1105_port *sp) +/* Transform untagged control packets into pvid-tagged control packets so that + * all packets sent by this tagger are VLAN-tagged and we can configure the + * switch to drop untagged packets coming from the DSA master. + */ +static struct sk_buff *sja1105_pvid_tag_control_pkt(struct dsa_port *dp, + struct sk_buff *skb, u8 pcp) { - return sp->xmit_tpid; + __be16 xmit_tpid = htons(sja1105_xmit_tpid(dp)); + struct vlan_ethhdr *hdr; + + /* If VLAN tag is in hwaccel area, move it to the payload + * to deal with both cases uniformly and to ensure that + * the VLANs are added in the right order. + */ + if (unlikely(skb_vlan_tag_present(skb))) { + skb = __vlan_hwaccel_push_inside(skb); + if (!skb) + return NULL; + } + + hdr = (struct vlan_ethhdr *)skb_mac_header(skb); + + /* If skb is already VLAN-tagged, leave that VLAN ID in place */ + if (hdr->h_vlan_proto == xmit_tpid) + return skb; + + return vlan_insert_tag(skb, xmit_tpid, (pcp << VLAN_PRIO_SHIFT) | + SJA1105_DEFAULT_VLAN); } static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct net_device *netdev) { struct dsa_port *dp = dsa_slave_to_port(netdev); - u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); + u16 tx_vid = dsa_tag_8021q_tx_vid(dp); + + if (skb->offload_fwd_mark) + return sja1105_imprecise_xmit(skb, netdev); /* Transmitting management traffic does not rely upon switch tagging, * but instead SPI-installed management routes. Part 2 of this * is the .port_deferred_xmit driver callback. */ - if (unlikely(sja1105_is_link_local(skb))) - return sja1105_defer_xmit(dp->priv, skb); + if (unlikely(sja1105_is_link_local(skb))) { + skb = sja1105_pvid_tag_control_pkt(dp, skb, pcp); + if (!skb) + return NULL; - return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv), + return sja1105_defer_xmit(dp, skb); + } + + return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp), ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); } @@ -191,46 +263,48 @@ static struct sk_buff *sja1110_xmit(struct sk_buff *skb, { struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone; struct dsa_port *dp = dsa_slave_to_port(netdev); - u16 tx_vid = dsa_8021q_tx_vid(dp->ds, dp->index); u16 queue_mapping = skb_get_queue_mapping(skb); u8 pcp = netdev_txq_to_tc(netdev, queue_mapping); - struct ethhdr *eth_hdr; + u16 tx_vid = dsa_tag_8021q_tx_vid(dp); __be32 *tx_trailer; __be16 *tx_header; int trailer_pos; + if (skb->offload_fwd_mark) + return sja1105_imprecise_xmit(skb, netdev); + /* Transmitting control packets is done using in-band control * extensions, while data packets are transmitted using * tag_8021q TX VLANs. */ if (likely(!sja1105_is_link_local(skb))) - return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv), + return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp), ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); + skb = sja1105_pvid_tag_control_pkt(dp, skb, pcp); + if (!skb) + return NULL; + skb_push(skb, SJA1110_HEADER_LEN); - /* Move Ethernet header to the left, making space for DSA tag */ - memmove(skb->data, skb->data + SJA1110_HEADER_LEN, 2 * ETH_ALEN); + dsa_alloc_etype_header(skb, SJA1110_HEADER_LEN); trailer_pos = skb->len; - /* On TX, skb->data points to skb_mac_header(skb) */ - eth_hdr = (struct ethhdr *)skb->data; - tx_header = (__be16 *)(eth_hdr + 1); + tx_header = dsa_etype_header_pos_tx(skb); tx_trailer = skb_put(skb, SJA1110_TX_TRAILER_LEN); - eth_hdr->h_proto = htons(ETH_P_SJA1110); - - *tx_header = htons(SJA1110_HEADER_HOST_TO_SWITCH | - SJA1110_TX_HEADER_HAS_TRAILER | - SJA1110_TX_HEADER_TRAILER_POS(trailer_pos)); + tx_header[0] = htons(ETH_P_SJA1110); + tx_header[1] = htons(SJA1110_HEADER_HOST_TO_SWITCH | + SJA1110_TX_HEADER_HAS_TRAILER | + SJA1110_TX_HEADER_TRAILER_POS(trailer_pos)); *tx_trailer = cpu_to_be32(SJA1110_TX_TRAILER_PRIO(pcp) | SJA1110_TX_TRAILER_SWITCHID(dp->ds->index) | SJA1110_TX_TRAILER_DESTPORTS(BIT(dp->index))); if (clone) { u8 ts_id = SJA1105_SKB_CB(clone)->ts_id; - *tx_header |= htons(SJA1110_TX_HEADER_TAKE_TS); + tx_header[1] |= htons(SJA1110_TX_HEADER_TAKE_TS); *tx_trailer |= cpu_to_be32(SJA1110_TX_TRAILER_TSTAMP_ID(ts_id)); } @@ -273,16 +347,16 @@ static struct sk_buff bool is_link_local, bool is_meta) { - struct sja1105_port *sp; - struct dsa_port *dp; - - dp = dsa_slave_to_port(skb->dev); - sp = dp->priv; - /* Step 1: A timestampable frame was received. * Buffer it until we get its meta frame. */ if (is_link_local) { + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + struct sja1105_port *sp = dp->priv; + + if (unlikely(!dsa_port_is_sja1105(dp))) + return skb; + if (!test_bit(SJA1105_HWTS_RX_EN, &sp->data->state)) /* Do normal processing. */ return skb; @@ -315,8 +389,13 @@ static struct sk_buff * frame, which serves no further purpose). */ } else if (is_meta) { + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + struct sja1105_port *sp = dp->priv; struct sk_buff *stampable_skb; + if (unlikely(!dsa_port_is_sja1105(dp))) + return skb; + /* Drop the meta frame if we're not in the right state * to process it. */ @@ -358,20 +437,6 @@ static struct sk_buff return skb; } -static void sja1105_decode_subvlan(struct sk_buff *skb, u16 subvlan) -{ - struct dsa_port *dp = dsa_slave_to_port(skb->dev); - struct sja1105_port *sp = dp->priv; - u16 vid = sp->subvlan_map[subvlan]; - u16 vlan_tci; - - if (vid == VLAN_N_VID) - return; - - vlan_tci = (skb->priority << VLAN_PRIO_SHIFT) | vid; - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); -} - static bool sja1105_skb_has_tag_8021q(const struct sk_buff *skb) { u16 tpid = ntohs(eth_hdr(skb)->h_proto); @@ -385,25 +450,45 @@ static bool sja1110_skb_has_inband_control_extension(const struct sk_buff *skb) return ntohs(eth_hdr(skb)->h_proto) == ETH_P_SJA1110; } +/* If the VLAN in the packet is a tag_8021q one, set @source_port and + * @switch_id and strip the header. Otherwise set @vid and keep it in the + * packet. + */ +static void sja1105_vlan_rcv(struct sk_buff *skb, int *source_port, + int *switch_id, u16 *vid) +{ + struct vlan_ethhdr *hdr = (struct vlan_ethhdr *)skb_mac_header(skb); + u16 vlan_tci; + + if (skb_vlan_tag_present(skb)) + vlan_tci = skb_vlan_tag_get(skb); + else + vlan_tci = ntohs(hdr->h_vlan_TCI); + + if (vid_is_dsa_8021q_rxvlan(vlan_tci & VLAN_VID_MASK)) + return dsa_8021q_rcv(skb, source_port, switch_id); + + /* Try our best with imprecise RX */ + *vid = vlan_tci & VLAN_VID_MASK; +} + static struct sk_buff *sja1105_rcv(struct sk_buff *skb, - struct net_device *netdev, - struct packet_type *pt) + struct net_device *netdev) { - int source_port, switch_id, subvlan = 0; + int source_port = -1, switch_id = -1; struct sja1105_meta meta = {0}; struct ethhdr *hdr; bool is_link_local; bool is_meta; + u16 vid; hdr = eth_hdr(skb); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); - skb->offload_fwd_mark = 1; - if (sja1105_skb_has_tag_8021q(skb)) { /* Normal traffic path. */ - dsa_8021q_rcv(skb, &source_port, &switch_id, &subvlan); + sja1105_vlan_rcv(skb, &source_port, &switch_id, &vid); } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of @@ -422,26 +507,66 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, return NULL; } - skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); + if (source_port == -1 || switch_id == -1) + skb->dev = dsa_find_designated_bridge_port_by_vid(netdev, vid); + else + skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); if (!skb->dev) { netdev_warn(netdev, "Couldn't decode source port\n"); return NULL; } - if (subvlan) - sja1105_decode_subvlan(skb, subvlan); + if (!is_link_local) + dsa_default_offload_fwd_mark(skb); return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, is_meta); } +static void sja1110_process_meta_tstamp(struct dsa_switch *ds, int port, + u8 ts_id, enum sja1110_meta_tstamp dir, + u64 tstamp) +{ + struct sk_buff *skb, *skb_tmp, *skb_match = NULL; + struct dsa_port *dp = dsa_to_port(ds, port); + struct skb_shared_hwtstamps shwt = {0}; + struct sja1105_port *sp = dp->priv; + + if (!dsa_port_is_sja1105(dp)) + return; + + /* We don't care about RX timestamps on the CPU port */ + if (dir == SJA1110_META_TSTAMP_RX) + return; + + spin_lock(&sp->data->skb_txtstamp_queue.lock); + + skb_queue_walk_safe(&sp->data->skb_txtstamp_queue, skb, skb_tmp) { + if (SJA1105_SKB_CB(skb)->ts_id != ts_id) + continue; + + __skb_unlink(skb, &sp->data->skb_txtstamp_queue); + skb_match = skb; + + break; + } + + spin_unlock(&sp->data->skb_txtstamp_queue.lock); + + if (WARN_ON(!skb_match)) + return; + + shwt.hwtstamp = ns_to_ktime(sja1105_ticks_to_ns(tstamp)); + skb_complete_tx_timestamp(skb_match, &shwt); +} + static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header) { + u8 *buf = dsa_etype_header_pos_rx(skb) + SJA1110_HEADER_LEN; int switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header); int n_ts = SJA1110_RX_HEADER_N_TS(rx_header); struct net_device *master = skb->dev; struct dsa_port *cpu_dp; - u8 *buf = skb->data + 2; struct dsa_switch *ds; int i; @@ -474,7 +599,8 @@ static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header) static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, int *source_port, - int *switch_id) + int *switch_id, + bool *host_only) { u16 rx_header; @@ -488,6 +614,9 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, */ rx_header = ntohs(*(__be16 *)skb->data); + if (rx_header & SJA1110_RX_HEADER_HOST_ONLY) + *host_only = true; + if (rx_header & SJA1110_RX_HEADER_IS_METADATA) return sja1110_rcv_meta(skb, rx_header); @@ -522,9 +651,7 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, /* Advance skb->data past the DSA header */ skb_pull_rcsum(skb, SJA1110_HEADER_LEN); - /* Remove the DSA header */ - memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - SJA1110_HEADER_LEN, - 2 * ETH_ALEN); + dsa_strip_etype_header(skb, SJA1110_HEADER_LEN); /* With skb->data in its final place, update the MAC header * so that eth_hdr() continues to works properly. @@ -535,34 +662,35 @@ static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb, } static struct sk_buff *sja1110_rcv(struct sk_buff *skb, - struct net_device *netdev, - struct packet_type *pt) + struct net_device *netdev) { - int source_port = -1, switch_id = -1, subvlan = 0; - - skb->offload_fwd_mark = 1; + int source_port = -1, switch_id = -1; + bool host_only = false; + u16 vid = 0; if (sja1110_skb_has_inband_control_extension(skb)) { skb = sja1110_rcv_inband_control_extension(skb, &source_port, - &switch_id); + &switch_id, + &host_only); if (!skb) return NULL; } /* Packets with in-band control extensions might still have RX VLANs */ if (likely(sja1105_skb_has_tag_8021q(skb))) - dsa_8021q_rcv(skb, &source_port, &switch_id, &subvlan); + sja1105_vlan_rcv(skb, &source_port, &switch_id, &vid); - skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); + if (source_port == -1 || switch_id == -1) + skb->dev = dsa_find_designated_bridge_port_by_vid(netdev, vid); + else + skb->dev = dsa_master_find_slave(netdev, switch_id, source_port); if (!skb->dev) { - netdev_warn(netdev, - "Couldn't decode source port %d and switch id %d\n", - source_port, switch_id); + netdev_warn(netdev, "Couldn't decode source port\n"); return NULL; } - if (subvlan) - sja1105_decode_subvlan(skb, subvlan); + if (!host_only) + dsa_default_offload_fwd_mark(skb); return skb; } @@ -596,7 +724,6 @@ static const struct dsa_device_ops sja1105_netdev_ops = { .proto = DSA_TAG_PROTO_SJA1105, .xmit = sja1105_xmit, .rcv = sja1105_rcv, - .filter = sja1105_filter, .needed_headroom = VLAN_HLEN, .flow_dissect = sja1105_flow_dissect, .promisc_on_master = true, @@ -610,7 +737,6 @@ static const struct dsa_device_ops sja1110_netdev_ops = { .proto = DSA_TAG_PROTO_SJA1110, .xmit = sja1110_xmit, .rcv = sja1110_rcv, - .filter = sja1105_filter, .flow_dissect = sja1110_flow_dissect, .needed_headroom = SJA1110_HEADER_LEN + VLAN_HLEN, .needed_tailroom = SJA1110_RX_TRAILER_LEN + SJA1110_MAX_PADDING_LEN, diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index ba73804340a5..5749ba85c2b8 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -24,8 +24,7 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev) { u8 *trailer; int source_port; diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c index a31ff7fcb45f..ff442b8af636 100644 --- a/net/dsa/tag_xrs700x.c +++ b/net/dsa/tag_xrs700x.c @@ -25,8 +25,7 @@ static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev) return skb; } -static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt) +static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev) { int source_port; u8 *trailer; @@ -46,7 +45,7 @@ static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev, return NULL; /* Frame is forwarded by hardware, don't forward in software. */ - skb->offload_fwd_mark = 1; + dsa_default_offload_fwd_mark(skb); return skb; } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 9cce612e8976..c7d9e08107cb 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -51,6 +51,7 @@ #include <linux/if_ether.h> #include <linux/of_net.h> #include <linux/pci.h> +#include <linux/property.h> #include <net/dst.h> #include <net/arp.h> #include <net/sock.h> @@ -62,8 +63,6 @@ #include <linux/uaccess.h> #include <net/pkt_sched.h> -__setup("ether=", netdev_boot_setup); - /** * eth_header - create the Ethernet header * @skb: buffer to alter @@ -182,12 +181,8 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * at all, so we check here whether one of those tagging * variants has been configured on the receiving interface, * and if so, set skb->protocol without looking at the packet. - * The DSA tagging protocol may be able to decode some but not all - * traffic (for example only for management). In that case give it the - * option to filter the packets from which it can decode source port - * information. */ - if (unlikely(netdev_uses_dsa(dev)) && dsa_can_decode(skb, dev)) + if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA); if (likely(eth_proto_is_802_3(eth->h_proto))) @@ -310,7 +305,7 @@ void eth_commit_mac_addr_change(struct net_device *dev, void *p) { struct sockaddr *addr = p; - memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); + eth_hw_addr_set(dev, addr->sa_data); } EXPORT_SYMBOL(eth_commit_mac_addr_change); @@ -529,6 +524,26 @@ int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr) EXPORT_SYMBOL(eth_platform_get_mac_address); /** + * platform_get_ethdev_address - Set netdev's MAC address from a given device + * @dev: Pointer to the device + * @netdev: Pointer to netdev to write the address to + * + * Wrapper around eth_platform_get_mac_address() which writes the address + * directly to netdev->dev_addr. + */ +int platform_get_ethdev_address(struct device *dev, struct net_device *netdev) +{ + u8 addr[ETH_ALEN] __aligned(2); + int ret; + + ret = eth_platform_get_mac_address(dev, addr); + if (!ret) + eth_hw_addr_set(netdev, addr); + return ret; +} +EXPORT_SYMBOL(platform_get_ethdev_address); + +/** * nvmem_get_mac_address - Obtain the MAC address from an nvmem cell named * 'mac-address' associated with given device. * @@ -563,4 +578,81 @@ int nvmem_get_mac_address(struct device *dev, void *addrbuf) return 0; } -EXPORT_SYMBOL(nvmem_get_mac_address); + +static int fwnode_get_mac_addr(struct fwnode_handle *fwnode, + const char *name, char *addr) +{ + int ret; + + ret = fwnode_property_read_u8_array(fwnode, name, addr, ETH_ALEN); + if (ret) + return ret; + + if (!is_valid_ether_addr(addr)) + return -EINVAL; + return 0; +} + +/** + * fwnode_get_mac_address - Get the MAC from the firmware node + * @fwnode: Pointer to the firmware node + * @addr: Address of buffer to store the MAC in + * + * Search the firmware node for the best MAC address to use. 'mac-address' is + * checked first, because that is supposed to contain to "most recent" MAC + * address. If that isn't set, then 'local-mac-address' is checked next, + * because that is the default address. If that isn't set, then the obsolete + * 'address' is checked, just in case we're using an old device tree. + * + * Note that the 'address' property is supposed to contain a virtual address of + * the register set, but some DTS files have redefined that property to be the + * MAC address. + * + * All-zero MAC addresses are rejected, because those could be properties that + * exist in the firmware tables, but were not updated by the firmware. For + * example, the DTS could define 'mac-address' and 'local-mac-address', with + * zero MAC addresses. Some older U-Boots only initialized 'local-mac-address'. + * In this case, the real MAC is in 'local-mac-address', and 'mac-address' + * exists but is all zeros. + */ +int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr) +{ + if (!fwnode_get_mac_addr(fwnode, "mac-address", addr) || + !fwnode_get_mac_addr(fwnode, "local-mac-address", addr) || + !fwnode_get_mac_addr(fwnode, "address", addr)) + return 0; + + return -ENOENT; +} +EXPORT_SYMBOL(fwnode_get_mac_address); + +/** + * device_get_mac_address - Get the MAC for a given device + * @dev: Pointer to the device + * @addr: Address of buffer to store the MAC in + */ +int device_get_mac_address(struct device *dev, char *addr) +{ + return fwnode_get_mac_address(dev_fwnode(dev), addr); +} +EXPORT_SYMBOL(device_get_mac_address); + +/** + * device_get_ethdev_address - Set netdev's MAC address from a given device + * @dev: Pointer to the device + * @netdev: Pointer to netdev to write the address to + * + * Wrapper around device_get_mac_address() which writes the address + * directly to netdev->dev_addr. + */ +int device_get_ethdev_address(struct device *dev, struct net_device *netdev) +{ + u8 addr[ETH_ALEN]; + int ret; + + ret = device_get_mac_address(dev, addr); + if (!ret) + eth_hw_addr_set(netdev, addr); + return ret; +} +EXPORT_SYMBOL(device_get_ethdev_address); diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 0a19470efbfb..b76432e70e6b 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o fec.o eeprom.o stats.o phc_vclocks.o + tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 1d6bc132aa4d..46776ea42a92 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -10,6 +10,7 @@ struct coalesce_req_info { struct coalesce_reply_data { struct ethnl_reply_data base; struct ethtool_coalesce coalesce; + struct kernel_ethtool_coalesce kernel_coalesce; u32 supported_params; }; @@ -61,6 +62,7 @@ static int coalesce_prepare_data(const struct ethnl_req_info *req_base, struct genl_info *info) { struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); + struct netlink_ext_ack *extack = info ? info->extack : NULL; struct net_device *dev = reply_base->dev; int ret; @@ -70,7 +72,8 @@ static int coalesce_prepare_data(const struct ethnl_req_info *req_base, ret = ethnl_ops_begin(dev); if (ret < 0) return ret; - ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce); + ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce, + &data->kernel_coalesce, extack); ethnl_ops_complete(dev); return ret; @@ -100,7 +103,9 @@ static int coalesce_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_HIGH */ nla_total_size(sizeof(u32)) + /* _TX_USECS_HIGH */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */ - nla_total_size(sizeof(u32)); /* _RATE_SAMPLE_INTERVAL */ + nla_total_size(sizeof(u32)) + /* _RATE_SAMPLE_INTERVAL */ + nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_TX */ + nla_total_size(sizeof(u8)); /* _USE_CQE_MODE_RX */ } static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val, @@ -124,6 +129,7 @@ static int coalesce_fill_reply(struct sk_buff *skb, const struct ethnl_reply_data *reply_base) { const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); + const struct kernel_ethtool_coalesce *kcoal = &data->kernel_coalesce; const struct ethtool_coalesce *coal = &data->coalesce; u32 supported = data->supported_params; @@ -170,7 +176,11 @@ static int coalesce_fill_reply(struct sk_buff *skb, coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, coal->tx_max_coalesced_frames_high, supported) || coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, - coal->rate_sample_interval, supported)) + coal->rate_sample_interval, supported) || + coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_TX, + kcoal->use_cqe_mode_tx, supported) || + coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_RX, + kcoal->use_cqe_mode_rx, supported)) return -EMSGSIZE; return 0; @@ -215,10 +225,13 @@ const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_USE_CQE_MODE_TX] = NLA_POLICY_MAX(NLA_U8, 1), + [ETHTOOL_A_COALESCE_USE_CQE_MODE_RX] = NLA_POLICY_MAX(NLA_U8, 1), }; int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) { + struct kernel_ethtool_coalesce kernel_coalesce = {}; struct ethtool_coalesce coalesce = {}; struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; @@ -255,7 +268,8 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; - ret = ops->get_coalesce(dev, &coalesce); + ret = ops->get_coalesce(dev, &coalesce, &kernel_coalesce, + info->extack); if (ret < 0) goto out_ops; @@ -303,11 +317,16 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod); ethnl_update_u32(&coalesce.rate_sample_interval, tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod); + ethnl_update_u8(&kernel_coalesce.use_cqe_mode_tx, + tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX], &mod); + ethnl_update_u8(&kernel_coalesce.use_cqe_mode_rx, + tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX], &mod); ret = 0; if (!mod) goto out_ops; - ret = dev->ethtool_ops->set_coalesce(dev, &coalesce); + ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, + info->extack); if (ret < 0) goto out_ops; ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL); diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index baa5d10043cb..65e9bc1058b5 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -7,6 +7,7 @@ * the information ethtool needs. */ +#include <linux/compat.h> #include <linux/module.h> #include <linux/types.h> #include <linux/capability.h> @@ -23,6 +24,7 @@ #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/net.h> +#include <linux/pm_runtime.h> #include <net/devlink.h> #include <net/xdp_sock_drv.h> #include <net/flow_offload.h> @@ -30,6 +32,29 @@ #include <generated/utsrelease.h> #include "common.h" +/* State held across locks and calls for commands which have devlink fallback */ +struct ethtool_devlink_compat { + struct devlink *devlink; + union { + struct ethtool_flash efl; + struct ethtool_drvinfo info; + }; +}; + +static struct devlink *netdev_to_devlink_get(struct net_device *dev) +{ + struct devlink_port *devlink_port; + + if (!dev->netdev_ops->ndo_get_devlink_port) + return NULL; + + devlink_port = dev->netdev_ops->ndo_get_devlink_port(dev); + if (!devlink_port) + return NULL; + + return devlink_try_get(devlink_port->devlink); +} + /* * Some useful ethtool_ops methods that're device independent. * If we find that all drivers want to do the same thing here, @@ -87,7 +112,8 @@ static int ethtool_get_features(struct net_device *dev, void __user *useraddr) if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; useraddr += sizeof(cmd); - if (copy_to_user(useraddr, features, copy_size * sizeof(*features))) + if (copy_to_user(useraddr, features, + array_size(copy_size, sizeof(*features)))) return -EFAULT; return 0; @@ -333,7 +359,7 @@ EXPORT_SYMBOL(ethtool_intersect_link_masks); void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32) { - bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); + linkmode_zero(dst); dst[0] = legacy_u32; } EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode); @@ -348,11 +374,10 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) { __ETHTOOL_DECLARE_LINK_MODE_MASK(ext); - bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS); + linkmode_zero(ext); bitmap_fill(ext, 32); bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS); - if (bitmap_intersects(ext, src, - __ETHTOOL_LINK_MODE_MASK_NBITS)) { + if (linkmode_intersects(ext, src)) { /* src mask goes beyond bit 31 */ retval = false; } @@ -695,22 +720,20 @@ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) return ret; } -static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, - void __user *useraddr) +static int +ethtool_get_drvinfo(struct net_device *dev, struct ethtool_devlink_compat *rsp) { - struct ethtool_drvinfo info; const struct ethtool_ops *ops = dev->ethtool_ops; - memset(&info, 0, sizeof(info)); - info.cmd = ETHTOOL_GDRVINFO; - strlcpy(info.version, UTS_RELEASE, sizeof(info.version)); + rsp->info.cmd = ETHTOOL_GDRVINFO; + strlcpy(rsp->info.version, UTS_RELEASE, sizeof(rsp->info.version)); if (ops->get_drvinfo) { - ops->get_drvinfo(dev, &info); + ops->get_drvinfo(dev, &rsp->info); } else if (dev->dev.parent && dev->dev.parent->driver) { - strlcpy(info.bus_info, dev_name(dev->dev.parent), - sizeof(info.bus_info)); - strlcpy(info.driver, dev->dev.parent->driver->name, - sizeof(info.driver)); + strlcpy(rsp->info.bus_info, dev_name(dev->dev.parent), + sizeof(rsp->info.bus_info)); + strlcpy(rsp->info.driver, dev->dev.parent->driver->name, + sizeof(rsp->info.driver)); } else { return -EOPNOTSUPP; } @@ -724,30 +747,27 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, rc = ops->get_sset_count(dev, ETH_SS_TEST); if (rc >= 0) - info.testinfo_len = rc; + rsp->info.testinfo_len = rc; rc = ops->get_sset_count(dev, ETH_SS_STATS); if (rc >= 0) - info.n_stats = rc; + rsp->info.n_stats = rc; rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS); if (rc >= 0) - info.n_priv_flags = rc; + rsp->info.n_priv_flags = rc; } if (ops->get_regs_len) { int ret = ops->get_regs_len(dev); if (ret > 0) - info.regdump_len = ret; + rsp->info.regdump_len = ret; } if (ops->get_eeprom_len) - info.eedump_len = ops->get_eeprom_len(dev); + rsp->info.eedump_len = ops->get_eeprom_len(dev); - if (!info.fw_version[0]) - devlink_compat_running_version(dev, info.fw_version, - sizeof(info.fw_version)); + if (!rsp->info.fw_version[0]) + rsp->devlink = netdev_to_devlink_get(dev); - if (copy_to_user(useraddr, &info, sizeof(info))) - return -EFAULT; return 0; } @@ -797,7 +817,7 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev, goto out; useraddr += offsetof(struct ethtool_sset_info, data); - if (copy_to_user(useraddr, info_buf, idx * sizeof(u32))) + if (copy_to_user(useraddr, info_buf, array_size(idx, sizeof(u32)))) goto out; ret = 0; @@ -807,6 +827,120 @@ out: return ret; } +static noinline_for_stack int +ethtool_rxnfc_copy_from_compat(struct ethtool_rxnfc *rxnfc, + const struct compat_ethtool_rxnfc __user *useraddr, + size_t size) +{ + struct compat_ethtool_rxnfc crxnfc = {}; + + /* We expect there to be holes between fs.m_ext and + * fs.ring_cookie and at the end of fs, but nowhere else. + * On non-x86, no conversion should be needed. + */ + BUILD_BUG_ON(!IS_ENABLED(CONFIG_X86_64) && + sizeof(struct compat_ethtool_rxnfc) != + sizeof(struct ethtool_rxnfc)); + BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) + + sizeof(useraddr->fs.m_ext) != + offsetof(struct ethtool_rxnfc, fs.m_ext) + + sizeof(rxnfc->fs.m_ext)); + BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.location) - + offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != + offsetof(struct ethtool_rxnfc, fs.location) - + offsetof(struct ethtool_rxnfc, fs.ring_cookie)); + + if (copy_from_user(&crxnfc, useraddr, min(size, sizeof(crxnfc)))) + return -EFAULT; + + *rxnfc = (struct ethtool_rxnfc) { + .cmd = crxnfc.cmd, + .flow_type = crxnfc.flow_type, + .data = crxnfc.data, + .fs = { + .flow_type = crxnfc.fs.flow_type, + .h_u = crxnfc.fs.h_u, + .h_ext = crxnfc.fs.h_ext, + .m_u = crxnfc.fs.m_u, + .m_ext = crxnfc.fs.m_ext, + .ring_cookie = crxnfc.fs.ring_cookie, + .location = crxnfc.fs.location, + }, + .rule_cnt = crxnfc.rule_cnt, + }; + + return 0; +} + +static int ethtool_rxnfc_copy_from_user(struct ethtool_rxnfc *rxnfc, + const void __user *useraddr, + size_t size) +{ + if (compat_need_64bit_alignment_fixup()) + return ethtool_rxnfc_copy_from_compat(rxnfc, useraddr, size); + + if (copy_from_user(rxnfc, useraddr, size)) + return -EFAULT; + + return 0; +} + +static int ethtool_rxnfc_copy_to_compat(void __user *useraddr, + const struct ethtool_rxnfc *rxnfc, + size_t size, const u32 *rule_buf) +{ + struct compat_ethtool_rxnfc crxnfc; + + memset(&crxnfc, 0, sizeof(crxnfc)); + crxnfc = (struct compat_ethtool_rxnfc) { + .cmd = rxnfc->cmd, + .flow_type = rxnfc->flow_type, + .data = rxnfc->data, + .fs = { + .flow_type = rxnfc->fs.flow_type, + .h_u = rxnfc->fs.h_u, + .h_ext = rxnfc->fs.h_ext, + .m_u = rxnfc->fs.m_u, + .m_ext = rxnfc->fs.m_ext, + .ring_cookie = rxnfc->fs.ring_cookie, + .location = rxnfc->fs.location, + }, + .rule_cnt = rxnfc->rule_cnt, + }; + + if (copy_to_user(useraddr, &crxnfc, min(size, sizeof(crxnfc)))) + return -EFAULT; + + return 0; +} + +static int ethtool_rxnfc_copy_to_user(void __user *useraddr, + const struct ethtool_rxnfc *rxnfc, + size_t size, const u32 *rule_buf) +{ + int ret; + + if (compat_need_64bit_alignment_fixup()) { + ret = ethtool_rxnfc_copy_to_compat(useraddr, rxnfc, size, + rule_buf); + useraddr += offsetof(struct compat_ethtool_rxnfc, rule_locs); + } else { + ret = copy_to_user(useraddr, rxnfc, size); + useraddr += offsetof(struct ethtool_rxnfc, rule_locs); + } + + if (ret) + return -EFAULT; + + if (rule_buf) { + if (copy_to_user(useraddr, rule_buf, + rxnfc->rule_cnt * sizeof(u32))) + return -EFAULT; + } + + return 0; +} + static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { @@ -825,7 +959,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, info_size = (offsetof(struct ethtool_rxnfc, data) + sizeof(info.data)); - if (copy_from_user(&info, useraddr, info_size)) + if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) return -EFAULT; rc = dev->ethtool_ops->set_rxnfc(dev, &info); @@ -833,7 +967,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, return rc; if (cmd == ETHTOOL_SRXCLSRLINS && - copy_to_user(useraddr, &info, info_size)) + ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL)) return -EFAULT; return 0; @@ -859,7 +993,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, info_size = (offsetof(struct ethtool_rxnfc, data) + sizeof(info.data)); - if (copy_from_user(&info, useraddr, info_size)) + if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) return -EFAULT; /* If FLOW_RSS was requested then user-space must be using the @@ -867,7 +1001,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, */ if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) { info_size = sizeof(info); - if (copy_from_user(&info, useraddr, info_size)) + if (ethtool_rxnfc_copy_from_user(&info, useraddr, info_size)) return -EFAULT; /* Since malicious users may modify the original data, * we need to check whether FLOW_RSS is still requested. @@ -893,18 +1027,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, if (ret < 0) goto err_out; - ret = -EFAULT; - if (copy_to_user(useraddr, &info, info_size)) - goto err_out; - - if (rule_buf) { - useraddr += offsetof(struct ethtool_rxnfc, rule_locs); - if (copy_to_user(useraddr, rule_buf, - info.rule_cnt * sizeof(u32))) - goto err_out; - } - ret = 0; - + ret = ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf); err_out: kfree(rule_buf); @@ -917,7 +1040,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, { int i; - if (copy_from_user(indir, useraddr, size * sizeof(indir[0]))) + if (copy_from_user(indir, useraddr, array_size(size, sizeof(indir[0])))) return -EFAULT; /* Validate ring indices */ @@ -1432,6 +1555,10 @@ static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr, ret = getter(dev, &eeprom, data); if (ret) break; + if (!eeprom.len) { + ret = -EIO; + break; + } if (copy_to_user(userbuf, data, eeprom.len)) { ret = -EFAULT; break; @@ -1514,12 +1641,14 @@ static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + struct kernel_ethtool_coalesce kernel_coalesce = {}; int ret; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; - ret = dev->ethtool_ops->get_coalesce(dev, &coalesce); + ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, + NULL); if (ret) return ret; @@ -1586,19 +1715,26 @@ ethtool_set_coalesce_supported(struct net_device *dev, static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) { + struct kernel_ethtool_coalesce kernel_coalesce = {}; struct ethtool_coalesce coalesce; int ret; - if (!dev->ethtool_ops->set_coalesce) + if (!dev->ethtool_ops->set_coalesce && !dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; + ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce, + NULL); + if (ret) + return ret; + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) return -EFAULT; if (!ethtool_set_coalesce_supported(dev, &coalesce)) return -EOPNOTSUPP; - ret = dev->ethtool_ops->set_coalesce(dev, &coalesce); + ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce, + NULL); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF, NULL); return ret; @@ -1777,7 +1913,7 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) if (copy_to_user(useraddr, &test, sizeof(test))) goto out; useraddr += sizeof(test); - if (copy_to_user(useraddr, data, test.len * sizeof(u64))) + if (copy_to_user(useraddr, data, array_size(test.len, sizeof(u64)))) goto out; ret = 0; @@ -1819,7 +1955,8 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) goto out; useraddr += sizeof(gstrings); if (gstrings.len && - copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + copy_to_user(useraddr, data, + array_size(gstrings.len, ETH_GSTRING_LEN))) goto out; ret = 0; @@ -2059,19 +2196,15 @@ static int ethtool_set_value(struct net_device *dev, char __user *useraddr, return actor(dev, edata.data); } -static noinline_for_stack int ethtool_flash_device(struct net_device *dev, - char __user *useraddr) +static int +ethtool_flash_device(struct net_device *dev, struct ethtool_devlink_compat *req) { - struct ethtool_flash efl; - - if (copy_from_user(&efl, useraddr, sizeof(efl))) - return -EFAULT; - efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; - - if (!dev->ethtool_ops->flash_device) - return devlink_compat_flash_update(dev, efl.data); + if (!dev->ethtool_ops->flash_device) { + req->devlink = netdev_to_devlink_get(dev); + return 0; + } - return dev->ethtool_ops->flash_device(dev, &efl); + return dev->ethtool_ops->flash_device(dev, &req->efl); } static int ethtool_set_dump(struct net_device *dev, @@ -2581,20 +2714,19 @@ static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) /* The main entry point in this file. Called from net/core/dev_ioctl.c */ -int dev_ethtool(struct net *net, struct ifreq *ifr) +static int +__dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr, + u32 ethcmd, struct ethtool_devlink_compat *devlink_state) { - struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); - void __user *useraddr = ifr->ifr_data; - u32 ethcmd, sub_cmd; + struct net_device *dev; + u32 sub_cmd; int rc; netdev_features_t old_features; - if (!dev || !netif_device_present(dev)) + dev = __dev_get_by_name(net, ifr->ifr_name); + if (!dev) return -ENODEV; - if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) - return -EFAULT; - if (ethcmd == ETHTOOL_PERQUEUE) { if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd))) return -EFAULT; @@ -2645,10 +2777,18 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) return -EPERM; } + if (dev->dev.parent) + pm_runtime_get_sync(dev->dev.parent); + + if (!netif_device_present(dev)) { + rc = -ENODEV; + goto out; + } + if (dev->ethtool_ops->begin) { rc = dev->ethtool_ops->begin(dev); - if (rc < 0) - return rc; + if (rc < 0) + goto out; } old_features = dev->features; @@ -2660,7 +2800,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_set_settings(dev, useraddr); break; case ETHTOOL_GDRVINFO: - rc = ethtool_get_drvinfo(dev, useraddr); + rc = ethtool_get_drvinfo(dev, devlink_state); break; case ETHTOOL_GREGS: rc = ethtool_get_regs(dev, useraddr); @@ -2762,7 +2902,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) rc = ethtool_set_rxnfc(dev, ethcmd, useraddr); break; case ETHTOOL_FLASHDEV: - rc = ethtool_flash_device(dev, useraddr); + rc = ethtool_flash_device(dev, devlink_state); break; case ETHTOOL_RESET: rc = ethtool_reset(dev, useraddr); @@ -2867,7 +3007,64 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (old_features != dev->features) netdev_features_change(dev); +out: + if (dev->dev.parent) + pm_runtime_put(dev->dev.parent); + + return rc; +} + +int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr) +{ + struct ethtool_devlink_compat *state; + u32 ethcmd; + int rc; + + if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) + return -EFAULT; + + state = kzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return -ENOMEM; + + switch (ethcmd) { + case ETHTOOL_FLASHDEV: + if (copy_from_user(&state->efl, useraddr, sizeof(state->efl))) { + rc = -EFAULT; + goto exit_free; + } + state->efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; + break; + } + + rtnl_lock(); + rc = __dev_ethtool(net, ifr, useraddr, ethcmd, state); + rtnl_unlock(); + if (rc) + goto exit_free; + + switch (ethcmd) { + case ETHTOOL_FLASHDEV: + if (state->devlink) + rc = devlink_compat_flash_update(state->devlink, + state->efl.data); + break; + case ETHTOOL_GDRVINFO: + if (state->devlink) + devlink_compat_running_version(state->devlink, + state->info.fw_version, + sizeof(state->info.fw_version)); + if (copy_to_user(useraddr, &state->info, sizeof(state->info))) { + rc = -EFAULT; + goto exit_free; + } + break; + } +exit_free: + if (state->devlink) + devlink_put(state->devlink); + kfree(state); return rc; } diff --git a/net/ethtool/module.c b/net/ethtool/module.c new file mode 100644 index 000000000000..bc2cef11bbda --- /dev/null +++ b/net/ethtool/module.c @@ -0,0 +1,180 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ethtool.h> + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct module_req_info { + struct ethnl_req_info base; +}; + +struct module_reply_data { + struct ethnl_reply_data base; + struct ethtool_module_power_mode_params power; +}; + +#define MODULE_REPDATA(__reply_base) \ + container_of(__reply_base, struct module_reply_data, base) + +/* MODULE_GET */ + +const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1] = { + [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int module_get_power_mode(struct net_device *dev, + struct module_reply_data *data, + struct netlink_ext_ack *extack) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops->get_module_power_mode) + return 0; + + return ops->get_module_power_mode(dev, &data->power, extack); +} + +static int module_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct module_reply_data *data = MODULE_REPDATA(reply_base); + struct netlink_ext_ack *extack = info ? info->extack : NULL; + struct net_device *dev = reply_base->dev; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + ret = module_get_power_mode(dev, data, extack); + if (ret < 0) + goto out_complete; + +out_complete: + ethnl_ops_complete(dev); + return ret; +} + +static int module_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + struct module_reply_data *data = MODULE_REPDATA(reply_base); + int len = 0; + + if (data->power.policy) + len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE_POLICY */ + + if (data->power.mode) + len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE */ + + return len; +} + +static int module_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct module_reply_data *data = MODULE_REPDATA(reply_base); + + if (data->power.policy && + nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE_POLICY, + data->power.policy)) + return -EMSGSIZE; + + if (data->power.mode && + nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE, data->power.mode)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_module_request_ops = { + .request_cmd = ETHTOOL_MSG_MODULE_GET, + .reply_cmd = ETHTOOL_MSG_MODULE_GET_REPLY, + .hdr_attr = ETHTOOL_A_MODULE_HEADER, + .req_info_size = sizeof(struct module_req_info), + .reply_data_size = sizeof(struct module_reply_data), + + .prepare_data = module_prepare_data, + .reply_size = module_reply_size, + .fill_reply = module_fill_reply, +}; + +/* MODULE_SET */ + +const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1] = { + [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_MODULE_POWER_MODE_POLICY] = + NLA_POLICY_RANGE(NLA_U8, ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH, + ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO), +}; + +static int module_set_power_mode(struct net_device *dev, struct nlattr **tb, + bool *p_mod, struct netlink_ext_ack *extack) +{ + struct ethtool_module_power_mode_params power = {}; + struct ethtool_module_power_mode_params power_new; + const struct ethtool_ops *ops = dev->ethtool_ops; + int ret; + + if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]) + return 0; + + if (!ops->get_module_power_mode || !ops->set_module_power_mode) { + NL_SET_ERR_MSG_ATTR(extack, + tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY], + "Setting power mode policy is not supported by this device"); + return -EOPNOTSUPP; + } + + power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]); + ret = ops->get_module_power_mode(dev, &power, extack); + if (ret < 0) + return ret; + + if (power_new.policy == power.policy) + return 0; + *p_mod = true; + + return ops->set_module_power_mode(dev, &power_new, extack); +} + +int ethnl_set_module(struct sk_buff *skb, struct genl_info *info) +{ + struct ethnl_req_info req_info = {}; + struct nlattr **tb = info->attrs; + struct net_device *dev; + bool mod = false; + int ret; + + ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_MODULE_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = module_set_power_mode(dev, tb, &mod, info->extack); + if (ret < 0) + goto out_ops; + + if (!mod) + goto out_ops; + + ethtool_notify(dev, ETHTOOL_MSG_MODULE_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 73e0f5b626bf..38b44c0291b1 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -2,6 +2,7 @@ #include <net/sock.h> #include <linux/ethtool_netlink.h> +#include <linux/pm_runtime.h> #include "netlink.h" static struct genl_family ethtool_genl_family; @@ -29,6 +30,44 @@ const struct nla_policy ethnl_header_policy_stats[] = { ETHTOOL_FLAGS_STATS), }; +int ethnl_ops_begin(struct net_device *dev) +{ + int ret; + + if (!dev) + return -ENODEV; + + if (dev->dev.parent) + pm_runtime_get_sync(dev->dev.parent); + + if (!netif_device_present(dev)) { + ret = -ENODEV; + goto err; + } + + if (dev->ethtool_ops->begin) { + ret = dev->ethtool_ops->begin(dev); + if (ret) + goto err; + } + + return 0; +err: + if (dev->dev.parent) + pm_runtime_put(dev->dev.parent); + + return ret; +} + +void ethnl_ops_complete(struct net_device *dev) +{ + if (dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); + + if (dev->dev.parent) + pm_runtime_put(dev->dev.parent); +} + /** * ethnl_parse_header_dev_get() - parse request header * @req_info: structure to put results into @@ -101,12 +140,6 @@ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, return -EINVAL; } - if (dev && !netif_device_present(dev)) { - dev_put(dev); - NL_SET_ERR_MSG(extack, "device not present"); - return -ENODEV; - } - req_info->dev = dev; req_info->flags = flags; return 0; @@ -249,6 +282,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_MODULE_EEPROM_GET] = ðnl_module_eeprom_request_ops, [ETHTOOL_MSG_STATS_GET] = ðnl_stats_request_ops, [ETHTOOL_MSG_PHC_VCLOCKS_GET] = ðnl_phc_vclocks_request_ops, + [ETHTOOL_MSG_MODULE_GET] = ðnl_module_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -365,8 +399,7 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) ops->cleanup_data(reply_data); genlmsg_end(rskb, reply_payload); - if (req_info->dev) - dev_put(req_info->dev); + dev_put(req_info->dev); kfree(reply_data); kfree(req_info); return genlmsg_reply(rskb, info); @@ -378,8 +411,7 @@ err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); err_dev: - if (req_info->dev) - dev_put(req_info->dev); + dev_put(req_info->dev); kfree(reply_data); kfree(req_info); return ret; @@ -562,6 +594,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_PAUSE_NTF] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_NTF] = ðnl_eee_request_ops, [ETHTOOL_MSG_FEC_NTF] = ðnl_fec_request_ops, + [ETHTOOL_MSG_MODULE_NTF] = ðnl_module_request_ops, }; /* default notification handler */ @@ -655,6 +688,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) @@ -968,6 +1002,22 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_phc_vclocks_get_policy, .maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_MODULE_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_module_get_policy, + .maxattr = ARRAY_SIZE(ethnl_module_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_MODULE_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_module, + .policy = ethnl_module_set_policy, + .maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 3fc395c86702..836ee7157848 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -247,19 +247,8 @@ struct ethnl_reply_data { struct net_device *dev; }; -static inline int ethnl_ops_begin(struct net_device *dev) -{ - if (dev && dev->ethtool_ops->begin) - return dev->ethtool_ops->begin(dev); - else - return 0; -} - -static inline void ethnl_ops_complete(struct net_device *dev) -{ - if (dev && dev->ethtool_ops->complete) - dev->ethtool_ops->complete(dev); -} +int ethnl_ops_begin(struct net_device *dev); +void ethnl_ops_complete(struct net_device *dev); /** * struct ethnl_request_ops - unified handling of GET requests @@ -348,6 +337,7 @@ extern const struct ethnl_request_ops ethnl_fec_request_ops; extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; extern const struct ethnl_request_ops ethnl_stats_request_ops; extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; +extern const struct ethnl_request_ops ethnl_module_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -370,7 +360,7 @@ extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; -extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL + 1]; +extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1]; extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_HEADER + 1]; extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; @@ -384,6 +374,8 @@ extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1]; extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1]; extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1]; +extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1]; +extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); @@ -402,6 +394,7 @@ int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_module(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 26c32407f029..737e4f17e1c6 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -309,9 +309,9 @@ static void send_hsr_supervision_frame(struct hsr_port *master, } spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags); - hsr_stag->HSR_TLV_type = type; + hsr_stag->tlv.HSR_TLV_type = type; /* TODO: Why 12 in HSRv0? */ - hsr_stag->HSR_TLV_length = hsr->prot_version ? + hsr_stag->tlv.HSR_TLV_length = hsr->prot_version ? sizeof(struct hsr_sup_payload) : 12; /* Payload: MacAddressA */ @@ -350,8 +350,8 @@ static void send_prp_supervision_frame(struct hsr_port *master, spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags); hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr); hsr->sup_sequence_nr++; - hsr_stag->HSR_TLV_type = PRP_TLV_LIFE_CHECK_DD; - hsr_stag->HSR_TLV_length = sizeof(struct hsr_sup_payload); + hsr_stag->tlv.HSR_TLV_type = PRP_TLV_LIFE_CHECK_DD; + hsr_stag->tlv.HSR_TLV_length = sizeof(struct hsr_sup_payload); /* Payload: MacAddressA */ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload)); @@ -493,7 +493,7 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2], INIT_LIST_HEAD(&hsr->self_node_db); spin_lock_init(&hsr->list_lock); - ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr); + eth_hw_addr_set(hsr_dev, slave[0]->dev_addr); /* initialize protocol specific functions */ if (protocol_version == PRP_V1) { diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index ceb8afb2a62f..e59cbb4f0cd1 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -37,6 +37,8 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) struct ethhdr *eth_hdr; struct hsr_sup_tag *hsr_sup_tag; struct hsrv1_ethhdr_sp *hsr_V1_hdr; + struct hsr_sup_tlv *hsr_sup_tlv; + u16 total_length = 0; WARN_ON_ONCE(!skb_mac_header_was_set(skb)); eth_hdr = (struct ethhdr *)skb_mac_header(skb); @@ -53,23 +55,63 @@ static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb) /* Get the supervision header from correct location. */ if (eth_hdr->h_proto == htons(ETH_P_HSR)) { /* Okay HSRv1. */ + total_length = sizeof(struct hsrv1_ethhdr_sp); + if (!pskb_may_pull(skb, total_length)) + return false; + hsr_V1_hdr = (struct hsrv1_ethhdr_sp *)skb_mac_header(skb); if (hsr_V1_hdr->hsr.encap_proto != htons(ETH_P_PRP)) return false; hsr_sup_tag = &hsr_V1_hdr->hsr_sup; } else { + total_length = sizeof(struct hsrv0_ethhdr_sp); + if (!pskb_may_pull(skb, total_length)) + return false; + hsr_sup_tag = &((struct hsrv0_ethhdr_sp *)skb_mac_header(skb))->hsr_sup; } - if (hsr_sup_tag->HSR_TLV_type != HSR_TLV_ANNOUNCE && - hsr_sup_tag->HSR_TLV_type != HSR_TLV_LIFE_CHECK && - hsr_sup_tag->HSR_TLV_type != PRP_TLV_LIFE_CHECK_DD && - hsr_sup_tag->HSR_TLV_type != PRP_TLV_LIFE_CHECK_DA) + if (hsr_sup_tag->tlv.HSR_TLV_type != HSR_TLV_ANNOUNCE && + hsr_sup_tag->tlv.HSR_TLV_type != HSR_TLV_LIFE_CHECK && + hsr_sup_tag->tlv.HSR_TLV_type != PRP_TLV_LIFE_CHECK_DD && + hsr_sup_tag->tlv.HSR_TLV_type != PRP_TLV_LIFE_CHECK_DA) return false; - if (hsr_sup_tag->HSR_TLV_length != 12 && - hsr_sup_tag->HSR_TLV_length != sizeof(struct hsr_sup_payload)) + if (hsr_sup_tag->tlv.HSR_TLV_length != 12 && + hsr_sup_tag->tlv.HSR_TLV_length != sizeof(struct hsr_sup_payload)) + return false; + + /* Get next tlv */ + total_length += sizeof(struct hsr_sup_tlv) + hsr_sup_tag->tlv.HSR_TLV_length; + if (!pskb_may_pull(skb, total_length)) + return false; + skb_pull(skb, total_length); + hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; + skb_push(skb, total_length); + + /* if this is a redbox supervision frame we need to verify + * that more data is available + */ + if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) { + /* tlv length must be a length of a mac address */ + if (hsr_sup_tlv->HSR_TLV_length != sizeof(struct hsr_sup_payload)) + return false; + + /* make sure another tlv follows */ + total_length += sizeof(struct hsr_sup_tlv) + hsr_sup_tlv->HSR_TLV_length; + if (!pskb_may_pull(skb, total_length)) + return false; + + /* get next tlv */ + skb_pull(skb, total_length); + hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; + skb_push(skb, total_length); + } + + /* end of tlvs must follow at the end */ + if (hsr_sup_tlv->HSR_TLV_type == HSR_TLV_EOT && + hsr_sup_tlv->HSR_TLV_length != 0) return false; return true; diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index e31949479305..0775f0f95dbf 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -76,8 +76,8 @@ static struct hsr_node *find_node_by_addr_A(struct list_head *node_db, * frames from self that's been looped over the HSR ring. */ int hsr_create_self_node(struct hsr_priv *hsr, - unsigned char addr_a[ETH_ALEN], - unsigned char addr_b[ETH_ALEN]) + const unsigned char addr_a[ETH_ALEN], + const unsigned char addr_b[ETH_ALEN]) { struct list_head *self_node_db = &hsr->self_node_db; struct hsr_node *node, *oldnode; @@ -265,11 +265,14 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) struct hsr_port *port_rcv = frame->port_rcv; struct hsr_priv *hsr = port_rcv->hsr; struct hsr_sup_payload *hsr_sp; + struct hsr_sup_tlv *hsr_sup_tlv; struct hsr_node *node_real; struct sk_buff *skb = NULL; struct list_head *node_db; struct ethhdr *ethhdr; int i; + unsigned int pull_size = 0; + unsigned int total_pull_size = 0; /* Here either frame->skb_hsr or frame->skb_prp should be * valid as supervision frame always will have protocol @@ -284,18 +287,26 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) if (!skb) return; - ethhdr = (struct ethhdr *)skb_mac_header(skb); - /* Leave the ethernet header. */ - skb_pull(skb, sizeof(struct ethhdr)); + pull_size = sizeof(struct ethhdr); + skb_pull(skb, pull_size); + total_pull_size += pull_size; + + ethhdr = (struct ethhdr *)skb_mac_header(skb); /* And leave the HSR tag. */ - if (ethhdr->h_proto == htons(ETH_P_HSR)) - skb_pull(skb, sizeof(struct hsr_tag)); + if (ethhdr->h_proto == htons(ETH_P_HSR)) { + pull_size = sizeof(struct ethhdr); + skb_pull(skb, pull_size); + total_pull_size += pull_size; + } /* And leave the HSR sup tag. */ - skb_pull(skb, sizeof(struct hsr_sup_tag)); + pull_size = sizeof(struct hsr_tag); + skb_pull(skb, pull_size); + total_pull_size += pull_size; + /* get HSR sup payload */ hsr_sp = (struct hsr_sup_payload *)skb->data; /* Merge node_curr (registered on macaddress_B) into node_real */ @@ -312,6 +323,37 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) /* Node has already been merged */ goto done; + /* Leave the first HSR sup payload. */ + pull_size = sizeof(struct hsr_sup_payload); + skb_pull(skb, pull_size); + total_pull_size += pull_size; + + /* Get second supervision tlv */ + hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data; + /* And check if it is a redbox mac TLV */ + if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) { + /* We could stop here after pushing hsr_sup_payload, + * or proceed and allow macaddress_B and for redboxes. + */ + /* Sanity check length */ + if (hsr_sup_tlv->HSR_TLV_length != 6) + goto done; + + /* Leave the second HSR sup tlv. */ + pull_size = sizeof(struct hsr_sup_tlv); + skb_pull(skb, pull_size); + total_pull_size += pull_size; + + /* Get redbox mac address. */ + hsr_sp = (struct hsr_sup_payload *)skb->data; + + /* Check if redbox mac and node mac are equal. */ + if (!ether_addr_equal(node_real->macaddress_A, hsr_sp->macaddress_A)) { + /* This is a redbox supervision frame for a VDAN! */ + goto done; + } + } + ether_addr_copy(node_real->macaddress_B, ethhdr->h_source); for (i = 0; i < HSR_PT_PORTS; i++) { if (!node_curr->time_in_stale[i] && @@ -331,11 +373,8 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) kfree_rcu(node_curr, rcu_head); done: - /* PRP uses v0 header */ - if (ethhdr->h_proto == htons(ETH_P_HSR)) - skb_push(skb, sizeof(struct hsrv1_ethhdr_sp)); - else - skb_push(skb, sizeof(struct hsrv0_ethhdr_sp)); + /* Push back here */ + skb_push(skb, total_pull_size); } /* 'skb' is a frame meant for this host, that is to be passed to upper layers. diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index d9628e7a5f05..bdbb8c822ba1 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -48,8 +48,8 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node, void hsr_prune_nodes(struct timer_list *t); int hsr_create_self_node(struct hsr_priv *hsr, - unsigned char addr_a[ETH_ALEN], - unsigned char addr_b[ETH_ALEN]); + const unsigned char addr_a[ETH_ALEN], + const unsigned char addr_b[ETH_ALEN]); void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, unsigned char addr[ETH_ALEN]); diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index f7e284f23b1f..b099c3150150 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -75,7 +75,7 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (port->type == HSR_PT_SLAVE_A) { - ether_addr_copy(master->dev->dev_addr, dev->dev_addr); + eth_hw_addr_set(master->dev, dev->dev_addr); call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev); } diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 53d1f7a82463..043e4e9a1694 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -35,13 +35,15 @@ * HSR_NODE_FORGET_TIME? */ #define PRUNE_PERIOD 3000 /* ms */ - +#define HSR_TLV_EOT 0 /* End of TLVs */ #define HSR_TLV_ANNOUNCE 22 #define HSR_TLV_LIFE_CHECK 23 /* PRP V1 life check for Duplicate discard */ #define PRP_TLV_LIFE_CHECK_DD 20 /* PRP V1 life check for Duplicate Accept */ #define PRP_TLV_LIFE_CHECK_DA 21 +/* PRP V1 life redundancy box MAC address */ +#define PRP_TLV_REDBOX_MAC 30 /* HSR Tag. * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB, @@ -94,14 +96,18 @@ struct hsr_vlan_ethhdr { struct hsr_tag hsr_tag; } __packed; +struct hsr_sup_tlv { + u8 HSR_TLV_type; + u8 HSR_TLV_length; +}; + /* HSR/PRP Supervision Frame data types. * Field names as defined in the IEC:2010 standard for HSR. */ struct hsr_sup_tag { - __be16 path_and_HSR_ver; - __be16 sequence_nr; - __u8 HSR_TLV_type; - __u8 HSR_TLV_length; + __be16 path_and_HSR_ver; + __be16 sequence_nr; + struct hsr_sup_tlv tlv; } __packed; struct hsr_sup_payload { diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 3297e7fa9945..2cf62718a282 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -157,7 +157,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, lowpan_802154_dev(ldev)->wdev = wdev; /* Set the lowpan hardware address to the wpan hardware address. */ - memcpy(ldev->dev_addr, wdev->dev_addr, IEEE802154_ADDR_LEN); + __dev_addr_set(ldev, wdev->dev_addr, IEEE802154_ADDR_LEN); /* We need headroom for possible wpan_dev_hard_header call and tailroom * for encryption/fcs handling. The lowpan interface will replace * the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c index 88215b5c93aa..dd5a45f8a78a 100644 --- a/net/ieee802154/nl-phy.c +++ b/net/ieee802154/nl-phy.c @@ -340,8 +340,7 @@ nla_put_failure: out_dev: wpan_phy_put(phy); out: - if (dev) - dev_put(dev); + dev_put(dev); return rc; } diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 0cf2374c143b..277124f206e0 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -2226,8 +2226,7 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb, if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { struct wpan_dev *wpan_dev = info->user_ptr[1]; - if (wpan_dev->netdev) - dev_put(wpan_dev->netdev); + dev_put(wpan_dev->netdev); } else { dev_put(info->user_ptr[1]); } diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index c25f7617770c..7bb9ef35c570 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -41,8 +41,7 @@ ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) ieee802154_devaddr_to_raw(hwaddr, addr->extended_addr); rcu_read_lock(); dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, hwaddr); - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); break; case IEEE802154_ADDR_SHORT: @@ -129,7 +128,7 @@ static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, int ret = -ENOIOCTLCMD; struct net_device *dev; - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, NULL, arg)) return -EFAULT; ifr.ifr_name[IFNAMSIZ-1] = 0; @@ -143,7 +142,7 @@ static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl) ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd); - if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) + if (!ret && put_user_ifreq(&ifr, arg)) ret = -EFAULT; dev_put(dev); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 54648181dd56..0189e3cd4a7d 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -133,13 +133,9 @@ void inet_sock_destruct(struct sock *sk) struct inet_sock *inet = inet_sk(sk); __skb_queue_purge(&sk->sk_receive_queue); - if (sk->sk_rx_skb_cache) { - __kfree_skb(sk->sk_rx_skb_cache); - sk->sk_rx_skb_cache = NULL; - } __skb_queue_purge(&sk->sk_error_queue); - sk_mem_reclaim(sk); + sk_mem_reclaim_final(sk); if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { pr_err("Attempt to release TCP socket in state %d %p\n", @@ -154,7 +150,7 @@ void inet_sock_destruct(struct sock *sk) WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(sk->sk_wmem_queued); - WARN_ON(sk->sk_forward_alloc); + WARN_ON(sk_forward_alloc_get(sk)); kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); @@ -452,7 +448,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, - BPF_CGROUP_INET4_BIND, &flags); + CGROUP_INET4_BIND, &flags); if (err) return err; @@ -773,26 +769,28 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); sin->sin_family = AF_INET; + lock_sock(sk); if (peer) { if (!inet->inet_dport || (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && - peer == 1)) + peer == 1)) { + release_sock(sk); return -ENOTCONN; + } sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETPEERNAME, - NULL); + BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, + CGROUP_INET4_GETPEERNAME); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET4_GETSOCKNAME, - NULL); + BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, + CGROUP_INET4_GETSOCKNAME); } + release_sock(sk); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } @@ -953,10 +951,10 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCGIFNETMASK: case SIOCGIFDSTADDR: case SIOCGIFPFLAGS: - if (copy_from_user(&ifr, p, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, NULL, p)) return -EFAULT; err = devinet_ioctl(net, cmd, &ifr); - if (!err && copy_to_user(p, &ifr, sizeof(struct ifreq))) + if (!err && put_user_ifreq(&ifr, p)) err = -EFAULT; break; @@ -966,7 +964,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFDSTADDR: case SIOCSIFPFLAGS: case SIOCSIFFLAGS: - if (copy_from_user(&ifr, p, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, NULL, p)) return -EFAULT; err = devinet_ioctl(net, cmd, &ifr); break; @@ -1666,12 +1664,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); -u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt) -{ - return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt); -} -EXPORT_SYMBOL_GPL(snmp_get_cpu_field); - unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 922dd73e5740..857a144b1ea9 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1247,6 +1247,8 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; + struct in_device *in_dev; + bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: @@ -1257,7 +1259,14 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&arp_tbl, dev); - if (!netif_carrier_ok(dev)) + + in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) + evict_nocarrier = true; + else + evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev); + + if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&arp_tbl, dev); break; default: diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 9e41eff4a685..2cf02b4d77fb 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -10,6 +10,9 @@ #include <net/tcp.h> #include <net/bpf_sk_storage.h> +/* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ +extern struct bpf_struct_ops bpf_tcp_congestion_ops; + static u32 optional_ops[] = { offsetof(struct tcp_congestion_ops, init), offsetof(struct tcp_congestion_ops, release), @@ -78,14 +81,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { - if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) - return false; - if (type != BPF_READ) - return false; - if (off % size != 0) - return false; - - if (!btf_ctx_access(off, size, type, prog, info)) + if (!bpf_tracing_btf_ctx_access(off, size, type, prog, info)) return false; if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id) @@ -163,6 +159,19 @@ static const struct bpf_func_proto bpf_tcp_send_ack_proto = { .arg2_type = ARG_ANYTHING, }; +static u32 prog_ops_moff(const struct bpf_prog *prog) +{ + const struct btf_member *m; + const struct btf_type *t; + u32 midx; + + midx = prog->expected_attach_type; + t = bpf_tcp_congestion_ops.type; + m = &btf_type_member(t)[midx]; + + return btf_member_bit_offset(t, m) / 8; +} + static const struct bpf_func_proto * bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -174,6 +183,28 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + /* Does not allow release() to call setsockopt. + * release() is called when the current bpf-tcp-cc + * is retiring. It is not allowed to call + * setsockopt() to make further changes which + * may potentially allocate new resources. + */ + if (prog_ops_moff(prog) != + offsetof(struct tcp_congestion_ops, release)) + return &bpf_sk_setsockopt_proto; + return NULL; + case BPF_FUNC_getsockopt: + /* Since get/setsockopt is usually expected to + * be available together, disable getsockopt for + * release also to avoid usage surprise. + * The bpf-tcp-cc already has a more powerful way + * to read tcp_sock from the PTR_TO_BTF_ID. + */ + if (prog_ops_moff(prog) != + offsetof(struct tcp_congestion_ops, release)) + return &bpf_sk_getsockopt_proto; + return NULL; default: return bpf_base_func_proto(func_id); } @@ -185,41 +216,13 @@ BTF_ID(func, tcp_reno_cong_avoid) BTF_ID(func, tcp_reno_undo_cwnd) BTF_ID(func, tcp_slow_start) BTF_ID(func, tcp_cong_avoid_ai) -#ifdef CONFIG_X86 -#ifdef CONFIG_DYNAMIC_FTRACE -#if IS_BUILTIN(CONFIG_TCP_CONG_CUBIC) -BTF_ID(func, cubictcp_init) -BTF_ID(func, cubictcp_recalc_ssthresh) -BTF_ID(func, cubictcp_cong_avoid) -BTF_ID(func, cubictcp_state) -BTF_ID(func, cubictcp_cwnd_event) -BTF_ID(func, cubictcp_acked) -#endif -#if IS_BUILTIN(CONFIG_TCP_CONG_DCTCP) -BTF_ID(func, dctcp_init) -BTF_ID(func, dctcp_update_alpha) -BTF_ID(func, dctcp_cwnd_event) -BTF_ID(func, dctcp_ssthresh) -BTF_ID(func, dctcp_cwnd_undo) -BTF_ID(func, dctcp_state) -#endif -#if IS_BUILTIN(CONFIG_TCP_CONG_BBR) -BTF_ID(func, bbr_init) -BTF_ID(func, bbr_main) -BTF_ID(func, bbr_sndbuf_expand) -BTF_ID(func, bbr_undo_cwnd) -BTF_ID(func, bbr_cwnd_event) -BTF_ID(func, bbr_ssthresh) -BTF_ID(func, bbr_min_tso_segs) -BTF_ID(func, bbr_set_state) -#endif -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_X86 */ BTF_SET_END(bpf_tcp_ca_kfunc_ids) -static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id) +static bool bpf_tcp_ca_check_kfunc_call(u32 kfunc_btf_id, struct module *owner) { - return btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id); + if (btf_id_set_contains(&bpf_tcp_ca_kfunc_ids, kfunc_btf_id)) + return true; + return bpf_check_mod_kfunc_call(&bpf_tcp_ca_kfunc_list, kfunc_btf_id, owner); } static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { @@ -286,9 +289,6 @@ static void bpf_tcp_ca_unreg(void *kdata) tcp_unregister_congestion_control(kdata); } -/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */ -extern struct bpf_struct_ops bpf_tcp_congestion_ops; - struct bpf_struct_ops bpf_tcp_congestion_ops = { .verifier_ops = &bpf_tcp_ca_verifier_ops, .reg = bpf_tcp_ca_reg, diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 7fbd0b532f52..62d5f99760aa 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -73,7 +73,7 @@ struct cipso_v4_map_cache_entry { static struct cipso_v4_map_cache_bkt *cipso_v4_cache; /* Restricted bitmap (tag #1) flags */ -int cipso_v4_rbm_optfmt = 0; +int cipso_v4_rbm_optfmt; int cipso_v4_rbm_strictvalid = 1; /* @@ -465,16 +465,14 @@ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) if (!doi_def) return; - if (doi_def->map.std) { - switch (doi_def->type) { - case CIPSO_V4_MAP_TRANS: - kfree(doi_def->map.std->lvl.cipso); - kfree(doi_def->map.std->lvl.local); - kfree(doi_def->map.std->cat.cipso); - kfree(doi_def->map.std->cat.local); - kfree(doi_def->map.std); - break; - } + switch (doi_def->type) { + case CIPSO_V4_MAP_TRANS: + kfree(doi_def->map.std->lvl.cipso); + kfree(doi_def->map.std->lvl.local); + kfree(doi_def->map.std->cat.cipso); + kfree(doi_def->map.std->cat.local); + kfree(doi_def->map.std); + break; } kfree(doi_def); } diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 4a8550c49202..48f337ccf949 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -9,7 +9,6 @@ #include <linux/types.h> #include <linux/module.h> -#include <linux/ip.h> #include <linux/in.h> #include <net/ip.h> #include <net/sock.h> diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 73721a4448bd..ec73a0d52d3e 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -75,6 +75,7 @@ static struct ipv4_devconf ipv4_devconf = { [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, + [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; @@ -87,6 +88,7 @@ static struct ipv4_devconf ipv4_devconf_dflt = { [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/, [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/, + [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1, }, }; @@ -215,7 +217,7 @@ static void devinet_sysctl_unregister(struct in_device *idev) static struct in_ifaddr *inet_alloc_ifa(void) { - return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL); + return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_ACCOUNT); } static void inet_rcu_free_ifa(struct rcu_head *head) @@ -1243,7 +1245,7 @@ out: return ret; } -static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) +int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size) { struct in_device *in_dev = __in_dev_get_rtnl(dev); const struct in_ifaddr *ifa; @@ -1950,7 +1952,8 @@ static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = { }; static int inet_validate_link_af(const struct net_device *dev, - const struct nlattr *nla) + const struct nlattr *nla, + struct netlink_ext_ack *extack) { struct nlattr *a, *tb[IFLA_INET_MAX+1]; int err, rem; @@ -1959,7 +1962,7 @@ static int inet_validate_link_af(const struct net_device *dev, return -EAFNOSUPPORT; err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, - inet_af_policy, NULL); + inet_af_policy, extack); if (err < 0) return err; @@ -2424,11 +2427,15 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; - int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + struct net *net = ctl->extra2; + int ret; - if (write && *valp != val) { - struct net *net = ctl->extra2; + if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + if (write && *valp != val) { if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { if (!rtnl_trylock()) { /* Restore the original values before restarting */ @@ -2527,6 +2534,8 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), + DEVINET_SYSCTL_RW_ENTRY(ARP_EVICT_NOCARRIER, + "arp_evict_nocarrier"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION, "force_igmp_version"), @@ -2762,8 +2771,6 @@ void __init devinet_init(void) INIT_HLIST_HEAD(&inet_addr_lst[i]); register_pernet_subsys(&devinet_ops); - - register_gifconf(PF_INET, inet_gifconf); register_netdevice_notifier(&ip_netdev_notifier); queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index a09e36c4a413..851f542928a3 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -97,7 +97,6 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, static void esp_ssg_unref(struct xfrm_state *x, void *tmp) { - struct esp_output_extra *extra = esp_tmp_extra(tmp); struct crypto_aead *aead = x->data; int extralen = 0; u8 *iv; @@ -105,9 +104,8 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) struct scatterlist *sg; if (x->props.flags & XFRM_STATE_ESN) - extralen += sizeof(*extra); + extralen += sizeof(struct esp_output_extra); - extra = esp_tmp_extra(tmp); iv = esp_tmp_iv(aead, tmp, extralen); req = esp_tmp_req(aead, iv); diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c index 0c28bd469a68..0e23ade74493 100644 --- a/net/ipv4/fib_notifier.c +++ b/net/ipv4/fib_notifier.c @@ -6,7 +6,6 @@ #include <linux/export.h> #include <net/net_namespace.h> #include <net/fib_notifier.h> -#include <net/netns/ipv4.h> #include <net/ip_fib.h> int call_fib4_notifier(struct notifier_block *nb, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 4c0c33e4710d..3364cb9c67e0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -208,9 +208,7 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp) void fib_nh_common_release(struct fib_nh_common *nhc) { - if (nhc->nhc_dev) - dev_put(nhc->nhc_dev); - + dev_put(nhc->nhc_dev); lwtstate_put(nhc->nhc_lwtstate); rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output); rt_fibinfo_free(&nhc->nhc_rth_input); @@ -260,7 +258,7 @@ EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { spin_lock_bh(&fib_info_lock); - if (fi && --fi->fib_treeref == 0) { + if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); @@ -1373,7 +1371,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg, if (!cfg->fc_mx) { fi = fib_find_info_nh(net, cfg); if (fi) { - fi->fib_treeref++; + refcount_inc(&fi->fib_treeref); return fi; } } @@ -1547,11 +1545,11 @@ link_it: if (ofi) { fi->fib_dead = 1; free_fib_info(fi); - ofi->fib_treeref++; + refcount_inc(&ofi->fib_treeref); return ofi; } - fi->fib_treeref++; + refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); spin_lock_bh(&fib_info_lock); hlist_add_head(&fi->fib_hash, @@ -1663,7 +1661,7 @@ EXPORT_SYMBOL_GPL(fib_nexthop_info); #if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6) int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, - int nh_weight, u8 rt_family) + int nh_weight, u8 rt_family, u32 nh_tclassid) { const struct net_device *dev = nhc->nhc_dev; struct rtnexthop *rtnh; @@ -1681,6 +1679,9 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc, rtnh->rtnh_flags = flags; + if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid)) + goto nla_put_failure; + /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh; @@ -1708,14 +1709,13 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) } for_nexthops(fi) { - if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, - AF_INET) < 0) - goto nla_put_failure; + u32 nh_tclassid = 0; #ifdef CONFIG_IP_ROUTE_CLASSID - if (nh->nh_tclassid && - nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) - goto nla_put_failure; + nh_tclassid = nh->nh_tclassid; #endif + if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight, + AF_INET, nh_tclassid) < 0) + goto nla_put_failure; } endfor_nexthops(fi); mp_end: diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 25cf387cca5b..8060524f4256 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2380,11 +2380,11 @@ void __init fib_trie_init(void) { fn_alias_kmem = kmem_cache_create("ip_fib_alias", sizeof(struct fib_alias), - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); trie_leaf_kmem = kmem_cache_create("ip_fib_trie", LEAF_SIZE, - 0, SLAB_PANIC, NULL); + 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); } struct fib_table *fib_trie_table(u32 id, struct fib_table *alias) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index e5f69b0bf3df..8fcbc6258ec5 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -230,8 +230,8 @@ static struct sk_buff *fou_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { + const struct net_offload __rcu **offloads; u8 proto = fou_from_sock(sk)->protocol; - const struct net_offload **offloads; const struct net_offload *ops; struct sk_buff *pp = NULL; @@ -263,10 +263,10 @@ out_unlock: static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { - const struct net_offload *ops; + const struct net_offload __rcu **offloads; u8 proto = fou_from_sock(sk)->protocol; + const struct net_offload *ops; int err = -ENOSYS; - const struct net_offload **offloads; rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; @@ -311,7 +311,7 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { - const struct net_offload **offloads; + const struct net_offload __rcu **offloads; const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; @@ -457,8 +457,8 @@ out: static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { - const struct net_offload **offloads; struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); + const struct net_offload __rcu **offloads; const struct net_offload *ops; unsigned int guehlen = 0; u8 proto; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c695d294a5df..b7e277d8a84d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -1054,14 +1054,19 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr), &_iio); if (!ext_hdr || !iio) goto send_mal_query; - if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr)) + if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr) || + ntohs(iio->extobj_hdr.length) > sizeof(_iio)) goto send_mal_query; ident_len = ntohs(iio->extobj_hdr.length) - sizeof(iio->extobj_hdr); + iio = skb_header_pointer(skb, sizeof(_ext_hdr), + sizeof(iio->extobj_hdr) + ident_len, &_iio); + if (!iio) + goto send_mal_query; + status = 0; dev = NULL; switch (iio->extobj_hdr.class_type) { case ICMP_EXT_ECHO_CTYPE_NAME: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); if (ident_len >= IFNAMSIZ) goto send_mal_query; memset(buff, 0, sizeof(buff)); @@ -1069,34 +1074,27 @@ bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) dev = dev_get_by_name(net, buff); break; case ICMP_EXT_ECHO_CTYPE_INDEX: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + - sizeof(iio->ident.ifindex), &_iio); if (ident_len != sizeof(iio->ident.ifindex)) goto send_mal_query; dev = dev_get_by_index(net, ntohl(iio->ident.ifindex)); break; case ICMP_EXT_ECHO_CTYPE_ADDR: - if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + + if (ident_len < sizeof(iio->ident.addr.ctype3_hdr) || + ident_len != sizeof(iio->ident.addr.ctype3_hdr) + iio->ident.addr.ctype3_hdr.addrlen) goto send_mal_query; switch (ntohs(iio->ident.addr.ctype3_hdr.afi)) { case ICMP_AFI_IP: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr) + - sizeof(struct in_addr), &_iio); - if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + - sizeof(struct in_addr)) + if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in_addr)) goto send_mal_query; dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr); break; #if IS_ENABLED(CONFIG_IPV6) case ICMP_AFI_IP6: - iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(_iio), &_iio); - if (ident_len != sizeof(iio->ident.addr.ctype3_hdr) + - sizeof(struct in6_addr)) + if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr)) goto send_mal_query; dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev); - if (dev) - dev_hold(dev); + dev_hold(dev); break; #endif default: diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 00576bae183d..d2e2b3d18c66 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2240,7 +2240,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, iml->sfmode, psf->sl_count, psf->sl_addr, 0); RCU_INIT_POINTER(iml->sflist, NULL); /* decrease mem now to avoid the memleak warning */ - atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); + atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc); kfree_rcu(psf, rcu); return err; } @@ -2389,7 +2389,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (psl) count += psl->sl_max; - newpsl = sock_kmalloc(sk, IP_SFLSIZE(count), GFP_KERNEL); + newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count), + GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; @@ -2400,7 +2401,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; /* decrease mem now to avoid the memleak warning */ - atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + atomic_sub(struct_size(psl, sl_addr, psl->sl_max), + &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } rcu_assign_pointer(pmc->sflist, newpsl); @@ -2475,19 +2477,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) goto done; } if (msf->imsf_numsrc) { - newpsl = sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc), - GFP_KERNEL); + newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, + msf->imsf_numsrc), + GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; } newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc; - memcpy(newpsl->sl_addr, msf->imsf_slist, - msf->imsf_numsrc * sizeof(msf->imsf_slist[0])); + memcpy(newpsl->sl_addr, msf->imsf_slist_flex, + flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc)); err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr, msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { - sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max)); + sock_kfree_s(sk, newpsl, + struct_size(newpsl, sl_addr, + newpsl->sl_max)); goto done; } } else { @@ -2500,7 +2505,8 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); /* decrease mem now to avoid the memleak warning */ - atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + atomic_sub(struct_size(psl, sl_addr, psl->sl_max), + &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } else (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, @@ -2558,14 +2564,14 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, count = psl->sl_count; } copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc; - len = copycount * sizeof(psl->sl_addr[0]); + len = flex_array_size(psl, sl_addr, copycount); msf->imsf_numsrc = count; if (put_user(IP_MSFILTER_SIZE(copycount), optlen) || copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) { return -EFAULT; } if (len && - copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len)) + copy_to_user(&optval->imsf_slist_flex[0], psl->sl_addr, len)) return -EFAULT; return 0; done: @@ -2720,6 +2726,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u rv = 1; } else if (im) { if (src_addr) { + spin_lock_bh(&im->lock); for (psf = im->sources; psf; psf = psf->sf_next) { if (psf->sf_inaddr == src_addr) break; @@ -2730,6 +2737,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u im->sfcount[MCAST_EXCLUDE]; else rv = im->sfcount[MCAST_EXCLUDE] != 0; + spin_unlock_bh(&im->lock); } else rv = 1; /* unspecified source; tentatively allow */ } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 754013fa393b..f7fea3a7c5e6 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -534,7 +534,8 @@ out: atomic_read(&newsk->sk_rmem_alloc)); mem_cgroup_sk_alloc(newsk); if (newsk->sk_memcg && amt) - mem_cgroup_charge_skmem(newsk->sk_memcg, amt); + mem_cgroup_charge_skmem(newsk->sk_memcg, amt, + GFP_KERNEL | __GFP_NOFAIL); release_sock(newsk); } @@ -1014,7 +1015,7 @@ void inet_csk_destroy_sock(struct sock *sk) sk_refcnt_debug_release(sk); - percpu_counter_dec(sk->sk_prot->orphan_count); + this_cpu_dec(*sk->sk_prot->orphan_count); sock_put(sk); } @@ -1073,7 +1074,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, sock_orphan(child); - percpu_counter_inc(sk->sk_prot->orphan_count); + this_cpu_inc(*sk->sk_prot->orphan_count); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index ef7897226f08..c8fa6e7f7d12 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -271,7 +271,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), - .idiag_fmem = sk->sk_forward_alloc, + .idiag_fmem = sk_forward_alloc_get(sk), .idiag_tmem = sk_wmem_alloc_get(sk), }; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 80aeaf9e6e16..75737267746f 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -242,8 +242,10 @@ static inline int compute_score(struct sock *sk, struct net *net, if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; + score = sk->sk_bound_dev_if ? 2 : 1; - score = sk->sk_family == PF_INET ? 2 : 1; + if (sk->sk_family == PF_INET) + score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } @@ -596,7 +598,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { - percpu_counter_inc(sk->sk_prot->orphan_count); + this_cpu_inc(*sk->sk_prot->orphan_count); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 95419b7adf5c..2ac2b95c5694 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -473,8 +473,6 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, static int gre_handle_offloads(struct sk_buff *skb, bool csum) { - if (csum && skb_checksum_start(skb) < skb->data) - return -EINVAL; return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } @@ -632,15 +630,20 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, } if (dev->header_ops) { + const int pull_len = tunnel->hlen + sizeof(struct iphdr); + if (skb_cow_head(skb, 0)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; + if (pull_len > skb_transport_offset(skb)) + goto free_skb; + /* Pull skb since ip_tunnel_xmit() needs skb->data pointing * to gre header. */ - skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); + skb_pull(skb, pull_len); skb_reset_mac_header(skb); } else { if (skb_cow_head(skb, dev->needed_headroom)) @@ -925,7 +928,7 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_stop = ipgre_close, #endif .ndo_start_xmit = ipgre_xmit, - .ndo_do_ioctl = ip_tunnel_ioctl, + .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, @@ -983,7 +986,7 @@ static int ipgre_tunnel_init(struct net_device *dev) __gre_tunnel_init(dev); - memcpy(dev->dev_addr, &iph->saddr, 4); + __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8d8a8da3ae7e..9bca57ef8b83 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -198,19 +198,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s } else if (rt->rt_type == RTN_BROADCAST) IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len); - /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { - struct sk_buff *skb2; - - skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); - if (!skb2) { - kfree_skb(skb); + skb = skb_expand_head(skb, hh_len); + if (!skb) return -ENOMEM; - } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; } if (lwtunnel_xmit_redirect(dst->lwtstate)) { @@ -446,8 +437,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4) { BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) != offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr)); - memcpy(&iph->saddr, &fl4->saddr, - sizeof(fl4->saddr) + sizeof(fl4->daddr)); + + iph->saddr = fl4->saddr; + iph->daddr = fl4->daddr; } /* Note: skb->sk can be different from sk, in case of tunnels */ @@ -614,18 +606,6 @@ void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph, } EXPORT_SYMBOL(ip_fraglist_init); -static void ip_fraglist_ipcb_prepare(struct sk_buff *skb, - struct ip_fraglist_iter *iter) -{ - struct sk_buff *to = iter->frag; - - /* Copy the flags to each fragment. */ - IPCB(to)->flags = IPCB(skb)->flags; - - if (iter->offset == 0) - ip_options_fragment(to); -} - void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter) { unsigned int hlen = iter->hlen; @@ -671,7 +651,7 @@ void ip_frag_init(struct sk_buff *skb, unsigned int hlen, EXPORT_SYMBOL(ip_frag_init); static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to, - bool first_frag, struct ip_frag_state *state) + bool first_frag) { /* Copy the flags to each fragment. */ IPCB(to)->flags = IPCB(from)->flags; @@ -846,11 +826,14 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, /* Everything is OK. Generate! */ ip_fraglist_init(skb, iph, hlen, &iter); + if (iter.frag) + ip_options_fragment(iter.frag); + for (;;) { /* Prepare header of the next frame, * before previous one went down. */ if (iter.frag) { - ip_fraglist_ipcb_prepare(skb, &iter); + IPCB(iter.frag)->flags = IPCB(skb)->flags; ip_fraglist_prepare(skb, &iter); } @@ -905,7 +888,7 @@ slow_path: err = PTR_ERR(skb2); goto fail; } - ip_frag_ipcb(skb, skb2, first_frag, &state); + ip_frag_ipcb(skb, skb2, first_frag); /* * Put this fragment into the sending queue. diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ec6036713e2c..38d29b175ca6 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -663,12 +663,11 @@ static int set_mcast_msfilter(struct sock *sk, int ifindex, struct sockaddr_storage *group, struct sockaddr_storage *list) { - int msize = IP_MSFILTER_SIZE(numsrc); struct ip_msfilter *msf; struct sockaddr_in *psin; int err, i; - msf = kmalloc(msize, GFP_KERNEL); + msf = kmalloc(IP_MSFILTER_SIZE(numsrc), GFP_KERNEL); if (!msf) return -ENOBUFS; @@ -684,7 +683,7 @@ static int set_mcast_msfilter(struct sock *sk, int ifindex, if (psin->sin_family != AF_INET) goto Eaddrnotavail; - msf->imsf_slist[i] = psin->sin_addr.s_addr; + msf->imsf_slist_flex[i] = psin->sin_addr.s_addr; } err = ip_mc_msfilter(sk, msf, ifindex); kfree(msf); @@ -791,7 +790,8 @@ static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) goto out_free_gsf; err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc, - gsf->gf_fmode, &gsf->gf_group, gsf->gf_slist); + gsf->gf_fmode, &gsf->gf_group, + gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return err; @@ -800,7 +800,7 @@ out_free_gsf: static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); + const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; unsigned int n; void *p; @@ -814,7 +814,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, p = kmalloc(optlen + 4, GFP_KERNEL); if (!p) return -ENOMEM; - gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ err = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) @@ -827,7 +827,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, goto out_free_gsf; err = -EINVAL; - if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) + if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_gsf; /* numsrc >= (4G-140)/128 overflow in 32 bits */ @@ -835,7 +835,7 @@ static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf) goto out_free_gsf; err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode, - &gf32->gf_group, gf32->gf_slist); + &gf32->gf_group, gf32->gf_slist_flex); out_free_gsf: kfree(p); return err; @@ -886,6 +886,8 @@ static int compat_ip_mcast_join_leave(struct sock *sk, int optname, return ip_mc_leave_group(sk, &mreq); } +DEFINE_STATIC_KEY_FALSE(ip4_min_ttl); + static int do_ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { @@ -1352,7 +1354,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, goto e_inval; if (val < 0 || val > 255) goto e_inval; - inet->min_ttl = val; + + if (val) + static_branch_enable(&ip4_min_ttl); + + /* tcp_v4_err() and tcp_v4_rcv() might read min_ttl + * while we are changint it. + */ + WRITE_ONCE(inet->min_ttl, val); break; default: @@ -1456,7 +1465,7 @@ static bool getsockopt_needs_rtnl(int optname) static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval, int __user *optlen, int len) { - const int size0 = offsetof(struct group_filter, gf_slist); + const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter __user *p = optval; struct group_filter gsf; int num; @@ -1468,7 +1477,7 @@ static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval, return -EFAULT; num = gsf.gf_numsrc; - err = ip_mc_gsfget(sk, &gsf, p->gf_slist); + err = ip_mc_gsfget(sk, &gsf, p->gf_slist_flex); if (err) return err; if (gsf.gf_numsrc < num) @@ -1482,7 +1491,7 @@ static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval, static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, int __user *optlen, int len) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); + const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter __user *p = optval; struct compat_group_filter gf32; struct group_filter gf; @@ -1499,7 +1508,7 @@ static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; - err = ip_mc_gsfget(sk, &gf, p->gf_slist); + err = ip_mc_gsfget(sk, &gf, p->gf_slist_flex); if (err) return err; if (gf.gf_numsrc < num) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index be75b409445c..5a473319d3a5 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -834,7 +834,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; if (dev->type != ARPHRD_ETHER) { - memcpy(dev->dev_addr, &p->iph.saddr, 4); + __dev_addr_set(dev, &p->iph.saddr, 4); memcpy(dev->broadcast, &p->iph.daddr, 4); } ip_tunnel_add(itn, t); @@ -958,19 +958,20 @@ done: } EXPORT_SYMBOL_GPL(ip_tunnel_ctl); -int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, int cmd) { struct ip_tunnel_parm p; int err; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) return -EFAULT; err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); - if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (!err && copy_to_user(data, &p, sizeof(p))) return -EFAULT; return err; } -EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); +EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) { diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index eb560eecee08..8c2bd1d9ddce 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -405,7 +405,7 @@ static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, - .ndo_do_ioctl = ip_tunnel_ioctl, + .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, @@ -425,7 +425,7 @@ static int vti_tunnel_init(struct net_device *dev) struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; - memcpy(dev->dev_addr, &iph->saddr, 4); + __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 816d8aad5a68..9d41d5d5cd1e 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -262,6 +262,11 @@ static int __init ic_open_devs(void) dev->name, able, d->xid); } } + /* Devices with a complex topology like SFP ethernet interfaces needs + * the rtnl_lock at init. The carrier wait-loop must therefore run + * without holding it. + */ + rtnl_unlock(); /* no point in waiting if we could not bring up at least one device */ if (!ic_first_dev) @@ -274,9 +279,13 @@ static int __init ic_open_devs(void) msecs_to_jiffies(carrier_timeout * 1000))) { int wait, elapsed; + rtnl_lock(); for_each_netdev(&init_net, dev) - if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) + if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) { + rtnl_unlock(); goto have_carrier; + } + rtnl_unlock(); msleep(1); @@ -289,7 +298,6 @@ static int __init ic_open_devs(void) next_msg = jiffies + msecs_to_jiffies(20000); } have_carrier: - rtnl_unlock(); *last = NULL; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 266c65577ba6..123ea63a04cb 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -347,7 +347,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, - .ndo_do_ioctl = ip_tunnel_ioctl, + .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, @@ -380,7 +380,7 @@ static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + __dev_addr_set(dev, &tunnel->parms.iph.saddr, 4); memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); tunnel->tun_hlen = 0; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c53f14b94356..ffc0cab7cf18 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -179,10 +179,11 @@ struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) return (void *)entry + entry->next_offset; } -unsigned int arpt_do_table(struct sk_buff *skb, - const struct nf_hook_state *state, - struct xt_table *table) +unsigned int arpt_do_table(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) { + const struct xt_table *table = priv; unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); unsigned int verdict = NF_DROP; diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 6922612df456..78cd5ee24448 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -18,28 +18,17 @@ MODULE_DESCRIPTION("arptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ (1 << NF_ARP_FORWARD)) -static int __net_init arptable_filter_table_init(struct net *net); - static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_ARP, .priority = NF_IP_PRI_FILTER, - .table_init = arptable_filter_table_init, }; -/* The work comes in here from netfilter.c */ -static unsigned int -arptable_filter_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return arpt_do_table(skb, state, priv); -} - static struct nf_hook_ops *arpfilter_ops __read_mostly; -static int __net_init arptable_filter_table_init(struct net *net) +static int arptable_filter_table_init(struct net *net) { struct arpt_replace *repl; int err; @@ -69,30 +58,32 @@ static struct pernet_operations arptable_filter_net_ops = { static int __init arptable_filter_init(void) { - int ret; + int ret = xt_register_template(&packet_filter, + arptable_filter_table_init); + + if (ret < 0) + return ret; - arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook); - if (IS_ERR(arpfilter_ops)) + arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arpt_do_table); + if (IS_ERR(arpfilter_ops)) { + xt_unregister_template(&packet_filter); return PTR_ERR(arpfilter_ops); + } ret = register_pernet_subsys(&arptable_filter_net_ops); if (ret < 0) { + xt_unregister_template(&packet_filter); kfree(arpfilter_ops); return ret; } - ret = arptable_filter_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&arptable_filter_net_ops); - kfree(arpfilter_ops); - } - return ret; } static void __exit arptable_filter_fini(void) { unregister_pernet_subsys(&arptable_filter_net_ops); + xt_unregister_template(&packet_filter); kfree(arpfilter_ops); } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 13acb687c19a..2ed7c58b471a 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -222,10 +222,11 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int -ipt_do_table(struct sk_buff *skb, - const struct nf_hook_state *state, - struct xt_table *table) +ipt_do_table(void *priv, + struct sk_buff *skb, + const struct nf_hook_state *state) { + const struct xt_table *table = priv; unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); const struct iphdr *ip; diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 8f7ca67475b7..8fd1aba8af31 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -66,11 +66,22 @@ struct clusterip_net { /* lock protects the configs list */ spinlock_t lock; + bool clusterip_deprecated_warning; #ifdef CONFIG_PROC_FS struct proc_dir_entry *procdir; /* mutex protects the config->pde*/ struct mutex mutex; #endif + unsigned int hook_users; +}; + +static unsigned int clusterip_arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); + +static const struct nf_hook_ops cip_arp_ops = { + .hook = clusterip_arp_mangle, + .pf = NFPROTO_ARP, + .hooknum = NF_ARP_OUT, + .priority = -1 }; static unsigned int clusterip_net_id __read_mostly; @@ -458,6 +469,7 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) static int clusterip_tg_check(const struct xt_tgchk_param *par) { struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; + struct clusterip_net *cn = clusterip_pernet(par->net); const struct ipt_entry *e = par->entryinfo; struct clusterip_config *config; int ret, i; @@ -467,6 +479,9 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) return -EOPNOTSUPP; } + if (cn->hook_users == UINT_MAX) + return -EOVERFLOW; + if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { @@ -517,10 +532,23 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) return ret; } - if (!par->net->xt.clusterip_deprecated_warning) { + if (cn->hook_users == 0) { + ret = nf_register_net_hook(par->net, &cip_arp_ops); + + if (ret < 0) { + clusterip_config_entry_put(config); + clusterip_config_put(config); + nf_ct_netns_put(par->net, par->family); + return ret; + } + } + + cn->hook_users++; + + if (!cn->clusterip_deprecated_warning) { pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, " "use xt_cluster instead\n"); - par->net->xt.clusterip_deprecated_warning = true; + cn->clusterip_deprecated_warning = true; } cipinfo->config = config; @@ -531,6 +559,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par) static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) { const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; + struct clusterip_net *cn = clusterip_pernet(par->net); /* if no more entries are referencing the config, remove it * from the list and destroy the proc entry */ @@ -539,6 +568,10 @@ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) clusterip_config_put(cipinfo->config); nf_ct_netns_put(par->net, par->family); + cn->hook_users--; + + if (cn->hook_users == 0) + nf_unregister_net_hook(par->net, &cip_arp_ops); } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT @@ -602,9 +635,8 @@ static void arp_print(struct arp_payload *payload) #endif static unsigned int -arp_mangle(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) +clusterip_arp_mangle(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { struct arphdr *arp = arp_hdr(skb); struct arp_payload *payload; @@ -654,13 +686,6 @@ arp_mangle(void *priv, return NF_ACCEPT; } -static const struct nf_hook_ops cip_arp_ops = { - .hook = arp_mangle, - .pf = NFPROTO_ARP, - .hooknum = NF_ARP_OUT, - .priority = -1 -}; - /*********************************************************************** * PROC DIR HANDLING ***********************************************************************/ @@ -817,20 +842,14 @@ static const struct proc_ops clusterip_proc_ops = { static int clusterip_net_init(struct net *net) { struct clusterip_net *cn = clusterip_pernet(net); - int ret; INIT_LIST_HEAD(&cn->configs); spin_lock_init(&cn->lock); - ret = nf_register_net_hook(net, &cip_arp_ops); - if (ret < 0) - return ret; - #ifdef CONFIG_PROC_FS cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); if (!cn->procdir) { - nf_unregister_net_hook(net, &cip_arp_ops); pr_err("Unable to proc dir entry\n"); return -ENOMEM; } @@ -850,7 +869,6 @@ static void clusterip_net_exit(struct net *net) cn->procdir = NULL; mutex_unlock(&cn->mutex); #endif - nf_unregister_net_hook(net, &cip_arp_ops); } static struct pernet_operations clusterip_net_ops = { diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index 8272df7c6ad5..b9062f4552ac 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -19,7 +19,6 @@ MODULE_DESCRIPTION("iptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) -static int __net_init iptable_filter_table_init(struct net *net); static const struct xt_table packet_filter = { .name = "filter", @@ -27,23 +26,15 @@ static const struct xt_table packet_filter = { .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_FILTER, - .table_init = iptable_filter_table_init, }; -static unsigned int -iptable_filter_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ipt_do_table(skb, state, priv); -} - static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ static bool forward __read_mostly = true; module_param(forward, bool, 0000); -static int __net_init iptable_filter_table_init(struct net *net) +static int iptable_filter_table_init(struct net *net) { struct ipt_replace *repl; int err; @@ -62,7 +53,7 @@ static int __net_init iptable_filter_table_init(struct net *net) static int __net_init iptable_filter_net_init(struct net *net) { - if (net == &init_net || !forward) + if (!forward) return iptable_filter_table_init(net); return 0; @@ -86,22 +77,32 @@ static struct pernet_operations iptable_filter_net_ops = { static int __init iptable_filter_init(void) { - int ret; + int ret = xt_register_template(&packet_filter, + iptable_filter_table_init); - filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook); - if (IS_ERR(filter_ops)) + if (ret < 0) + return ret; + + filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table); + if (IS_ERR(filter_ops)) { + xt_unregister_template(&packet_filter); return PTR_ERR(filter_ops); + } ret = register_pernet_subsys(&iptable_filter_net_ops); - if (ret < 0) + if (ret < 0) { + xt_unregister_template(&packet_filter); kfree(filter_ops); + return ret; + } - return ret; + return 0; } static void __exit iptable_filter_fini(void) { unregister_pernet_subsys(&iptable_filter_net_ops); + xt_unregister_template(&packet_filter); kfree(filter_ops); } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 2abc3836f391..3abb430af9e6 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -25,19 +25,16 @@ MODULE_DESCRIPTION("iptables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) -static int __net_init iptable_mangle_table_init(struct net *net); - static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_MANGLE, - .table_init = iptable_mangle_table_init, }; static unsigned int -ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv) +ipt_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret; const struct iphdr *iph; @@ -53,7 +50,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *pri daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, state, priv); + ret = ipt_do_table(priv, skb, state); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); @@ -78,12 +75,12 @@ iptable_mangle_hook(void *priv, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) - return ipt_mangle_out(skb, state, priv); - return ipt_do_table(skb, state, priv); + return ipt_mangle_out(priv, skb, state); + return ipt_do_table(priv, skb, state); } static struct nf_hook_ops *mangle_ops __read_mostly; -static int __net_init iptable_mangle_table_init(struct net *net) +static int iptable_mangle_table_init(struct net *net) { struct ipt_replace *repl; int ret; @@ -113,32 +110,32 @@ static struct pernet_operations iptable_mangle_net_ops = { static int __init iptable_mangle_init(void) { - int ret; + int ret = xt_register_template(&packet_mangler, + iptable_mangle_table_init); + if (ret < 0) + return ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); if (IS_ERR(mangle_ops)) { + xt_unregister_template(&packet_mangler); ret = PTR_ERR(mangle_ops); return ret; } ret = register_pernet_subsys(&iptable_mangle_net_ops); if (ret < 0) { + xt_unregister_template(&packet_mangler); kfree(mangle_ops); return ret; } - ret = iptable_mangle_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&iptable_mangle_net_ops); - kfree(mangle_ops); - } - return ret; } static void __exit iptable_mangle_fini(void) { unregister_pernet_subsys(&iptable_mangle_net_ops); + xt_unregister_template(&packet_mangler); kfree(mangle_ops); } diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c index a9913842ef18..56f6ecc43451 100644 --- a/net/ipv4/netfilter/iptable_nat.c +++ b/net/ipv4/netfilter/iptable_nat.c @@ -17,8 +17,6 @@ struct iptable_nat_pernet { struct nf_hook_ops *nf_nat_ops; }; -static int __net_init iptable_nat_table_init(struct net *net); - static unsigned int iptable_nat_net_id __read_mostly; static const struct xt_table nf_nat_ipv4_table = { @@ -29,37 +27,29 @@ static const struct xt_table nf_nat_ipv4_table = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV4, - .table_init = iptable_nat_table_init, }; -static unsigned int iptable_nat_do_chain(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ipt_do_table(skb, state, priv); -} - static const struct nf_hook_ops nf_nat_ipv4_ops[] = { { - .hook = iptable_nat_do_chain, + .hook = ipt_do_table, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, { - .hook = iptable_nat_do_chain, + .hook = ipt_do_table, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, { - .hook = iptable_nat_do_chain, + .hook = ipt_do_table, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, { - .hook = iptable_nat_do_chain, + .hook = ipt_do_table, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, @@ -113,7 +103,7 @@ static void ipt_nat_unregister_lookups(struct net *net) kfree(ops); } -static int __net_init iptable_nat_table_init(struct net *net) +static int iptable_nat_table_init(struct net *net) { struct ipt_replace *repl; int ret; @@ -155,20 +145,25 @@ static struct pernet_operations iptable_nat_net_ops = { static int __init iptable_nat_init(void) { - int ret = register_pernet_subsys(&iptable_nat_net_ops); + int ret = xt_register_template(&nf_nat_ipv4_table, + iptable_nat_table_init); - if (ret) + if (ret < 0) return ret; - ret = iptable_nat_table_init(&init_net); - if (ret) - unregister_pernet_subsys(&iptable_nat_net_ops); + ret = register_pernet_subsys(&iptable_nat_net_ops); + if (ret < 0) { + xt_unregister_template(&nf_nat_ipv4_table); + return ret; + } + return ret; } static void __exit iptable_nat_exit(void) { unregister_pernet_subsys(&iptable_nat_net_ops); + xt_unregister_template(&nf_nat_ipv4_table); } module_init(iptable_nat_init); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index ceef397c1f5f..ca5e5b21587c 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -12,8 +12,6 @@ #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) -static int __net_init iptable_raw_table_init(struct net *net); - static bool raw_before_defrag __read_mostly; MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); module_param(raw_before_defrag, bool, 0000); @@ -24,7 +22,6 @@ static const struct xt_table packet_raw = { .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_RAW, - .table_init = iptable_raw_table_init, }; static const struct xt_table packet_raw_before_defrag = { @@ -33,20 +30,11 @@ static const struct xt_table packet_raw_before_defrag = { .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_RAW_BEFORE_DEFRAG, - .table_init = iptable_raw_table_init, }; -/* The work comes in here from netfilter.c. */ -static unsigned int -iptable_raw_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ipt_do_table(skb, state, priv); -} - static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init iptable_raw_table_init(struct net *net) +static int iptable_raw_table_init(struct net *net) { struct ipt_replace *repl; const struct xt_table *table = &packet_raw; @@ -89,22 +77,24 @@ static int __init iptable_raw_init(void) pr_info("Enabling raw table before defrag\n"); } - rawtable_ops = xt_hook_ops_alloc(table, iptable_raw_hook); - if (IS_ERR(rawtable_ops)) + ret = xt_register_template(table, + iptable_raw_table_init); + if (ret < 0) + return ret; + + rawtable_ops = xt_hook_ops_alloc(table, ipt_do_table); + if (IS_ERR(rawtable_ops)) { + xt_unregister_template(table); return PTR_ERR(rawtable_ops); + } ret = register_pernet_subsys(&iptable_raw_net_ops); if (ret < 0) { + xt_unregister_template(table); kfree(rawtable_ops); return ret; } - ret = iptable_raw_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&iptable_raw_net_ops); - kfree(rawtable_ops); - } - return ret; } @@ -112,6 +102,7 @@ static void __exit iptable_raw_fini(void) { unregister_pernet_subsys(&iptable_raw_net_ops); kfree(rawtable_ops); + xt_unregister_template(&packet_raw); } module_init(iptable_raw_init); diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 77973f5fd8f6..d885443cb267 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -25,27 +25,17 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) -static int __net_init iptable_security_table_init(struct net *net); - static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_SECURITY, - .table_init = iptable_security_table_init, }; -static unsigned int -iptable_security_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ipt_do_table(skb, state, priv); -} - static struct nf_hook_ops *sectbl_ops __read_mostly; -static int __net_init iptable_security_table_init(struct net *net) +static int iptable_security_table_init(struct net *net) { struct ipt_replace *repl; int ret; @@ -75,24 +65,25 @@ static struct pernet_operations iptable_security_net_ops = { static int __init iptable_security_init(void) { - int ret; + int ret = xt_register_template(&security_table, + iptable_security_table_init); - sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook); - if (IS_ERR(sectbl_ops)) + if (ret < 0) + return ret; + + sectbl_ops = xt_hook_ops_alloc(&security_table, ipt_do_table); + if (IS_ERR(sectbl_ops)) { + xt_unregister_template(&security_table); return PTR_ERR(sectbl_ops); + } ret = register_pernet_subsys(&iptable_security_net_ops); if (ret < 0) { + xt_unregister_template(&security_table); kfree(sectbl_ops); return ret; } - ret = iptable_security_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&iptable_security_net_ops); - kfree(sectbl_ops); - } - return ret; } @@ -100,6 +91,7 @@ static void __exit iptable_security_fini(void) { unregister_pernet_subsys(&iptable_security_net_ops); kfree(sectbl_ops); + xt_unregister_template(&security_table); } module_init(iptable_security_init); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 613432a36f0a..e61ea428ea18 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -20,13 +20,8 @@ #endif #include <net/netfilter/nf_conntrack_zones.h> -static unsigned int defrag4_pernet_id __read_mostly; static DEFINE_MUTEX(defrag4_mutex); -struct defrag4_pernet { - unsigned int users; -}; - static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { @@ -111,19 +106,15 @@ static const struct nf_hook_ops ipv4_defrag_ops[] = { static void __net_exit defrag4_net_exit(struct net *net) { - struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); - - if (nf_defrag->users) { + if (net->nf.defrag_ipv4_users) { nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); - nf_defrag->users = 0; + net->nf.defrag_ipv4_users = 0; } } static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, - .id = &defrag4_pernet_id, - .size = sizeof(struct defrag4_pernet), }; static int __init nf_defrag_init(void) @@ -138,24 +129,23 @@ static void __exit nf_defrag_fini(void) int nf_defrag_ipv4_enable(struct net *net) { - struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); int err = 0; mutex_lock(&defrag4_mutex); - if (nf_defrag->users == UINT_MAX) { + if (net->nf.defrag_ipv4_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } - if (nf_defrag->users) { - nf_defrag->users++; + if (net->nf.defrag_ipv4_users) { + net->nf.defrag_ipv4_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); if (err == 0) - nf_defrag->users = 1; + net->nf.defrag_ipv4_users = 1; out_unlock: mutex_unlock(&defrag4_mutex); @@ -165,12 +155,10 @@ EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); void nf_defrag_ipv4_disable(struct net *net) { - struct defrag4_pernet *nf_defrag = net_generic(net, defrag4_pernet_id); - mutex_lock(&defrag4_mutex); - if (nf_defrag->users) { - nf_defrag->users--; - if (nf_defrag->users == 0) + if (net->nf.defrag_ipv4_users) { + net->nf.defrag_ipv4_users--; + if (net->nf.defrag_ipv4_users == 0) nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 4075230b14c6..9e8100728d46 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1982,6 +1982,8 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old, rcu_assign_pointer(old->nh_grp, newg); if (newg->resilient) { + /* Make sure concurrent readers are not using 'oldg' anymore. */ + synchronize_net(); rcu_assign_pointer(oldg->res_table, tmp_table); rcu_assign_pointer(oldg->spare->res_table, tmp_table); } @@ -2490,6 +2492,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, .fc_gw4 = cfg->gw.ipv4, .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, .fc_flags = cfg->nh_flags, + .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; @@ -2528,6 +2531,7 @@ static int nh_create_ipv6(struct net *net, struct nexthop *nh, .fc_ifindex = cfg->nh_ifindex, .fc_gateway = cfg->gw.ipv6, .fc_flags = cfg->nh_flags, + .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, .fc_is_fdb = cfg->nh_fdb, @@ -3563,6 +3567,7 @@ static struct notifier_block nh_netdev_notifier = { }; static int nexthops_dump(struct net *net, struct notifier_block *nb, + enum nexthop_event_type event_type, struct netlink_ext_ack *extack) { struct rb_root *root = &net->nexthop.rb_root; @@ -3573,8 +3578,7 @@ static int nexthops_dump(struct net *net, struct notifier_block *nb, struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); - err = call_nexthop_notifier(nb, net, NEXTHOP_EVENT_REPLACE, nh, - extack); + err = call_nexthop_notifier(nb, net, event_type, nh, extack); if (err) break; } @@ -3588,7 +3592,7 @@ int register_nexthop_notifier(struct net *net, struct notifier_block *nb, int err; rtnl_lock(); - err = nexthops_dump(net, nb, extack); + err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack); if (err) goto unlock; err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, @@ -3601,8 +3605,17 @@ EXPORT_SYMBOL(register_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { - return blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, - nb); + int err; + + rtnl_lock(); + err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, + nb); + if (err) + goto unlock; + nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); +unlock: + rtnl_unlock(); + return err; } EXPORT_SYMBOL(unregister_nexthop_notifier); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index b0d3a09dc84e..f30273afb539 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -53,7 +53,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) struct net *net = seq->private; int orphans, sockets; - orphans = percpu_counter_sum_positive(&tcp_orphan_count); + orphans = tcp_orphan_count_sum(); sockets = proto_sockets_allocated_sum_positive(&tcp_prot); socket_seq_show(seq); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a6f20ee35335..0b4103b1e622 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -61,15 +61,11 @@ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> -#include <linux/uaccess.h> #include <linux/bitops.h> -#include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/memblock.h> -#include <linux/string.h> #include <linux/socket.h> -#include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> @@ -84,20 +80,17 @@ #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <linux/rcupdate.h> -#include <linux/times.h> #include <linux/slab.h> #include <linux/jhash.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/net_namespace.h> -#include <net/protocol.h> #include <net/ip.h> #include <net/route.h> #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/nexthop.h> -#include <net/arp.h> #include <net/tcp.h> #include <net/icmp.h> #include <net/xfrm.h> @@ -109,7 +102,6 @@ #endif #include <net/secure_seq.h> #include <net/ip_tunnels.h> -#include <net/l3mdev.h> #include "fib_lookup.h" @@ -276,12 +268,13 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) struct rt_cache_stat *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); + seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); return 0; } - seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " - " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x\n", dst_entries_get_slow(&ipv4_dst_ops), 0, /* st->in_hit */ st->in_slow_tot, @@ -586,18 +579,25 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe) } } -static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) +static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) { - struct fib_nh_exception *fnhe, *oldest; + struct fib_nh_exception __rcu **fnhe_p, **oldest_p; + struct fib_nh_exception *fnhe, *oldest = NULL; - oldest = rcu_dereference(hash->chain); - for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; - fnhe = rcu_dereference(fnhe->fnhe_next)) { - if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) + for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) { + fnhe = rcu_dereference_protected(*fnhe_p, + lockdep_is_held(&fnhe_lock)); + if (!fnhe) + break; + if (!oldest || + time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) { oldest = fnhe; + oldest_p = fnhe_p; + } } fnhe_flush_routes(oldest); - return oldest; + *oldest_p = oldest->fnhe_next; + kfree_rcu(oldest, rcu); } static u32 fnhe_hashfun(__be32 daddr) @@ -676,16 +676,21 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, if (rt) fill_route_from_fnhe(rt, fnhe); } else { - if (depth > FNHE_RECLAIM_DEPTH) - fnhe = fnhe_oldest(hash); - else { - fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); - if (!fnhe) - goto out_unlock; - - fnhe->fnhe_next = hash->chain; - rcu_assign_pointer(hash->chain, fnhe); + /* Randomize max depth to avoid some side channels attacks. */ + int max_depth = FNHE_RECLAIM_DEPTH + + prandom_u32_max(FNHE_RECLAIM_DEPTH); + + while (depth > max_depth) { + fnhe_remove_oldest(hash); + depth--; } + + fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); + if (!fnhe) + goto out_unlock; + + fnhe->fnhe_next = hash->chain; + fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; @@ -693,6 +698,8 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = max(1UL, expires); + rcu_assign_pointer(hash->chain, fnhe); + /* Exception created; mark the cached routes for the nexthop * stale, so anyone caching it rechecks if this exception * applies to them. @@ -1299,26 +1306,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) { - const struct rtable *rt = (const struct rtable *)dst; - unsigned int mtu = rt->rt_pmtu; - - if (!mtu || time_after_eq(jiffies, rt->dst.expires)) - mtu = dst_metric_raw(dst, RTAX_MTU); - - if (mtu) - goto out; - - mtu = READ_ONCE(dst->dev->mtu); - - if (unlikely(ip_mtu_locked(dst))) { - if (rt->rt_uses_gateway && mtu > 576) - mtu = 576; - } - -out: - mtu = min_t(unsigned int, mtu, IP_MAX_MTU); - - return mtu - lwtunnel_headroom(dst->lwtstate, mtu); + return ip_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ipv4_mtu); @@ -2831,8 +2819,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or new->output = dst_discard_out; new->dev = net->loopback_dev; - if (new->dev) - dev_hold(new->dev); + dev_hold(new->dev); rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; @@ -3170,7 +3157,7 @@ static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, udph = skb_put_zero(skb, sizeof(struct udphdr)); udph->source = sport; udph->dest = dport; - udph->len = sizeof(struct udphdr); + udph->len = htons(sizeof(struct udphdr)); udph->check = 0; break; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 33792cf55a79..8696dc343ad2 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -7,8 +7,6 @@ */ #include <linux/tcp.h> -#include <linux/slab.h> -#include <linux/random.h> #include <linux/siphash.h> #include <linux/kernel.h> #include <linux/export.h> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6f1e64d49232..97eb54774924 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -6,25 +6,16 @@ * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] */ -#include <linux/mm.h> -#include <linux/module.h> #include <linux/sysctl.h> -#include <linux/igmp.h> -#include <linux/inetdevice.h> #include <linux/seqlock.h> #include <linux/init.h> #include <linux/slab.h> -#include <linux/nsproxy.h> -#include <linux/swap.h> -#include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_fib.h> -#include <net/route.h> #include <net/tcp.h> #include <net/udp.h> #include <net/cipso_ipv4.h> -#include <net/inet_frag.h> #include <net/ping.h> #include <net/protocol.h> #include <net/netevent.h> @@ -594,18 +585,6 @@ static struct ctl_table ipv4_table[] = { .extra1 = &sysctl_fib_sync_mem_min, .extra2 = &sysctl_fib_sync_mem_max, }, - { - .procname = "tcp_rx_skb_cache", - .data = &tcp_rx_skb_cache_key.key, - .mode = 0644, - .proc_handler = proc_do_static_key, - }, - { - .procname = "tcp_tx_skb_cache", - .data = &tcp_tx_skb_cache_key.key, - .mode = 0644, - .proc_handler = proc_do_static_key, - }, { } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8cb44040ec68..bc7f419184aa 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -287,8 +287,8 @@ enum { TCP_CMSG_TS = 2 }; -struct percpu_counter tcp_orphan_count; -EXPORT_SYMBOL_GPL(tcp_orphan_count); +DEFINE_PER_CPU(unsigned int, tcp_orphan_count); +EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem); @@ -325,11 +325,6 @@ struct tcp_splice_state { unsigned long tcp_memory_pressure __read_mostly; EXPORT_SYMBOL_GPL(tcp_memory_pressure); -DEFINE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); -EXPORT_SYMBOL(tcp_rx_skb_cache_key); - -DEFINE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); - void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; @@ -486,10 +481,7 @@ static bool tcp_stream_is_readable(struct sock *sk, int target) { if (tcp_epollin_ready(sk, target)) return true; - - if (sk->sk_prot->stream_memory_read) - return sk->sk_prot->stream_memory_read(sk); - return false; + return sk_is_readable(sk); } /* @@ -647,7 +639,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) } EXPORT_SYMBOL(tcp_ioctl); -static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) +void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; @@ -658,15 +650,13 @@ static inline bool forced_push(const struct tcp_sock *tp) return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } -static void skb_entail(struct sock *sk, struct sk_buff *skb) +void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); - skb->csum = 0; tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; - tcb->sacked = 0; __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); @@ -861,30 +851,15 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } EXPORT_SYMBOL(tcp_splice_read); -struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, - bool force_schedule) +struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, + bool force_schedule) { struct sk_buff *skb; - if (likely(!size)) { - skb = sk->sk_tx_skb_cache; - if (skb) { - skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); - sk->sk_tx_skb_cache = NULL; - pskb_trim(skb, 0); - INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); - skb_shinfo(skb)->tx_flags = 0; - memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb)); - return skb; - } - } - /* The TCP header must be at least 32-bit aligned. */ - size = ALIGN(size, 4); - if (unlikely(tcp_under_memory_pressure(sk))) sk_mem_reclaim_partial(sk); - skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); + skb = alloc_skb_fclone(size + MAX_TCP_HEADER, gfp); if (likely(skb)) { bool mem_scheduled; @@ -895,12 +870,8 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, mem_scheduled = sk_wmem_schedule(sk, skb->truesize); } if (likely(mem_scheduled)) { - skb_reserve(skb, sk->sk_prot->max_header); - /* - * Make sure that we have exactly size bytes - * available to the caller, no more, no less. - */ - skb->reserved_tailroom = skb->end - skb->tail - size; + skb_reserve(skb, MAX_TCP_HEADER); + skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } @@ -953,18 +924,20 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags) * importantly be able to generate EPOLLOUT for Edge Trigger epoll() * users. */ -void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) +void tcp_remove_empty_skb(struct sock *sk) { - if (skb && !skb->len) { + struct sk_buff *skb = tcp_write_queue_tail(sk); + + if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { tcp_unlink_write_queue(skb, sk); if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - sk_wmem_free_skb(sk, skb); + tcp_wmem_free_skb(sk, skb); } } -struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, - struct page *page, int offset, size_t *size) +static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, + struct page *page, int offset, size_t *size) { struct sk_buff *skb = tcp_write_queue_tail(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -977,15 +950,15 @@ new_segment: if (!sk_stream_memory_free(sk)) return NULL; - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - tcp_rtx_and_write_queues_empty(sk)); + skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, + tcp_rtx_and_write_queues_empty(sk)); if (!skb) return NULL; #ifdef CONFIG_TLS_DEVICE skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); #endif - skb_entail(sk, skb); + tcp_skb_entail(sk, skb); copy = size_goal; } @@ -1016,7 +989,6 @@ new_segment: skb->truesize += copy; sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); - skb->ip_summed = CHECKSUM_PARTIAL; WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); @@ -1107,7 +1079,7 @@ out: return copied; do_error: - tcp_remove_empty_skb(sk, tcp_write_queue_tail(sk)); + tcp_remove_empty_skb(sk); if (copied) goto out; out_err: @@ -1306,15 +1278,14 @@ new_segment: goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - first_skb); + skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, + first_skb); if (!skb) goto wait_for_space; process_backlog++; - skb->ip_summed = CHECKSUM_PARTIAL; - skb_entail(sk, skb); + tcp_skb_entail(sk, skb); copy = size_goal; /* All packets are restored as if they have @@ -1329,14 +1300,7 @@ new_segment: if (copy > msg_data_left(msg)) copy = msg_data_left(msg); - /* Where to copy to? */ - if (skb_availroom(skb) > 0 && !zc) { - /* We have some space in skb head. Superb! */ - copy = min_t(int, copy, skb_availroom(skb)); - err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy); - if (err) - goto do_fault; - } else if (!zc) { + if (!zc) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); @@ -1435,9 +1399,7 @@ out_nopush: return copied + copied_syn; do_error: - skb = tcp_write_queue_tail(sk); -do_fault: - tcp_remove_empty_skb(sk, skb); + tcp_remove_empty_skb(sk); if (copied + copied_syn) goto out; @@ -2690,11 +2652,36 @@ void tcp_shutdown(struct sock *sk, int how) } EXPORT_SYMBOL(tcp_shutdown); +int tcp_orphan_count_sum(void) +{ + int i, total = 0; + + for_each_possible_cpu(i) + total += per_cpu(tcp_orphan_count, i); + + return max(total, 0); +} + +static int tcp_orphan_cache; +static struct timer_list tcp_orphan_timer; +#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100) + +static void tcp_orphan_update(struct timer_list *unused) +{ + WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum()); + mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); +} + +static bool tcp_too_many_orphans(int shift) +{ + return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans; +} + bool tcp_check_oom(struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; - too_many_orphans = tcp_too_many_orphans(sk, shift); + too_many_orphans = tcp_too_many_orphans(shift); out_of_socket_memory = tcp_out_of_memory(sk); if (too_many_orphans) @@ -2803,7 +2790,7 @@ adjudge_to_death: /* remove backlog if any, without releasing ownership. */ __release_sock(sk); - percpu_counter_inc(sk->sk_prot->orphan_count); + this_cpu_inc(tcp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) @@ -2906,7 +2893,7 @@ static void tcp_rtx_queue_purge(struct sock *sk) * list_del(&skb->tcp_tsorted_anchor) */ tcp_rtx_queue_unlink(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_wmem_free_skb(sk, skb); } } @@ -2917,14 +2904,9 @@ void tcp_write_queue_purge(struct sock *sk) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { tcp_skb_tsorted_anchor_cleanup(skb); - sk_wmem_free_skb(sk, skb); + tcp_wmem_free_skb(sk, skb); } tcp_rtx_queue_purge(sk); - skb = sk->sk_tx_skb_cache; - if (skb) { - __kfree_skb(skb); - sk->sk_tx_skb_cache = NULL; - } INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); sk_mem_reclaim(sk); tcp_clear_all_retrans_hints(tcp_sk(sk)); @@ -2961,10 +2943,6 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); - if (sk->sk_rx_skb_cache) { - __kfree_skb(sk->sk_rx_skb_cache); - sk->sk_rx_skb_cache = NULL; - } WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); tp->urg_data = 0; tcp_write_queue_purge(sk); @@ -3338,6 +3316,7 @@ int tcp_set_window_clamp(struct sock *sk, int val) } else { tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? SOCK_MIN_RCVBUF / 2 : val; + tp->rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp); } return 0; } @@ -4504,7 +4483,10 @@ void __init tcp_init(void) sizeof_field(struct sk_buff, cb)); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); - percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); + + timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); + mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); + inet_hashinfo_init(&tcp_hashinfo); inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", thash_entries, 21, /* one slot per 2 MB*/ @@ -4512,7 +4494,9 @@ void __init tcp_init(void) tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN | SLAB_PANIC | + SLAB_ACCOUNT, + NULL); /* Size and allocate the main established and bind bucket * hash tables. diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 6274462b86b4..ec5550089b4d 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -56,6 +56,8 @@ * otherwise TCP stack falls back to an internal pacing using one high * resolution timer per TCP socket and may use more resources. */ +#include <linux/btf.h> +#include <linux/btf_ids.h> #include <linux/module.h> #include <net/tcp.h> #include <linux/inet_diag.h> @@ -1152,14 +1154,38 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .set_state = bbr_set_state, }; +BTF_SET_START(tcp_bbr_kfunc_ids) +#ifdef CONFIG_X86 +#ifdef CONFIG_DYNAMIC_FTRACE +BTF_ID(func, bbr_init) +BTF_ID(func, bbr_main) +BTF_ID(func, bbr_sndbuf_expand) +BTF_ID(func, bbr_undo_cwnd) +BTF_ID(func, bbr_cwnd_event) +BTF_ID(func, bbr_ssthresh) +BTF_ID(func, bbr_min_tso_segs) +BTF_ID(func, bbr_set_state) +#endif +#endif +BTF_SET_END(tcp_bbr_kfunc_ids) + +static DEFINE_KFUNC_BTF_ID_SET(&tcp_bbr_kfunc_ids, tcp_bbr_kfunc_btf_set); + static int __init bbr_register(void) { + int ret; + BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE); - return tcp_register_congestion_control(&tcp_bbr_cong_ops); + ret = tcp_register_congestion_control(&tcp_bbr_cong_ops); + if (ret) + return ret; + register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set); + return 0; } static void __exit bbr_unregister(void) { + unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_bbr_kfunc_btf_set); tcp_unregister_congestion_control(&tcp_bbr_cong_ops); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index d3e9386b493e..5f4d6f45d87f 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -150,19 +150,6 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir); #ifdef CONFIG_BPF_SYSCALL -static bool tcp_bpf_stream_read(const struct sock *sk) -{ - struct sk_psock *psock; - bool empty = true; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) - empty = list_empty(&psock->ingress_msg); - rcu_read_unlock(); - return !empty; -} - static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { @@ -232,6 +219,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, bool cork = false, enospc = sk_msg_full(msg); struct sock *sk_redir; u32 tosend, delta = 0; + u32 eval = __SK_NONE; int ret; more_data: @@ -275,13 +263,24 @@ more_data: case __SK_REDIRECT: sk_redir = psock->sk_redir; sk_msg_apply_bytes(psock, tosend); + if (!psock->apply_bytes) { + /* Clean up before releasing the sock lock. */ + eval = psock->eval; + psock->eval = __SK_NONE; + psock->sk_redir = NULL; + } if (psock->cork) { cork = true; psock->cork = NULL; } sk_msg_return(sk, msg, tosend); release_sock(sk); + ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); + + if (eval == __SK_REDIRECT) + sock_put(sk_redir); + lock_sock(sk); if (unlikely(ret < 0)) { int free = sk_msg_free_nocharge(sk, msg); @@ -479,7 +478,7 @@ static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS], prot[TCP_BPF_BASE].unhash = sock_map_unhash; prot[TCP_BPF_BASE].close = sock_map_close; prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg; - prot[TCP_BPF_BASE].stream_memory_read = tcp_bpf_stream_read; + prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable; prot[TCP_BPF_TX] = prot[TCP_BPF_BASE]; prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 4a30deaa9a37..5e9d9c51164c 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -25,6 +25,8 @@ */ #include <linux/mm.h> +#include <linux/btf.h> +#include <linux/btf_ids.h> #include <linux/module.h> #include <linux/math64.h> #include <net/tcp.h> @@ -482,8 +484,25 @@ static struct tcp_congestion_ops cubictcp __read_mostly = { .name = "cubic", }; +BTF_SET_START(tcp_cubic_kfunc_ids) +#ifdef CONFIG_X86 +#ifdef CONFIG_DYNAMIC_FTRACE +BTF_ID(func, cubictcp_init) +BTF_ID(func, cubictcp_recalc_ssthresh) +BTF_ID(func, cubictcp_cong_avoid) +BTF_ID(func, cubictcp_state) +BTF_ID(func, cubictcp_cwnd_event) +BTF_ID(func, cubictcp_acked) +#endif +#endif +BTF_SET_END(tcp_cubic_kfunc_ids) + +static DEFINE_KFUNC_BTF_ID_SET(&tcp_cubic_kfunc_ids, tcp_cubic_kfunc_btf_set); + static int __init cubictcp_register(void) { + int ret; + BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); /* Precompute a bunch of the scaling factors that are used per-packet @@ -514,11 +533,16 @@ static int __init cubictcp_register(void) /* divide by bic_scale and by constant Srtt (100ms) */ do_div(cube_factor, bic_scale * 10); - return tcp_register_congestion_control(&cubictcp); + ret = tcp_register_congestion_control(&cubictcp); + if (ret) + return ret; + register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set); + return 0; } static void __exit cubictcp_unregister(void) { + unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_cubic_kfunc_btf_set); tcp_unregister_congestion_control(&cubictcp); } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 79f705450c16..0d7ab3cc7b61 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -36,6 +36,8 @@ * Glenn Judd <glenn.judd@morganstanley.com> */ +#include <linux/btf.h> +#include <linux/btf_ids.h> #include <linux/module.h> #include <linux/mm.h> #include <net/tcp.h> @@ -236,14 +238,36 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = { .name = "dctcp-reno", }; +BTF_SET_START(tcp_dctcp_kfunc_ids) +#ifdef CONFIG_X86 +#ifdef CONFIG_DYNAMIC_FTRACE +BTF_ID(func, dctcp_init) +BTF_ID(func, dctcp_update_alpha) +BTF_ID(func, dctcp_cwnd_event) +BTF_ID(func, dctcp_ssthresh) +BTF_ID(func, dctcp_cwnd_undo) +BTF_ID(func, dctcp_state) +#endif +#endif +BTF_SET_END(tcp_dctcp_kfunc_ids) + +static DEFINE_KFUNC_BTF_ID_SET(&tcp_dctcp_kfunc_ids, tcp_dctcp_kfunc_btf_set); + static int __init dctcp_register(void) { + int ret; + BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE); - return tcp_register_congestion_control(&dctcp); + ret = tcp_register_congestion_control(&dctcp); + if (ret) + return ret; + register_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set); + return 0; } static void __exit dctcp_unregister(void) { + unregister_kfunc_btf_id_set(&bpf_tcp_ca_kfunc_list, &tcp_dctcp_kfunc_btf_set); tcp_unregister_congestion_control(&dctcp); } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 25fa4c01a17f..fdbcf2a6d08e 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -1,13 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/crypto.h> -#include <linux/err.h> -#include <linux/init.h> #include <linux/kernel.h> -#include <linux/list.h> #include <linux/tcp.h> #include <linux/rcupdate.h> -#include <linux/rculist.h> -#include <net/inetpeer.h> #include <net/tcp.h> void tcp_fastopen_init_key_once(struct net *net) @@ -55,12 +49,7 @@ void tcp_fastopen_ctx_destroy(struct net *net) { struct tcp_fastopen_context *ctxt; - spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); - - ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, - lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); - rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL); - spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); + ctxt = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, NULL); if (ctxt) call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free); @@ -89,18 +78,12 @@ int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, ctx->num = 1; } - spin_lock(&net->ipv4.tcp_fastopen_ctx_lock); if (sk) { q = &inet_csk(sk)->icsk_accept_queue.fastopenq; - octx = rcu_dereference_protected(q->ctx, - lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); - rcu_assign_pointer(q->ctx, ctx); + octx = xchg((__force struct tcp_fastopen_context **)&q->ctx, ctx); } else { - octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx, - lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock)); - rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx); + octx = xchg((__force struct tcp_fastopen_context **)&net->ipv4.tcp_fastopen_ctx, ctx); } - spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock); if (octx) call_rcu(&octx->rcu, tcp_fastopen_ctx_free); @@ -379,8 +362,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, return NULL; } - if (syn_data && - tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) + if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD)) goto fastopen; if (foc->len == 0) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 149ceb5c94ff..246ab7b5e857 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -100,6 +100,7 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE; #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ #define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */ +#define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -454,11 +455,12 @@ static void tcp_sndbuf_expand(struct sock *sk) */ /* Slow part of check#2. */ -static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb, + unsigned int skbtruesize) { struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ - int truesize = tcp_win_from_space(sk, skb->truesize) >> 1; + int truesize = tcp_win_from_space(sk, skbtruesize) >> 1; int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; while (tp->rcv_ssthresh <= window) { @@ -471,30 +473,59 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) return 0; } -static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) +/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing + * can play nice with us, as sk_buff and skb->head might be either + * freed or shared with up to MAX_SKB_FRAGS segments. + * Only give a boost to drivers using page frag(s) to hold the frame(s), + * and if no payload was pulled in skb->head before reaching us. + */ +static u32 truesize_adjust(bool adjust, const struct sk_buff *skb) +{ + u32 truesize = skb->truesize; + + if (adjust && !skb_headlen(skb)) { + truesize -= SKB_TRUESIZE(skb_end_offset(skb)); + /* paranoid check, some drivers might be buggy */ + if (unlikely((int)truesize < (int)skb->len)) + truesize = skb->truesize; + } + return truesize; +} + +static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb, + bool adjust) { struct tcp_sock *tp = tcp_sk(sk); int room; room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; + if (room <= 0) + return; + /* Check #1 */ - if (room > 0 && !tcp_under_memory_pressure(sk)) { + if (!tcp_under_memory_pressure(sk)) { + unsigned int truesize = truesize_adjust(adjust, skb); int incr; /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ - if (tcp_win_from_space(sk, skb->truesize) <= skb->len) + if (tcp_win_from_space(sk, truesize) <= skb->len) incr = 2 * tp->advmss; else - incr = __tcp_grow_window(sk, skb); + incr = __tcp_grow_window(sk, skb, truesize); if (incr) { incr = max_t(int, incr, 2 * skb->len); tp->rcv_ssthresh += min(room, incr); inet_csk(sk)->icsk_ack.quick |= 1; } + } else { + /* Under pressure: + * Adjust rcv_ssthresh according to reserved mem + */ + tcp_adjust_rcv_ssthresh(sk); } } @@ -782,7 +813,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) tcp_ecn_check_ce(sk, skb); if (skb->len >= 128) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, true); } /* Called to compute a smoothed rtt estimate. The data fed to this @@ -969,6 +1000,8 @@ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, return 0; if (seq_len > tp->mss_cache) dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache); + else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq) + state->flag |= FLAG_DSACK_TLP; tp->dsack_dups += dup_segs; /* Skip the DSACK if dup segs weren't retransmitted by sender */ @@ -976,7 +1009,14 @@ static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq, return 0; tp->rx_opt.sack_ok |= TCP_DSACK_SEEN; - tp->rack.dsack_seen = 1; + /* We increase the RACK ordering window in rounds where we receive + * DSACKs that may have been due to reordering causing RACK to trigger + * a spurious fast recovery. Thus RACK ignores DSACKs that happen + * without having seen reordering, or that match TLP probes (TLP + * is timer-driven, not triggered by RACK). + */ + if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP)) + tp->rack.dsack_seen = 1; state->flag |= FLAG_DSACKING_ACK; /* A spurious retransmission is delivered */ @@ -1314,7 +1354,7 @@ static u8 tcp_sacktag_one(struct sock *sk, if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) - tp->undo_retrans--; + tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount); if ((sacked & TCPCB_SACKED_ACKED) && before(start_seq, state->reord)) state->reord = start_seq; @@ -3189,7 +3229,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, long seq_rtt_us = -1L; long ca_rtt_us = -1L; u32 pkts_acked = 0; - u32 last_in_flight = 0; bool rtt_update; int flag = 0; @@ -3225,7 +3264,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (!first_ackt) first_ackt = last_ackt; - last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; if (before(start_seq, reord)) reord = start_seq; if (!after(scb->end_seq, tp->high_seq)) @@ -3291,8 +3329,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); - if (pkts_acked == 1 && last_in_flight < tp->mss_cache && - last_in_flight && !prior_sacked && fully_acked && + if (pkts_acked == 1 && fully_acked && !prior_sacked && + (tp->snd_una - prior_snd_una) < tp->mss_cache && sack->rate->prior_delivered + 1 == tp->delivered && !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { /* Conservatively mark a delayed ACK. It's typically @@ -3349,9 +3387,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, - .rtt_us = sack->rate->rtt_us, - .in_flight = last_in_flight }; + .rtt_us = sack->rate->rtt_us }; + sample.in_flight = tp->mss_cache * + (tp->delivered - sack->rate->prior_delivered); icsk->icsk_ca_ops->pkts_acked(sk, &sample); } @@ -3628,7 +3667,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) if (!tp->tlp_retrans) { /* TLP of new data has been acknowledged */ tp->tlp_high_seq = 0; - } else if (flag & FLAG_DSACKING_ACK) { + } else if (flag & FLAG_DSACK_TLP) { /* This DSACK means original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; } else if (after(ack, tp->tlp_high_seq)) { @@ -4769,7 +4808,7 @@ coalesce_done: * and trigger fast retransmit. */ if (tcp_is_sack(tp)) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, true); kfree_skb_partial(skb, fragstolen); skb = NULL; goto add_sack; @@ -4857,7 +4896,7 @@ end: * and trigger fast retransmit. */ if (tcp_is_sack(tp)) - tcp_grow_window(sk, skb); + tcp_grow_window(sk, skb, false); skb_condense(skb); skb_set_owner_r(skb, sk); } @@ -5314,7 +5353,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); else if (tcp_under_memory_pressure(sk)) - tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + tcp_adjust_rcv_ssthresh(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) return 0; @@ -5349,7 +5388,7 @@ static int tcp_prune_queue(struct sock *sk) return -1; } -static bool tcp_should_expand_sndbuf(const struct sock *sk) +static bool tcp_should_expand_sndbuf(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -5360,8 +5399,18 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return false; /* If we are under global TCP memory pressure, do not expand. */ - if (tcp_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) { + int unused_mem = sk_unused_reserved_mem(sk); + + /* Adjust sndbuf according to reserved mem. But make sure + * it never goes below SOCK_MIN_SNDBUF. + * See sk_stream_moderate_sndbuf() for more details. + */ + if (unused_mem > SOCK_MIN_SNDBUF) + WRITE_ONCE(sk->sk_sndbuf, unused_mem); + return false; + } /* If we are under soft global TCP memory pressure, do not expand. */ if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0)) @@ -5383,7 +5432,7 @@ static void tcp_new_space(struct sock *sk) tp->snd_cwnd_stamp = tcp_jiffies32; } - sk->sk_write_space(sk); + INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); } static void tcp_check_space(struct sock *sk) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a692626c19e4..13d868c43284 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -508,9 +508,12 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) if (sk->sk_state == TCP_CLOSE) goto out; - if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { - __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); - goto out; + if (static_branch_unlikely(&ip4_min_ttl)) { + /* min_ttl can be changed concurrently from do_ip_setsockopt() */ + if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } } tp = tcp_sk(sk); @@ -1037,6 +1040,20 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); EXPORT_SYMBOL(tcp_md5_needed); +static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) +{ + if (!old) + return true; + + /* l3index always overrides non-l3index */ + if (old->l3index && new->l3index == 0) + return false; + if (old->l3index == 0 && new->l3index) + return true; + + return old->prefixlen < new->prefixlen; +} + /* Find the Key structure for an address. */ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, @@ -1059,7 +1076,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, lockdep_sock_is_held(sk)) { if (key->family != family) continue; - if (key->l3index && key->l3index != l3index) + if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index) continue; if (family == AF_INET) { mask = inet_make_mask(key->prefixlen); @@ -1074,8 +1091,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, match = false; } - if (match && (!best_match || - key->prefixlen > best_match->prefixlen)) + if (match && better_md5_match(best_match, key)) best_match = key; } return best_match; @@ -1085,7 +1101,7 @@ EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, - int l3index) + int l3index, u8 flags) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1105,7 +1121,9 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, lockdep_sock_is_held(sk)) { if (key->family != family) continue; - if (key->l3index && key->l3index != l3index) + if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX)) + continue; + if (key->l3index != l3index) continue; if (!memcmp(&key->addr, addr, size) && key->prefixlen == prefixlen) @@ -1129,7 +1147,7 @@ EXPORT_SYMBOL(tcp_v4_md5_lookup); /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, int l3index, + int family, u8 prefixlen, int l3index, u8 flags, const u8 *newkey, u8 newkeylen, gfp_t gfp) { /* Add Key to the list */ @@ -1137,7 +1155,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); if (key) { /* Pre-existing entry - just update that one. * Note that the key might be used concurrently. @@ -1182,6 +1200,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key->family = family; key->prefixlen = prefixlen; key->l3index = l3index; + key->flags = flags; memcpy(&key->addr, addr, (family == AF_INET6) ? sizeof(struct in6_addr) : sizeof(struct in_addr)); @@ -1191,11 +1210,11 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, EXPORT_SYMBOL(tcp_md5_do_add); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, - u8 prefixlen, int l3index) + u8 prefixlen, int l3index, u8 flags) { struct tcp_md5sig_key *key; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags); if (!key) return -ENOENT; hlist_del_rcu(&key->node); @@ -1229,6 +1248,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, const union tcp_md5_addr *addr; u8 prefixlen = 32; int l3index = 0; + u8 flags; if (optlen < sizeof(cmd)) return -EINVAL; @@ -1239,6 +1259,8 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, if (sin->sin_family != AF_INET) return -EINVAL; + flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; + if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { prefixlen = cmd.tcpm_prefixlen; @@ -1246,7 +1268,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, return -EINVAL; } - if (optname == TCP_MD5SIG_EXT && + if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { struct net_device *dev; @@ -1267,12 +1289,12 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; if (!cmd.tcpm_keylen) - return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); + return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags); if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; - return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, + return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } @@ -1596,7 +1618,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, * memory, then we end up not copying the key * across. Shucks. */ - tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, + tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags, key->key, key->keylen, GFP_ATOMIC); sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } @@ -1684,7 +1706,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { - if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + if (sk->sk_rx_dst_ifindex != skb->skb_iif || !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check, dst, 0)) { dst_release(dst); @@ -1769,7 +1791,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) if (dst) dst = dst_check(dst, 0); if (dst && - inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) + sk->sk_rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } @@ -1941,7 +1963,6 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); - struct sk_buff *skb_to_free; int sdif = inet_sdif(skb); int dif = inet_iif(skb); const struct iphdr *iph; @@ -2050,9 +2071,13 @@ process: return 0; } } - if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { - __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); - goto discard_and_relse; + + if (static_branch_unlikely(&ip4_min_ttl)) { + /* min_ttl can be changed concurrently from do_ip_setsockopt() */ + if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) { + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + goto discard_and_relse; + } } if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) @@ -2082,17 +2107,12 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - skb_to_free = sk->sk_rx_skb_cache; - sk->sk_rx_skb_cache = NULL; ret = tcp_v4_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb)) goto discard_and_relse; - skb_to_free = NULL; } bh_unlock_sock(sk); - if (skb_to_free) - __kfree_skb(skb_to_free); put_and_return: if (refcounted) @@ -2182,7 +2202,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) if (dst && dst_hold_safe(dst)) { sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; + sk->sk_rx_dst_ifindex = skb->skb_iif; } } EXPORT_SYMBOL(inet_sk_rx_dst_set); @@ -2277,51 +2297,72 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ -/* - * Get next listener socket follow cur. If cur is NULL, get first socket - * starting from bucket given in st->bucket; when st->bucket is zero the - * very first socket in the hash table is returned. +static unsigned short seq_file_family(const struct seq_file *seq); + +static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) +{ + unsigned short family = seq_file_family(seq); + + /* AF_UNSPEC is used as a match all */ + return ((family == AF_UNSPEC || family == sk->sk_family) && + net_eq(sock_net(sk), seq_file_net(seq))); +} + +/* Find a non empty bucket (starting from st->bucket) + * and return the first sk from it. */ -static void *listening_get_next(struct seq_file *seq, void *cur) +static void *listening_get_first(struct seq_file *seq) { - struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; - struct sock *sk = cur; - if (st->bpf_seq_afinfo) - afinfo = st->bpf_seq_afinfo; - else - afinfo = PDE_DATA(file_inode(seq->file)); + st->offset = 0; + for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { + struct inet_listen_hashbucket *ilb2; + struct inet_connection_sock *icsk; + struct sock *sk; - if (!sk) { -get_head: - ilb = &tcp_hashinfo.listening_hash[st->bucket]; - spin_lock(&ilb->lock); - sk = sk_nulls_head(&ilb->nulls_head); - st->offset = 0; - goto get_sk; + ilb2 = &tcp_hashinfo.lhash2[st->bucket]; + if (hlist_empty(&ilb2->head)) + continue; + + spin_lock(&ilb2->lock); + inet_lhash2_for_each_icsk(icsk, &ilb2->head) { + sk = (struct sock *)icsk; + if (seq_sk_match(seq, sk)) + return sk; + } + spin_unlock(&ilb2->lock); } - ilb = &tcp_hashinfo.listening_hash[st->bucket]; + + return NULL; +} + +/* Find the next sk of "cur" within the same bucket (i.e. st->bucket). + * If "cur" is the last one in the st->bucket, + * call listening_get_first() to return the first sk of the next + * non empty bucket. + */ +static void *listening_get_next(struct seq_file *seq, void *cur) +{ + struct tcp_iter_state *st = seq->private; + struct inet_listen_hashbucket *ilb2; + struct inet_connection_sock *icsk; + struct sock *sk = cur; + ++st->num; ++st->offset; - sk = sk_nulls_next(sk); -get_sk: - sk_nulls_for_each_from(sk, node) { - if (!net_eq(sock_net(sk), net)) - continue; - if (afinfo->family == AF_UNSPEC || - sk->sk_family == afinfo->family) + icsk = inet_csk(sk); + inet_lhash2_for_each_icsk_continue(icsk) { + sk = (struct sock *)icsk; + if (seq_sk_match(seq, sk)) return sk; } - spin_unlock(&ilb->lock); - st->offset = 0; - if (++st->bucket < INET_LHTABLE_SIZE) - goto get_head; - return NULL; + + ilb2 = &tcp_hashinfo.lhash2[st->bucket]; + spin_unlock(&ilb2->lock); + ++st->bucket; + return listening_get_first(seq); } static void *listening_get_idx(struct seq_file *seq, loff_t *pos) @@ -2331,7 +2372,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) st->bucket = 0; st->offset = 0; - rc = listening_get_next(seq, NULL); + rc = listening_get_first(seq); while (rc && *pos) { rc = listening_get_next(seq, rc); @@ -2351,15 +2392,7 @@ static inline bool empty_bucket(const struct tcp_iter_state *st) */ static void *established_get_first(struct seq_file *seq) { - struct tcp_seq_afinfo *afinfo; struct tcp_iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - void *rc = NULL; - - if (st->bpf_seq_afinfo) - afinfo = st->bpf_seq_afinfo; - else - afinfo = PDE_DATA(file_inode(seq->file)); st->offset = 0; for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { @@ -2373,32 +2406,20 @@ static void *established_get_first(struct seq_file *seq) spin_lock_bh(lock); sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { - if ((afinfo->family != AF_UNSPEC && - sk->sk_family != afinfo->family) || - !net_eq(sock_net(sk), net)) { - continue; - } - rc = sk; - goto out; + if (seq_sk_match(seq, sk)) + return sk; } spin_unlock_bh(lock); } -out: - return rc; + + return NULL; } static void *established_get_next(struct seq_file *seq, void *cur) { - struct tcp_seq_afinfo *afinfo; struct sock *sk = cur; struct hlist_nulls_node *node; struct tcp_iter_state *st = seq->private; - struct net *net = seq_file_net(seq); - - if (st->bpf_seq_afinfo) - afinfo = st->bpf_seq_afinfo; - else - afinfo = PDE_DATA(file_inode(seq->file)); ++st->num; ++st->offset; @@ -2406,9 +2427,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) sk = sk_nulls_next(sk); sk_nulls_for_each_from(sk, node) { - if ((afinfo->family == AF_UNSPEC || - sk->sk_family == afinfo->family) && - net_eq(sock_net(sk), net)) + if (seq_sk_match(seq, sk)) return sk; } @@ -2451,17 +2470,18 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) static void *tcp_seek_last_pos(struct seq_file *seq) { struct tcp_iter_state *st = seq->private; + int bucket = st->bucket; int offset = st->offset; int orig_num = st->num; void *rc = NULL; switch (st->state) { case TCP_SEQ_STATE_LISTENING: - if (st->bucket >= INET_LHTABLE_SIZE) + if (st->bucket > tcp_hashinfo.lhash2_mask) break; st->state = TCP_SEQ_STATE_LISTENING; - rc = listening_get_next(seq, NULL); - while (offset-- && rc) + rc = listening_get_first(seq); + while (offset-- && rc && bucket == st->bucket) rc = listening_get_next(seq, rc); if (rc) break; @@ -2472,7 +2492,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) if (st->bucket > tcp_hashinfo.ehash_mask) break; rc = established_get_first(seq); - while (offset-- && rc) + while (offset-- && rc && bucket == st->bucket) rc = established_get_next(seq, rc); } @@ -2542,7 +2562,7 @@ void tcp_seq_stop(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) - spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock); + spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); break; case TCP_SEQ_STATE_ESTABLISHED: if (v) @@ -2687,6 +2707,15 @@ out: } #ifdef CONFIG_BPF_SYSCALL +struct bpf_tcp_iter_state { + struct tcp_iter_state state; + unsigned int cur_sk; + unsigned int end_sk; + unsigned int max_sk; + struct sock **batch; + bool st_bucket_done; +}; + struct bpf_iter__tcp { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct sock_common *, sk_common); @@ -2705,16 +2734,204 @@ static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, return bpf_iter_run_prog(prog, &ctx); } +static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter) +{ + while (iter->cur_sk < iter->end_sk) + sock_put(iter->batch[iter->cur_sk++]); +} + +static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, + unsigned int new_batch_sz) +{ + struct sock **new_batch; + + new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, + GFP_USER | __GFP_NOWARN); + if (!new_batch) + return -ENOMEM; + + bpf_iter_tcp_put_batch(iter); + kvfree(iter->batch); + iter->batch = new_batch; + iter->max_sk = new_batch_sz; + + return 0; +} + +static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, + struct sock *start_sk) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct inet_connection_sock *icsk; + unsigned int expected = 1; + struct sock *sk; + + sock_hold(start_sk); + iter->batch[iter->end_sk++] = start_sk; + + icsk = inet_csk(start_sk); + inet_lhash2_for_each_icsk_continue(icsk) { + sk = (struct sock *)icsk; + if (seq_sk_match(seq, sk)) { + if (iter->end_sk < iter->max_sk) { + sock_hold(sk); + iter->batch[iter->end_sk++] = sk; + } + expected++; + } + } + spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); + + return expected; +} + +static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, + struct sock *start_sk) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct hlist_nulls_node *node; + unsigned int expected = 1; + struct sock *sk; + + sock_hold(start_sk); + iter->batch[iter->end_sk++] = start_sk; + + sk = sk_nulls_next(start_sk); + sk_nulls_for_each_from(sk, node) { + if (seq_sk_match(seq, sk)) { + if (iter->end_sk < iter->max_sk) { + sock_hold(sk); + iter->batch[iter->end_sk++] = sk; + } + expected++; + } + } + spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + + return expected; +} + +static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + unsigned int expected; + bool resized = false; + struct sock *sk; + + /* The st->bucket is done. Directly advance to the next + * bucket instead of having the tcp_seek_last_pos() to skip + * one by one in the current bucket and eventually find out + * it has to advance to the next bucket. + */ + if (iter->st_bucket_done) { + st->offset = 0; + st->bucket++; + if (st->state == TCP_SEQ_STATE_LISTENING && + st->bucket > tcp_hashinfo.lhash2_mask) { + st->state = TCP_SEQ_STATE_ESTABLISHED; + st->bucket = 0; + } + } + +again: + /* Get a new batch */ + iter->cur_sk = 0; + iter->end_sk = 0; + iter->st_bucket_done = false; + + sk = tcp_seek_last_pos(seq); + if (!sk) + return NULL; /* Done */ + + if (st->state == TCP_SEQ_STATE_LISTENING) + expected = bpf_iter_tcp_listening_batch(seq, sk); + else + expected = bpf_iter_tcp_established_batch(seq, sk); + + if (iter->end_sk == expected) { + iter->st_bucket_done = true; + return sk; + } + + if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) { + resized = true; + goto again; + } + + return sk; +} + +static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos) +{ + /* bpf iter does not support lseek, so it always + * continue from where it was stop()-ped. + */ + if (*pos) + return bpf_iter_tcp_batch(seq); + + return SEQ_START_TOKEN; +} + +static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_tcp_iter_state *iter = seq->private; + struct tcp_iter_state *st = &iter->state; + struct sock *sk; + + /* Whenever seq_next() is called, the iter->cur_sk is + * done with seq_show(), so advance to the next sk in + * the batch. + */ + if (iter->cur_sk < iter->end_sk) { + /* Keeping st->num consistent in tcp_iter_state. + * bpf_iter_tcp does not use st->num. + * meta.seq_num is used instead. + */ + st->num++; + /* Move st->offset to the next sk in the bucket such that + * the future start() will resume at st->offset in + * st->bucket. See tcp_seek_last_pos(). + */ + st->offset++; + sock_put(iter->batch[iter->cur_sk++]); + } + + if (iter->cur_sk < iter->end_sk) + sk = iter->batch[iter->cur_sk]; + else + sk = bpf_iter_tcp_batch(seq); + + ++*pos; + /* Keeping st->last_pos consistent in tcp_iter_state. + * bpf iter does not do lseek, so st->last_pos always equals to *pos. + */ + st->last_pos = *pos; + return sk; +} + static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; + bool slow; uid_t uid; + int ret; if (v == SEQ_START_TOKEN) return 0; + if (sk_fullsock(sk)) + slow = lock_sock_fast(sk); + + if (unlikely(sk_unhashed(sk))) { + ret = SEQ_SKIP; + goto unlock; + } + if (sk->sk_state == TCP_TIME_WAIT) { uid = 0; } else if (sk->sk_state == TCP_NEW_SYN_RECV) { @@ -2728,11 +2945,18 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) meta.seq = seq; prog = bpf_iter_get_info(&meta, false); - return tcp_prog_seq_show(prog, &meta, v, uid); + ret = tcp_prog_seq_show(prog, &meta, v, uid); + +unlock: + if (sk_fullsock(sk)) + unlock_sock_fast(sk, slow); + return ret; + } static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) { + struct bpf_tcp_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; @@ -2743,16 +2967,33 @@ static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v) (void)tcp_prog_seq_show(prog, &meta, v, 0); } - tcp_seq_stop(seq, v); + if (iter->cur_sk < iter->end_sk) { + bpf_iter_tcp_put_batch(iter); + iter->st_bucket_done = false; + } } static const struct seq_operations bpf_iter_tcp_seq_ops = { .show = bpf_iter_tcp_seq_show, - .start = tcp_seq_start, - .next = tcp_seq_next, + .start = bpf_iter_tcp_seq_start, + .next = bpf_iter_tcp_seq_next, .stop = bpf_iter_tcp_seq_stop, }; #endif +static unsigned short seq_file_family(const struct seq_file *seq) +{ + const struct tcp_seq_afinfo *afinfo; + +#ifdef CONFIG_BPF_SYSCALL + /* Iterated from bpf_iter. Let the bpf prog to filter instead. */ + if (seq->op == &bpf_iter_tcp_seq_ops) + return AF_UNSPEC; +#endif + + /* Iterated from proc fs */ + afinfo = PDE_DATA(file_inode(seq->file)); + return afinfo->family; +} static const struct seq_operations tcp4_seq_ops = { .show = tcp4_seq_show, @@ -2964,7 +3205,6 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; - spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; atomic_set(&net->ipv4.tfo_active_disable_times, 0); @@ -3003,39 +3243,55 @@ static struct pernet_operations __net_initdata tcp_sk_ops = { DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta, struct sock_common *sk_common, uid_t uid) +#define INIT_BATCH_SZ 16 + static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux) { - struct tcp_iter_state *st = priv_data; - struct tcp_seq_afinfo *afinfo; - int ret; + struct bpf_tcp_iter_state *iter = priv_data; + int err; - afinfo = kmalloc(sizeof(*afinfo), GFP_USER | __GFP_NOWARN); - if (!afinfo) - return -ENOMEM; + err = bpf_iter_init_seq_net(priv_data, aux); + if (err) + return err; - afinfo->family = AF_UNSPEC; - st->bpf_seq_afinfo = afinfo; - ret = bpf_iter_init_seq_net(priv_data, aux); - if (ret) - kfree(afinfo); - return ret; + err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ); + if (err) { + bpf_iter_fini_seq_net(priv_data); + return err; + } + + return 0; } static void bpf_iter_fini_tcp(void *priv_data) { - struct tcp_iter_state *st = priv_data; + struct bpf_tcp_iter_state *iter = priv_data; - kfree(st->bpf_seq_afinfo); bpf_iter_fini_seq_net(priv_data); + kvfree(iter->batch); } static const struct bpf_iter_seq_info tcp_seq_info = { .seq_ops = &bpf_iter_tcp_seq_ops, .init_seq_private = bpf_iter_init_tcp, .fini_seq_private = bpf_iter_fini_tcp, - .seq_priv_size = sizeof(struct tcp_iter_state), + .seq_priv_size = sizeof(struct bpf_tcp_iter_state), }; +static const struct bpf_func_proto * +bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_setsockopt: + return &bpf_sk_setsockopt_proto; + case BPF_FUNC_getsockopt: + return &bpf_sk_getsockopt_proto; + default: + return NULL; + } +} + static struct bpf_iter_reg tcp_reg_info = { .target = "tcp", .ctx_arg_info_size = 1, @@ -3043,6 +3299,7 @@ static struct bpf_iter_reg tcp_reg_info = { { offsetof(struct bpf_iter__tcp, sk_common), PTR_TO_BTF_ID_OR_NULL }, }, + .get_func_proto = bpf_iter_tcp_get_func_proto, .seq_info = &tcp_seq_info, }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0a4f3f16140a..cf913a66df17 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -19,14 +19,7 @@ * Jorge Cwik, <jorge@laser.satlink.net> */ -#include <linux/mm.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/sysctl.h> -#include <linux/workqueue.h> -#include <linux/static_key.h> #include <net/tcp.h> -#include <net/inet_common.h> #include <net/xfrm.h> #include <net/busy_poll.h> diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 95db7a11ba2a..ab552356bdba 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -25,7 +25,6 @@ * 1) Add mechanism to deal with reverse congestion. */ -#include <linux/mm.h> #include <linux/module.h> #include <linux/math64.h> #include <net/tcp.h> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 29553fce8502..6fbbf1558033 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -394,7 +394,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; - TCP_SKB_CB(skb)->sacked = 0; tcp_skb_pcount_set(skb, 1); @@ -1256,8 +1255,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); skb->skb_mstamp_ns = tp->tcp_wstamp_ns; if (clone_it) { - TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - - tp->snd_una; oskb = skb; tcp_skb_tsorted_save(oskb) { @@ -1566,7 +1563,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, return -ENOMEM; /* Get a new skb... force flag on. */ - buff = sk_stream_alloc_skb(sk, nsize, gfp, true); + buff = tcp_stream_alloc_skb(sk, nsize, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ skb_copy_decrypted(buff, skb); @@ -1592,8 +1589,6 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, skb_split(skb, buff, len); - buff->ip_summed = CHECKSUM_PARTIAL; - buff->tstamp = skb->tstamp; tcp_fragment_tstamp(skb, buff); @@ -1678,7 +1673,6 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) delta_truesize = __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; - skb->ip_summed = CHECKSUM_PARTIAL; if (delta_truesize) { skb->truesize -= delta_truesize; @@ -2123,7 +2117,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, return tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, skb, len, mss_now, gfp); - buff = sk_stream_alloc_skb(sk, 0, gfp, true); + buff = tcp_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) return -ENOMEM; skb_copy_decrypted(buff, skb); @@ -2144,12 +2138,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH); TCP_SKB_CB(buff)->tcp_flags = flags; - /* This packet was never sent out yet, so no SACK bits. */ - TCP_SKB_CB(buff)->sacked = 0; - tcp_skb_fragment_eor(skb, buff); - buff->ip_summed = CHECKSUM_PARTIAL; skb_split(skb, buff, len); tcp_fragment_tstamp(skb, buff); @@ -2390,7 +2380,7 @@ static int tcp_mtu_probe(struct sock *sk) return -1; /* We're allowed to probe. Build it now. */ - nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); + nskb = tcp_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); if (!nskb) return -1; sk_wmem_queued_add(sk, nskb->truesize); @@ -2403,9 +2393,6 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK; - TCP_SKB_CB(nskb)->sacked = 0; - nskb->csum = 0; - nskb->ip_summed = CHECKSUM_PARTIAL; tcp_insert_write_queue_before(nskb, skb, sk); tcp_highest_sack_replace(sk, skb, nskb); @@ -2425,7 +2412,7 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; tcp_skb_collapse_tstamp(nskb, skb); tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_wmem_free_skb(sk, skb); } else { TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags & ~(TCPHDR_FIN|TCPHDR_PSH); @@ -2969,8 +2956,7 @@ u32 __tcp_select_window(struct sock *sk) icsk->icsk_ack.quick = 0; if (tcp_under_memory_pressure(sk)) - tp->rcv_ssthresh = min(tp->rcv_ssthresh, - 4U * tp->advmss); + tcp_adjust_rcv_ssthresh(sk); /* free_space might become our new window, make sure we don't * increase it due to wscale. @@ -3048,13 +3034,9 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - if (next_skb_size) { - if (next_skb_size <= skb_availroom(skb)) - skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size), - next_skb_size); - else if (!tcp_skb_shift(skb, next_skb, 1, next_skb_size)) - return false; - } + if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size)) + return false; + tcp_highest_sack_replace(sk, next_skb, skb); /* Update sequence range on original skb. */ @@ -3373,7 +3355,8 @@ void sk_forced_mem_schedule(struct sock *sk, int size) sk_memory_allocated_add(sk, amt); if (mem_cgroup_sockets_enabled && sk->sk_memcg) - mem_cgroup_charge_skmem(sk->sk_memcg, amt); + mem_cgroup_charge_skmem(sk->sk_memcg, amt, + gfp_memcg_charge() | __GFP_NOFAIL); } /* Send a FIN. The caller locks the socket for us. @@ -3756,10 +3739,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) /* limit to order-0 allocations */ space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER)); - syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false); + syn_data = tcp_stream_alloc_skb(sk, space, sk->sk_allocation, false); if (!syn_data) goto fallback; - syn_data->ip_summed = CHECKSUM_PARTIAL; memcpy(syn_data->cb, syn->cb, sizeof(syn->cb)); if (space) { int copied = copy_from_iter(skb_put(syn_data, space), space, @@ -3837,7 +3819,7 @@ int tcp_connect(struct sock *sk) return 0; } - buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); + buff = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation, true); if (unlikely(!buff)) return -ENOBUFS; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 0de693565963..fbab921670cc 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -65,6 +65,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; } @@ -86,6 +87,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, if (!rs->prior_delivered || after(scb->tx.delivered, rs->prior_delivered)) { + rs->prior_delivered_ce = scb->tx.delivered_ce; rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; rs->is_app_limited = scb->tx.is_app_limited; @@ -138,6 +140,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, } rs->delivered = tp->delivered - rs->prior_delivered; + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ + rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; + /* Model sending data and receiving ACKs as separate pipeline phases * for a window. Usually the ACK phase is longer, but with ACK * compression the send phase can be longer. To be safe we use the diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 6f1b4ac7fe99..fd113f6226ef 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -172,7 +172,8 @@ void tcp_rack_reo_timeout(struct sock *sk) /* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. * - * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded + * If a DSACK is received that seems like it may have been due to reordering + * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded * by srtt), since there is possibility that spurious retransmission was * due to reordering delay longer than reo_wnd. * diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1a742b710e54..2fffcf2b54f3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -390,7 +390,8 @@ static int compute_score(struct sock *sk, struct net *net, dif, sdif); if (!dev_match) return -1; - score += 4; + if (sk->sk_bound_dev_if) + score += 4; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; @@ -1053,7 +1054,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) __be16 dport; u8 tos; int err, is_udplite = IS_UDPLITE(sk); - int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; struct ip_options_data opt_copy; @@ -1143,7 +1144,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) @@ -1361,7 +1362,7 @@ int udp_sendpage(struct sock *sk, struct page *page, int offset, } up->len += size; - if (!(up->corkflag || (flags&MSG_MORE))) + if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE))) ret = udp_push_pending_frames(sk); if (!ret) ret = size; @@ -2662,9 +2663,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: if (val != 0) { - up->corkflag = 1; + WRITE_ONCE(up->corkflag, 1); } else { - up->corkflag = 0; + WRITE_ONCE(up->corkflag, 0); lock_sock(sk); push_pending_frames(sk); release_sock(sk); @@ -2787,7 +2788,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, switch (optname) { case UDP_CORK: - val = up->corkflag; + val = READ_ONCE(up->corkflag); break; case UDP_ENCAP: @@ -2866,6 +2867,9 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) mask &= ~(EPOLLIN | EPOLLRDNORM); + /* psock ingress_msg queue should not contain any bad checksum frames */ + if (sk_is_readable(sk)) + mask |= EPOLLIN | EPOLLRDNORM; return mask; } diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c index 9f5a5cdc38e6..bbe6569c9ad3 100644 --- a/net/ipv4/udp_bpf.c +++ b/net/ipv4/udp_bpf.c @@ -112,9 +112,9 @@ static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS]; static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; - prot->unhash = sock_map_unhash; prot->close = sock_map_close; prot->recvmsg = udp_bpf_recvmsg; + prot->sock_is_readable = sk_msg_is_readable; } static void udp_bpf_check_v6_needs_rebuild(struct proto *ops) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 1380a6b6f4ff..86d32a1e62ac 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -152,8 +152,8 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6) { + const struct net_offload __rcu **offloads; __be16 protocol = skb->protocol; - const struct net_offload **offloads; const struct net_offload *ops; struct sk_buff *segs = ERR_PTR(-EINVAL); struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb, diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index b97e3635acf5..8efaf8c3fe2a 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -2,11 +2,8 @@ #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> -#include <linux/udp.h> -#include <linux/types.h> #include <linux/kernel.h> #include <net/dst_metadata.h> -#include <net/net_namespace.h> #include <net/udp.h> #include <net/udp_tunnel.h> diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index 0d122edc368d..b91003538d87 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -935,7 +935,7 @@ static int __init udp_tunnel_nic_init_module(void) { int err; - udp_tunnel_nic_workqueue = alloc_workqueue("udp_tunnel_nic", 0, 0); + udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0); if (!udp_tunnel_nic_workqueue) return -ENOMEM; diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index f4555a88f86b..9d4f418f1bf8 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -8,9 +8,7 @@ #include <linux/skbuff.h> #include <linux/module.h> -#include <linux/mutex.h> #include <net/xfrm.h> -#include <net/ip.h> #include <net/protocol.h> static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 747f56e0c636..bf2e5e5fe142 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -328,4 +328,15 @@ config IPV6_RPL_LWTUNNEL If unsure, say N. +config IPV6_IOAM6_LWTUNNEL + bool "IPv6: IOAM Pre-allocated Trace insertion support" + depends on IPV6 + select LWTUNNEL + select DST_CACHE + help + Support for the insertion of IOAM Pre-allocated Trace + Header using the lightweight tunnels mechanism. + + If unsure, say N. + endif # IPV6 diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index cf7b47bdb9b3..3036a45e8a1e 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -5,16 +5,14 @@ obj-$(CONFIG_IPV6) += ipv6.o -ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ +ipv6-y := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \ addrlabel.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \ raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \ exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \ - udp_offload.o seg6.o fib6_notifier.o rpl.o + udp_offload.o seg6.o fib6_notifier.o rpl.o ioam6.o -ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o - -ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o +ipv6-$(CONFIG_SYSCTL) += sysctl_net_ipv6.o ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ @@ -27,8 +25,7 @@ ipv6-$(CONFIG_NETLABEL) += calipso.o ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o ipv6-$(CONFIG_IPV6_RPL_LWTUNNEL) += rpl_iptunnel.o - -ipv6-objs += $(ipv6-y) +ipv6-$(CONFIG_IPV6_IOAM6_LWTUNNEL) += ioam6_iptunnel.o obj-$(CONFIG_INET6_AH) += ah6.o obj-$(CONFIG_INET6_ESP) += esp6.o @@ -47,7 +44,8 @@ obj-$(CONFIG_IPV6_GRE) += ip6_gre.o obj-$(CONFIG_IPV6_FOU) += fou6.o obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o -obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload) +obj-$(CONFIG_INET) += output_core.o protocol.o \ + ip6_offload.o tcpv6_offload.o exthdrs_offload.o obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3bf685fe64b9..3445f8017430 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -89,6 +89,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> +#include <linux/ioam6.h> #define INFINITY_LIFE_TIME 0xFFFFFFFF @@ -237,6 +238,10 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, .disable_policy = 0, .rpl_seg_enabled = 0, + .ioam6_enabled = 0, + .ioam6_id = IOAM6_DEFAULT_IF_ID, + .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE, + .ndisc_evict_nocarrier = 1, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -293,6 +298,10 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, .disable_policy = 0, .rpl_seg_enabled = 0, + .ioam6_enabled = 0, + .ioam6_id = IOAM6_DEFAULT_IF_ID, + .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE, + .ndisc_evict_nocarrier = 1, }; /* Check if link is ready: is it up and is a valid qdisc available */ @@ -387,6 +396,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY; ndev->cnf.mtu6 = dev->mtu; + ndev->ra_mtu = 0; ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); if (!ndev->nd_parms) { kfree(ndev); @@ -694,8 +704,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, errout: if (in6_dev) in6_dev_put(in6_dev); - if (dev) - dev_put(dev); + dev_put(dev); return err; } @@ -1080,7 +1089,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg, goto out; } - ifa = kzalloc(sizeof(*ifa), gfp_flags); + ifa = kzalloc(sizeof(*ifa), gfp_flags | __GFP_ACCOUNT); if (!ifa) { err = -ENOBUFS; goto out; @@ -2230,12 +2239,12 @@ static int addrconf_ifid_6lowpan(u8 *eui, struct net_device *dev) static int addrconf_ifid_ieee1394(u8 *eui, struct net_device *dev) { - union fwnet_hwaddr *ha; + const union fwnet_hwaddr *ha; if (dev->addr_len != FWNET_ALEN) return -1; - ha = (union fwnet_hwaddr *)dev->dev_addr; + ha = (const union fwnet_hwaddr *)dev->dev_addr; memcpy(eui, &ha->uc.uniq_id, sizeof(ha->uc.uniq_id)); eui[0] ^= 2; @@ -3085,21 +3094,27 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, } } -#if IS_ENABLED(CONFIG_IPV6_SIT) -static void sit_add_v4_addrs(struct inet6_dev *idev) +#if IS_ENABLED(CONFIG_IPV6_SIT) || IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) +static void add_v4_addrs(struct inet6_dev *idev) { struct in6_addr addr; struct net_device *dev; struct net *net = dev_net(idev->dev); - int scope, plen; + int scope, plen, offset = 0; u32 pflags = 0; ASSERT_RTNL(); memset(&addr, 0, sizeof(struct in6_addr)); - memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); + /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */ + if (idev->dev->addr_len == sizeof(struct in6_addr)) + offset = sizeof(struct in6_addr) - 4; + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); if (idev->dev->flags&IFF_POINTOPOINT) { + if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE) + return; + addr.s6_addr32[0] = htonl(0xfe800000); scope = IFA_LINK; plen = 64; @@ -3335,8 +3350,6 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_IEEE1394) && (dev->type != ARPHRD_TUNNEL6) && (dev->type != ARPHRD_6LOWPAN) && - (dev->type != ARPHRD_IP6GRE) && - (dev->type != ARPHRD_IPGRE) && (dev->type != ARPHRD_TUNNEL) && (dev->type != ARPHRD_NONE) && (dev->type != ARPHRD_RAWIP)) { @@ -3384,14 +3397,14 @@ static void addrconf_sit_config(struct net_device *dev) return; } - sit_add_v4_addrs(idev); + add_v4_addrs(idev); if (dev->flags&IFF_POINTOPOINT) addrconf_add_mroute(dev); } #endif -#if IS_ENABLED(CONFIG_NET_IPGRE) +#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) static void addrconf_gre_config(struct net_device *dev) { struct inet6_dev *idev; @@ -3404,7 +3417,13 @@ static void addrconf_gre_config(struct net_device *dev) return; } - addrconf_addr_gen(idev, true); + if (dev->type == ARPHRD_ETHER) { + addrconf_addr_gen(idev, true); + return; + } + + add_v4_addrs(idev); + if (dev->flags & IFF_POINTOPOINT) addrconf_add_mroute(dev); } @@ -3580,7 +3599,8 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, addrconf_sit_config(dev); break; #endif -#if IS_ENABLED(CONFIG_NET_IPGRE) +#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) + case ARPHRD_IP6GRE: case ARPHRD_IPGRE: addrconf_gre_config(dev); break; @@ -3843,6 +3863,7 @@ restart: } idev->tstamp = jiffies; + idev->ra_mtu = 0; /* Last: Shot the device (if unregistered) */ if (unregister) { @@ -5211,8 +5232,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, .netnsid = -1, .type = type, }; - struct net *net = sock_net(skb->sk); - struct net *tgt_net = net; + struct net *tgt_net = sock_net(skb->sk); int idx, s_idx, s_ip_idx; int h, s_h; struct net_device *dev; @@ -5351,7 +5371,7 @@ static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb, static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { - struct net *net = sock_net(in_skb->sk); + struct net *tgt_net = sock_net(in_skb->sk); struct inet6_fill_args fillargs = { .portid = NETLINK_CB(in_skb).portid, .seq = nlh->nlmsg_seq, @@ -5359,7 +5379,6 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, .flags = 0, .netnsid = -1, }; - struct net *tgt_net = net; struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *addr = NULL, *peer; @@ -5412,8 +5431,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, errout_ifa: in6_ifa_put(ifa); errout: - if (dev) - dev_put(dev); + dev_put(dev); if (fillargs.netnsid >= 0) put_net(tgt_net); @@ -5526,6 +5544,10 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy; array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass; array[DEVCONF_RPL_SEG_ENABLED] = cnf->rpl_seg_enabled; + array[DEVCONF_IOAM6_ENABLED] = cnf->ioam6_enabled; + array[DEVCONF_IOAM6_ID] = cnf->ioam6_id; + array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide; + array[DEVCONF_NDISC_EVICT_NOCARRIER] = cnf->ndisc_evict_nocarrier; } static inline size_t inet6_ifla6_size(void) @@ -5537,6 +5559,7 @@ static inline size_t inet6_ifla6_size(void) + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */ + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */ + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */ + + nla_total_size(4) /* IFLA_INET6_RA_MTU */ + 0; } @@ -5645,6 +5668,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev, if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode)) goto nla_put_failure; + if (idev->ra_mtu && + nla_put_u32(skb, IFLA_INET6_RA_MTU, idev->ra_mtu)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -5761,6 +5788,9 @@ update_lft: static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = { [IFLA_INET6_ADDR_GEN_MODE] = { .type = NLA_U8 }, [IFLA_INET6_TOKEN] = { .len = sizeof(struct in6_addr) }, + [IFLA_INET6_RA_MTU] = { .type = NLA_REJECT, + .reject_message = + "IFLA_INET6_RA_MTU can not be set" }, }; static int check_addr_gen_mode(int mode) @@ -5784,7 +5814,8 @@ static int check_stable_privacy(struct inet6_dev *idev, struct net *net, } static int inet6_validate_link_af(const struct net_device *dev, - const struct nlattr *nla) + const struct nlattr *nla, + struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_INET6_MAX + 1]; struct inet6_dev *idev = NULL; @@ -5797,7 +5828,7 @@ static int inet6_validate_link_af(const struct net_device *dev, } err = nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, - inet6_af_policy, NULL); + inet6_af_policy, extack); if (err) return err; @@ -6540,6 +6571,7 @@ static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, static int minus_one = -1; static const int two_five_five = 255; +static u32 ioam6_if_id_max = U16_MAX; static const struct ctl_table addrconf_sysctl[] = { { @@ -6933,6 +6965,40 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { + .procname = "ioam6_enabled", + .data = &ipv6_devconf.ioam6_enabled, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = (void *)SYSCTL_ZERO, + .extra2 = (void *)SYSCTL_ONE, + }, + { + .procname = "ioam6_id", + .data = &ipv6_devconf.ioam6_id, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = (void *)SYSCTL_ZERO, + .extra2 = (void *)&ioam6_if_id_max, + }, + { + .procname = "ioam6_id_wide", + .data = &ipv6_devconf.ioam6_id_wide, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec, + }, + { + .procname = "ndisc_evict_nocarrier", + .data = &ipv6_devconf.ndisc_evict_nocarrier, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = (void *)SYSCTL_ZERO, + .extra2 = (void *)SYSCTL_ONE, + }, + { /* sentinel */ } }; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2389ff702f51..0c4da163535a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -62,6 +62,7 @@ #include <net/rpl.h> #include <net/compat.h> #include <net/xfrm.h> +#include <net/ioam6.h> #include <linux/uaccess.h> #include <linux/mroute6.h> @@ -454,7 +455,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, - BPF_CGROUP_INET6_BIND, &flags); + CGROUP_INET6_BIND, &flags); if (err) return err; @@ -520,31 +521,32 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; sin->sin6_scope_id = 0; + lock_sock(sk); if (peer) { - if (!inet->inet_dport) - return -ENOTCONN; - if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && - peer == 1) + if (!inet->inet_dport || + (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && + peer == 1)) { + release_sock(sk); return -ENOTCONN; + } sin->sin6_port = inet->inet_dport; sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETPEERNAME, - NULL); + BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, + CGROUP_INET6_GETPEERNAME); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - BPF_CGROUP_INET6_GETSOCKNAME, - NULL); + BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, + CGROUP_INET6_GETSOCKNAME); } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); + release_sock(sk); return sizeof(*sin); } EXPORT_SYMBOL(inet6_getname); @@ -961,6 +963,9 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.fib_notify_on_flag_change = 0; atomic_set(&net->ipv6.fib6_sernum, 1); + net->ipv6.sysctl.ioam6_id = IOAM6_DEFAULT_ID; + net->ipv6.sysctl.ioam6_id_wide = IOAM6_DEFAULT_ID_WIDE; + err = ipv6_init_mibs(net); if (err) return err; @@ -1191,6 +1196,10 @@ static int __init inet6_init(void) if (err) goto rpl_fail; + err = ioam6_init(); + if (err) + goto ioam6_fail; + err = igmp6_late_init(); if (err) goto igmp6_late_err; @@ -1213,6 +1222,8 @@ sysctl_fail: igmp6_late_cleanup(); #endif igmp6_late_err: + ioam6_exit(); +ioam6_fail: rpl_exit(); rpl_fail: seg6_exit(); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 26882e165c9e..38ece3b7b839 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -49,22 +49,12 @@ #include <net/seg6_hmac.h> #endif #include <net/rpl.h> +#include <linux/ioam6.h> +#include <net/ioam6.h> +#include <net/dst_metadata.h> #include <linux/uaccess.h> -/* - * Parsing tlv encoded headers. - * - * Parsing function "func" returns true, if parsing succeed - * and false, if it failed. - * It MUST NOT touch skb->h. - */ - -struct tlvtype_proc { - int type; - bool (*func)(struct sk_buff *skb, int offset); -}; - /********************* Generic functions *********************/ @@ -109,16 +99,23 @@ drop: return false; } +static bool ipv6_hop_ra(struct sk_buff *skb, int optoff); +static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff); +static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff); +static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff); +#if IS_ENABLED(CONFIG_IPV6_MIP6) +static bool ipv6_dest_hao(struct sk_buff *skb, int optoff); +#endif + /* Parse tlv encoded option header (hop-by-hop or destination) */ -static bool ip6_parse_tlv(const struct tlvtype_proc *procs, +static bool ip6_parse_tlv(bool hopbyhop, struct sk_buff *skb, int max_count) { int len = (skb_transport_header(skb)[1] + 1) << 3; const unsigned char *nh = skb_network_header(skb); int off = skb_network_header_len(skb); - const struct tlvtype_proc *curr; bool disallow_unknowns = false; int tlv_count = 0; int padlen = 0; @@ -173,20 +170,45 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs, if (tlv_count > max_count) goto bad; - for (curr = procs; curr->type >= 0; curr++) { - if (curr->type == nh[off]) { - /* type specific length/alignment - checks will be performed in the - func(). */ - if (curr->func(skb, off) == false) + if (hopbyhop) { + switch (nh[off]) { + case IPV6_TLV_ROUTERALERT: + if (!ipv6_hop_ra(skb, off)) + return false; + break; + case IPV6_TLV_IOAM: + if (!ipv6_hop_ioam(skb, off)) + return false; + break; + case IPV6_TLV_JUMBO: + if (!ipv6_hop_jumbo(skb, off)) + return false; + break; + case IPV6_TLV_CALIPSO: + if (!ipv6_hop_calipso(skb, off)) + return false; + break; + default: + if (!ip6_tlvopt_unknown(skb, off, + disallow_unknowns)) + return false; + break; + } + } else { + switch (nh[off]) { +#if IS_ENABLED(CONFIG_IPV6_MIP6) + case IPV6_TLV_HAO: + if (!ipv6_dest_hao(skb, off)) + return false; + break; +#endif + default: + if (!ip6_tlvopt_unknown(skb, off, + disallow_unknowns)) return false; break; } } - if (curr->type < 0 && - !ip6_tlvopt_unknown(skb, off, disallow_unknowns)) - return false; - padlen = 0; } off += optlen; @@ -264,16 +286,6 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff) } #endif -static const struct tlvtype_proc tlvprocdestopt_lst[] = { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - { - .type = IPV6_TLV_HAO, - .func = ipv6_dest_hao, - }, -#endif - {-1, NULL} -}; - static int ipv6_destopt_rcv(struct sk_buff *skb) { struct inet6_dev *idev = __in6_dev_get(skb->dev); @@ -304,8 +316,7 @@ fail_and_free: dstbuf = opt->dst1; #endif - if (ip6_parse_tlv(tlvprocdestopt_lst, skb, - net->ipv6.sysctl.max_dst_opts_cnt)) { + if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) @@ -928,6 +939,60 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff) return false; } +/* IOAM */ + +static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff) +{ + struct ioam6_trace_hdr *trace; + struct ioam6_namespace *ns; + struct ioam6_hdr *hdr; + + /* Bad alignment (must be 4n-aligned) */ + if (optoff & 3) + goto drop; + + /* Ignore if IOAM is not enabled on ingress */ + if (!__in6_dev_get(skb->dev)->cnf.ioam6_enabled) + goto ignore; + + /* Truncated Option header */ + hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff); + if (hdr->opt_len < 2) + goto drop; + + switch (hdr->type) { + case IOAM6_TYPE_PREALLOC: + /* Truncated Pre-allocated Trace header */ + if (hdr->opt_len < 2 + sizeof(*trace)) + goto drop; + + /* Malformed Pre-allocated Trace header */ + trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr)); + if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4) + goto drop; + + /* Ignore if the IOAM namespace is unknown */ + ns = ioam6_namespace(ipv6_skb_net(skb), trace->namespace_id); + if (!ns) + goto ignore; + + if (!skb_valid_dst(skb)) + ip6_route_input(skb); + + ioam6_fill_trace_data(skb, ns, trace, true); + break; + default: + break; + } + +ignore: + return true; + +drop: + kfree_skb(skb); + return false; +} + /* Jumbo payload */ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff) @@ -994,22 +1059,6 @@ drop: return false; } -static const struct tlvtype_proc tlvprochopopt_lst[] = { - { - .type = IPV6_TLV_ROUTERALERT, - .func = ipv6_hop_ra, - }, - { - .type = IPV6_TLV_JUMBO, - .func = ipv6_hop_jumbo, - }, - { - .type = IPV6_TLV_CALIPSO, - .func = ipv6_hop_calipso, - }, - { -1, } -}; - int ipv6_parse_hopopts(struct sk_buff *skb) { struct inet6_skb_parm *opt = IP6CB(skb); @@ -1035,8 +1084,7 @@ fail_and_free: goto fail_and_free; opt->flags |= IP6SKB_HOPBYHOP; - if (ip6_parse_tlv(tlvprochopopt_lst, skb, - net->ipv6.sysctl.max_hbh_opts_cnt)) { + if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) { skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index a1ac0e3d8c60..47447f0241df 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -610,7 +610,11 @@ int ila_xlat_init_net(struct net *net) if (err) return err; - rhashtable_init(&ilan->xlat.rhash_table, &rht_params); + err = rhashtable_init(&ilan->xlat.rhash_table, &rht_params); + if (err) { + free_bucket_spinlocks(ilan->xlat.locks); + return err; + } return 0; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 55c290d55605..67c9114835c8 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -106,7 +106,7 @@ static inline int compute_score(struct sock *sk, struct net *net, if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return -1; - score = 1; + score = sk->sk_bound_dev_if ? 2 : 1; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; } diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c new file mode 100644 index 000000000000..122a3d47424c --- /dev/null +++ b/net/ipv6/ioam6.c @@ -0,0 +1,965 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * IPv6 IOAM implementation + * + * Author: + * Justin Iurman <justin.iurman@uliege.be> + */ + +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/net.h> +#include <linux/ioam6.h> +#include <linux/ioam6_genl.h> +#include <linux/rhashtable.h> + +#include <net/addrconf.h> +#include <net/genetlink.h> +#include <net/ioam6.h> + +static void ioam6_ns_release(struct ioam6_namespace *ns) +{ + kfree_rcu(ns, rcu); +} + +static void ioam6_sc_release(struct ioam6_schema *sc) +{ + kfree_rcu(sc, rcu); +} + +static void ioam6_free_ns(void *ptr, void *arg) +{ + struct ioam6_namespace *ns = (struct ioam6_namespace *)ptr; + + if (ns) + ioam6_ns_release(ns); +} + +static void ioam6_free_sc(void *ptr, void *arg) +{ + struct ioam6_schema *sc = (struct ioam6_schema *)ptr; + + if (sc) + ioam6_sc_release(sc); +} + +static int ioam6_ns_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct ioam6_namespace *ns = obj; + + return (ns->id != *(__be16 *)arg->key); +} + +static int ioam6_sc_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct ioam6_schema *sc = obj; + + return (sc->id != *(u32 *)arg->key); +} + +static const struct rhashtable_params rht_ns_params = { + .key_len = sizeof(__be16), + .key_offset = offsetof(struct ioam6_namespace, id), + .head_offset = offsetof(struct ioam6_namespace, head), + .automatic_shrinking = true, + .obj_cmpfn = ioam6_ns_cmpfn, +}; + +static const struct rhashtable_params rht_sc_params = { + .key_len = sizeof(u32), + .key_offset = offsetof(struct ioam6_schema, id), + .head_offset = offsetof(struct ioam6_schema, head), + .automatic_shrinking = true, + .obj_cmpfn = ioam6_sc_cmpfn, +}; + +static struct genl_family ioam6_genl_family; + +static const struct nla_policy ioam6_genl_policy_addns[] = { + [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 }, + [IOAM6_ATTR_NS_DATA] = { .type = NLA_U32 }, + [IOAM6_ATTR_NS_DATA_WIDE] = { .type = NLA_U64 }, +}; + +static const struct nla_policy ioam6_genl_policy_delns[] = { + [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 }, +}; + +static const struct nla_policy ioam6_genl_policy_addsc[] = { + [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 }, + [IOAM6_ATTR_SC_DATA] = { .type = NLA_BINARY, + .len = IOAM6_MAX_SCHEMA_DATA_LEN }, +}; + +static const struct nla_policy ioam6_genl_policy_delsc[] = { + [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 }, +}; + +static const struct nla_policy ioam6_genl_policy_ns_sc[] = { + [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 }, + [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 }, + [IOAM6_ATTR_SC_NONE] = { .type = NLA_FLAG }, +}; + +static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_pernet_data *nsdata; + struct ioam6_namespace *ns; + u64 data64; + u32 data32; + __be16 id; + int err; + + if (!info->attrs[IOAM6_ATTR_NS_ID]) + return -EINVAL; + + id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID])); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); + if (ns) { + err = -EEXIST; + goto out_unlock; + } + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) { + err = -ENOMEM; + goto out_unlock; + } + + ns->id = id; + + if (!info->attrs[IOAM6_ATTR_NS_DATA]) + data32 = IOAM6_U32_UNAVAILABLE; + else + data32 = nla_get_u32(info->attrs[IOAM6_ATTR_NS_DATA]); + + if (!info->attrs[IOAM6_ATTR_NS_DATA_WIDE]) + data64 = IOAM6_U64_UNAVAILABLE; + else + data64 = nla_get_u64(info->attrs[IOAM6_ATTR_NS_DATA_WIDE]); + + ns->data = cpu_to_be32(data32); + ns->data_wide = cpu_to_be64(data64); + + err = rhashtable_lookup_insert_fast(&nsdata->namespaces, &ns->head, + rht_ns_params); + if (err) + kfree(ns); + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +} + +static int ioam6_genl_delns(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_pernet_data *nsdata; + struct ioam6_namespace *ns; + struct ioam6_schema *sc; + __be16 id; + int err; + + if (!info->attrs[IOAM6_ATTR_NS_ID]) + return -EINVAL; + + id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID])); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); + if (!ns) { + err = -ENOENT; + goto out_unlock; + } + + sc = rcu_dereference_protected(ns->schema, + lockdep_is_held(&nsdata->lock)); + + err = rhashtable_remove_fast(&nsdata->namespaces, &ns->head, + rht_ns_params); + if (err) + goto out_unlock; + + if (sc) + rcu_assign_pointer(sc->ns, NULL); + + ioam6_ns_release(ns); + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +} + +static int __ioam6_genl_dumpns_element(struct ioam6_namespace *ns, + u32 portid, + u32 seq, + u32 flags, + struct sk_buff *skb, + u8 cmd) +{ + struct ioam6_schema *sc; + u64 data64; + u32 data32; + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd); + if (!hdr) + return -ENOMEM; + + data32 = be32_to_cpu(ns->data); + data64 = be64_to_cpu(ns->data_wide); + + if (nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id)) || + (data32 != IOAM6_U32_UNAVAILABLE && + nla_put_u32(skb, IOAM6_ATTR_NS_DATA, data32)) || + (data64 != IOAM6_U64_UNAVAILABLE && + nla_put_u64_64bit(skb, IOAM6_ATTR_NS_DATA_WIDE, + data64, IOAM6_ATTR_PAD))) + goto nla_put_failure; + + rcu_read_lock(); + + sc = rcu_dereference(ns->schema); + if (sc && nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id)) { + rcu_read_unlock(); + goto nla_put_failure; + } + + rcu_read_unlock(); + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ioam6_genl_dumpns_start(struct netlink_callback *cb) +{ + struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk)); + struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; + + if (!iter) { + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + cb->args[0] = (long)iter; + } + + rhashtable_walk_enter(&nsdata->namespaces, iter); + + return 0; +} + +static int ioam6_genl_dumpns_done(struct netlink_callback *cb) +{ + struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; + + rhashtable_walk_exit(iter); + kfree(iter); + + return 0; +} + +static int ioam6_genl_dumpns(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rhashtable_iter *iter; + struct ioam6_namespace *ns; + int err; + + iter = (struct rhashtable_iter *)cb->args[0]; + rhashtable_walk_start(iter); + + for (;;) { + ns = rhashtable_walk_next(iter); + + if (IS_ERR(ns)) { + if (PTR_ERR(ns) == -EAGAIN) + continue; + err = PTR_ERR(ns); + goto done; + } else if (!ns) { + break; + } + + err = __ioam6_genl_dumpns_element(ns, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + skb, + IOAM6_CMD_DUMP_NAMESPACES); + if (err) + goto done; + } + + err = skb->len; + +done: + rhashtable_walk_stop(iter); + return err; +} + +static int ioam6_genl_addsc(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_pernet_data *nsdata; + int len, len_aligned, err; + struct ioam6_schema *sc; + u32 id; + + if (!info->attrs[IOAM6_ATTR_SC_ID] || !info->attrs[IOAM6_ATTR_SC_DATA]) + return -EINVAL; + + id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params); + if (sc) { + err = -EEXIST; + goto out_unlock; + } + + len = nla_len(info->attrs[IOAM6_ATTR_SC_DATA]); + len_aligned = ALIGN(len, 4); + + sc = kzalloc(sizeof(*sc) + len_aligned, GFP_KERNEL); + if (!sc) { + err = -ENOMEM; + goto out_unlock; + } + + sc->id = id; + sc->len = len_aligned; + sc->hdr = cpu_to_be32(sc->id | ((u8)(sc->len / 4) << 24)); + nla_memcpy(sc->data, info->attrs[IOAM6_ATTR_SC_DATA], len); + + err = rhashtable_lookup_insert_fast(&nsdata->schemas, &sc->head, + rht_sc_params); + if (err) + goto free_sc; + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +free_sc: + kfree(sc); + goto out_unlock; +} + +static int ioam6_genl_delsc(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_pernet_data *nsdata; + struct ioam6_namespace *ns; + struct ioam6_schema *sc; + int err; + u32 id; + + if (!info->attrs[IOAM6_ATTR_SC_ID]) + return -EINVAL; + + id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params); + if (!sc) { + err = -ENOENT; + goto out_unlock; + } + + ns = rcu_dereference_protected(sc->ns, lockdep_is_held(&nsdata->lock)); + + err = rhashtable_remove_fast(&nsdata->schemas, &sc->head, + rht_sc_params); + if (err) + goto out_unlock; + + if (ns) + rcu_assign_pointer(ns->schema, NULL); + + ioam6_sc_release(sc); + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +} + +static int __ioam6_genl_dumpsc_element(struct ioam6_schema *sc, + u32 portid, u32 seq, u32 flags, + struct sk_buff *skb, u8 cmd) +{ + struct ioam6_namespace *ns; + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd); + if (!hdr) + return -ENOMEM; + + if (nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id) || + nla_put(skb, IOAM6_ATTR_SC_DATA, sc->len, sc->data)) + goto nla_put_failure; + + rcu_read_lock(); + + ns = rcu_dereference(sc->ns); + if (ns && nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id))) { + rcu_read_unlock(); + goto nla_put_failure; + } + + rcu_read_unlock(); + + genlmsg_end(skb, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ioam6_genl_dumpsc_start(struct netlink_callback *cb) +{ + struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk)); + struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; + + if (!iter) { + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + cb->args[0] = (long)iter; + } + + rhashtable_walk_enter(&nsdata->schemas, iter); + + return 0; +} + +static int ioam6_genl_dumpsc_done(struct netlink_callback *cb) +{ + struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0]; + + rhashtable_walk_exit(iter); + kfree(iter); + + return 0; +} + +static int ioam6_genl_dumpsc(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rhashtable_iter *iter; + struct ioam6_schema *sc; + int err; + + iter = (struct rhashtable_iter *)cb->args[0]; + rhashtable_walk_start(iter); + + for (;;) { + sc = rhashtable_walk_next(iter); + + if (IS_ERR(sc)) { + if (PTR_ERR(sc) == -EAGAIN) + continue; + err = PTR_ERR(sc); + goto done; + } else if (!sc) { + break; + } + + err = __ioam6_genl_dumpsc_element(sc, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI, + skb, + IOAM6_CMD_DUMP_SCHEMAS); + if (err) + goto done; + } + + err = skb->len; + +done: + rhashtable_walk_stop(iter); + return err; +} + +static int ioam6_genl_ns_set_schema(struct sk_buff *skb, struct genl_info *info) +{ + struct ioam6_namespace *ns, *ns_ref; + struct ioam6_schema *sc, *sc_ref; + struct ioam6_pernet_data *nsdata; + __be16 ns_id; + u32 sc_id; + int err; + + if (!info->attrs[IOAM6_ATTR_NS_ID] || + (!info->attrs[IOAM6_ATTR_SC_ID] && + !info->attrs[IOAM6_ATTR_SC_NONE])) + return -EINVAL; + + ns_id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID])); + nsdata = ioam6_pernet(genl_info_net(info)); + + mutex_lock(&nsdata->lock); + + ns = rhashtable_lookup_fast(&nsdata->namespaces, &ns_id, rht_ns_params); + if (!ns) { + err = -ENOENT; + goto out_unlock; + } + + if (info->attrs[IOAM6_ATTR_SC_NONE]) { + sc = NULL; + } else { + sc_id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]); + sc = rhashtable_lookup_fast(&nsdata->schemas, &sc_id, + rht_sc_params); + if (!sc) { + err = -ENOENT; + goto out_unlock; + } + } + + sc_ref = rcu_dereference_protected(ns->schema, + lockdep_is_held(&nsdata->lock)); + if (sc_ref) + rcu_assign_pointer(sc_ref->ns, NULL); + rcu_assign_pointer(ns->schema, sc); + + if (sc) { + ns_ref = rcu_dereference_protected(sc->ns, + lockdep_is_held(&nsdata->lock)); + if (ns_ref) + rcu_assign_pointer(ns_ref->schema, NULL); + rcu_assign_pointer(sc->ns, ns); + } + + err = 0; + +out_unlock: + mutex_unlock(&nsdata->lock); + return err; +} + +static const struct genl_ops ioam6_genl_ops[] = { + { + .cmd = IOAM6_CMD_ADD_NAMESPACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_addns, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_addns, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_addns) - 1, + }, + { + .cmd = IOAM6_CMD_DEL_NAMESPACE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_delns, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_delns, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_delns) - 1, + }, + { + .cmd = IOAM6_CMD_DUMP_NAMESPACES, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .start = ioam6_genl_dumpns_start, + .dumpit = ioam6_genl_dumpns, + .done = ioam6_genl_dumpns_done, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = IOAM6_CMD_ADD_SCHEMA, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_addsc, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_addsc, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_addsc) - 1, + }, + { + .cmd = IOAM6_CMD_DEL_SCHEMA, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_delsc, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_delsc, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_delsc) - 1, + }, + { + .cmd = IOAM6_CMD_DUMP_SCHEMAS, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .start = ioam6_genl_dumpsc_start, + .dumpit = ioam6_genl_dumpsc, + .done = ioam6_genl_dumpsc_done, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = IOAM6_CMD_NS_SET_SCHEMA, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = ioam6_genl_ns_set_schema, + .flags = GENL_ADMIN_PERM, + .policy = ioam6_genl_policy_ns_sc, + .maxattr = ARRAY_SIZE(ioam6_genl_policy_ns_sc) - 1, + }, +}; + +static struct genl_family ioam6_genl_family __ro_after_init = { + .name = IOAM6_GENL_NAME, + .version = IOAM6_GENL_VERSION, + .netnsok = true, + .parallel_ops = true, + .ops = ioam6_genl_ops, + .n_ops = ARRAY_SIZE(ioam6_genl_ops), + .module = THIS_MODULE, +}; + +struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id) +{ + struct ioam6_pernet_data *nsdata = ioam6_pernet(net); + + return rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params); +} + +static void __ioam6_fill_trace_data(struct sk_buff *skb, + struct ioam6_namespace *ns, + struct ioam6_trace_hdr *trace, + struct ioam6_schema *sc, + u8 sclen, bool is_input) +{ + struct __kernel_sock_timeval ts; + u64 raw64; + u32 raw32; + u16 raw16; + u8 *data; + u8 byte; + + data = trace->data + trace->remlen * 4 - trace->nodelen * 4 - sclen * 4; + + /* hop_lim and node_id */ + if (trace->type.bit0) { + byte = ipv6_hdr(skb)->hop_limit; + if (is_input) + byte--; + + raw32 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id; + + *(__be32 *)data = cpu_to_be32((byte << 24) | raw32); + data += sizeof(__be32); + } + + /* ingress_if_id and egress_if_id */ + if (trace->type.bit1) { + if (!skb->dev) + raw16 = IOAM6_U16_UNAVAILABLE; + else + raw16 = (__force u16)__in6_dev_get(skb->dev)->cnf.ioam6_id; + + *(__be16 *)data = cpu_to_be16(raw16); + data += sizeof(__be16); + + if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) + raw16 = IOAM6_U16_UNAVAILABLE; + else + raw16 = (__force u16)__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id; + + *(__be16 *)data = cpu_to_be16(raw16); + data += sizeof(__be16); + } + + /* timestamp seconds */ + if (trace->type.bit2) { + if (!skb->dev) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + } else { + if (!skb->tstamp) + __net_timestamp(skb); + + skb_get_new_timestamp(skb, &ts); + *(__be32 *)data = cpu_to_be32((u32)ts.tv_sec); + } + data += sizeof(__be32); + } + + /* timestamp subseconds */ + if (trace->type.bit3) { + if (!skb->dev) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + } else { + if (!skb->tstamp) + __net_timestamp(skb); + + if (!trace->type.bit2) + skb_get_new_timestamp(skb, &ts); + + *(__be32 *)data = cpu_to_be32((u32)ts.tv_usec); + } + data += sizeof(__be32); + } + + /* transit delay */ + if (trace->type.bit4) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* namespace data */ + if (trace->type.bit5) { + *(__be32 *)data = ns->data; + data += sizeof(__be32); + } + + /* queue depth */ + if (trace->type.bit6) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* checksum complement */ + if (trace->type.bit7) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* hop_lim and node_id (wide) */ + if (trace->type.bit8) { + byte = ipv6_hdr(skb)->hop_limit; + if (is_input) + byte--; + + raw64 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id_wide; + + *(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64); + data += sizeof(__be64); + } + + /* ingress_if_id and egress_if_id (wide) */ + if (trace->type.bit9) { + if (!skb->dev) + raw32 = IOAM6_U32_UNAVAILABLE; + else + raw32 = __in6_dev_get(skb->dev)->cnf.ioam6_id_wide; + + *(__be32 *)data = cpu_to_be32(raw32); + data += sizeof(__be32); + + if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) + raw32 = IOAM6_U32_UNAVAILABLE; + else + raw32 = __in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide; + + *(__be32 *)data = cpu_to_be32(raw32); + data += sizeof(__be32); + } + + /* namespace data (wide) */ + if (trace->type.bit10) { + *(__be64 *)data = ns->data_wide; + data += sizeof(__be64); + } + + /* buffer occupancy */ + if (trace->type.bit11) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit12 undefined: filled with empty value */ + if (trace->type.bit12) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit13 undefined: filled with empty value */ + if (trace->type.bit13) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit14 undefined: filled with empty value */ + if (trace->type.bit14) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit15 undefined: filled with empty value */ + if (trace->type.bit15) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit16 undefined: filled with empty value */ + if (trace->type.bit16) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit17 undefined: filled with empty value */ + if (trace->type.bit17) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit18 undefined: filled with empty value */ + if (trace->type.bit18) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit19 undefined: filled with empty value */ + if (trace->type.bit19) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit20 undefined: filled with empty value */ + if (trace->type.bit20) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* bit21 undefined: filled with empty value */ + if (trace->type.bit21) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); + data += sizeof(__be32); + } + + /* opaque state snapshot */ + if (trace->type.bit22) { + if (!sc) { + *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE >> 8); + } else { + *(__be32 *)data = sc->hdr; + data += sizeof(__be32); + + memcpy(data, sc->data, sc->len); + } + } +} + +/* called with rcu_read_lock() */ +void ioam6_fill_trace_data(struct sk_buff *skb, + struct ioam6_namespace *ns, + struct ioam6_trace_hdr *trace, + bool is_input) +{ + struct ioam6_schema *sc; + u8 sclen = 0; + + /* Skip if Overflow flag is set + */ + if (trace->overflow) + return; + + /* NodeLen does not include Opaque State Snapshot length. We need to + * take it into account if the corresponding bit is set (bit 22) and + * if the current IOAM namespace has an active schema attached to it + */ + sc = rcu_dereference(ns->schema); + if (trace->type.bit22) { + sclen = sizeof_field(struct ioam6_schema, hdr) / 4; + + if (sc) + sclen += sc->len / 4; + } + + /* If there is no space remaining, we set the Overflow flag and we + * skip without filling the trace + */ + if (!trace->remlen || trace->remlen < trace->nodelen + sclen) { + trace->overflow = 1; + return; + } + + __ioam6_fill_trace_data(skb, ns, trace, sc, sclen, is_input); + trace->remlen -= trace->nodelen + sclen; +} + +static int __net_init ioam6_net_init(struct net *net) +{ + struct ioam6_pernet_data *nsdata; + int err = -ENOMEM; + + nsdata = kzalloc(sizeof(*nsdata), GFP_KERNEL); + if (!nsdata) + goto out; + + mutex_init(&nsdata->lock); + net->ipv6.ioam6_data = nsdata; + + err = rhashtable_init(&nsdata->namespaces, &rht_ns_params); + if (err) + goto free_nsdata; + + err = rhashtable_init(&nsdata->schemas, &rht_sc_params); + if (err) + goto free_rht_ns; + +out: + return err; +free_rht_ns: + rhashtable_destroy(&nsdata->namespaces); +free_nsdata: + kfree(nsdata); + net->ipv6.ioam6_data = NULL; + goto out; +} + +static void __net_exit ioam6_net_exit(struct net *net) +{ + struct ioam6_pernet_data *nsdata = ioam6_pernet(net); + + rhashtable_free_and_destroy(&nsdata->namespaces, ioam6_free_ns, NULL); + rhashtable_free_and_destroy(&nsdata->schemas, ioam6_free_sc, NULL); + + kfree(nsdata); +} + +static struct pernet_operations ioam6_net_ops = { + .init = ioam6_net_init, + .exit = ioam6_net_exit, +}; + +int __init ioam6_init(void) +{ + int err = register_pernet_subsys(&ioam6_net_ops); + if (err) + goto out; + + err = genl_register_family(&ioam6_genl_family); + if (err) + goto out_unregister_pernet_subsys; + +#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL + err = ioam6_iptunnel_init(); + if (err) + goto out_unregister_genl; +#endif + + pr_info("In-situ OAM (IOAM) with IPv6\n"); + +out: + return err; +#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL +out_unregister_genl: + genl_unregister_family(&ioam6_genl_family); +#endif +out_unregister_pernet_subsys: + unregister_pernet_subsys(&ioam6_net_ops); + goto out; +} + +void ioam6_exit(void) +{ +#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL + ioam6_iptunnel_exit(); +#endif + genl_unregister_family(&ioam6_genl_family); + unregister_pernet_subsys(&ioam6_net_ops); +} diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c new file mode 100644 index 000000000000..f90a87389fcc --- /dev/null +++ b/net/ipv6/ioam6_iptunnel.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * IPv6 IOAM Lightweight Tunnel implementation + * + * Author: + * Justin Iurman <justin.iurman@uliege.be> + */ + +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/net.h> +#include <linux/in6.h> +#include <linux/ioam6.h> +#include <linux/ioam6_iptunnel.h> +#include <net/dst.h> +#include <net/sock.h> +#include <net/lwtunnel.h> +#include <net/ioam6.h> +#include <net/netlink.h> +#include <net/ipv6.h> +#include <net/dst_cache.h> +#include <net/ip6_route.h> +#include <net/addrconf.h> + +#define IOAM6_MASK_SHORT_FIELDS 0xff100000 +#define IOAM6_MASK_WIDE_FIELDS 0xe00000 + +struct ioam6_lwt_encap { + struct ipv6_hopopt_hdr eh; + u8 pad[2]; /* 2-octet padding for 4n-alignment */ + struct ioam6_hdr ioamh; + struct ioam6_trace_hdr traceh; +} __packed; + +struct ioam6_lwt { + struct dst_cache cache; + u8 mode; + struct in6_addr tundst; + struct ioam6_lwt_encap tuninfo; +}; + +static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt) +{ + return (struct ioam6_lwt *)lwt->data; +} + +static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt) +{ + return &ioam6_lwt_state(lwt)->tuninfo; +} + +static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt) +{ + return &(ioam6_lwt_state(lwt)->tuninfo.traceh); +} + +static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = { + [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8, + IOAM6_IPTUNNEL_MODE_MIN, + IOAM6_IPTUNNEL_MODE_MAX), + [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), + [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(sizeof(struct ioam6_trace_hdr)), +}; + +static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace) +{ + u32 fields; + + if (!trace->type_be32 || !trace->remlen || + trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 || + trace->type.bit12 | trace->type.bit13 | trace->type.bit14 | + trace->type.bit15 | trace->type.bit16 | trace->type.bit17 | + trace->type.bit18 | trace->type.bit19 | trace->type.bit20 | + trace->type.bit21) + return false; + + trace->nodelen = 0; + fields = be32_to_cpu(trace->type_be32); + + trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS) + * (sizeof(__be32) / 4); + trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS) + * (sizeof(__be64) / 4); + + return true; +} + +static int ioam6_build_state(struct net *net, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1]; + struct ioam6_lwt_encap *tuninfo; + struct ioam6_trace_hdr *trace; + struct lwtunnel_state *lwt; + struct ioam6_lwt *ilwt; + int len_aligned, err; + u8 mode; + + if (family != AF_INET6) + return -EINVAL; + + err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla, + ioam6_iptunnel_policy, extack); + if (err < 0) + return err; + + if (!tb[IOAM6_IPTUNNEL_MODE]) + mode = IOAM6_IPTUNNEL_MODE_INLINE; + else + mode = nla_get_u8(tb[IOAM6_IPTUNNEL_MODE]); + + if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) { + NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination"); + return -EINVAL; + } + + if (!tb[IOAM6_IPTUNNEL_TRACE]) { + NL_SET_ERR_MSG(extack, "missing trace"); + return -EINVAL; + } + + trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]); + if (!ioam6_validate_trace_hdr(trace)) { + NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE], + "invalid trace validation"); + return -EINVAL; + } + + len_aligned = ALIGN(trace->remlen * 4, 8); + lwt = lwtunnel_state_alloc(sizeof(*ilwt) + len_aligned); + if (!lwt) + return -ENOMEM; + + ilwt = ioam6_lwt_state(lwt); + err = dst_cache_init(&ilwt->cache, GFP_ATOMIC); + if (err) { + kfree(lwt); + return err; + } + + ilwt->mode = mode; + if (tb[IOAM6_IPTUNNEL_DST]) + ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]); + + tuninfo = ioam6_lwt_info(lwt); + tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1; + tuninfo->pad[0] = IPV6_TLV_PADN; + tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC; + tuninfo->ioamh.opt_type = IPV6_TLV_IOAM; + tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace) + + trace->remlen * 4; + + memcpy(&tuninfo->traceh, trace, sizeof(*trace)); + + if (len_aligned - trace->remlen * 4) { + tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN; + tuninfo->traceh.data[trace->remlen * 4 + 1] = 2; + } + + lwt->type = LWTUNNEL_ENCAP_IOAM6; + lwt->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + + *ts = lwt; + + return 0; +} + +static int ioam6_do_fill(struct net *net, struct sk_buff *skb) +{ + struct ioam6_trace_hdr *trace; + struct ioam6_namespace *ns; + + trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb) + + sizeof(struct ipv6_hopopt_hdr) + 2 + + sizeof(struct ioam6_hdr)); + + ns = ioam6_namespace(net, trace->namespace_id); + if (ns) + ioam6_fill_trace_data(skb, ns, trace, false); + + return 0; +} + +static int ioam6_do_inline(struct net *net, struct sk_buff *skb, + struct ioam6_lwt_encap *tuninfo) +{ + struct ipv6hdr *oldhdr, *hdr; + int hdrlen, err; + + hdrlen = (tuninfo->eh.hdrlen + 1) << 3; + + err = skb_cow_head(skb, hdrlen + skb->mac_len); + if (unlikely(err)) + return err; + + oldhdr = ipv6_hdr(skb); + skb_pull(skb, sizeof(*oldhdr)); + skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr)); + + skb_push(skb, sizeof(*oldhdr) + hdrlen); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + + hdr = ipv6_hdr(skb); + memmove(hdr, oldhdr, sizeof(*oldhdr)); + tuninfo->eh.nexthdr = hdr->nexthdr; + + skb_set_transport_header(skb, sizeof(*hdr)); + skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen); + + memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen); + + hdr->nexthdr = NEXTHDR_HOP; + hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); + + return ioam6_do_fill(net, skb); +} + +static int ioam6_do_encap(struct net *net, struct sk_buff *skb, + struct ioam6_lwt_encap *tuninfo, + struct in6_addr *tundst) +{ + struct dst_entry *dst = skb_dst(skb); + struct ipv6hdr *hdr, *inner_hdr; + int hdrlen, len, err; + + hdrlen = (tuninfo->eh.hdrlen + 1) << 3; + len = sizeof(*hdr) + hdrlen; + + err = skb_cow_head(skb, len + skb->mac_len); + if (unlikely(err)) + return err; + + inner_hdr = ipv6_hdr(skb); + + skb_push(skb, len); + skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + skb_set_transport_header(skb, sizeof(*hdr)); + + tuninfo->eh.nexthdr = NEXTHDR_IPV6; + memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen); + + hdr = ipv6_hdr(skb); + memcpy(hdr, inner_hdr, sizeof(*hdr)); + + hdr->nexthdr = NEXTHDR_HOP; + hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr)); + hdr->daddr = *tundst; + ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr, + IPV6_PREFER_SRC_PUBLIC, &hdr->saddr); + + skb_postpush_rcsum(skb, hdr, len); + + return ioam6_do_fill(net, skb); +} + +static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct in6_addr orig_daddr; + struct ioam6_lwt *ilwt; + int err = -EINVAL; + + if (skb->protocol != htons(ETH_P_IPV6)) + goto drop; + + ilwt = ioam6_lwt_state(dst->lwtstate); + orig_daddr = ipv6_hdr(skb)->daddr; + + switch (ilwt->mode) { + case IOAM6_IPTUNNEL_MODE_INLINE: +do_inline: + /* Direct insertion - if there is no Hop-by-Hop yet */ + if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) + goto out; + + err = ioam6_do_inline(net, skb, &ilwt->tuninfo); + if (unlikely(err)) + goto drop; + + break; + case IOAM6_IPTUNNEL_MODE_ENCAP: +do_encap: + /* Encapsulation (ip6ip6) */ + err = ioam6_do_encap(net, skb, &ilwt->tuninfo, &ilwt->tundst); + if (unlikely(err)) + goto drop; + + break; + case IOAM6_IPTUNNEL_MODE_AUTO: + /* Automatic (RFC8200 compliant): + * - local packets -> INLINE mode + * - in-transit packets -> ENCAP mode + */ + if (!skb->dev) + goto do_inline; + + goto do_encap; + default: + goto drop; + } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; + + if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) { + preempt_disable(); + dst = dst_cache_get(&ilwt->cache); + preempt_enable(); + + if (unlikely(!dst)) { + struct ipv6hdr *hdr = ipv6_hdr(skb); + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + fl6.daddr = hdr->daddr; + fl6.saddr = hdr->saddr; + fl6.flowlabel = ip6_flowinfo(hdr); + fl6.flowi6_mark = skb->mark; + fl6.flowi6_proto = hdr->nexthdr; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + err = dst->error; + dst_release(dst); + goto drop; + } + + preempt_disable(); + dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); + preempt_enable(); + } + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + + return dst_output(net, sk, skb); + } +out: + return dst->lwtstate->orig_output(net, sk, skb); +drop: + kfree_skb(skb); + return err; +} + +static void ioam6_destroy_state(struct lwtunnel_state *lwt) +{ + dst_cache_destroy(&ioam6_lwt_state(lwt)->cache); +} + +static int ioam6_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate); + int err; + + err = nla_put_u8(skb, IOAM6_IPTUNNEL_MODE, ilwt->mode); + if (err) + goto ret; + + if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) { + err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst); + if (err) + goto ret; + } + + err = nla_put(skb, IOAM6_IPTUNNEL_TRACE, sizeof(ilwt->tuninfo.traceh), + &ilwt->tuninfo.traceh); +ret: + return err; +} + +static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate); + int nlsize; + + nlsize = nla_total_size(sizeof(ilwt->mode)) + + nla_total_size(sizeof(ilwt->tuninfo.traceh)); + + if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) + nlsize += nla_total_size(sizeof(ilwt->tundst)); + + return nlsize; +} + +static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct ioam6_trace_hdr *trace_a = ioam6_lwt_trace(a); + struct ioam6_trace_hdr *trace_b = ioam6_lwt_trace(b); + struct ioam6_lwt *ilwt_a = ioam6_lwt_state(a); + struct ioam6_lwt *ilwt_b = ioam6_lwt_state(b); + + return (ilwt_a->mode != ilwt_b->mode || + (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE && + !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) || + trace_a->namespace_id != trace_b->namespace_id); +} + +static const struct lwtunnel_encap_ops ioam6_iptun_ops = { + .build_state = ioam6_build_state, + .destroy_state = ioam6_destroy_state, + .output = ioam6_output, + .fill_encap = ioam6_fill_encap_info, + .get_encap_size = ioam6_encap_nlsize, + .cmp_encap = ioam6_encap_cmp, + .owner = THIS_MODULE, +}; + +int __init ioam6_iptunnel_init(void) +{ + return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6); +} + +void ioam6_iptunnel_exit(void) +{ + lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6); +} diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index ef75c9b05f17..0371d2c14145 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1378,7 +1378,6 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, int err = -ENOMEM; int allow_create = 1; int replace_required = 0; - int sernum = fib6_new_sernum(info->nl_net); if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -1478,7 +1477,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, if (!err) { if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); - __fib6_update_sernum_upto_root(rt, sernum); + __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); fib6_start_gc(info->nl_net, rt); } @@ -2449,8 +2448,8 @@ int __init fib6_init(void) int ret = -ENOMEM; fib6_node_kmem = kmem_cache_create("fib6_nodes", - sizeof(struct fib6_node), - 0, SLAB_HWCACHE_ALIGN, + sizeof(struct fib6_node), 0, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!fib6_node_kmem) goto out; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 7a5e90e09363..d831d2439693 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -629,8 +629,6 @@ drop: static int gre_handle_offloads(struct sk_buff *skb, bool csum) { - if (csum && skb_checksum_start(skb) < skb->data) - return -EINVAL; return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } @@ -1090,7 +1088,7 @@ static void ip6gre_tnl_link_config_common(struct ip6_tnl *t) struct flowi6 *fl6 = &t->fl.u.ip6; if (dev->type != ARPHRD_ETHER) { - memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); } @@ -1246,8 +1244,9 @@ static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u, memcpy(u->name, p->name, sizeof(u->name)); } -static int ip6gre_tunnel_ioctl(struct net_device *dev, - struct ifreq *ifr, int cmd) +static int ip6gre_tunnel_siocdevprivate(struct net_device *dev, + struct ifreq *ifr, void __user *data, + int cmd) { int err = 0; struct ip6_tnl_parm2 p; @@ -1261,7 +1260,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, switch (cmd) { case SIOCGETTUNNEL: if (dev == ign->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } @@ -1272,7 +1271,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, } memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; @@ -1283,7 +1282,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, goto done; err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) goto done; err = -EINVAL; @@ -1320,7 +1319,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, memset(&p, 0, sizeof(p)); ip6gre_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); @@ -1333,7 +1332,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev, if (dev == ign->fb_tunnel_dev) { err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) goto done; err = -ENOENT; ip6gre_tnl_parm_from_user(&p1, &p); @@ -1400,7 +1399,7 @@ static const struct net_device_ops ip6gre_netdev_ops = { .ndo_init = ip6gre_tunnel_init, .ndo_uninit = ip6gre_tunnel_uninit, .ndo_start_xmit = ip6gre_tunnel_xmit, - .ndo_do_ioctl = ip6gre_tunnel_ioctl, + .ndo_siocdevprivate = ip6gre_tunnel_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, @@ -1522,7 +1521,7 @@ static int ip6gre_tunnel_init(struct net_device *dev) if (tunnel->parms.collect_md) return 0; - memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); + __dev_addr_set(dev, &tunnel->parms.laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); if (ipv6_addr_any(&tunnel->parms.raddr)) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8e6ca9ad6812..2f044a49afa8 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -60,46 +60,29 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * { struct dst_entry *dst = skb_dst(skb); struct net_device *dev = dst->dev; + struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); - int delta = hh_len - skb_headroom(skb); - const struct in6_addr *nexthop; + const struct in6_addr *daddr, *nexthop; + struct ipv6hdr *hdr; struct neighbour *neigh; int ret; /* Be paranoid, rather than too clever. */ - if (unlikely(delta > 0) && dev->header_ops) { - /* pskb_expand_head() might crash, if skb is shared */ - if (skb_shared(skb)) { - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); - - if (likely(nskb)) { - if (skb->sk) - skb_set_owner_w(nskb, skb->sk); - consume_skb(skb); - } else { - kfree_skb(skb); - } - skb = nskb; - } - if (skb && - pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { - kfree_skb(skb); - skb = NULL; - } + if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { + skb = skb_expand_head(skb, hh_len); if (!skb) { - IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); return -ENOMEM; } } - if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { - struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); - + hdr = ipv6_hdr(skb); + daddr = &hdr->daddr; + if (ipv6_addr_is_multicast(daddr)) { if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && ((mroute6_is_socket(net, skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || - ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, - &ipv6_hdr(skb)->saddr))) { + ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); /* Do not check for IFF_ALLMULTI; multicast routing @@ -110,7 +93,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * net, sk, newskb, NULL, newskb->dev, dev_loopback_xmit); - if (ipv6_hdr(skb)->hop_limit == 0) { + if (hdr->hop_limit == 0) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); kfree_skb(skb); @@ -119,9 +102,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); - - if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= - IPV6_ADDR_SCOPE_NODELOCAL && + if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && !(dev->flags & IFF_LOOPBACK)) { kfree_skb(skb); return 0; @@ -136,10 +117,10 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } rcu_read_lock_bh(); - nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); - neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); + nexthop = rt6_nexthop((struct rt6_info *)dst, daddr); + neigh = __ipv6_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) - neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); + neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); @@ -148,7 +129,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } rcu_read_unlock_bh(); - IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EINVAL; } @@ -268,6 +249,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); + struct net_device *dev = dst->dev; + struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int head_room; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; @@ -275,22 +258,16 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, int hlimit = -1; u32 mtu; - head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); + head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; - if (unlikely(skb_headroom(skb) < head_room)) { - struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); - if (!skb2) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUTDISCARDS); - kfree_skb(skb); + if (unlikely(head_room > skb_headroom(skb))) { + skb = skb_expand_head(skb, head_room); + if (!skb) { + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); return -ENOBUFS; } - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - consume_skb(skb); - skb = skb2; } if (opt) { @@ -332,8 +309,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, mtu = dst_mtu(dst); if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { - IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_OUT, skb->len); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -346,17 +322,17 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, * we promote our socket to non const */ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, (struct sock *)sk, skb, NULL, dst->dev, + net, (struct sock *)sk, skb, NULL, dev, dst_output); } - skb->dev = dst->dev; + skb->dev = dev; /* ipv6_local_error() does not require socket lock, * we promote our socket to non const */ ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); return -EMSGSIZE; } @@ -488,13 +464,14 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) int ip6_forward(struct sk_buff *skb) { - struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(dst->dev); + struct inet6_dev *idev; u32 mtu; + idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); if (net->ipv6.devconf_all->forwarding == 0) goto error; @@ -608,7 +585,7 @@ int ip6_forward(struct sk_buff *skb) } } - mtu = ip6_dst_mtu_forward(dst); + mtu = ip6_dst_mtu_maybe_forward(dst, true); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 322698d9fcf4..484aca492cc0 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1449,7 +1449,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) unsigned int mtu; int t_hlen; - memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); /* Set up flowi template */ @@ -1581,9 +1581,10 @@ ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) } /** - * ip6_tnl_ioctl - configure ipv6 tunnels from userspace + * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace * @dev: virtual device associated with tunnel - * @ifr: parameters passed from userspace + * @ifr: unused + * @data: parameters passed from userspace * @cmd: command to be performed * * Description: @@ -1609,7 +1610,8 @@ ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) **/ static int -ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm p; @@ -1623,7 +1625,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } @@ -1635,9 +1637,8 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) memset(&p, 0, sizeof(p)); } ip6_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) { + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; - } break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: @@ -1645,7 +1646,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && @@ -1669,7 +1670,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else { @@ -1683,7 +1684,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); @@ -1802,7 +1803,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_start_xmit, - .ndo_do_ioctl = ip6_tnl_ioctl, + .ndo_siocdevprivate = ip6_tnl_siocdevprivate, .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 2d048e21abbb..527e9ead7449 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -660,7 +660,7 @@ static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu) struct net_device *tdev = NULL; int mtu; - memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV | @@ -771,13 +771,14 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) } /** - * vti6_ioctl - configure vti6 tunnels from userspace + * vti6_siocdevprivate - configure vti6 tunnels from userspace * @dev: virtual device associated with tunnel - * @ifr: parameters passed from userspace + * @ifr: unused + * @data: parameters passed from userspace * @cmd: command to be performed * * Description: - * vti6_ioctl() is used for managing vti6 tunnels + * vti6_siocdevprivate() is used for managing vti6 tunnels * from userspace. * * The possible commands are the following: @@ -798,7 +799,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) * %-ENODEV if attempting to change or delete a nonexisting device **/ static int -vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm2 p; @@ -810,7 +811,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } @@ -822,7 +823,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!t) t = netdev_priv(dev); vti6_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: @@ -831,7 +832,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != 0) @@ -852,7 +853,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (t) { err = 0; vti6_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else @@ -865,7 +866,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; vti6_parm_from_user(&p1, &p); @@ -890,7 +891,7 @@ static const struct net_device_ops vti6_netdev_ops = { .ndo_init = vti6_dev_init, .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, - .ndo_do_ioctl = vti6_ioctl, + .ndo_siocdevprivate = vti6_siocdevprivate, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, }; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 06b0d2c329b9..36ed9efb8825 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -559,8 +559,7 @@ static int pim6_rcv(struct sk_buff *skb) read_lock(&mrt_lock); if (reg_vif_num >= 0) reg_dev = mrt->vif_table[reg_vif_num].dev; - if (reg_dev) - dev_hold(reg_dev); + dev_hold(reg_dev); read_unlock(&mrt_lock); if (!reg_dev) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a6804a7e34c1..41efca817db4 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -55,6 +55,8 @@ struct ip6_ra_chain *ip6_ra_chain; DEFINE_RWLOCK(ip6_ra_lock); +DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount); + int ip6_ra_control(struct sock *sk, int sel) { struct ip6_ra_chain *ra, *new_ra, **rap; @@ -225,7 +227,7 @@ static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) goto out_free_gsf; - ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist); + ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex); out_free_gsf: kfree(gsf); return ret; @@ -234,7 +236,7 @@ out_free_gsf: static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); + const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter *gf32; void *p; int ret; @@ -249,7 +251,7 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, if (!p) return -ENOMEM; - gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */ ret = -EFAULT; if (copy_from_sockptr(gf32, optval, optlen)) goto out_free_p; @@ -261,14 +263,14 @@ static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval, goto out_free_p; ret = -EINVAL; - if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) + if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen) goto out_free_p; ret = ip6_mc_msfilter(sk, &(struct group_filter){ .gf_interface = gf32->gf_interface, .gf_group = gf32->gf_group, .gf_fmode = gf32->gf_fmode, - .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist); + .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex); out_free_p: kfree(p); @@ -950,7 +952,14 @@ done: goto e_inval; if (val < 0 || val > 255) goto e_inval; - np->min_hopcount = val; + + if (val) + static_branch_enable(&ip6_min_hopcount); + + /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount + * while we are changing it. + */ + WRITE_ONCE(np->min_hopcount, val); retv = 0; break; case IPV6_DONTFRAG: @@ -1048,7 +1057,7 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, static int ipv6_get_msfilter(struct sock *sk, void __user *optval, int __user *optlen, int len) { - const int size0 = offsetof(struct group_filter, gf_slist); + const int size0 = offsetof(struct group_filter, gf_slist_flex); struct group_filter __user *p = optval; struct group_filter gsf; int num; @@ -1062,7 +1071,7 @@ static int ipv6_get_msfilter(struct sock *sk, void __user *optval, return -EADDRNOTAVAIL; num = gsf.gf_numsrc; lock_sock(sk); - err = ip6_mc_msfget(sk, &gsf, p->gf_slist); + err = ip6_mc_msfget(sk, &gsf, p->gf_slist_flex); if (!err) { if (num > gsf.gf_numsrc) num = gsf.gf_numsrc; @@ -1077,7 +1086,7 @@ static int ipv6_get_msfilter(struct sock *sk, void __user *optval, static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval, int __user *optlen) { - const int size0 = offsetof(struct compat_group_filter, gf_slist); + const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); struct compat_group_filter __user *p = optval; struct compat_group_filter gf32; struct group_filter gf; @@ -1100,7 +1109,7 @@ static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval, return -EADDRNOTAVAIL; lock_sock(sk); - err = ip6_mc_msfget(sk, &gf, p->gf_slist); + err = ip6_mc_msfget(sk, &gf, p->gf_slist_flex); release_sock(sk); if (err) return err; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 54ec163fbafa..bed8155508c8 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -447,7 +447,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (psl) count += psl->sl_max; - newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_KERNEL); + newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count), + GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; @@ -457,7 +458,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (psl) { for (i = 0; i < psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; - atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + atomic_sub(struct_size(psl, sl_addr, psl->sl_max), + &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } psl = newpsl; @@ -525,8 +527,9 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, goto done; } if (gsf->gf_numsrc) { - newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc), - GFP_KERNEL); + newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, + gsf->gf_numsrc), + GFP_KERNEL); if (!newpsl) { err = -ENOBUFS; goto done; @@ -543,7 +546,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, newpsl->sl_count, newpsl->sl_addr, 0); if (err) { mutex_unlock(&idev->mc_lock); - sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max)); + sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr, + newpsl->sl_max)); goto done; } mutex_unlock(&idev->mc_lock); @@ -559,7 +563,8 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, if (psl) { ip6_mc_del_src(idev, group, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); - atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + atomic_sub(struct_size(psl, sl_addr, psl->sl_max), + &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } else { ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); @@ -1351,8 +1356,8 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, return 0; } -static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, - unsigned long *max_delay) +static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, + unsigned long *max_delay) { *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL); @@ -1362,7 +1367,7 @@ static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, idev->mc_maxdelay = *max_delay; - return 0; + return; } /* called with rcu_read_lock() */ @@ -1449,9 +1454,7 @@ static void __mld_query_work(struct sk_buff *skb) mlh2 = (struct mld2_query *)skb_transport_header(skb); - err = mld_process_v2(idev, mlh2, &max_delay); - if (err < 0) - goto out; + mld_process_v2(idev, mlh2, &max_delay); if (group_type == IPV6_ADDR_ANY) { /* general query */ if (mlh2->mld2q_nsrcs) @@ -2607,7 +2610,8 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, psl->sl_count, psl->sl_addr, 0); RCU_INIT_POINTER(iml->sflist, NULL); - atomic_sub(IP6_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + atomic_sub(struct_size(psl, sl_addr, psl->sl_max), + &sk->sk_omem_alloc); kfree_rcu(psl, rcu); } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index c467c6419893..f03b597e4121 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -142,7 +142,7 @@ struct neigh_table nd_tbl = { }; EXPORT_SYMBOL_GPL(nd_tbl); -void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, +void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data, int data_len, int pad) { int space = __ndisc_opt_addr_space(data_len, pad); @@ -165,7 +165,7 @@ void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data, EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option); static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type, - void *data, u8 icmp6_type) + const void *data, u8 icmp6_type) { __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len, ndisc_addr_option_pad(skb->dev->type)); @@ -1391,12 +1391,6 @@ skip_defrtr: } } - /* - * Send a notify if RA changed managed/otherconf flags or timer settings - */ - if (send_ifinfo_notify) - inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); - skip_linkparms: /* @@ -1496,6 +1490,11 @@ skip_routeinfo: memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); mtu = ntohl(n); + if (in6_dev->ra_mtu != mtu) { + in6_dev->ra_mtu = mtu; + send_ifinfo_notify = true; + } + if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); } else if (in6_dev->cnf.mtu6 != mtu) { @@ -1519,6 +1518,12 @@ skip_routeinfo: ND_PRINTK(2, warn, "RA: invalid RA options\n"); } out: + /* Send a notify if RA changed managed/otherconf flags or + * timer settings or ra_mtu value + */ + if (send_ifinfo_notify) + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + fib6_info_release(rt); if (neigh) neigh_release(neigh); @@ -1789,6 +1794,7 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, struct netdev_notifier_change_info *change_info; struct net *net = dev_net(dev); struct inet6_dev *idev; + bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: @@ -1805,10 +1811,19 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, in6_dev_put(idev); break; case NETDEV_CHANGE: + idev = in6_dev_get(dev); + if (!idev) + evict_nocarrier = true; + else { + evict_nocarrier = idev->cnf.ndisc_evict_nocarrier && + net->ipv6.devconf_all->ndisc_evict_nocarrier; + in6_dev_put(idev); + } + change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&nd_tbl, dev); - if (!netif_carrier_ok(dev)) + if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&nd_tbl, dev); break; case NETDEV_DOWN: diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index de2cf3943b91..2d816277f2c5 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -247,10 +247,10 @@ ip6t_next_entry(const struct ip6t_entry *entry) /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int -ip6t_do_table(struct sk_buff *skb, - const struct nf_hook_state *state, - struct xt_table *table) +ip6t_do_table(void *priv, struct sk_buff *skb, + const struct nf_hook_state *state) { + const struct xt_table *table = priv; unsigned int hook = state->hook; static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); /* Initializing verdict to NF_DROP keeps gcc happy. */ @@ -273,6 +273,7 @@ ip6t_do_table(struct sk_buff *skb, * things we don't know, ie. tcp syn flag or ports). If the * rule is also a fragment-specific rule, non-fragments won't * match it. */ + acpar.fragoff = 0; acpar.hotdrop = false; acpar.state = state; diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c index 733c83d38b30..4ad8b2032f1f 100644 --- a/net/ipv6/netfilter/ip6t_rt.c +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -25,12 +25,7 @@ MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); static inline bool segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) { - bool r; - pr_debug("segsleft_match:%c 0x%x <= 0x%x <= 0x%x\n", - invert ? '!' : ' ', min, id, max); - r = (id >= min && id <= max) ^ invert; - pr_debug(" result %s\n", r ? "PASS" : "FAILED"); - return r; + return (id >= min && id <= max) ^ invert; } static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) @@ -65,30 +60,6 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) return false; } - pr_debug("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen); - pr_debug("TYPE %04X ", rh->type); - pr_debug("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left); - - pr_debug("IPv6 RT segsleft %02X ", - segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], - rh->segments_left, - !!(rtinfo->invflags & IP6T_RT_INV_SGS))); - pr_debug("type %02X %02X %02X ", - rtinfo->rt_type, rh->type, - (!(rtinfo->flags & IP6T_RT_TYP) || - ((rtinfo->rt_type == rh->type) ^ - !!(rtinfo->invflags & IP6T_RT_INV_TYP)))); - pr_debug("len %02X %04X %02X ", - rtinfo->hdrlen, hdrlen, - !(rtinfo->flags & IP6T_RT_LEN) || - ((rtinfo->hdrlen == hdrlen) ^ - !!(rtinfo->invflags & IP6T_RT_INV_LEN))); - pr_debug("res %02X %02X %02X ", - rtinfo->flags & IP6T_RT_RES, - ((const struct rt0_hdr *)rh)->reserved, - !((rtinfo->flags & IP6T_RT_RES) && - (((const struct rt0_hdr *)rh)->reserved))); - ret = (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], rh->segments_left, !!(rtinfo->invflags & IP6T_RT_INV_SGS))) && @@ -107,22 +78,22 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) reserved), sizeof(_reserved), &_reserved); + if (!rp) { + par->hotdrop = true; + return false; + } ret = (*rp == 0); } - pr_debug("#%d ", rtinfo->addrnr); if (!(rtinfo->flags & IP6T_RT_FST)) { return ret; } else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) { - pr_debug("Not strict "); if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { - pr_debug("There isn't enough space\n"); return false; } else { unsigned int i = 0; - pr_debug("#%d ", rtinfo->addrnr); for (temp = 0; temp < (unsigned int)((hdrlen - 8) / 16); temp++) { @@ -138,26 +109,20 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) return false; } - if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) { - pr_debug("i=%d temp=%d;\n", i, temp); + if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) i++; - } if (i == rtinfo->addrnr) break; } - pr_debug("i=%d #%d\n", i, rtinfo->addrnr); if (i == rtinfo->addrnr) return ret; else return false; } } else { - pr_debug("Strict "); if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { - pr_debug("There isn't enough space\n"); return false; } else { - pr_debug("#%d ", rtinfo->addrnr); for (temp = 0; temp < rtinfo->addrnr; temp++) { ap = skb_header_pointer(skb, ptr @@ -173,7 +138,6 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp])) break; } - pr_debug("temp=%d #%d\n", temp, rtinfo->addrnr); if (temp == rtinfo->addrnr && temp == (unsigned int)((hdrlen - 8) / 16)) return ret; diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index bb784ea7bbd3..df785ebda0ca 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -19,32 +19,21 @@ MODULE_DESCRIPTION("ip6tables filter table"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) -static int __net_init ip6table_filter_table_init(struct net *net); - static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_FILTER, - .table_init = ip6table_filter_table_init, }; -/* The work comes in here from netfilter.c. */ -static unsigned int -ip6table_filter_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip6t_do_table(skb, state, priv); -} - static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ static bool forward = true; module_param(forward, bool, 0000); -static int __net_init ip6table_filter_table_init(struct net *net) +static int ip6table_filter_table_init(struct net *net) { struct ip6t_replace *repl; int err; @@ -63,7 +52,7 @@ static int __net_init ip6table_filter_table_init(struct net *net) static int __net_init ip6table_filter_net_init(struct net *net) { - if (net == &init_net || !forward) + if (!forward) return ip6table_filter_table_init(net); return 0; @@ -87,15 +76,24 @@ static struct pernet_operations ip6table_filter_net_ops = { static int __init ip6table_filter_init(void) { - int ret; + int ret = xt_register_template(&packet_filter, + ip6table_filter_table_init); - filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook); - if (IS_ERR(filter_ops)) + if (ret < 0) + return ret; + + filter_ops = xt_hook_ops_alloc(&packet_filter, ip6t_do_table); + if (IS_ERR(filter_ops)) { + xt_unregister_template(&packet_filter); return PTR_ERR(filter_ops); + } ret = register_pernet_subsys(&ip6table_filter_net_ops); - if (ret < 0) + if (ret < 0) { + xt_unregister_template(&packet_filter); kfree(filter_ops); + return ret; + } return ret; } @@ -103,6 +101,7 @@ static int __init ip6table_filter_init(void) static void __exit ip6table_filter_fini(void) { unregister_pernet_subsys(&ip6table_filter_net_ops); + xt_unregister_template(&packet_filter); kfree(filter_ops); } diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c index c76cffd63041..a88b2ce4a3cb 100644 --- a/net/ipv6/netfilter/ip6table_mangle.c +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -20,19 +20,16 @@ MODULE_DESCRIPTION("ip6tables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) -static int __net_init ip6table_mangle_table_init(struct net *net); - static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_MANGLE, - .table_init = ip6table_mangle_table_init, }; static unsigned int -ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv) +ip6t_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret; struct in6_addr saddr, daddr; @@ -49,7 +46,7 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *pr /* flowlabel and prio (includes version, which shouldn't change either */ flowlabel = *((u_int32_t *)ipv6_hdr(skb)); - ret = ip6t_do_table(skb, state, priv); + ret = ip6t_do_table(priv, skb, state); if (ret != NF_DROP && ret != NF_STOLEN && (!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) || @@ -71,12 +68,12 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) - return ip6t_mangle_out(skb, state, priv); - return ip6t_do_table(skb, state, priv); + return ip6t_mangle_out(priv, skb, state); + return ip6t_do_table(priv, skb, state); } static struct nf_hook_ops *mangle_ops __read_mostly; -static int __net_init ip6table_mangle_table_init(struct net *net) +static int ip6table_mangle_table_init(struct net *net) { struct ip6t_replace *repl; int ret; @@ -106,29 +103,32 @@ static struct pernet_operations ip6table_mangle_net_ops = { static int __init ip6table_mangle_init(void) { - int ret; + int ret = xt_register_template(&packet_mangler, + ip6table_mangle_table_init); + + if (ret < 0) + return ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook); - if (IS_ERR(mangle_ops)) + if (IS_ERR(mangle_ops)) { + xt_unregister_template(&packet_mangler); return PTR_ERR(mangle_ops); + } ret = register_pernet_subsys(&ip6table_mangle_net_ops); if (ret < 0) { + xt_unregister_template(&packet_mangler); kfree(mangle_ops); return ret; } - ret = ip6table_mangle_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&ip6table_mangle_net_ops); - kfree(mangle_ops); - } return ret; } static void __exit ip6table_mangle_fini(void) { unregister_pernet_subsys(&ip6table_mangle_net_ops); + xt_unregister_template(&packet_mangler); kfree(mangle_ops); } diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c index b0292251e655..bf3cb3a13600 100644 --- a/net/ipv6/netfilter/ip6table_nat.c +++ b/net/ipv6/netfilter/ip6table_nat.c @@ -19,8 +19,6 @@ struct ip6table_nat_pernet { struct nf_hook_ops *nf_nat_ops; }; -static int __net_init ip6table_nat_table_init(struct net *net); - static unsigned int ip6table_nat_net_id __read_mostly; static const struct xt_table nf_nat_ipv6_table = { @@ -31,37 +29,29 @@ static const struct xt_table nf_nat_ipv6_table = { (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, .af = NFPROTO_IPV6, - .table_init = ip6table_nat_table_init, }; -static unsigned int ip6table_nat_do_chain(void *priv, - struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip6t_do_table(skb, state, priv); -} - static const struct nf_hook_ops nf_nat_ipv6_ops[] = { { - .hook = ip6table_nat_do_chain, + .hook = ip6t_do_table, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_NAT_DST, }, { - .hook = ip6table_nat_do_chain, + .hook = ip6t_do_table, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC, }, { - .hook = ip6table_nat_do_chain, + .hook = ip6t_do_table, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST, }, { - .hook = ip6table_nat_do_chain, + .hook = ip6t_do_table, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, @@ -115,7 +105,7 @@ static void ip6t_nat_unregister_lookups(struct net *net) kfree(ops); } -static int __net_init ip6table_nat_table_init(struct net *net) +static int ip6table_nat_table_init(struct net *net) { struct ip6t_replace *repl; int ret; @@ -157,20 +147,23 @@ static struct pernet_operations ip6table_nat_net_ops = { static int __init ip6table_nat_init(void) { - int ret = register_pernet_subsys(&ip6table_nat_net_ops); + int ret = xt_register_template(&nf_nat_ipv6_table, + ip6table_nat_table_init); - if (ret) + if (ret < 0) return ret; - ret = ip6table_nat_table_init(&init_net); + ret = register_pernet_subsys(&ip6table_nat_net_ops); if (ret) - unregister_pernet_subsys(&ip6table_nat_net_ops); + xt_unregister_template(&nf_nat_ipv6_table); + return ret; } static void __exit ip6table_nat_exit(void) { unregister_pernet_subsys(&ip6table_nat_net_ops); + xt_unregister_template(&nf_nat_ipv6_table); } module_init(ip6table_nat_init); diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c index f63c106c521e..08861d5d1f4d 100644 --- a/net/ipv6/netfilter/ip6table_raw.c +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -11,8 +11,6 @@ #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) -static int __net_init ip6table_raw_table_init(struct net *net); - static bool raw_before_defrag __read_mostly; MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag"); module_param(raw_before_defrag, bool, 0000); @@ -23,7 +21,6 @@ static const struct xt_table packet_raw = { .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_RAW, - .table_init = ip6table_raw_table_init, }; static const struct xt_table packet_raw_before_defrag = { @@ -32,20 +29,11 @@ static const struct xt_table packet_raw_before_defrag = { .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG, - .table_init = ip6table_raw_table_init, }; -/* The work comes in here from netfilter.c. */ -static unsigned int -ip6table_raw_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip6t_do_table(skb, state, priv); -} - static struct nf_hook_ops *rawtable_ops __read_mostly; -static int __net_init ip6table_raw_table_init(struct net *net) +static int ip6table_raw_table_init(struct net *net) { struct ip6t_replace *repl; const struct xt_table *table = &packet_raw; @@ -79,37 +67,39 @@ static struct pernet_operations ip6table_raw_net_ops = { static int __init ip6table_raw_init(void) { - int ret; const struct xt_table *table = &packet_raw; + int ret; if (raw_before_defrag) { table = &packet_raw_before_defrag; - pr_info("Enabling raw table before defrag\n"); } + ret = xt_register_template(table, ip6table_raw_table_init); + if (ret < 0) + return ret; + /* Register hooks */ - rawtable_ops = xt_hook_ops_alloc(table, ip6table_raw_hook); - if (IS_ERR(rawtable_ops)) + rawtable_ops = xt_hook_ops_alloc(table, ip6t_do_table); + if (IS_ERR(rawtable_ops)) { + xt_unregister_template(table); return PTR_ERR(rawtable_ops); + } ret = register_pernet_subsys(&ip6table_raw_net_ops); if (ret < 0) { kfree(rawtable_ops); + xt_unregister_template(table); return ret; } - ret = ip6table_raw_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&ip6table_raw_net_ops); - kfree(rawtable_ops); - } return ret; } static void __exit ip6table_raw_fini(void) { unregister_pernet_subsys(&ip6table_raw_net_ops); + xt_unregister_template(&packet_raw); kfree(rawtable_ops); } diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c index 8dc335cf450b..4df14a9bae78 100644 --- a/net/ipv6/netfilter/ip6table_security.c +++ b/net/ipv6/netfilter/ip6table_security.c @@ -24,27 +24,17 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) -static int __net_init ip6table_security_table_init(struct net *net); - static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV6, .priority = NF_IP6_PRI_SECURITY, - .table_init = ip6table_security_table_init, }; -static unsigned int -ip6table_security_hook(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip6t_do_table(skb, state, priv); -} - static struct nf_hook_ops *sectbl_ops __read_mostly; -static int __net_init ip6table_security_table_init(struct net *net) +static int ip6table_security_table_init(struct net *net) { struct ip6t_replace *repl; int ret; @@ -74,29 +64,32 @@ static struct pernet_operations ip6table_security_net_ops = { static int __init ip6table_security_init(void) { - int ret; + int ret = xt_register_template(&security_table, + ip6table_security_table_init); - sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook); - if (IS_ERR(sectbl_ops)) + if (ret < 0) + return ret; + + sectbl_ops = xt_hook_ops_alloc(&security_table, ip6t_do_table); + if (IS_ERR(sectbl_ops)) { + xt_unregister_template(&security_table); return PTR_ERR(sectbl_ops); + } ret = register_pernet_subsys(&ip6table_security_net_ops); if (ret < 0) { kfree(sectbl_ops); + xt_unregister_template(&security_table); return ret; } - ret = ip6table_security_table_init(&init_net); - if (ret) { - unregister_pernet_subsys(&ip6table_security_net_ops); - kfree(sectbl_ops); - } return ret; } static void __exit ip6table_security_fini(void) { unregister_pernet_subsys(&ip6table_security_net_ops); + xt_unregister_template(&security_table); kfree(sectbl_ops); } diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index a0108415275f..5c47be29b9ee 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -33,7 +33,7 @@ static const char nf_frags_cache_name[] = "nf-frags"; -unsigned int nf_frag_pernet_id __read_mostly; +static unsigned int nf_frag_pernet_id __read_mostly; static struct inet_frags nf_frags; static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net) diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index e8a59d8bf2ad..cb4eb1d2c620 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -25,8 +25,6 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> -extern unsigned int nf_frag_pernet_id; - static DEFINE_MUTEX(defrag6_mutex); static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, @@ -91,12 +89,10 @@ static const struct nf_hook_ops ipv6_defrag_ops[] = { static void __net_exit defrag6_net_exit(struct net *net) { - struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); - - if (nf_frag->users) { + if (net->nf.defrag_ipv6_users) { nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); - nf_frag->users = 0; + net->nf.defrag_ipv6_users = 0; } } @@ -134,24 +130,23 @@ static void __exit nf_defrag_fini(void) int nf_defrag_ipv6_enable(struct net *net) { - struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); int err = 0; mutex_lock(&defrag6_mutex); - if (nf_frag->users == UINT_MAX) { + if (net->nf.defrag_ipv6_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } - if (nf_frag->users) { - nf_frag->users++; + if (net->nf.defrag_ipv6_users) { + net->nf.defrag_ipv6_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); if (err == 0) - nf_frag->users = 1; + net->nf.defrag_ipv6_users = 1; out_unlock: mutex_unlock(&defrag6_mutex); @@ -161,12 +156,10 @@ EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); void nf_defrag_ipv6_disable(struct net *net) { - struct nft_ct_frag6_pernet *nf_frag = net_generic(net, nf_frag_pernet_id); - mutex_lock(&defrag6_mutex); - if (nf_frag->users) { - nf_frag->users--; - if (nf_frag->users == 0) + if (net->nf.defrag_ipv6_users) { + net->nf.defrag_ipv6_users--; + if (net->nf.defrag_ipv6_users == 0) nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); } diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index 6fd54744cbc3..aa5bb8789ba0 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -99,7 +99,7 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, { __be16 dport, sport; const struct in6_addr *daddr = NULL, *saddr = NULL; - struct ipv6hdr *iph = ipv6_hdr(skb); + struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var; struct sk_buff *data_skb = NULL; int doff = 0; int thoff = 0, tproto; @@ -129,8 +129,6 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, thoff + sizeof(*hp); } else if (tproto == IPPROTO_ICMPV6) { - struct ipv6hdr ipv6_var; - if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, &sport, &dport, &ipv6_var)) return NULL; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c5e8ecb96426..3ae25b8ffbd6 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1657,6 +1657,7 @@ static int rt6_insert_exception(struct rt6_info *nrt, struct in6_addr *src_key = NULL; struct rt6_exception *rt6_ex; struct fib6_nh *nh = res->nh; + int max_depth; int err = 0; spin_lock_bh(&rt6_exception_lock); @@ -1711,7 +1712,9 @@ static int rt6_insert_exception(struct rt6_info *nrt, bucket->depth++; net->ipv6.rt6_stats->fib_rt_cache++; - if (bucket->depth > FIB6_MAX_DEPTH) + /* Randomize max depth to avoid some side channels attacks. */ + max_depth = FIB6_MAX_DEPTH + prandom_u32_max(FIB6_MAX_DEPTH); + while (bucket->depth > max_depth) rt6_exception_remove_oldest(bucket); out: @@ -3209,25 +3212,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst) { - struct inet6_dev *idev; - unsigned int mtu; - - mtu = dst_metric_raw(dst, RTAX_MTU); - if (mtu) - goto out; - - mtu = IPV6_MIN_MTU; - - rcu_read_lock(); - idev = __in6_dev_get(dst->dev); - if (idev) - mtu = idev->cnf.mtu6; - rcu_read_unlock(); - -out: - mtu = min_t(unsigned int, mtu, IP6_MAX_MTU); - - return mtu - lwtunnel_headroom(dst->lwtstate, mtu); + return ip6_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ip6_mtu); @@ -3652,8 +3637,7 @@ out: if (err) { lwtstate_put(fib6_nh->fib_nh_lws); fib6_nh->fib_nh_lws = NULL; - if (dev) - dev_put(dev); + dev_put(dev); } return err; @@ -5697,14 +5681,15 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common, - rt->fib6_nh->fib_nh_weight, AF_INET6) < 0) + rt->fib6_nh->fib_nh_weight, AF_INET6, + 0) < 0) goto nla_put_failure; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, - AF_INET6) < 0) + AF_INET6, 0) < 0) goto nla_put_failure; } @@ -6321,11 +6306,11 @@ static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, static struct ctl_table ipv6_route_table_template[] = { { - .procname = "flush", - .data = &init_net.ipv6.sysctl.flush_delay, + .procname = "max_size", + .data = &init_net.ipv6.sysctl.ip6_rt_max_size, .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = ipv6_sysctl_rtcache_flush + .mode = 0644, + .proc_handler = proc_dointvec, }, { .procname = "gc_thresh", @@ -6335,11 +6320,11 @@ static struct ctl_table ipv6_route_table_template[] = { .proc_handler = proc_dointvec, }, { - .procname = "max_size", - .data = &init_net.ipv6.sysctl.ip6_rt_max_size, + .procname = "flush", + .data = &init_net.ipv6.sysctl.flush_delay, .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, + .mode = 0200, + .proc_handler = ipv6_sysctl_rtcache_flush }, { .procname = "gc_min_interval", @@ -6411,10 +6396,10 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) GFP_KERNEL); if (table) { - table[0].data = &net->ipv6.sysctl.flush_delay; - table[0].extra1 = net; + table[0].data = &net->ipv6.sysctl.ip6_rt_max_size; table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh; - table[2].data = &net->ipv6.sysctl.ip6_rt_max_size; + table[2].data = &net->ipv6.sysctl.flush_delay; + table[2].extra1 = net; table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval; table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout; table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval; @@ -6426,7 +6411,7 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net) /* Don't export sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) - table[0].procname = NULL; + table[1].procname = NULL; } return table; @@ -6646,7 +6631,7 @@ int __init ip6_route_init(void) ret = -ENOMEM; ip6_dst_ops_template.kmem_cachep = kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!ip6_dst_ops_template.kmem_cachep) goto out; diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index e412817fba2f..5daa1c3ed83b 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -374,7 +374,11 @@ static int __net_init seg6_net_init(struct net *net) net->ipv6.seg6_data = sdata; #ifdef CONFIG_IPV6_SEG6_HMAC - seg6_hmac_net_init(net); + if (seg6_hmac_net_init(net)) { + kfree(rcu_dereference_raw(sdata->tun_src)); + kfree(sdata); + return -ENOMEM; + }; #endif return 0; @@ -388,7 +392,7 @@ static void __net_exit seg6_net_exit(struct net *net) seg6_hmac_net_exit(net); #endif - kfree(sdata->tun_src); + kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); } diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 687d95dce085..29bc4e7c3046 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -405,9 +405,7 @@ int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); - rhashtable_init(&sdata->hmac_infos, &rht_params); - - return 0; + return rhashtable_init(&sdata->hmac_infos, &rht_params); } EXPORT_SYMBOL(seg6_hmac_net_init); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 897fa59c47de..3adc5d9211ad 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -26,6 +26,7 @@ #ifdef CONFIG_IPV6_SEG6_HMAC #include <net/seg6_hmac.h> #endif +#include <linux/netfilter.h> static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) { @@ -295,11 +296,19 @@ static int seg6_do_srh(struct sk_buff *skb) ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + nf_reset_ct(skb); return 0; } -static int seg6_input(struct sk_buff *skb) +static int seg6_input_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + return dst_input(skb); +} + +static int seg6_input_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; @@ -337,15 +346,46 @@ static int seg6_input(struct sk_buff *skb) if (unlikely(err)) return err; - return dst_input(skb); + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + dev_net(skb->dev), NULL, skb, NULL, + skb_dst(skb)->dev, seg6_input_finish); + + return seg6_input_finish(dev_net(skb->dev), NULL, skb); } -static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) +static int seg6_input_nf(struct sk_buff *skb) +{ + struct net_device *dev = skb_dst(skb)->dev; + struct net *net = dev_net(skb->dev); + + switch (skb->protocol) { + case htons(ETH_P_IP): + return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, NULL, + skb, NULL, dev, seg6_input_core); + case htons(ETH_P_IPV6): + return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, NULL, + skb, NULL, dev, seg6_input_core); + } + + return -EINVAL; +} + +static int seg6_input(struct sk_buff *skb) +{ + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return seg6_input_nf(skb); + + return seg6_input_core(dev_net(skb->dev), NULL, skb); +} + +static int seg6_output_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct seg6_lwt *slwt; - int err = -EINVAL; + int err; err = seg6_do_srh(skb); if (unlikely(err)) @@ -387,12 +427,40 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (unlikely(err)) goto drop; + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, + NULL, skb_dst(skb)->dev, dst_output); + return dst_output(net, sk, skb); drop: kfree_skb(skb); return err; } +static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct net_device *dev = skb_dst(skb)->dev; + + switch (skb->protocol) { + case htons(ETH_P_IP): + return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, + NULL, dev, seg6_output_core); + case htons(ETH_P_IPV6): + return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, + NULL, dev, seg6_output_core); + } + + return -EINVAL; +} + +static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return seg6_output_nf(net, sk, skb); + + return seg6_output_core(net, sk, skb); +} + static int seg6_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 60bf3b877957..2dc40b3f373e 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -30,6 +30,7 @@ #include <net/seg6_local.h> #include <linux/etherdevice.h> #include <linux/bpf.h> +#include <linux/netfilter.h> #define SEG6_F_ATTR(i) BIT(i) @@ -413,12 +414,33 @@ drop: return -EINVAL; } +static int input_action_end_dx6_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + struct dst_entry *orig_dst = skb_dst(skb); + struct in6_addr *nhaddr = NULL; + struct seg6_local_lwt *slwt; + + slwt = seg6_local_lwtunnel(orig_dst->lwtstate); + + /* The inner packet is not associated to any local interface, + * so we do not call netif_rx(). + * + * If slwt->nh6 is set to ::, then lookup the nexthop for the + * inner packet's DA. Otherwise, use the specified nexthop. + */ + if (!ipv6_addr_any(&slwt->nh6)) + nhaddr = &slwt->nh6; + + seg6_lookup_nexthop(skb, nhaddr, 0); + + return dst_input(skb); +} + /* decapsulate and forward to specified nexthop */ static int input_action_end_dx6(struct sk_buff *skb, struct seg6_local_lwt *slwt) { - struct in6_addr *nhaddr = NULL; - /* this function accepts IPv6 encapsulated packets, with either * an SRH with SL=0, or no SRH. */ @@ -429,40 +451,30 @@ static int input_action_end_dx6(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - /* The inner packet is not associated to any local interface, - * so we do not call netif_rx(). - * - * If slwt->nh6 is set to ::, then lookup the nexthop for the - * inner packet's DA. Otherwise, use the specified nexthop. - */ - - if (!ipv6_addr_any(&slwt->nh6)) - nhaddr = &slwt->nh6; - skb_set_transport_header(skb, sizeof(struct ipv6hdr)); + nf_reset_ct(skb); - seg6_lookup_nexthop(skb, nhaddr, 0); + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, NULL, + skb_dst(skb)->dev, input_action_end_dx6_finish); - return dst_input(skb); + return input_action_end_dx6_finish(dev_net(skb->dev), NULL, skb); drop: kfree_skb(skb); return -EINVAL; } -static int input_action_end_dx4(struct sk_buff *skb, - struct seg6_local_lwt *slwt) +static int input_action_end_dx4_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) { + struct dst_entry *orig_dst = skb_dst(skb); + struct seg6_local_lwt *slwt; struct iphdr *iph; __be32 nhaddr; int err; - if (!decap_and_validate(skb, IPPROTO_IPIP)) - goto drop; - - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto drop; - - skb->protocol = htons(ETH_P_IP); + slwt = seg6_local_lwtunnel(orig_dst->lwtstate); iph = ip_hdr(skb); @@ -470,14 +482,34 @@ static int input_action_end_dx4(struct sk_buff *skb, skb_dst_drop(skb); - skb_set_transport_header(skb, sizeof(struct iphdr)); - err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev); - if (err) - goto drop; + if (err) { + kfree_skb(skb); + return -EINVAL; + } return dst_input(skb); +} + +static int input_action_end_dx4(struct sk_buff *skb, + struct seg6_local_lwt *slwt) +{ + if (!decap_and_validate(skb, IPPROTO_IPIP)) + goto drop; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto drop; + + skb->protocol = htons(ETH_P_IP); + skb_set_transport_header(skb, sizeof(struct iphdr)); + nf_reset_ct(skb); + + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, + dev_net(skb->dev), NULL, skb, NULL, + skb_dst(skb)->dev, input_action_end_dx4_finish); + return input_action_end_dx4_finish(dev_net(skb->dev), NULL, skb); drop: kfree_skb(skb); return -EINVAL; @@ -645,6 +677,7 @@ static struct sk_buff *end_dt_vrf_core(struct sk_buff *skb, skb_dst_drop(skb); skb_set_transport_header(skb, hdrlen); + nf_reset_ct(skb); return end_dt_vrf_rcv(skb, family, vrf); @@ -1078,7 +1111,8 @@ static void seg6_local_update_counters(struct seg6_local_lwt *slwt, u64_stats_update_end(&pcounters->syncp); } -static int seg6_local_input(struct sk_buff *skb) +static int seg6_local_input_core(struct net *net, struct sock *sk, + struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct seg6_action_desc *desc; @@ -1086,11 +1120,6 @@ static int seg6_local_input(struct sk_buff *skb) unsigned int len = skb->len; int rc; - if (skb->protocol != htons(ETH_P_IPV6)) { - kfree_skb(skb); - return -EINVAL; - } - slwt = seg6_local_lwtunnel(orig_dst->lwtstate); desc = slwt->desc; @@ -1104,6 +1133,21 @@ static int seg6_local_input(struct sk_buff *skb) return rc; } +static int seg6_local_input(struct sk_buff *skb) +{ + if (skb->protocol != htons(ETH_P_IPV6)) { + kfree_skb(skb); + return -EINVAL; + } + + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, + seg6_local_input_core); + + return seg6_local_input_core(dev_net(skb->dev), NULL, skb); +} + static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_ACTION] = { .type = NLA_U32 }, [SEG6_LOCAL_SRH] = { .type = NLA_BINARY }, diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index df5bea818410..1b57ee36d668 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -204,7 +204,7 @@ static int ipip6_tunnel_create(struct net_device *dev) struct sit_net *sitn = net_generic(net, sit_net_id); int err; - memcpy(dev->dev_addr, &t->parms.iph.saddr, 4); + __dev_addr_set(dev, &t->parms.iph.saddr, 4); memcpy(dev->broadcast, &t->parms.iph.daddr, 4); if ((__force u16)t->parms.i_flags & SIT_ISATAP) @@ -299,9 +299,8 @@ __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr) } -static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr) +static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __user *a) { - struct ip_tunnel_prl __user *a = ifr->ifr_ifru.ifru_data; struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl kprl, *kp; struct ip_tunnel_prl_entry *prl; @@ -321,7 +320,7 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr) * we try harder to allocate. */ kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ? - kcalloc(cmax, sizeof(*kp), GFP_KERNEL | __GFP_NOWARN) : + kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) : NULL; rcu_read_lock(); @@ -334,7 +333,8 @@ static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr) * For root users, retry allocating enough memory for * the answer. */ - kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC); + kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT | + __GFP_NOWARN); if (!kp) { ret = -ENOMEM; goto out; @@ -453,8 +453,8 @@ out: return err; } -static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ifreq *ifr, - int cmd) +static int ipip6_tunnel_prl_ctl(struct net_device *dev, + struct ip_tunnel_prl __user *data, int cmd) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl prl; @@ -465,7 +465,7 @@ static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ifreq *ifr, if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) return -EINVAL; - if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) + if (copy_from_user(&prl, data, sizeof(prl))) return -EFAULT; switch (cmd) { @@ -1149,7 +1149,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, synchronize_net(); t->parms.iph.saddr = p->iph.saddr; t->parms.iph.daddr = p->iph.daddr; - memcpy(t->dev->dev_addr, &p->iph.saddr, 4); + __dev_addr_set(t->dev, &p->iph.saddr, 4); memcpy(t->dev->broadcast, &p->iph.daddr, 4); ipip6_tunnel_link(sitn, t); t->parms.iph.ttl = p->iph.ttl; @@ -1197,14 +1197,14 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, } static int -ipip6_tunnel_get6rd(struct net_device *dev, struct ifreq *ifr) +ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_6rd ip6rd; struct ip_tunnel_parm p; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + if (copy_from_user(&p, data, sizeof(p))) return -EFAULT; t = ipip6_tunnel_locate(t->net, &p, 0); } @@ -1215,13 +1215,14 @@ ipip6_tunnel_get6rd(struct net_device *dev, struct ifreq *ifr) ip6rd.relay_prefix = t->ip6rd.relay_prefix; ip6rd.prefixlen = t->ip6rd.prefixlen; ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, sizeof(ip6rd))) + if (copy_to_user(data, &ip6rd, sizeof(ip6rd))) return -EFAULT; return 0; } static int -ipip6_tunnel_6rdctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ipip6_tunnel_6rdctl(struct net_device *dev, struct ip_tunnel_6rd __user *data, + int cmd) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_6rd ip6rd; @@ -1229,7 +1230,7 @@ ipip6_tunnel_6rdctl(struct net_device *dev, struct ifreq *ifr, int cmd) if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; - if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, sizeof(ip6rd))) + if (copy_from_user(&ip6rd, data, sizeof(ip6rd))) return -EFAULT; if (cmd != SIOCDEL6RD) { @@ -1368,27 +1369,28 @@ ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) } static int -ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ipip6_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, + void __user *data, int cmd) { switch (cmd) { case SIOCGETTUNNEL: case SIOCADDTUNNEL: case SIOCCHGTUNNEL: case SIOCDELTUNNEL: - return ip_tunnel_ioctl(dev, ifr, cmd); + return ip_tunnel_siocdevprivate(dev, ifr, data, cmd); case SIOCGETPRL: - return ipip6_tunnel_get_prl(dev, ifr); + return ipip6_tunnel_get_prl(dev, data); case SIOCADDPRL: case SIOCDELPRL: case SIOCCHGPRL: - return ipip6_tunnel_prl_ctl(dev, ifr, cmd); + return ipip6_tunnel_prl_ctl(dev, data, cmd); #ifdef CONFIG_IPV6_SIT_6RD case SIOCGET6RD: - return ipip6_tunnel_get6rd(dev, ifr); + return ipip6_tunnel_get6rd(dev, data); case SIOCADD6RD: case SIOCCHG6RD: case SIOCDEL6RD: - return ipip6_tunnel_6rdctl(dev, ifr, cmd); + return ipip6_tunnel_6rdctl(dev, data, cmd); #endif default: return -EINVAL; @@ -1399,7 +1401,7 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_init = ipip6_tunnel_init, .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = sit_tunnel_xmit, - .ndo_do_ioctl = ipip6_tunnel_ioctl, + .ndo_siocdevprivate = ipip6_tunnel_siocdevprivate, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip6_tunnel_ctl, diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index d7cf26f730d7..d53dd142bf87 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -21,6 +21,7 @@ #ifdef CONFIG_NETLABEL #include <net/calipso.h> #endif +#include <linux/ioam6.h> static int two = 2; static int three = 3; @@ -28,6 +29,8 @@ static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static u32 rt6_multipath_hash_fields_all_mask = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; +static u32 ioam6_id_max = IOAM6_DEFAULT_ID; +static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -196,6 +199,22 @@ static struct ctl_table ipv6_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = &two, }, + { + .procname = "ioam6_id", + .data = &init_net.ipv6.sysctl.ioam6_id, + .maxlen = sizeof(u32), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra2 = &ioam6_id_max, + }, + { + .procname = "ioam6_id_wide", + .data = &init_net.ipv6.sysctl.ioam6_id_wide, + .maxlen = sizeof(u64), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra2 = &ioam6_id_wide_max, + }, { } }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0ce52d46e4f8..2cc9b0e53ad1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -108,8 +108,8 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) const struct rt6_info *rt = (const struct rt6_info *)dst; sk->sk_rx_dst = dst; - inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; - tcp_inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); + sk->sk_rx_dst_ifindex = skb->skb_iif; + sk->sk_rx_dst_cookie = rt6_get_cookie(rt); } } @@ -414,9 +414,12 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sk->sk_state == TCP_CLOSE) goto out; - if (ipv6_hdr(skb)->hop_limit < tcp_inet6_sk(sk)->min_hopcount) { - __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); - goto out; + if (static_branch_unlikely(&ip6_min_hopcount)) { + /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ + if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } } tp = tcp_sk(sk); @@ -569,7 +572,7 @@ done: static void tcp_v6_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->ipv6_opt); - kfree_skb(inet_rsk(req)->pktopts); + consume_skb(inet_rsk(req)->pktopts); } #ifdef CONFIG_TCP_MD5SIG @@ -599,6 +602,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; int l3index = 0; u8 prefixlen; + u8 flags; if (optlen < sizeof(cmd)) return -EINVAL; @@ -609,6 +613,8 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (sin6->sin6_family != AF_INET6) return -EINVAL; + flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; + if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { prefixlen = cmd.tcpm_prefixlen; @@ -619,7 +625,7 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128; } - if (optname == TCP_MD5SIG_EXT && + if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { struct net_device *dev; @@ -640,9 +646,9 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], AF_INET, prefixlen, - l3index); + l3index, flags); return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen, l3index); + AF_INET6, prefixlen, l3index, flags); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) @@ -650,12 +656,12 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], - AF_INET, prefixlen, l3index, + AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr, - AF_INET6, prefixlen, l3index, + AF_INET6, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } @@ -963,7 +969,6 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowlabel = label; buff->ip_summed = CHECKSUM_PARTIAL; - buff->csum = 0; __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); @@ -1404,7 +1409,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * * across. Shucks. */ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr, - AF_INET6, 128, l3index, key->key, key->keylen, + AF_INET6, 128, l3index, key->flags, key->key, key->keylen, sk_gfp_mask(sk, GFP_ATOMIC)); } #endif @@ -1506,9 +1511,9 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { - if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || + if (sk->sk_rx_dst_ifindex != skb->skb_iif || INDIRECT_CALL_1(dst->ops->check, ip6_dst_check, - dst, np->rx_dst_cookie) == NULL) { + dst, sk->sk_rx_dst_cookie) == NULL) { dst_release(dst); sk->sk_rx_dst = NULL; } @@ -1588,7 +1593,7 @@ ipv6_pktoptions: } } - kfree_skb(opt_skb); + consume_skb(opt_skb); return 0; } @@ -1618,7 +1623,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { - struct sk_buff *skb_to_free; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; @@ -1724,9 +1728,13 @@ process: return 0; } } - if (hdr->hop_limit < tcp_inet6_sk(sk)->min_hopcount) { - __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); - goto discard_and_relse; + + if (static_branch_unlikely(&ip6_min_hopcount)) { + /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ + if (hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { + __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); + goto discard_and_relse; + } } if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) @@ -1754,17 +1762,12 @@ process: tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { - skb_to_free = sk->sk_rx_skb_cache; - sk->sk_rx_skb_cache = NULL; ret = tcp_v6_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb)) goto discard_and_relse; - skb_to_free = NULL; } bh_unlock_sock(sk); - if (skb_to_free) - __kfree_skb(skb_to_free); put_and_return: if (refcounted) sock_put(sk); @@ -1875,9 +1878,9 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb) struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); if (dst) - dst = dst_check(dst, tcp_inet6_sk(sk)->rx_dst_cookie); + dst = dst_check(dst, sk->sk_rx_dst_cookie); if (dst && - inet_sk(sk)->rx_dst_ifindex == skb->skb_iif) + sk->sk_rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c5e15e94bb00..12c12619ee35 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -133,7 +133,8 @@ static int compute_score(struct sock *sk, struct net *net, dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif); if (!dev_match) return -1; - score++; + if (sk->sk_bound_dev_if) + score++; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; @@ -883,7 +884,7 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) if (udp_sk_rx_dst_set(sk, dst)) { const struct rt6_info *rt = (const struct rt6_info *)dst; - inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); + sk->sk_rx_dst_cookie = rt6_get_cookie(rt); } } @@ -1072,7 +1073,7 @@ INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb) dst = READ_ONCE(sk->sk_rx_dst); if (dst) - dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); + dst = dst_check(dst, sk->sk_rx_dst_cookie); if (dst) { /* set noref for now. * any place which wants to hold dst has to call @@ -1303,7 +1304,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int addr_len = msg->msg_namelen; bool connected = false; int ulen = len; - int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; int err; int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); @@ -1434,7 +1435,6 @@ do_udp_sendmsg: if (!fl6.flowi6_oif) fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; - fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; if (msg->msg_controllen) { @@ -1470,12 +1470,13 @@ do_udp_sendmsg: ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; + fl6.flowi6_mark = ipc6.sockc.mark; fl6.daddr = *daddr; if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { + if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 44453b35c7b7..18316ee3c692 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1044,7 +1044,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (err == 0) { atomic_dec(&iucv->skbs_in_xmit); skb_unlink(skb, &iucv->send_skb_q); - kfree_skb(skb); + consume_skb(skb); } /* this error should never happen since the */ @@ -1293,7 +1293,7 @@ static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg, } } - kfree_skb(skb); + consume_skb(skb); if (iucv->transport == AF_IUCV_TRANS_HIPER) { atomic_inc(&iucv->msg_recv); if (atomic_read(&iucv->msg_recv) > iucv->msglimit) { @@ -1756,7 +1756,7 @@ static void iucv_callback_txdone(struct iucv_path *path, spin_unlock_irqrestore(&list->lock, flags); if (this) { - kfree_skb(this); + consume_skb(this); /* wake up any process waiting for sending */ iucv_sock_wake_msglim(sk); } @@ -1903,17 +1903,17 @@ static int afiucv_hs_callback_synack(struct sock *sk, struct sk_buff *skb) { struct iucv_sock *iucv = iucv_sk(sk); - if (!iucv) - goto out; - if (sk->sk_state != IUCV_BOUND) - goto out; + if (!iucv || sk->sk_state != IUCV_BOUND) { + kfree_skb(skb); + return NET_RX_SUCCESS; + } + bh_lock_sock(sk); iucv->msglimit_peer = iucv_trans_hdr(skb)->window; sk->sk_state = IUCV_CONNECTED; sk->sk_state_change(sk); bh_unlock_sock(sk); -out: - kfree_skb(skb); + consume_skb(skb); return NET_RX_SUCCESS; } @@ -1924,16 +1924,16 @@ static int afiucv_hs_callback_synfin(struct sock *sk, struct sk_buff *skb) { struct iucv_sock *iucv = iucv_sk(sk); - if (!iucv) - goto out; - if (sk->sk_state != IUCV_BOUND) - goto out; + if (!iucv || sk->sk_state != IUCV_BOUND) { + kfree_skb(skb); + return NET_RX_SUCCESS; + } + bh_lock_sock(sk); sk->sk_state = IUCV_DISCONN; sk->sk_state_change(sk); bh_unlock_sock(sk); -out: - kfree_skb(skb); + consume_skb(skb); return NET_RX_SUCCESS; } @@ -1945,16 +1945,18 @@ static int afiucv_hs_callback_fin(struct sock *sk, struct sk_buff *skb) struct iucv_sock *iucv = iucv_sk(sk); /* other end of connection closed */ - if (!iucv) - goto out; + if (!iucv) { + kfree_skb(skb); + return NET_RX_SUCCESS; + } + bh_lock_sock(sk); if (sk->sk_state == IUCV_CONNECTED) { sk->sk_state = IUCV_DISCONN; sk->sk_state_change(sk); } bh_unlock_sock(sk); -out: - kfree_skb(skb); + consume_skb(skb); return NET_RX_SUCCESS; } @@ -2107,7 +2109,7 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, case (AF_IUCV_FLAG_WIN): err = afiucv_hs_callback_win(sk, skb); if (skb->len == sizeof(struct af_iucv_trans_hdr)) { - kfree_skb(skb); + consume_skb(skb); break; } fallthrough; /* and receive non-zero length data */ @@ -2262,21 +2264,11 @@ static struct packet_type iucv_packet_type = { .func = afiucv_hs_rcv, }; -static int afiucv_iucv_init(void) -{ - return pr_iucv->iucv_register(&af_iucv_handler, 0); -} - -static void afiucv_iucv_exit(void) -{ - pr_iucv->iucv_unregister(&af_iucv_handler, 0); -} - static int __init afiucv_init(void) { int err; - if (MACHINE_IS_VM) { + if (MACHINE_IS_VM && IS_ENABLED(CONFIG_IUCV)) { cpcmd("QUERY USERID", iucv_userid, sizeof(iucv_userid), &err); if (unlikely(err)) { WARN_ON(err); @@ -2284,11 +2276,7 @@ static int __init afiucv_init(void) goto out; } - pr_iucv = try_then_request_module(symbol_get(iucv_if), "iucv"); - if (!pr_iucv) { - printk(KERN_WARNING "iucv_if lookup failed\n"); - memset(&iucv_userid, 0, sizeof(iucv_userid)); - } + pr_iucv = &iucv_if; } else { memset(&iucv_userid, 0, sizeof(iucv_userid)); pr_iucv = NULL; @@ -2302,7 +2290,7 @@ static int __init afiucv_init(void) goto out_proto; if (pr_iucv) { - err = afiucv_iucv_init(); + err = pr_iucv->iucv_register(&af_iucv_handler, 0); if (err) goto out_sock; } @@ -2316,23 +2304,19 @@ static int __init afiucv_init(void) out_notifier: if (pr_iucv) - afiucv_iucv_exit(); + pr_iucv->iucv_unregister(&af_iucv_handler, 0); out_sock: sock_unregister(PF_IUCV); out_proto: proto_unregister(&iucv_proto); out: - if (pr_iucv) - symbol_put(iucv_if); return err; } static void __exit afiucv_exit(void) { - if (pr_iucv) { - afiucv_iucv_exit(); - symbol_put(iucv_if); - } + if (pr_iucv) + pr_iucv->iucv_unregister(&af_iucv_handler, 0); unregister_netdevice_notifier(&afiucv_netdev_notifier); dev_remove_pack(&iucv_packet_type); diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index e6795d5a546a..f3343a8541a5 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -286,19 +286,19 @@ static union iucv_param *iucv_param_irq[NR_CPUS]; */ static inline int __iucv_call_b2f0(int command, union iucv_param *parm) { - register unsigned long reg0 asm ("0"); - register unsigned long reg1 asm ("1"); - int ccode; + int cc; - reg0 = command; - reg1 = (unsigned long)parm; asm volatile( - " .long 0xb2f01000\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (ccode), "=m" (*parm), "+d" (reg0), "+a" (reg1) - : "m" (*parm) : "cc"); - return ccode; + " lgr 0,%[reg0]\n" + " lgr 1,%[reg1]\n" + " .long 0xb2f01000\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=&d" (cc), "+m" (*parm) + : [reg0] "d" ((unsigned long)command), + [reg1] "d" ((unsigned long)parm) + : "cc", "0", "1"); + return cc; } static inline int iucv_call_b2f0(int command, union iucv_param *parm) @@ -319,19 +319,21 @@ static inline int iucv_call_b2f0(int command, union iucv_param *parm) */ static int __iucv_query_maxconn(void *param, unsigned long *max_pathid) { - register unsigned long reg0 asm ("0"); - register unsigned long reg1 asm ("1"); - int ccode; + unsigned long reg1 = (unsigned long)param; + int cc; - reg0 = IUCV_QUERY; - reg1 = (unsigned long) param; asm volatile ( + " lghi 0,%[cmd]\n" + " lgr 1,%[reg1]\n" " .long 0xb2f01000\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (ccode), "+d" (reg0), "+d" (reg1) : : "cc"); + " ipm %[cc]\n" + " srl %[cc],28\n" + " lgr %[reg1],1\n" + : [cc] "=&d" (cc), [reg1] "+&d" (reg1) + : [cmd] "K" (IUCV_QUERY) + : "cc", "0", "1"); *max_pathid = reg1; - return ccode; + return cc; } static int iucv_query_maxconn(void) @@ -500,14 +502,14 @@ static void iucv_setmask_mp(void) { int cpu; - get_online_cpus(); + cpus_read_lock(); for_each_online_cpu(cpu) /* Enable all cpus with a declared buffer. */ if (cpumask_test_cpu(cpu, &iucv_buffer_cpumask) && !cpumask_test_cpu(cpu, &iucv_irq_cpumask)) smp_call_function_single(cpu, iucv_allow_cpu, NULL, 1); - put_online_cpus(); + cpus_read_unlock(); } /** @@ -540,7 +542,7 @@ static int iucv_enable(void) size_t alloc_size; int cpu, rc; - get_online_cpus(); + cpus_read_lock(); rc = -ENOMEM; alloc_size = iucv_max_pathid * sizeof(struct iucv_path); iucv_path_table = kzalloc(alloc_size, GFP_KERNEL); @@ -553,12 +555,12 @@ static int iucv_enable(void) if (cpumask_empty(&iucv_buffer_cpumask)) /* No cpu could declare an iucv buffer. */ goto out; - put_online_cpus(); + cpus_read_unlock(); return 0; out: kfree(iucv_path_table); iucv_path_table = NULL; - put_online_cpus(); + cpus_read_unlock(); return rc; } @@ -571,11 +573,11 @@ out: */ static void iucv_disable(void) { - get_online_cpus(); + cpus_read_lock(); on_each_cpu(iucv_retrieve_cpu, NULL, 1); kfree(iucv_path_table); iucv_path_table = NULL; - put_online_cpus(); + cpus_read_unlock(); } static int iucv_cpu_dead(unsigned int cpu) @@ -784,7 +786,7 @@ static int iucv_reboot_event(struct notifier_block *this, if (cpumask_empty(&iucv_irq_cpumask)) return NOTIFY_DONE; - get_online_cpus(); + cpus_read_lock(); on_each_cpu_mask(&iucv_irq_cpumask, iucv_block_cpu, NULL, 1); preempt_disable(); for (i = 0; i < iucv_max_pathid; i++) { @@ -792,7 +794,7 @@ static int iucv_reboot_event(struct notifier_block *this, iucv_sever_pathid(i, NULL); } preempt_enable(); - put_online_cpus(); + cpus_read_unlock(); iucv_disable(); return NOTIFY_DONE; } diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 53486b162f01..93271a2632b8 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -869,8 +869,10 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb) } if (tunnel->version == L2TP_HDR_VER_3 && - l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) + l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) { + l2tp_session_dec_refcount(session); goto invalid; + } l2tp_recv_common(session, skb, ptr, optr, hdrflags, length); l2tp_session_dec_refcount(session); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index ac5cadd02cfa..3086f4a6ae68 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -224,8 +224,7 @@ static int llc_ui_release(struct socket *sock) } else { release_sock(sk); } - if (llc->dev) - dev_put(llc->dev); + dev_put(llc->dev); sock_put(sk); llc_sk_free(sk); out: @@ -363,8 +362,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) } else llc->dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, addr->sllc_mac); - if (llc->dev) - dev_hold(llc->dev); + dev_hold(llc->dev); rcu_read_unlock(); if (!llc->dev) goto out; diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c index 647c0554d04c..40ca3c1e42a2 100644 --- a/net/llc/llc_c_ac.c +++ b/net/llc/llc_c_ac.c @@ -781,7 +781,7 @@ int llc_conn_ac_send_sabme_cmd_p_set_x(struct sock *sk, struct sk_buff *skb) if (nskb) { struct llc_sap *sap = llc->sap; - u8 *dmac = llc->daddr.mac; + const u8 *dmac = llc->daddr.mac; if (llc->dev->flags & IFF_LOOPBACK) dmac = llc->dev->dev_addr; diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index ad6547736c21..dde9bf08a593 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -80,7 +80,7 @@ out_free: * establishment will inform to upper layer via calling it's confirm * function and passing proper information. */ -int llc_establish_connection(struct sock *sk, u8 *lmac, u8 *dmac, u8 dsap) +int llc_establish_connection(struct sock *sk, const u8 *lmac, u8 *dmac, u8 dsap) { int rc = -EISCONN; struct llc_addr laddr, daddr; diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c index b9ad087bcbd7..5a6466fc626a 100644 --- a/net/llc/llc_output.c +++ b/net/llc/llc_output.c @@ -56,7 +56,7 @@ int llc_mac_hdr_init(struct sk_buff *skb, * package primitive as an event and send to SAP event handler */ int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, - unsigned char *dmac, unsigned char dsap) + const unsigned char *dmac, unsigned char dsap) { int rc; llc_pdu_header_init(skb, LLC_PDU_TYPE_U, sap->laddr.lsap, diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c index a4eccb98220a..0ff490a73fae 100644 --- a/net/llc/llc_proc.c +++ b/net/llc/llc_proc.c @@ -26,7 +26,7 @@ #include <net/llc_c_st.h> #include <net/llc_conn.h> -static void llc_ui_format_mac(struct seq_file *seq, u8 *addr) +static void llc_ui_format_mac(struct seq_file *seq, const u8 *addr) { seq_printf(seq, "%pM", addr); } diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index cce28e3b2232..470ff0ce3dc7 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -477,7 +477,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, size_t len) { u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num; - struct ieee802_11_elems elems = { }; + struct ieee802_11_elems *elems = NULL; u8 dialog_token; int ies_len; @@ -495,16 +495,18 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, ies_len = len - offsetof(struct ieee80211_mgmt, u.action.u.addba_req.variable); if (ies_len) { - ieee802_11_parse_elems(mgmt->u.action.u.addba_req.variable, - ies_len, true, &elems, mgmt->bssid, NULL); - if (elems.parse_error) - return; + elems = ieee802_11_parse_elems(mgmt->u.action.u.addba_req.variable, + ies_len, true, mgmt->bssid, NULL); + if (!elems || elems->parse_error) + goto free; } __ieee80211_start_rx_ba_session(sta, dialog_token, timeout, start_seq_num, ba_policy, tid, buf_size, true, false, - elems.addba_ext_ie); + elems ? elems->addba_ext_ie : NULL); +free: + kfree(elems); } void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 4e6f11e63df3..e2b791c37591 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -111,6 +111,36 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, return 0; } +static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata, + struct cfg80211_mbssid_config params) +{ + struct ieee80211_sub_if_data *tx_sdata; + + sdata->vif.mbssid_tx_vif = NULL; + sdata->vif.bss_conf.bssid_index = 0; + sdata->vif.bss_conf.nontransmitted = false; + sdata->vif.bss_conf.ema_ap = false; + + if (sdata->vif.type != NL80211_IFTYPE_AP || !params.tx_wdev) + return -EINVAL; + + tx_sdata = IEEE80211_WDEV_TO_SUB_IF(params.tx_wdev); + if (!tx_sdata) + return -EINVAL; + + if (tx_sdata == sdata) { + sdata->vif.mbssid_tx_vif = &sdata->vif; + } else { + sdata->vif.mbssid_tx_vif = &tx_sdata->vif; + sdata->vif.bss_conf.nontransmitted = true; + sdata->vif.bss_conf.bssid_index = params.index; + } + if (params.ema) + sdata->vif.bss_conf.ema_ap = true; + + return 0; +} + static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy, const char *name, unsigned char name_assign_type, @@ -828,9 +858,11 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, return ret; } -static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, - const u8 *resp, size_t resp_len, - const struct ieee80211_csa_settings *csa) +static int +ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, + const u8 *resp, size_t resp_len, + const struct ieee80211_csa_settings *csa, + const struct ieee80211_color_change_settings *cca) { struct probe_resp *new, *old; @@ -850,6 +882,8 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata, memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_presp, csa->n_counter_offsets_presp * sizeof(new->cntdwn_counter_offsets[0])); + else if (cca) + new->cntdwn_counter_offsets[0] = cca->counter_offset_presp; rcu_assign_pointer(sdata->u.ap.probe_resp, new); if (old) @@ -955,7 +989,8 @@ static int ieee80211_set_ftm_responder_params( static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, struct cfg80211_beacon_data *params, - const struct ieee80211_csa_settings *csa) + const struct ieee80211_csa_settings *csa, + const struct ieee80211_color_change_settings *cca) { struct beacon_data *new, *old; int new_head_len, new_tail_len; @@ -1004,6 +1039,9 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_beacon, csa->n_counter_offsets_beacon * sizeof(new->cntdwn_counter_offsets[0])); + } else if (cca) { + new->cntdwn_current_counter = cca->count; + new->cntdwn_counter_offsets[0] = cca->counter_offset_beacon; } /* copy in head */ @@ -1020,7 +1058,7 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata, memcpy(new->tail, old->tail, new_tail_len); err = ieee80211_set_probe_resp(sdata, params->probe_resp, - params->probe_resp_len, csa); + params->probe_resp_len, csa, cca); if (err < 0) { kfree(new); return err; @@ -1097,6 +1135,14 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, changed |= BSS_CHANGED_HE_BSS_COLOR; } + if (sdata->vif.type == NL80211_IFTYPE_AP && + params->mbssid_config.tx_wdev) { + err = ieee80211_set_ap_mbssid_options(sdata, + params->mbssid_config); + if (err) + return err; + } + mutex_lock(&local->mtx); err = ieee80211_vif_use_channel(sdata, ¶ms->chandef, IEEE80211_CHANCTX_SHARED); @@ -1175,7 +1221,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) sdata->vif.bss_conf.beacon_tx_rate = params->beacon_rate; - err = ieee80211_assign_beacon(sdata, ¶ms->beacon, NULL); + err = ieee80211_assign_beacon(sdata, ¶ms->beacon, NULL, NULL); if (err < 0) goto error; changed |= err; @@ -1230,17 +1276,17 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, sdata = IEEE80211_DEV_TO_SUB_IF(dev); sdata_assert_lock(sdata); - /* don't allow changing the beacon while CSA is in place - offset + /* don't allow changing the beacon while a countdown is in place - offset * of channel switch counter may change */ - if (sdata->vif.csa_active) + if (sdata->vif.csa_active || sdata->vif.color_change_active) return -EBUSY; old = sdata_dereference(sdata->u.ap.beacon, sdata); if (!old) return -ENOENT; - err = ieee80211_assign_beacon(sdata, params, NULL); + err = ieee80211_assign_beacon(sdata, params, NULL, NULL); if (err < 0) return err; ieee80211_bss_info_change_notify(sdata, err); @@ -3156,7 +3202,7 @@ static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP: err = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon, - NULL); + NULL, NULL); kfree(sdata->u.ap.next_beacon); sdata->u.ap.next_beacon = NULL; @@ -3322,7 +3368,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, csa.n_counter_offsets_presp = params->n_counter_offsets_presp; csa.count = params->count; - err = ieee80211_assign_beacon(sdata, ¶ms->beacon_csa, &csa); + err = ieee80211_assign_beacon(sdata, ¶ms->beacon_csa, &csa, NULL); if (err < 0) { kfree(sdata->u.ap.next_beacon); return err; @@ -3411,6 +3457,15 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata, return 0; } +static void ieee80211_color_change_abort(struct ieee80211_sub_if_data *sdata) +{ + sdata->vif.color_change_active = false; + kfree(sdata->u.ap.next_beacon); + sdata->u.ap.next_beacon = NULL; + + cfg80211_color_change_aborted_notify(sdata->dev); +} + static int __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params) @@ -3479,6 +3534,10 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, goto out; } + /* if there is a color change in progress, abort it */ + if (sdata->vif.color_change_active) + ieee80211_color_change_abort(sdata); + err = ieee80211_set_csa_beacon(sdata, params, &changed); if (err) { ieee80211_vif_unreserve_chanctx(sdata); @@ -4130,6 +4189,196 @@ static int ieee80211_set_sar_specs(struct wiphy *wiphy, return local->ops->set_sar_specs(&local->hw, sar); } +static int +ieee80211_set_after_color_change_beacon(struct ieee80211_sub_if_data *sdata, + u32 *changed) +{ + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: { + int ret; + + ret = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon, + NULL, NULL); + kfree(sdata->u.ap.next_beacon); + sdata->u.ap.next_beacon = NULL; + + if (ret < 0) + return ret; + + *changed |= ret; + break; + } + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + + return 0; +} + +static int +ieee80211_set_color_change_beacon(struct ieee80211_sub_if_data *sdata, + struct cfg80211_color_change_settings *params, + u32 *changed) +{ + struct ieee80211_color_change_settings color_change = {}; + int err; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: + sdata->u.ap.next_beacon = + cfg80211_beacon_dup(¶ms->beacon_next); + if (!sdata->u.ap.next_beacon) + return -ENOMEM; + + if (params->count <= 1) + break; + + color_change.counter_offset_beacon = + params->counter_offset_beacon; + color_change.counter_offset_presp = + params->counter_offset_presp; + color_change.count = params->count; + + err = ieee80211_assign_beacon(sdata, ¶ms->beacon_color_change, + NULL, &color_change); + if (err < 0) { + kfree(sdata->u.ap.next_beacon); + return err; + } + *changed |= err; + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static void +ieee80211_color_change_bss_config_notify(struct ieee80211_sub_if_data *sdata, + u8 color, int enable, u32 changed) +{ + sdata->vif.bss_conf.he_bss_color.color = color; + sdata->vif.bss_conf.he_bss_color.enabled = enable; + changed |= BSS_CHANGED_HE_BSS_COLOR; + + ieee80211_bss_info_change_notify(sdata, changed); +} + +static int ieee80211_color_change_finalize(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + u32 changed = 0; + int err; + + sdata_assert_lock(sdata); + lockdep_assert_held(&local->mtx); + + sdata->vif.color_change_active = false; + + err = ieee80211_set_after_color_change_beacon(sdata, &changed); + if (err) { + cfg80211_color_change_aborted_notify(sdata->dev); + return err; + } + + ieee80211_color_change_bss_config_notify(sdata, + sdata->vif.color_change_color, + 1, changed); + cfg80211_color_change_notify(sdata->dev); + + return 0; +} + +void ieee80211_color_change_finalize_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + color_change_finalize_work); + struct ieee80211_local *local = sdata->local; + + sdata_lock(sdata); + mutex_lock(&local->mtx); + + /* AP might have been stopped while waiting for the lock. */ + if (!sdata->vif.color_change_active) + goto unlock; + + if (!ieee80211_sdata_running(sdata)) + goto unlock; + + ieee80211_color_change_finalize(sdata); + +unlock: + mutex_unlock(&local->mtx); + sdata_unlock(sdata); +} + +void ieee80211_color_change_finish(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + ieee80211_queue_work(&sdata->local->hw, + &sdata->color_change_finalize_work); +} +EXPORT_SYMBOL_GPL(ieee80211_color_change_finish); + +void +ieeee80211_obss_color_collision_notify(struct ieee80211_vif *vif, + u64 color_bitmap) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + if (sdata->vif.color_change_active || sdata->vif.csa_active) + return; + + cfg80211_obss_color_collision_notify(sdata->dev, color_bitmap); +} +EXPORT_SYMBOL_GPL(ieeee80211_obss_color_collision_notify); + +static int +ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev, + struct cfg80211_color_change_settings *params) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + u32 changed = 0; + int err; + + sdata_assert_lock(sdata); + + mutex_lock(&local->mtx); + + /* don't allow another color change if one is already active or if csa + * is active + */ + if (sdata->vif.color_change_active || sdata->vif.csa_active) { + err = -EBUSY; + goto out; + } + + err = ieee80211_set_color_change_beacon(sdata, params, &changed); + if (err) + goto out; + + sdata->vif.color_change_active = true; + sdata->vif.color_change_color = params->color; + + cfg80211_color_change_started_notify(sdata->dev, params->count); + + if (changed) + ieee80211_color_change_bss_config_notify(sdata, 0, 0, changed); + else + /* if the beacon didn't change, we can finalize immediately */ + ieee80211_color_change_finalize(sdata); + +out: + mutex_unlock(&local->mtx); + + return err; +} + const struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -4233,4 +4482,5 @@ const struct cfg80211_ops mac80211_config_ops = { .set_tid_config = ieee80211_set_tid_config, .reset_tid_config = ieee80211_reset_tid_config, .set_sar_specs = ieee80211_set_sar_specs, + .color_change = ieee80211_color_change, }; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 8be28cfd6f64..481f01b0f65c 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -5,7 +5,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2020 Intel Corporation + * Copyright (C) 2018 - 2021 Intel Corporation */ #include <linux/debugfs.h> @@ -153,20 +153,20 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, rcu_read_lock(); p += scnprintf(p, - bufsz+buf-p, + bufsz + buf - p, "target %uus interval %uus ecn %s\n", codel_time_to_us(sta->cparams.target), codel_time_to_us(sta->cparams.interval), sta->cparams.ecn ? "yes" : "no"); p += scnprintf(p, - bufsz+buf-p, + bufsz + buf - p, "tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n"); for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { if (!sta->sta.txq[i]) continue; txqi = to_txq_info(sta->sta.txq[i]); - p += scnprintf(p, bufsz+buf-p, + p += scnprintf(p, bufsz + buf - p, "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s)\n", txqi->txq.tid, txqi->txq.ac, @@ -314,17 +314,24 @@ STA_OPS_RW(aql); static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { - char buf[71 + IEEE80211_NUM_TIDS * 40], *p = buf; + char *buf, *p; + ssize_t bufsz = 71 + IEEE80211_NUM_TIDS * 40; int i; struct sta_info *sta = file->private_data; struct tid_ampdu_rx *tid_rx; struct tid_ampdu_tx *tid_tx; + ssize_t ret; + + buf = kzalloc(bufsz, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = buf; rcu_read_lock(); - p += scnprintf(p, sizeof(buf) + buf - p, "next dialog_token: %#02x\n", + p += scnprintf(p, bufsz + buf - p, "next dialog_token: %#02x\n", sta->ampdu_mlme.dialog_token_allocator + 1); - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n"); for (i = 0; i < IEEE80211_NUM_TIDS; i++) { @@ -334,25 +341,27 @@ static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf, tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[i]); tid_rx_valid = test_bit(i, sta->ampdu_mlme.agg_session_valid); - p += scnprintf(p, sizeof(buf) + buf - p, "%02d", i); - p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x", + p += scnprintf(p, bufsz + buf - p, "%02d", i); + p += scnprintf(p, bufsz + buf - p, "\t\t%x", tid_rx_valid); - p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x", + p += scnprintf(p, bufsz + buf - p, "\t%#.2x", tid_rx_valid ? sta->ampdu_mlme.tid_rx_token[i] : 0); - p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.3x", + p += scnprintf(p, bufsz + buf - p, "\t%#.3x", tid_rx ? tid_rx->ssn : 0); - p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x", !!tid_tx); - p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x", + p += scnprintf(p, bufsz + buf - p, "\t\t%x", !!tid_tx); + p += scnprintf(p, bufsz + buf - p, "\t%#.2x", tid_tx ? tid_tx->dialog_token : 0); - p += scnprintf(p, sizeof(buf) + buf - p, "\t%03d", + p += scnprintf(p, bufsz + buf - p, "\t%03d", tid_tx ? skb_queue_len(&tid_tx->pending) : 0); - p += scnprintf(p, sizeof(buf) + buf - p, "\n"); + p += scnprintf(p, bufsz + buf - p, "\n"); } rcu_read_unlock(); - return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); + ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); + kfree(buf); + return ret; } static ssize_t sta_agg_status_write(struct file *file, const char __user *userbuf, @@ -434,15 +443,22 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, if (_cond) \ p += scnprintf(p, sizeof(buf)+buf-p, "\t" _str "\n"); \ } while (0) - char buf[512], *p = buf; + char *buf, *p; int i; + ssize_t bufsz = 512; struct sta_info *sta = file->private_data; struct ieee80211_sta_ht_cap *htc = &sta->sta.ht_cap; + ssize_t ret; - p += scnprintf(p, sizeof(buf) + buf - p, "ht %ssupported\n", + buf = kzalloc(bufsz, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = buf; + + p += scnprintf(p, bufsz + buf - p, "ht %ssupported\n", htc->ht_supported ? "" : "not "); if (htc->ht_supported) { - p += scnprintf(p, sizeof(buf)+buf-p, "cap: %#.4x\n", htc->cap); + p += scnprintf(p, bufsz + buf - p, "cap: %#.4x\n", htc->cap); PRINT_HT_CAP((htc->cap & BIT(0)), "RX LDPC"); PRINT_HT_CAP((htc->cap & BIT(1)), "HT20/HT40"); @@ -484,81 +500,90 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, PRINT_HT_CAP((htc->cap & BIT(15)), "L-SIG TXOP protection"); - p += scnprintf(p, sizeof(buf)+buf-p, "ampdu factor/density: %d/%d\n", + p += scnprintf(p, bufsz + buf - p, "ampdu factor/density: %d/%d\n", htc->ampdu_factor, htc->ampdu_density); - p += scnprintf(p, sizeof(buf)+buf-p, "MCS mask:"); + p += scnprintf(p, bufsz + buf - p, "MCS mask:"); for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) - p += scnprintf(p, sizeof(buf)+buf-p, " %.2x", + p += scnprintf(p, bufsz + buf - p, " %.2x", htc->mcs.rx_mask[i]); - p += scnprintf(p, sizeof(buf)+buf-p, "\n"); + p += scnprintf(p, bufsz + buf - p, "\n"); /* If not set this is meaningless */ if (le16_to_cpu(htc->mcs.rx_highest)) { - p += scnprintf(p, sizeof(buf)+buf-p, + p += scnprintf(p, bufsz + buf - p, "MCS rx highest: %d Mbps\n", le16_to_cpu(htc->mcs.rx_highest)); } - p += scnprintf(p, sizeof(buf)+buf-p, "MCS tx params: %x\n", + p += scnprintf(p, bufsz + buf - p, "MCS tx params: %x\n", htc->mcs.tx_params); } - return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); + ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); + kfree(buf); + return ret; } STA_OPS(ht_capa); static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { - char buf[512], *p = buf; + char *buf, *p; struct sta_info *sta = file->private_data; struct ieee80211_sta_vht_cap *vhtc = &sta->sta.vht_cap; + ssize_t ret; + ssize_t bufsz = 512; - p += scnprintf(p, sizeof(buf) + buf - p, "VHT %ssupported\n", + buf = kzalloc(bufsz, GFP_KERNEL); + if (!buf) + return -ENOMEM; + p = buf; + + p += scnprintf(p, bufsz + buf - p, "VHT %ssupported\n", vhtc->vht_supported ? "" : "not "); if (vhtc->vht_supported) { - p += scnprintf(p, sizeof(buf) + buf - p, "cap: %#.8x\n", + p += scnprintf(p, bufsz + buf - p, "cap: %#.8x\n", vhtc->cap); #define PFLAG(a, b) \ do { \ if (vhtc->cap & IEEE80211_VHT_CAP_ ## a) \ - p += scnprintf(p, sizeof(buf) + buf - p, \ + p += scnprintf(p, bufsz + buf - p, \ "\t\t%s\n", b); \ } while (0) switch (vhtc->cap & 0x3) { case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\tMAX-MPDU-3895\n"); break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\tMAX-MPDU-7991\n"); break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\tMAX-MPDU-11454\n"); break; default: - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\tMAX-MPDU-UNKNOWN\n"); } switch (vhtc->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case 0: - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\t80Mhz\n"); break; case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\t160Mhz\n"); break; case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\t80+80Mhz\n"); break; default: - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\tUNKNOWN-MHZ: 0x%x\n", (vhtc->cap >> 2) & 0x3); } @@ -566,15 +591,15 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, PFLAG(SHORT_GI_80, "SHORT-GI-80"); PFLAG(SHORT_GI_160, "SHORT-GI-160"); PFLAG(TXSTBC, "TXSTBC"); - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\tRXSTBC_%d\n", (vhtc->cap >> 8) & 0x7); PFLAG(SU_BEAMFORMER_CAPABLE, "SU-BEAMFORMER-CAPABLE"); PFLAG(SU_BEAMFORMEE_CAPABLE, "SU-BEAMFORMEE-CAPABLE"); - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\tBEAMFORMEE-STS: 0x%x\n", (vhtc->cap & IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK) >> IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT); - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\tSOUNDING-DIMENSIONS: 0x%x\n", (vhtc->cap & IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK) >> IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT); @@ -582,34 +607,36 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, PFLAG(MU_BEAMFORMEE_CAPABLE, "MU-BEAMFORMEE-CAPABLE"); PFLAG(VHT_TXOP_PS, "TXOP-PS"); PFLAG(HTC_VHT, "HTC-VHT"); - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\tMPDU-LENGTH-EXPONENT: 0x%x\n", (vhtc->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK) >> IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT); PFLAG(VHT_LINK_ADAPTATION_VHT_UNSOL_MFB, "LINK-ADAPTATION-VHT-UNSOL-MFB"); - p += scnprintf(p, sizeof(buf) + buf - p, + p += scnprintf(p, bufsz + buf - p, "\t\tLINK-ADAPTATION-VHT-MRQ-MFB: 0x%x\n", (vhtc->cap & IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB) >> 26); PFLAG(RX_ANTENNA_PATTERN, "RX-ANTENNA-PATTERN"); PFLAG(TX_ANTENNA_PATTERN, "TX-ANTENNA-PATTERN"); - p += scnprintf(p, sizeof(buf)+buf-p, "RX MCS: %.4x\n", + p += scnprintf(p, bufsz + buf - p, "RX MCS: %.4x\n", le16_to_cpu(vhtc->vht_mcs.rx_mcs_map)); if (vhtc->vht_mcs.rx_highest) - p += scnprintf(p, sizeof(buf)+buf-p, + p += scnprintf(p, bufsz + buf - p, "MCS RX highest: %d Mbps\n", le16_to_cpu(vhtc->vht_mcs.rx_highest)); - p += scnprintf(p, sizeof(buf)+buf-p, "TX MCS: %.4x\n", + p += scnprintf(p, bufsz + buf - p, "TX MCS: %.4x\n", le16_to_cpu(vhtc->vht_mcs.tx_mcs_map)); if (vhtc->vht_mcs.tx_highest) - p += scnprintf(p, sizeof(buf)+buf-p, + p += scnprintf(p, bufsz + buf - p, "MCS TX highest: %d Mbps\n", le16_to_cpu(vhtc->vht_mcs.tx_highest)); #undef PFLAG } - return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); + ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); + kfree(buf); + return ret; } STA_OPS(vht_capa); diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index bcb7cc06db3d..cd3731cbf6c6 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1447,4 +1447,40 @@ static inline void drv_sta_set_decap_offload(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline void drv_add_twt_setup(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + struct ieee80211_twt_setup *twt) +{ + struct ieee80211_twt_params *twt_agrt; + + might_sleep(); + + if (!check_sdata_in_driver(sdata)) + return; + + twt_agrt = (void *)twt->params; + + trace_drv_add_twt_setup(local, sta, twt, twt_agrt); + local->ops->add_twt_setup(&local->hw, sta, twt); + trace_drv_return_void(local); +} + +static inline void drv_twt_teardown_request(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + u8 flowid) +{ + might_sleep(); + if (!check_sdata_in_driver(sdata)) + return; + + if (!local->ops->twt_teardown_request) + return; + + trace_drv_twt_teardown_request(local, sta, flowid); + local->ops->twt_teardown_request(&local->hw, sta, flowid); + trace_drv_return_void(local); +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/fils_aead.c b/net/mac80211/fils_aead.c index a13ae148937e..e1d4cfd99128 100644 --- a/net/mac80211/fils_aead.c +++ b/net/mac80211/fils_aead.c @@ -219,7 +219,8 @@ int fils_encrypt_assoc_req(struct sk_buff *skb, { struct ieee80211_mgmt *mgmt = (void *)skb->data; u8 *capab, *ies, *encr; - const u8 *addr[5 + 1], *session; + const u8 *addr[5 + 1]; + const struct element *session; size_t len[5 + 1]; size_t crypt_len; @@ -231,12 +232,12 @@ int fils_encrypt_assoc_req(struct sk_buff *skb, ies = mgmt->u.assoc_req.variable; } - session = cfg80211_find_ext_ie(WLAN_EID_EXT_FILS_SESSION, - ies, skb->data + skb->len - ies); - if (!session || session[1] != 1 + 8) + session = cfg80211_find_ext_elem(WLAN_EID_EXT_FILS_SESSION, + ies, skb->data + skb->len - ies); + if (!session || session->datalen != 1 + 8) return -EINVAL; /* encrypt after FILS Session element */ - encr = (u8 *)session + 2 + 1 + 8; + encr = (u8 *)session->data + 1 + 8; /* AES-SIV AAD vectors */ @@ -270,7 +271,8 @@ int fils_decrypt_assoc_resp(struct ieee80211_sub_if_data *sdata, { struct ieee80211_mgmt *mgmt = (void *)frame; u8 *capab, *ies, *encr; - const u8 *addr[5 + 1], *session; + const u8 *addr[5 + 1]; + const struct element *session; size_t len[5 + 1]; int res; size_t crypt_len; @@ -280,16 +282,16 @@ int fils_decrypt_assoc_resp(struct ieee80211_sub_if_data *sdata, capab = (u8 *)&mgmt->u.assoc_resp.capab_info; ies = mgmt->u.assoc_resp.variable; - session = cfg80211_find_ext_ie(WLAN_EID_EXT_FILS_SESSION, - ies, frame + *frame_len - ies); - if (!session || session[1] != 1 + 8) { + session = cfg80211_find_ext_elem(WLAN_EID_EXT_FILS_SESSION, + ies, frame + *frame_len - ies); + if (!session || session->datalen != 1 + 8) { mlme_dbg(sdata, "No (valid) FILS Session element in (Re)Association Response frame from %pM", mgmt->sa); return -EINVAL; } /* decrypt after FILS Session element */ - encr = (u8 *)session + 2 + 1 + 8; + encr = (u8 *)session->data + 1 + 8; /* AES-SIV AAD vectors */ diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index a7ac53a2f00d..0416c4d22292 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -9,7 +9,7 @@ * Copyright 2009, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright(c) 2018-2020 Intel Corporation + * Copyright(c) 2018-2021 Intel Corporation */ #include <linux/delay.h> @@ -489,7 +489,6 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, const struct cfg80211_bss_ies *ies; u16 capability = WLAN_CAPABILITY_IBSS; u64 tsf; - int ret = 0; sdata_assert_lock(sdata); @@ -501,10 +500,8 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, ifibss->ssid_len, IEEE80211_BSS_TYPE_IBSS, IEEE80211_PRIVACY(ifibss->privacy)); - if (WARN_ON(!cbss)) { - ret = -EINVAL; - goto out; - } + if (WARN_ON(!cbss)) + return -EINVAL; rcu_read_lock(); ies = rcu_dereference(cbss->ies); @@ -520,18 +517,14 @@ int ieee80211_ibss_csa_beacon(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.basic_rates, capability, tsf, &ifibss->chandef, NULL, csa_settings); - if (!presp) { - ret = -ENOMEM; - goto out; - } + if (!presp) + return -ENOMEM; rcu_assign_pointer(ifibss->presp, presp); if (old_presp) kfree_rcu(old_presp, rcu_head); return BSS_CHANGED_BEACON; - out: - return ret; } int ieee80211_ibss_finish_csa(struct ieee80211_sub_if_data *sdata) @@ -1596,7 +1589,7 @@ void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_rx_status *rx_status) { size_t baselen; - struct ieee802_11_elems elems; + struct ieee802_11_elems *elems; BUILD_BUG_ON(offsetof(typeof(mgmt->u.probe_resp), variable) != offsetof(typeof(mgmt->u.beacon), variable)); @@ -1609,10 +1602,14 @@ void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; - ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen, - false, &elems, mgmt->bssid, NULL); + elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable, + len - baselen, false, + mgmt->bssid, NULL); - ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems); + if (elems) { + ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, elems); + kfree(elems); + } } void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, @@ -1621,7 +1618,7 @@ void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct ieee80211_rx_status *rx_status; struct ieee80211_mgmt *mgmt; u16 fc; - struct ieee802_11_elems elems; + struct ieee802_11_elems *elems; int ies_len; rx_status = IEEE80211_SKB_RXCB(skb); @@ -1658,15 +1655,16 @@ void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, if (ies_len < 0) break; - ieee802_11_parse_elems( + elems = ieee802_11_parse_elems( mgmt->u.action.u.chan_switch.variable, - ies_len, true, &elems, mgmt->bssid, NULL); - - if (elems.parse_error) - break; - - ieee80211_rx_mgmt_spectrum_mgmt(sdata, mgmt, skb->len, - rx_status, &elems); + ies_len, true, mgmt->bssid, NULL); + + if (elems && !elems->parse_error) + ieee80211_rx_mgmt_spectrum_mgmt(sdata, mgmt, + skb->len, + rx_status, + elems); + kfree(elems); break; } } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 30ce6d2ec7ce..5666bbb8860b 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -25,6 +25,7 @@ #include <linux/leds.h> #include <linux/idr.h> #include <linux/rhashtable.h> +#include <linux/rbtree.h> #include <net/ieee80211_radiotap.h> #include <net/cfg80211.h> #include <net/mac80211.h> @@ -244,6 +245,12 @@ struct ieee80211_csa_settings { u8 count; }; +struct ieee80211_color_change_settings { + u16 counter_offset_beacon; + u16 counter_offset_presp; + u8 count; +}; + struct beacon_data { u8 *head, *tail; int head_len, tail_len; @@ -624,10 +631,9 @@ struct ieee80211_if_ocb { */ struct ieee802_11_elems; struct ieee80211_mesh_sync_ops { - void (*rx_bcn_presp)(struct ieee80211_sub_if_data *sdata, - u16 stype, - struct ieee80211_mgmt *mgmt, - struct ieee802_11_elems *elems, + void (*rx_bcn_presp)(struct ieee80211_sub_if_data *sdata, u16 stype, + struct ieee80211_mgmt *mgmt, unsigned int len, + const struct ieee80211_meshconf_ie *mesh_cfg, struct ieee80211_rx_status *rx_status); /* should be called with beacon_data under RCU read lock */ @@ -923,6 +929,8 @@ struct ieee80211_sub_if_data { bool csa_block_tx; /* write-protected by sdata_lock and local->mtx */ struct cfg80211_chan_def csa_chandef; + struct work_struct color_change_finalize_work; + struct list_head assigned_chanctx_list; /* protected by chanctx_mtx */ struct list_head reserved_chanctx_list; /* protected by chanctx_mtx */ @@ -937,6 +945,7 @@ struct ieee80211_sub_if_data { struct work_struct work; struct sk_buff_head skb_queue; + struct sk_buff_head status_queue; u8 needed_rx_chains; enum ieee80211_smps_mode smps_mode; @@ -1232,6 +1241,9 @@ struct ieee80211_local { */ bool suspended; + /* suspending is true during the whole suspend process */ + bool suspending; + /* * Resuming is true while suspended, but when we're reprogramming the * hardware -- at that time it's allowed to use ieee80211_queue_work() @@ -1498,6 +1510,7 @@ struct ieee80211_csa_ie { struct ieee802_11_elems { const u8 *ie_start; size_t total_len; + u32 crc; /* pointers to IEs */ const struct ieee80211_tdls_lnkie *lnk_id; @@ -1507,7 +1520,6 @@ struct ieee802_11_elems { const u8 *supp_rates; const u8 *ds_params; const struct ieee80211_tim_ie *tim; - const u8 *challenge; const u8 *rsn; const u8 *rsnx; const u8 *erp_info; @@ -1524,6 +1536,7 @@ struct ieee802_11_elems { const struct ieee80211_he_spr *he_spr; const struct ieee80211_mu_edca_param_set *mu_edca_param_set; const struct ieee80211_he_6ghz_capa *he_6ghz_capa; + const struct ieee80211_tx_pwr_env *tx_pwr_env[IEEE80211_TPE_MAX_IE_COUNT]; const u8 *uora_element; const u8 *mesh_id; const u8 *peering; @@ -1560,7 +1573,6 @@ struct ieee802_11_elems { u8 ssid_len; u8 supp_rates_len; u8 tim_len; - u8 challenge_len; u8 rsn_len; u8 rsnx_len; u8 ext_supp_rates_len; @@ -1574,6 +1586,8 @@ struct ieee802_11_elems { u8 perr_len; u8 country_elem_len; u8 bssid_index_len; + u8 tx_pwr_env_len[IEEE80211_TPE_MAX_IE_COUNT]; + u8 tx_pwr_env_num; /* whether a parse error occurred while retrieving these elements */ bool parse_error; @@ -1887,6 +1901,9 @@ void ieee80211_csa_finalize_work(struct work_struct *work); int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_csa_settings *params); +/* color change handling */ +void ieee80211_color_change_finalize_work(struct work_struct *work); + /* interface handling */ #define MAC80211_SUPPORTED_FEATURES_TX (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | \ NETIF_F_HW_CSUM | NETIF_F_SG | \ @@ -2068,6 +2085,11 @@ ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, /* S1G */ void ieee80211_s1g_sta_rate_init(struct sta_info *sta); +bool ieee80211_s1g_is_twt_setup(struct sk_buff *skb); +void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, @@ -2173,18 +2195,18 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, ieee80211_tx_skb_tid(sdata, skb, 7); } -u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, - struct ieee802_11_elems *elems, - u64 filter, u32 crc, u8 *transmitter_bssid, - u8 *bss_bssid); -static inline void ieee802_11_parse_elems(const u8 *start, size_t len, - bool action, - struct ieee802_11_elems *elems, - u8 *transmitter_bssid, - u8 *bss_bssid) +struct ieee802_11_elems *ieee802_11_parse_elems_crc(const u8 *start, size_t len, + bool action, + u64 filter, u32 crc, + const u8 *transmitter_bssid, + const u8 *bss_bssid); +static inline struct ieee802_11_elems * +ieee802_11_parse_elems(const u8 *start, size_t len, bool action, + const u8 *transmitter_bssid, + const u8 *bss_bssid) { - ieee802_11_parse_elems_crc(start, len, action, elems, 0, 0, - transmitter_bssid, bss_bssid); + return ieee802_11_parse_elems_crc(start, len, action, 0, 0, + transmitter_bssid, bss_bssid); } diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 1e5e9fc45523..9a2145c8192b 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -462,6 +462,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do sdata_unlock(sdata); cancel_work_sync(&sdata->csa_finalize_work); + cancel_work_sync(&sdata->color_change_finalize_work); cancel_delayed_work_sync(&sdata->dfs_cac_timer_work); @@ -551,6 +552,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do */ ieee80211_free_keys(sdata, true); skb_queue_purge(&sdata->skb_queue); + skb_queue_purge(&sdata->status_queue); } spin_lock_irqsave(&local->queue_stop_reason_lock, flags); @@ -630,17 +632,46 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do ieee80211_add_virtual_monitor(local); } +static void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_sub_if_data *tx_sdata, *non_tx_sdata, *tmp_sdata; + struct ieee80211_vif *tx_vif = sdata->vif.mbssid_tx_vif; + + if (!tx_vif) + return; + + tx_sdata = vif_to_sdata(tx_vif); + sdata->vif.mbssid_tx_vif = NULL; + + list_for_each_entry_safe(non_tx_sdata, tmp_sdata, + &tx_sdata->local->interfaces, list) { + if (non_tx_sdata != sdata && non_tx_sdata != tx_sdata && + non_tx_sdata->vif.mbssid_tx_vif == tx_vif && + ieee80211_sdata_running(non_tx_sdata)) { + non_tx_sdata->vif.mbssid_tx_vif = NULL; + dev_close(non_tx_sdata->wdev.netdev); + } + } + + if (sdata != tx_sdata && ieee80211_sdata_running(tx_sdata)) { + tx_sdata->vif.mbssid_tx_vif = NULL; + dev_close(tx_sdata->wdev.netdev); + } +} + static int ieee80211_stop(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - /* close all dependent VLAN interfaces before locking wiphy */ + /* close dependent VLAN and MBSSID interfaces before locking wiphy */ if (sdata->vif.type == NL80211_IFTYPE_AP) { struct ieee80211_sub_if_data *vlan, *tmpsdata; list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, u.vlan.list) dev_close(vlan->dev); + + ieee80211_stop_mbssid(sdata); } wiphy_lock(sdata->local->hw.wiphy); @@ -983,6 +1014,7 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) } skb_queue_head_init(&sdata->skb_queue); + skb_queue_head_init(&sdata->status_queue); INIT_WORK(&sdata->work, ieee80211_iface_work); return 0; @@ -1105,9 +1137,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) * this interface, if it has the special null one. */ if (dev && is_zero_ether_addr(dev->dev_addr)) { - memcpy(dev->dev_addr, - local->hw.wiphy->perm_addr, - ETH_ALEN); + eth_hw_addr_set(dev, local->hw.wiphy->perm_addr); memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN); if (!is_valid_ether_addr(dev->dev_addr)) { @@ -1381,6 +1411,16 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, WARN_ON(1); break; } + } else if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_S1G) { + switch (mgmt->u.action.u.s1g.action_code) { + case WLAN_S1G_TWT_TEARDOWN: + case WLAN_S1G_TWT_SETUP: + ieee80211_s1g_rx_twt_action(sdata, skb); + break; + default: + break; + } } else if (ieee80211_is_ext(mgmt->frame_control)) { if (sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_sta_rx_queued_ext(sdata, skb); @@ -1436,6 +1476,24 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, } } +static void ieee80211_iface_process_status(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *)skb->data; + + if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category == WLAN_CATEGORY_S1G) { + switch (mgmt->u.action.u.s1g.action_code) { + case WLAN_S1G_TWT_TEARDOWN: + case WLAN_S1G_TWT_SETUP: + ieee80211_s1g_status_twt_action(sdata, skb); + break; + default: + break; + } + } +} + static void ieee80211_iface_work(struct work_struct *work) { struct ieee80211_sub_if_data *sdata = @@ -1465,6 +1523,16 @@ static void ieee80211_iface_work(struct work_struct *work) kcov_remote_stop(); } + /* process status queue */ + while ((skb = skb_dequeue(&sdata->status_queue))) { + kcov_remote_start_common(skb_get_kcov_handle(skb)); + + ieee80211_iface_process_status(sdata, skb); + kfree_skb(skb); + + kcov_remote_stop(); + } + /* then other type-dependent work */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: @@ -1528,9 +1596,11 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, } skb_queue_head_init(&sdata->skb_queue); + skb_queue_head_init(&sdata->status_queue); INIT_WORK(&sdata->work, ieee80211_iface_work); INIT_WORK(&sdata->recalc_smps, ieee80211_recalc_smps_work); INIT_WORK(&sdata->csa_finalize_work, ieee80211_csa_finalize_work); + INIT_WORK(&sdata->color_change_finalize_work, ieee80211_color_change_finalize_work); INIT_LIST_HEAD(&sdata->assigned_chanctx_list); INIT_LIST_HEAD(&sdata->reserved_chanctx_list); @@ -1921,9 +1991,9 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, ieee80211_assign_perm_addr(local, ndev->perm_addr, type); if (is_valid_ether_addr(params->macaddr)) - memcpy(ndev->dev_addr, params->macaddr, ETH_ALEN); + eth_hw_addr_set(ndev, params->macaddr); else - memcpy(ndev->dev_addr, ndev->perm_addr, ETH_ALEN); + eth_hw_addr_set(ndev, ndev->perm_addr); SET_NETDEV_DEV(ndev, wiphy_dev(local->hw.wiphy)); /* don't use IEEE80211_DEV_TO_SUB_IF -- it checks too much */ @@ -2001,9 +2071,16 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops); - /* MTU range: 256 - 2304 */ + /* MTU range is normally 256 - 2304, where the upper limit is + * the maximum MSDU size. Monitor interfaces send and receive + * MPDU and A-MSDU frames which may be much larger so we do + * not impose an upper limit in that case. + */ ndev->min_mtu = 256; - ndev->max_mtu = local->hw.max_mtu; + if (type == NL80211_IFTYPE_MONITOR) + ndev->max_mtu = 0; + else + ndev->max_mtu = local->hw.max_mtu; ret = cfg80211_register_netdevice(ndev); if (ret) { diff --git a/net/mac80211/main.c b/net/mac80211/main.c index fcae76ddd586..45fb517591ee 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1020,7 +1020,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) iftd = &sband->iftype_data[i]; - supp_he = supp_he || (iftd && iftd->he_cap.has_he); + supp_he = supp_he || iftd->he_cap.has_he; } /* HT, VHT, HE require QoS, thus >= 4 queues */ diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 97095b7c9c64..15ac08d111ea 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -672,7 +672,7 @@ ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata, u8 *ie, u8 ie_len) { struct ieee80211_supported_band *sband; - const u8 *cap; + const struct element *cap; const struct ieee80211_he_operation *he_oper = NULL; sband = ieee80211_get_sband(sdata); @@ -687,9 +687,10 @@ ieee80211_mesh_update_bss_params(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.he_support = true; - cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, ie, ie_len); - if (cap && cap[1] >= ieee80211_he_oper_size(&cap[3])) - he_oper = (void *)(cap + 3); + cap = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ie, ie_len); + if (cap && cap->datalen >= 1 + sizeof(*he_oper) && + cap->datalen >= 1 + ieee80211_he_oper_size(cap->data + 1)) + he_oper = (void *)(cap->data + 1); if (he_oper) sdata->vif.bss_conf.he_oper.params = @@ -1246,7 +1247,7 @@ ieee80211_mesh_rx_probe_req(struct ieee80211_sub_if_data *sdata, struct sk_buff *presp; struct beacon_data *bcn; struct ieee80211_mgmt *hdr; - struct ieee802_11_elems elems; + struct ieee802_11_elems *elems; size_t baselen; u8 *pos; @@ -1255,22 +1256,24 @@ ieee80211_mesh_rx_probe_req(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; - ieee802_11_parse_elems(pos, len - baselen, false, &elems, mgmt->bssid, - NULL); - - if (!elems.mesh_id) + elems = ieee802_11_parse_elems(pos, len - baselen, false, mgmt->bssid, + NULL); + if (!elems) return; + if (!elems->mesh_id) + goto free; + /* 802.11-2012 10.1.4.3.2 */ if ((!ether_addr_equal(mgmt->da, sdata->vif.addr) && !is_broadcast_ether_addr(mgmt->da)) || - elems.ssid_len != 0) - return; + elems->ssid_len != 0) + goto free; - if (elems.mesh_id_len != 0 && - (elems.mesh_id_len != ifmsh->mesh_id_len || - memcmp(elems.mesh_id, ifmsh->mesh_id, ifmsh->mesh_id_len))) - return; + if (elems->mesh_id_len != 0 && + (elems->mesh_id_len != ifmsh->mesh_id_len || + memcmp(elems->mesh_id, ifmsh->mesh_id, ifmsh->mesh_id_len))) + goto free; rcu_read_lock(); bcn = rcu_dereference(ifmsh->beacon); @@ -1294,6 +1297,8 @@ ieee80211_mesh_rx_probe_req(struct ieee80211_sub_if_data *sdata, ieee80211_tx_skb(sdata, presp); out: rcu_read_unlock(); +free: + kfree(elems); } static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, @@ -1304,7 +1309,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - struct ieee802_11_elems elems; + struct ieee802_11_elems *elems; struct ieee80211_channel *channel; size_t baselen; int freq; @@ -1319,42 +1324,47 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; - ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen, - false, &elems, mgmt->bssid, NULL); + elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable, + len - baselen, + false, mgmt->bssid, NULL); + if (!elems) + return; /* ignore non-mesh or secure / unsecure mismatch */ - if ((!elems.mesh_id || !elems.mesh_config) || - (elems.rsn && sdata->u.mesh.security == IEEE80211_MESH_SEC_NONE) || - (!elems.rsn && sdata->u.mesh.security != IEEE80211_MESH_SEC_NONE)) - return; + if ((!elems->mesh_id || !elems->mesh_config) || + (elems->rsn && sdata->u.mesh.security == IEEE80211_MESH_SEC_NONE) || + (!elems->rsn && sdata->u.mesh.security != IEEE80211_MESH_SEC_NONE)) + goto free; - if (elems.ds_params) - freq = ieee80211_channel_to_frequency(elems.ds_params[0], band); + if (elems->ds_params) + freq = ieee80211_channel_to_frequency(elems->ds_params[0], band); else freq = rx_status->freq; channel = ieee80211_get_channel(local->hw.wiphy, freq); if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) - return; + goto free; - if (mesh_matches_local(sdata, &elems)) { + if (mesh_matches_local(sdata, elems)) { mpl_dbg(sdata, "rssi_threshold=%d,rx_status->signal=%d\n", sdata->u.mesh.mshcfg.rssi_threshold, rx_status->signal); if (!sdata->u.mesh.user_mpm || sdata->u.mesh.mshcfg.rssi_threshold == 0 || sdata->u.mesh.mshcfg.rssi_threshold < rx_status->signal) - mesh_neighbour_update(sdata, mgmt->sa, &elems, + mesh_neighbour_update(sdata, mgmt->sa, elems, rx_status); if (ifmsh->csa_role != IEEE80211_MESH_CSA_ROLE_INIT && !sdata->vif.csa_active) - ieee80211_mesh_process_chnswitch(sdata, &elems, true); + ieee80211_mesh_process_chnswitch(sdata, elems, true); } if (ifmsh->sync_ops) - ifmsh->sync_ops->rx_bcn_presp(sdata, - stype, mgmt, &elems, rx_status); + ifmsh->sync_ops->rx_bcn_presp(sdata, stype, mgmt, len, + elems->mesh_config, rx_status); +free: + kfree(elems); } int ieee80211_mesh_finish_csa(struct ieee80211_sub_if_data *sdata) @@ -1446,7 +1456,7 @@ static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - struct ieee802_11_elems elems; + struct ieee802_11_elems *elems; u16 pre_value; bool fwd_csa = true; size_t baselen; @@ -1459,33 +1469,37 @@ static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata, pos = mgmt->u.action.u.chan_switch.variable; baselen = offsetof(struct ieee80211_mgmt, u.action.u.chan_switch.variable); - ieee802_11_parse_elems(pos, len - baselen, true, &elems, - mgmt->bssid, NULL); - - if (!mesh_matches_local(sdata, &elems)) + elems = ieee802_11_parse_elems(pos, len - baselen, true, + mgmt->bssid, NULL); + if (!elems) return; - ifmsh->chsw_ttl = elems.mesh_chansw_params_ie->mesh_ttl; + if (!mesh_matches_local(sdata, elems)) + goto free; + + ifmsh->chsw_ttl = elems->mesh_chansw_params_ie->mesh_ttl; if (!--ifmsh->chsw_ttl) fwd_csa = false; - pre_value = le16_to_cpu(elems.mesh_chansw_params_ie->mesh_pre_value); + pre_value = le16_to_cpu(elems->mesh_chansw_params_ie->mesh_pre_value); if (ifmsh->pre_value >= pre_value) - return; + goto free; ifmsh->pre_value = pre_value; if (!sdata->vif.csa_active && - !ieee80211_mesh_process_chnswitch(sdata, &elems, false)) { + !ieee80211_mesh_process_chnswitch(sdata, elems, false)) { mcsa_dbg(sdata, "Failed to process CSA action frame"); - return; + goto free; } /* forward or re-broadcast the CSA frame */ if (fwd_csa) { - if (mesh_fwd_csa_frame(sdata, mgmt, len, &elems) < 0) + if (mesh_fwd_csa_frame(sdata, mgmt, len, elems) < 0) mcsa_dbg(sdata, "Failed to forward the CSA frame"); } +free: + kfree(elems); } static void ieee80211_mesh_rx_mgmt_action(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index a05b615deb51..44a6fdb6efbd 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019, 2021 Intel Corporation * Author: Luis Carlos Cobo <luisca@cozybit.com> */ @@ -908,7 +908,7 @@ static void hwmp_rann_frame_process(struct ieee80211_sub_if_data *sdata, void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len) { - struct ieee802_11_elems elems; + struct ieee802_11_elems *elems; size_t baselen; u32 path_metric; struct sta_info *sta; @@ -926,37 +926,41 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt; - ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable, - len - baselen, false, &elems, mgmt->bssid, NULL); + elems = ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable, + len - baselen, false, mgmt->bssid, NULL); + if (!elems) + return; - if (elems.preq) { - if (elems.preq_len != 37) + if (elems->preq) { + if (elems->preq_len != 37) /* Right now we support just 1 destination and no AE */ - return; - path_metric = hwmp_route_info_get(sdata, mgmt, elems.preq, + goto free; + path_metric = hwmp_route_info_get(sdata, mgmt, elems->preq, MPATH_PREQ); if (path_metric) - hwmp_preq_frame_process(sdata, mgmt, elems.preq, + hwmp_preq_frame_process(sdata, mgmt, elems->preq, path_metric); } - if (elems.prep) { - if (elems.prep_len != 31) + if (elems->prep) { + if (elems->prep_len != 31) /* Right now we support no AE */ - return; - path_metric = hwmp_route_info_get(sdata, mgmt, elems.prep, + goto free; + path_metric = hwmp_route_info_get(sdata, mgmt, elems->prep, MPATH_PREP); if (path_metric) - hwmp_prep_frame_process(sdata, mgmt, elems.prep, + hwmp_prep_frame_process(sdata, mgmt, elems->prep, path_metric); } - if (elems.perr) { - if (elems.perr_len != 15) + if (elems->perr) { + if (elems->perr_len != 15) /* Right now we support only one destination per PERR */ - return; - hwmp_perr_frame_process(sdata, mgmt, elems.perr); + goto free; + hwmp_perr_frame_process(sdata, mgmt, elems->perr); } - if (elems.rann) - hwmp_rann_frame_process(sdata, mgmt, elems.rann); + if (elems->rann) + hwmp_rann_frame_process(sdata, mgmt, elems->rann); +free: + kfree(elems); } /** diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index efbefcbac3ac..7cab1cf09bf1 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -60,7 +60,10 @@ static struct mesh_table *mesh_table_alloc(void) atomic_set(&newtbl->entries, 0); spin_lock_init(&newtbl->gates_lock); spin_lock_init(&newtbl->walk_lock); - rhashtable_init(&newtbl->rhead, &mesh_rht_params); + if (rhashtable_init(&newtbl->rhead, &mesh_rht_params)) { + kfree(newtbl); + return NULL; + } return newtbl; } diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index a6915847d78a..a829470dd59e 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019, 2021 Intel Corporation * Author: Luis Carlos Cobo <luisca@cozybit.com> */ #include <linux/gfp.h> @@ -1200,7 +1200,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status) { - struct ieee802_11_elems elems; + struct ieee802_11_elems *elems; size_t baselen; u8 *baseaddr; @@ -1228,7 +1228,8 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; } - ieee802_11_parse_elems(baseaddr, len - baselen, true, &elems, - mgmt->bssid, NULL); - mesh_process_plink_frame(sdata, mgmt, &elems, rx_status); + elems = ieee802_11_parse_elems(baseaddr, len - baselen, true, + mgmt->bssid, NULL); + mesh_process_plink_frame(sdata, mgmt, elems, rx_status); + kfree(elems); } diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 204830a55240..3fbd0b9ff913 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -2,6 +2,7 @@ /* * Copyright 2012-2013, Marco Porsch <marco.porsch@s2005.tu-chemnitz.de> * Copyright 2012-2013, cozybit Inc. + * Copyright (C) 2021 Intel Corporation */ #include "mesh.h" @@ -588,7 +589,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, /* only transmit to PS STA with announced, non-zero awake window */ if (test_sta_flag(sta, WLAN_STA_PS_STA) && - (!elems->awake_window || !le16_to_cpu(*elems->awake_window))) + (!elems->awake_window || !get_unaligned_le16(elems->awake_window))) return; if (!test_sta_flag(sta, WLAN_STA_MPSP_OWNER)) diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c index fde93de2b80a..9e342cc2504c 100644 --- a/net/mac80211/mesh_sync.c +++ b/net/mac80211/mesh_sync.c @@ -3,6 +3,7 @@ * Copyright 2011-2012, Pavel Zubarev <pavel.zubarev@gmail.com> * Copyright 2011-2012, Marco Porsch <marco.porsch@s2005.tu-chemnitz.de> * Copyright 2011-2012, cozybit Inc. + * Copyright (C) 2021 Intel Corporation */ #include "ieee80211_i.h" @@ -35,12 +36,12 @@ struct sync_method { /** * mesh_peer_tbtt_adjusting - check if an mp is currently adjusting its TBTT * - * @ie: information elements of a management frame from the mesh peer + * @cfg: mesh config element from the mesh peer (or %NULL) */ -static bool mesh_peer_tbtt_adjusting(struct ieee802_11_elems *ie) +static bool mesh_peer_tbtt_adjusting(const struct ieee80211_meshconf_ie *cfg) { - return (ie->mesh_config->meshconf_cap & - IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING) != 0; + return cfg && + (cfg->meshconf_cap & IEEE80211_MESHCONF_CAPAB_TBTT_ADJUSTING); } void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata) @@ -76,11 +77,11 @@ void mesh_sync_adjust_tsf(struct ieee80211_sub_if_data *sdata) } } -static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, - u16 stype, - struct ieee80211_mgmt *mgmt, - struct ieee802_11_elems *elems, - struct ieee80211_rx_status *rx_status) +static void +mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, u16 stype, + struct ieee80211_mgmt *mgmt, unsigned int len, + const struct ieee80211_meshconf_ie *mesh_cfg, + struct ieee80211_rx_status *rx_status) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_local *local = sdata->local; @@ -101,10 +102,7 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, */ if (ieee80211_have_rx_timestamp(rx_status)) t_r = ieee80211_calculate_rx_timestamp(local, rx_status, - 24 + 12 + - elems->total_len + - FCS_LEN, - 24); + len + FCS_LEN, 24); else t_r = drv_get_tsf(local, sdata); @@ -119,7 +117,7 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, * dot11MeshNbrOffsetMaxNeighbor non-peer non-MBSS neighbors */ - if (elems->mesh_config && mesh_peer_tbtt_adjusting(elems)) { + if (mesh_peer_tbtt_adjusting(mesh_cfg)) { msync_dbg(sdata, "STA %pM : is adjusting TBTT\n", sta->sta.addr); goto no_sync; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c0ea3b1aa9e1..54ab0e1ef6ca 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1490,6 +1490,7 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, fallthrough; case NL80211_BAND_2GHZ: case NL80211_BAND_60GHZ: + case NL80211_BAND_LC: chan_increment = 1; break; case NL80211_BAND_5GHZ: @@ -2258,6 +2259,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; + struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; u32 changed = 0; struct ieee80211_prep_tx_info info = { .subtype = stype, @@ -2407,6 +2409,10 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, cancel_delayed_work_sync(&ifmgd->tx_tspec_wk); sdata->encrypt_headroom = IEEE80211_ENCRYPT_HEADROOM; + + bss_conf->pwr_reduction = 0; + bss_conf->tx_pwr_env_num = 0; + memset(bss_conf->tx_pwr_env, 0, sizeof(bss_conf->tx_pwr_env)); } static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata) @@ -2509,7 +2515,7 @@ static void ieee80211_mlme_send_probe_req(struct ieee80211_sub_if_data *sdata, static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - const u8 *ssid; + const struct element *ssid; u8 *dst = ifmgd->associated->bssid; u8 unicast_limit = max(1, max_probe_tries - 3); struct sta_info *sta; @@ -2546,14 +2552,14 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) int ssid_len; rcu_read_lock(); - ssid = ieee80211_bss_get_ie(ifmgd->associated, WLAN_EID_SSID); + ssid = ieee80211_bss_get_elem(ifmgd->associated, WLAN_EID_SSID); if (WARN_ON_ONCE(ssid == NULL)) ssid_len = 0; else - ssid_len = ssid[1]; + ssid_len = ssid->datalen; ieee80211_mlme_send_probe_req(sdata, sdata->vif.addr, dst, - ssid + 2, ssid_len, + ssid->data, ssid_len, ifmgd->associated->channel); rcu_read_unlock(); } @@ -2583,6 +2589,13 @@ static void ieee80211_mgd_probe_ap(struct ieee80211_sub_if_data *sdata, goto out; } + if (sdata->local->suspending) { + /* reschedule after resume */ + mutex_unlock(&sdata->local->mtx); + ieee80211_reset_ap_probe(sdata); + goto out; + } + if (beacon) { mlme_dbg_ratelimited(sdata, "detected beacon loss from AP (missed %d beacons) - probing\n", @@ -2629,7 +2642,7 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct cfg80211_bss *cbss; struct sk_buff *skb; - const u8 *ssid; + const struct element *ssid; int ssid_len; if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION)) @@ -2647,16 +2660,17 @@ struct sk_buff *ieee80211_ap_probereq_get(struct ieee80211_hw *hw, return NULL; rcu_read_lock(); - ssid = ieee80211_bss_get_ie(cbss, WLAN_EID_SSID); - if (WARN_ONCE(!ssid || ssid[1] > IEEE80211_MAX_SSID_LEN, - "invalid SSID element (len=%d)", ssid ? ssid[1] : -1)) + ssid = ieee80211_bss_get_elem(cbss, WLAN_EID_SSID); + if (WARN_ONCE(!ssid || ssid->datalen > IEEE80211_MAX_SSID_LEN, + "invalid SSID element (len=%d)", + ssid ? ssid->datalen : -1)) ssid_len = 0; else - ssid_len = ssid[1]; + ssid_len = ssid->datalen; skb = ieee80211_build_probe_req(sdata, sdata->vif.addr, cbss->bssid, (u32) -1, cbss->channel, - ssid + 2, ssid_len, + ssid->data, ssid_len, NULL, 0, IEEE80211_PROBE_FLAG_DIRECTED); rcu_read_unlock(); @@ -2870,17 +2884,17 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_mgd_auth_data *auth_data = sdata->u.mgd.auth_data; + const struct element *challenge; u8 *pos; - struct ieee802_11_elems elems; u32 tx_flags = 0; struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_AUTH, }; pos = mgmt->u.auth.variable; - ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems, - mgmt->bssid, auth_data->bss->bssid); - if (!elems.challenge) + challenge = cfg80211_find_elem(WLAN_EID_CHALLENGE, pos, + len - (pos - (u8 *)mgmt)); + if (!challenge) return; auth_data->expected_transaction = 4; drv_mgd_prepare_tx(sdata->local, sdata, &info); @@ -2888,7 +2902,8 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, tx_flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; ieee80211_send_auth(sdata, 3, auth_data->algorithm, 0, - elems.challenge - 2, elems.challenge_len + 2, + (void *)challenge, + challenge->datalen + sizeof(*challenge), auth_data->bss->bssid, auth_data->bss->bssid, auth_data->key, auth_data->key_len, auth_data->key_idx, tx_flags); @@ -3290,8 +3305,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, aid = 0; /* TODO */ } capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); - ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, elems, - mgmt->bssid, assoc_data->bss->bssid); + elems = ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, + mgmt->bssid, assoc_data->bss->bssid); + + if (!elems) + return false; if (elems->aid_resp) aid = le16_to_cpu(elems->aid_resp->aid); @@ -3313,7 +3331,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (!is_s1g && !elems->supp_rates) { sdata_info(sdata, "no SuppRates element in AssocResp\n"); - return false; + ret = false; + goto out; } sdata->vif.bss_conf.aid = aid; @@ -3335,7 +3354,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && (!elems->vht_cap_elem || !elems->vht_operation)))) { const struct cfg80211_bss_ies *ies; - struct ieee802_11_elems bss_elems; + struct ieee802_11_elems *bss_elems; rcu_read_lock(); ies = rcu_dereference(cbss->ies); @@ -3343,16 +3362,22 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, bss_ies = kmemdup(ies, sizeof(*ies) + ies->len, GFP_ATOMIC); rcu_read_unlock(); - if (!bss_ies) - return false; + if (!bss_ies) { + ret = false; + goto out; + } + + bss_elems = ieee802_11_parse_elems(bss_ies->data, bss_ies->len, + false, mgmt->bssid, + assoc_data->bss->bssid); + if (!bss_elems) { + ret = false; + goto out; + } - ieee802_11_parse_elems(bss_ies->data, bss_ies->len, - false, &bss_elems, - mgmt->bssid, - assoc_data->bss->bssid); if (assoc_data->wmm && - !elems->wmm_param && bss_elems.wmm_param) { - elems->wmm_param = bss_elems.wmm_param; + !elems->wmm_param && bss_elems->wmm_param) { + elems->wmm_param = bss_elems->wmm_param; sdata_info(sdata, "AP bug: WMM param missing from AssocResp\n"); } @@ -3361,30 +3386,32 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * Also check if we requested HT/VHT, otherwise the AP doesn't * have to include the IEs in the (re)association response. */ - if (!elems->ht_cap_elem && bss_elems.ht_cap_elem && + if (!elems->ht_cap_elem && bss_elems->ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { - elems->ht_cap_elem = bss_elems.ht_cap_elem; + elems->ht_cap_elem = bss_elems->ht_cap_elem; sdata_info(sdata, "AP bug: HT capability missing from AssocResp\n"); } - if (!elems->ht_operation && bss_elems.ht_operation && + if (!elems->ht_operation && bss_elems->ht_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { - elems->ht_operation = bss_elems.ht_operation; + elems->ht_operation = bss_elems->ht_operation; sdata_info(sdata, "AP bug: HT operation missing from AssocResp\n"); } - if (!elems->vht_cap_elem && bss_elems.vht_cap_elem && + if (!elems->vht_cap_elem && bss_elems->vht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { - elems->vht_cap_elem = bss_elems.vht_cap_elem; + elems->vht_cap_elem = bss_elems->vht_cap_elem; sdata_info(sdata, "AP bug: VHT capa missing from AssocResp\n"); } - if (!elems->vht_operation && bss_elems.vht_operation && + if (!elems->vht_operation && bss_elems->vht_operation && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { - elems->vht_operation = bss_elems.vht_operation; + elems->vht_operation = bss_elems->vht_operation; sdata_info(sdata, "AP bug: VHT operation missing from AssocResp\n"); } + + kfree(bss_elems); } /* @@ -3629,6 +3656,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ret = true; out: + kfree(elems); kfree(bss_ies); return ret; } @@ -3640,7 +3668,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; u16 capab_info, status_code, aid; - struct ieee802_11_elems elems; + struct ieee802_11_elems *elems; int ac, uapsd_queues = -1; u8 *pos; bool reassoc; @@ -3697,14 +3725,16 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, fils_decrypt_assoc_resp(sdata, (u8 *)mgmt, &len, assoc_data) < 0) return; - ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, &elems, - mgmt->bssid, assoc_data->bss->bssid); + elems = ieee802_11_parse_elems(pos, len - (pos - (u8 *)mgmt), false, + mgmt->bssid, assoc_data->bss->bssid); + if (!elems) + goto notify_driver; if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY && - elems.timeout_int && - elems.timeout_int->type == WLAN_TIMEOUT_ASSOC_COMEBACK) { + elems->timeout_int && + elems->timeout_int->type == WLAN_TIMEOUT_ASSOC_COMEBACK) { u32 tu, ms; - tu = le32_to_cpu(elems.timeout_int->value); + tu = le32_to_cpu(elems->timeout_int->value); ms = tu * 1024 / 1000; sdata_info(sdata, "%pM rejected association temporarily; comeback duration %u TU (%u ms)\n", @@ -3724,7 +3754,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { - if (!ieee80211_assoc_success(sdata, cbss, mgmt, len, &elems)) { + if (!ieee80211_assoc_success(sdata, cbss, mgmt, len, elems)) { /* oops -- internal error -- send timeout for now */ ieee80211_destroy_assoc_data(sdata, false, false); cfg80211_assoc_timeout(sdata->dev, cbss); @@ -3754,6 +3784,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, ifmgd->assoc_req_ies, ifmgd->assoc_req_ies_len); notify_driver: drv_mgd_complete_tx(sdata->local, sdata, &info); + kfree(elems); } static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, @@ -3958,7 +3989,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; struct ieee80211_mgmt *mgmt = (void *) hdr; size_t baselen; - struct ieee802_11_elems elems; + struct ieee802_11_elems *elems; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; struct ieee80211_channel *chan; @@ -4004,15 +4035,16 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon && ieee80211_rx_our_beacon(bssid, ifmgd->assoc_data->bss)) { - ieee802_11_parse_elems(variable, - len - baselen, false, &elems, - bssid, - ifmgd->assoc_data->bss->bssid); + elems = ieee802_11_parse_elems(variable, len - baselen, false, + bssid, + ifmgd->assoc_data->bss->bssid); + if (!elems) + return; ieee80211_rx_bss_info(sdata, mgmt, len, rx_status); - if (elems.dtim_period) - ifmgd->dtim_period = elems.dtim_period; + if (elems->dtim_period) + ifmgd->dtim_period = elems->dtim_period; ifmgd->have_beacon = true; ifmgd->assoc_data->need_beacon = false; if (ieee80211_hw_check(&local->hw, TIMING_BEACON_ONLY)) { @@ -4020,17 +4052,17 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, le64_to_cpu(mgmt->u.beacon.timestamp); sdata->vif.bss_conf.sync_device_ts = rx_status->device_timestamp; - sdata->vif.bss_conf.sync_dtim_count = elems.dtim_count; + sdata->vif.bss_conf.sync_dtim_count = elems->dtim_count; } - if (elems.mbssid_config_ie) + if (elems->mbssid_config_ie) bss_conf->profile_periodicity = - elems.mbssid_config_ie->profile_periodicity; + elems->mbssid_config_ie->profile_periodicity; else bss_conf->profile_periodicity = 0; - if (elems.ext_capab_len >= 11 && - (elems.ext_capab[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) + if (elems->ext_capab_len >= 11 && + (elems->ext_capab[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) bss_conf->ema_ap = true; else bss_conf->ema_ap = false; @@ -4039,6 +4071,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ifmgd->assoc_data->timeout = jiffies; ifmgd->assoc_data->timeout_started = true; run_again(sdata, ifmgd->assoc_data->timeout); + kfree(elems); return; } @@ -4070,13 +4103,15 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, */ if (!ieee80211_is_s1g_beacon(hdr->frame_control)) ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4); - ncrc = ieee802_11_parse_elems_crc(variable, - len - baselen, false, &elems, - care_about_ies, ncrc, - mgmt->bssid, bssid); + elems = ieee802_11_parse_elems_crc(variable, len - baselen, + false, care_about_ies, ncrc, + mgmt->bssid, bssid); + if (!elems) + return; + ncrc = elems->crc; if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && - ieee80211_check_tim(elems.tim, elems.tim_len, bss_conf->aid)) { + ieee80211_check_tim(elems->tim, elems->tim_len, bss_conf->aid)) { if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; @@ -4146,12 +4181,12 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, le64_to_cpu(mgmt->u.beacon.timestamp); sdata->vif.bss_conf.sync_device_ts = rx_status->device_timestamp; - sdata->vif.bss_conf.sync_dtim_count = elems.dtim_count; + sdata->vif.bss_conf.sync_dtim_count = elems->dtim_count; } if ((ncrc == ifmgd->beacon_crc && ifmgd->beacon_crc_valid) || ieee80211_is_s1g_short_beacon(mgmt->frame_control)) - return; + goto free; ifmgd->beacon_crc = ncrc; ifmgd->beacon_crc_valid = true; @@ -4159,12 +4194,12 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ieee80211_sta_process_chanswitch(sdata, rx_status->mactime, rx_status->device_timestamp, - &elems, true); + elems, true); if (!(ifmgd->flags & IEEE80211_STA_DISABLE_WMM) && - ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, - elems.wmm_param_len, - elems.mu_edca_param_set)) + ieee80211_sta_wmm_params(local, sdata, elems->wmm_param, + elems->wmm_param_len, + elems->mu_edca_param_set)) changed |= BSS_CHANGED_QOS; /* @@ -4173,7 +4208,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, */ if (!ifmgd->have_beacon) { /* a few bogus AP send dtim_period = 0 or no TIM IE */ - bss_conf->dtim_period = elems.dtim_period ?: 1; + bss_conf->dtim_period = elems->dtim_period ?: 1; changed |= BSS_CHANGED_BEACON_INFO; ifmgd->have_beacon = true; @@ -4185,9 +4220,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ieee80211_recalc_ps_vif(sdata); } - if (elems.erp_info) { + if (elems->erp_info) { erp_valid = true; - erp_value = elems.erp_info[0]; + erp_value = elems->erp_info[0]; } else { erp_valid = false; } @@ -4200,12 +4235,12 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, bssid); - changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); + changed |= ieee80211_recalc_twt_req(sdata, sta, elems); - if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem, - elems.vht_cap_elem, elems.ht_operation, - elems.vht_operation, elems.he_operation, - elems.s1g_oper, bssid, &changed)) { + if (ieee80211_config_bw(sdata, sta, elems->ht_cap_elem, + elems->vht_cap_elem, elems->ht_operation, + elems->vht_operation, elems->he_operation, + elems->s1g_oper, bssid, &changed)) { mutex_unlock(&local->sta_mtx); sdata_info(sdata, "failed to follow AP %pM bandwidth change, disconnect\n", @@ -4217,21 +4252,23 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, sizeof(deauth_buf), true, WLAN_REASON_DEAUTH_LEAVING, false); - return; + goto free; } - if (sta && elems.opmode_notif) - ieee80211_vht_handle_opmode(sdata, sta, *elems.opmode_notif, + if (sta && elems->opmode_notif) + ieee80211_vht_handle_opmode(sdata, sta, *elems->opmode_notif, rx_status->band); mutex_unlock(&local->sta_mtx); changed |= ieee80211_handle_pwr_constr(sdata, chan, mgmt, - elems.country_elem, - elems.country_elem_len, - elems.pwr_constr_elem, - elems.cisco_dtpc_elem); + elems->country_elem, + elems->country_elem_len, + elems->pwr_constr_elem, + elems->cisco_dtpc_elem); ieee80211_bss_info_change_notify(sdata, changed); +free: + kfree(elems); } void ieee80211_sta_rx_queued_ext(struct ieee80211_sub_if_data *sdata, @@ -4260,7 +4297,6 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct ieee80211_rx_status *rx_status; struct ieee80211_mgmt *mgmt; u16 fc; - struct ieee802_11_elems elems; int ies_len; rx_status = (struct ieee80211_rx_status *) skb->cb; @@ -4292,6 +4328,8 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, break; case IEEE80211_STYPE_ACTION: if (mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT) { + struct ieee802_11_elems *elems; + ies_len = skb->len - offsetof(struct ieee80211_mgmt, u.action.u.chan_switch.variable); @@ -4300,18 +4338,19 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, break; /* CSA IE cannot be overridden, no need for BSSID */ - ieee802_11_parse_elems( - mgmt->u.action.u.chan_switch.variable, - ies_len, true, &elems, mgmt->bssid, NULL); - - if (elems.parse_error) - break; - - ieee80211_sta_process_chanswitch(sdata, - rx_status->mactime, - rx_status->device_timestamp, - &elems, false); + elems = ieee802_11_parse_elems( + mgmt->u.action.u.chan_switch.variable, + ies_len, true, mgmt->bssid, NULL); + + if (elems && !elems->parse_error) + ieee80211_sta_process_chanswitch(sdata, + rx_status->mactime, + rx_status->device_timestamp, + elems, false); + kfree(elems); } else if (mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) { + struct ieee802_11_elems *elems; + ies_len = skb->len - offsetof(struct ieee80211_mgmt, u.action.u.ext_chan_switch.variable); @@ -4323,21 +4362,22 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, * extended CSA IE can't be overridden, no need for * BSSID */ - ieee802_11_parse_elems( - mgmt->u.action.u.ext_chan_switch.variable, - ies_len, true, &elems, mgmt->bssid, NULL); - - if (elems.parse_error) - break; - - /* for the handling code pretend this was also an IE */ - elems.ext_chansw_ie = - &mgmt->u.action.u.ext_chan_switch.data; + elems = ieee802_11_parse_elems( + mgmt->u.action.u.ext_chan_switch.variable, + ies_len, true, mgmt->bssid, NULL); + + if (elems && !elems->parse_error) { + /* for the handling code pretend it was an IE */ + elems->ext_chansw_ie = + &mgmt->u.action.u.ext_chan_switch.data; + + ieee80211_sta_process_chanswitch(sdata, + rx_status->mactime, + rx_status->device_timestamp, + elems, false); + } - ieee80211_sta_process_chanswitch(sdata, - rx_status->mactime, - rx_status->device_timestamp, - &elems, false); + kfree(elems); } break; } @@ -4972,10 +5012,22 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; bool is_5ghz = cbss->channel->band == NL80211_BAND_5GHZ; struct ieee80211_bss *bss = (void *)cbss->priv; + struct ieee802_11_elems *elems; + const struct cfg80211_bss_ies *ies; int ret; u32 i; bool have_80mhz; + rcu_read_lock(); + + ies = rcu_dereference(cbss->ies); + elems = ieee802_11_parse_elems(ies->data, ies->len, false, + NULL, NULL); + if (!elems) { + rcu_read_unlock(); + return -ENOMEM; + } + sband = local->hw.wiphy->bands[cbss->channel->band]; ifmgd->flags &= ~(IEEE80211_STA_DISABLE_40MHZ | @@ -4998,18 +5050,9 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ieee80211_vif_type_p2p(&sdata->vif))) ifmgd->flags |= IEEE80211_STA_DISABLE_HE; - rcu_read_lock(); - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && !is_6ghz) { - const u8 *ht_oper_ie, *ht_cap_ie; - - ht_oper_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_OPERATION); - if (ht_oper_ie && ht_oper_ie[1] >= sizeof(*ht_oper)) - ht_oper = (void *)(ht_oper_ie + 2); - - ht_cap_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_CAPABILITY); - if (ht_cap_ie && ht_cap_ie[1] >= sizeof(*ht_cap)) - ht_cap = (void *)(ht_cap_ie + 2); + ht_oper = elems->ht_operation; + ht_cap = elems->ht_cap_elem; if (!ht_cap) { ifmgd->flags |= IEEE80211_STA_DISABLE_HT; @@ -5018,12 +5061,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && !is_6ghz) { - const u8 *vht_oper_ie, *vht_cap; - - vht_oper_ie = ieee80211_bss_get_ie(cbss, - WLAN_EID_VHT_OPERATION); - if (vht_oper_ie && vht_oper_ie[1] >= sizeof(*vht_oper)) - vht_oper = (void *)(vht_oper_ie + 2); + vht_oper = elems->vht_operation; if (vht_oper && !ht_oper) { vht_oper = NULL; sdata_info(sdata, @@ -5033,25 +5071,38 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= IEEE80211_STA_DISABLE_HE; } - vht_cap = ieee80211_bss_get_ie(cbss, WLAN_EID_VHT_CAPABILITY); - if (!vht_cap || vht_cap[1] < sizeof(struct ieee80211_vht_cap)) { + if (!elems->vht_cap_elem) { ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; vht_oper = NULL; } } if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) { - const struct cfg80211_bss_ies *ies; - const u8 *he_oper_ie; + he_oper = elems->he_operation; - ies = rcu_dereference(cbss->ies); - he_oper_ie = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, - ies->data, ies->len); - if (he_oper_ie && - he_oper_ie[1] >= ieee80211_he_oper_size(&he_oper_ie[3])) - he_oper = (void *)(he_oper_ie + 3); - else - he_oper = NULL; + if (is_6ghz) { + struct ieee80211_bss_conf *bss_conf; + u8 i, j = 0; + + bss_conf = &sdata->vif.bss_conf; + + if (elems->pwr_constr_elem) + bss_conf->pwr_reduction = *elems->pwr_constr_elem; + + BUILD_BUG_ON(ARRAY_SIZE(bss_conf->tx_pwr_env) != + ARRAY_SIZE(elems->tx_pwr_env)); + + for (i = 0; i < elems->tx_pwr_env_num; i++) { + if (elems->tx_pwr_env_len[i] > + sizeof(bss_conf->tx_pwr_env[j])) + continue; + + bss_conf->tx_pwr_env_num++; + memcpy(&bss_conf->tx_pwr_env[j], elems->tx_pwr_env[i], + elems->tx_pwr_env_len[i]); + j++; + } + } if (!ieee80211_verify_sta_he_mcs_support(sdata, sband, he_oper)) ifmgd->flags |= IEEE80211_STA_DISABLE_HE; @@ -5072,13 +5123,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; if (sband->band == NL80211_BAND_S1GHZ) { - const u8 *s1g_oper_ie; - - s1g_oper_ie = ieee80211_bss_get_ie(cbss, - WLAN_EID_S1G_OPERATION); - if (s1g_oper_ie && s1g_oper_ie[1] >= sizeof(*s1g_oper)) - s1g_oper = (void *)(s1g_oper_ie + 2); - else + s1g_oper = elems->s1g_oper; + if (!s1g_oper) sdata_info(sdata, "AP missing S1G operation element?\n"); } @@ -5094,6 +5140,9 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, local->rx_chains); rcu_read_unlock(); + /* the element data was RCU protected so no longer valid anyway */ + kfree(elems); + elems = NULL; if (ifmgd->flags & IEEE80211_STA_DISABLE_HE && is_6ghz) { sdata_info(sdata, "Rejecting non-HE 6/7 GHz connection"); @@ -5498,7 +5547,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, const struct cfg80211_bss_ies *beacon_ies; struct ieee80211_supported_band *sband; struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; - const u8 *ssidie, *ht_ie, *vht_ie; + const struct element *ssid_elem, *ht_elem, *vht_elem; int i, err; bool override = false; @@ -5507,14 +5556,14 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, return -ENOMEM; rcu_read_lock(); - ssidie = ieee80211_bss_get_ie(req->bss, WLAN_EID_SSID); - if (!ssidie || ssidie[1] > sizeof(assoc_data->ssid)) { + ssid_elem = ieee80211_bss_get_elem(req->bss, WLAN_EID_SSID); + if (!ssid_elem || ssid_elem->datalen > sizeof(assoc_data->ssid)) { rcu_read_unlock(); kfree(assoc_data); return -EINVAL; } - memcpy(assoc_data->ssid, ssidie + 2, ssidie[1]); - assoc_data->ssid_len = ssidie[1]; + memcpy(assoc_data->ssid, ssid_elem->data, ssid_elem->datalen); + assoc_data->ssid_len = ssid_elem->datalen; memcpy(bss_conf->ssid, assoc_data->ssid, assoc_data->ssid_len); bss_conf->ssid_len = assoc_data->ssid_len; rcu_read_unlock(); @@ -5628,15 +5677,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, assoc_data->supp_rates_len = bss->supp_rates_len; rcu_read_lock(); - ht_ie = ieee80211_bss_get_ie(req->bss, WLAN_EID_HT_OPERATION); - if (ht_ie && ht_ie[1] >= sizeof(struct ieee80211_ht_operation)) + ht_elem = ieee80211_bss_get_elem(req->bss, WLAN_EID_HT_OPERATION); + if (ht_elem && ht_elem->datalen >= sizeof(struct ieee80211_ht_operation)) assoc_data->ap_ht_param = - ((struct ieee80211_ht_operation *)(ht_ie + 2))->ht_param; + ((struct ieee80211_ht_operation *)(ht_elem->data))->ht_param; else if (!is_6ghz) ifmgd->flags |= IEEE80211_STA_DISABLE_HT; - vht_ie = ieee80211_bss_get_ie(req->bss, WLAN_EID_VHT_CAPABILITY); - if (vht_ie && vht_ie[1] >= sizeof(struct ieee80211_vht_cap)) - memcpy(&assoc_data->ap_vht_cap, vht_ie + 2, + vht_elem = ieee80211_bss_get_elem(req->bss, WLAN_EID_VHT_CAPABILITY); + if (vht_elem && vht_elem->datalen >= sizeof(struct ieee80211_vht_cap)) + memcpy(&assoc_data->ap_vht_cap, vht_elem->data, sizeof(struct ieee80211_vht_cap)); else if (is_5ghz) ifmgd->flags |= IEEE80211_STA_DISABLE_VHT | diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index 7809a906d7fe..0ccb5701c7f3 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -27,6 +27,9 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) if (!local->open_count) goto suspend; + local->suspending = true; + mb(); /* make suspending visible before any cancellation */ + ieee80211_scan_cancel(local); ieee80211_dfs_cac_cancel(local); @@ -176,6 +179,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) /* need suspended to be visible before quiescing is false */ barrier(); local->quiescing = false; + local->suspending = false; return 0; } diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index e5935e3d7a07..8c6416129d5b 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -392,10 +392,6 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; - if (ieee80211_is_tx_data(txrc->skb) && - info->flags & IEEE80211_TX_CTL_NO_ACK) - return false; - if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 2563473b5cf1..fc5c608d02e2 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -359,7 +359,12 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, put_unaligned_le32(it_present_val, it_present); - pos = (void *)(it_present + 1); + /* This references through an offset into it_optional[] rather + * than via it_present otherwise later uses of pos will cause + * the compiler to think we have walked past the end of the + * struct member. + */ + pos = (void *)&rthdr->it_optional[it_present - rthdr->it_optional]; /* the order of the following fields is important */ @@ -372,7 +377,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, ieee80211_calculate_rx_timestamp(local, status, mpdulen, 0), pos); - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_TSFT); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_TSFT)); pos += 8; } @@ -396,7 +401,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, *pos = 0; } else { int shift = 0; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_RATE)); if (status->bw == RATE_INFO_BW_10) shift = 1; else if (status->bw == RATE_INFO_BW_5) @@ -433,7 +438,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { *pos = status->signal; rthdr->it_present |= - cpu_to_le32(1 << IEEE80211_RADIOTAP_DBM_ANTSIGNAL); + cpu_to_le32(BIT(IEEE80211_RADIOTAP_DBM_ANTSIGNAL)); pos++; } @@ -459,7 +464,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, if (status->encoding == RX_ENC_HT) { unsigned int stbc; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); *pos++ = local->hw.radiotap_mcs_details; *pos = 0; if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) @@ -483,7 +488,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, while ((pos - (u8 *)rthdr) & 3) pos++; rthdr->it_present |= - cpu_to_le32(1 << IEEE80211_RADIOTAP_AMPDU_STATUS); + cpu_to_le32(BIT(IEEE80211_RADIOTAP_AMPDU_STATUS)); put_unaligned_le32(status->ampdu_reference, pos); pos += 4; if (status->flag & RX_FLAG_AMPDU_LAST_KNOWN) @@ -510,7 +515,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, if (status->encoding == RX_ENC_VHT) { u16 known = local->hw.radiotap_vht_details; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); put_unaligned_le16(known, pos); pos += 2; /* flags */ @@ -554,7 +559,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, u8 flags = IEEE80211_RADIOTAP_TIMESTAMP_FLAG_32BIT; rthdr->it_present |= - cpu_to_le32(1 << IEEE80211_RADIOTAP_TIMESTAMP); + cpu_to_le32(BIT(IEEE80211_RADIOTAP_TIMESTAMP)); /* ensure 8 byte alignment */ while ((pos - (u8 *)rthdr) & 7) @@ -642,7 +647,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, /* ensure 2 byte alignment */ while ((pos - (u8 *)rthdr) & 1) pos++; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE)); memcpy(pos, &he, sizeof(he)); pos += sizeof(he); } @@ -652,14 +657,14 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, /* ensure 2 byte alignment */ while ((pos - (u8 *)rthdr) & 1) pos++; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE_MU); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE_MU)); memcpy(pos, &he_mu, sizeof(he_mu)); pos += sizeof(he_mu); } if (status->flag & RX_FLAG_NO_PSDU) { rthdr->it_present |= - cpu_to_le32(1 << IEEE80211_RADIOTAP_ZERO_LEN_PSDU); + cpu_to_le32(BIT(IEEE80211_RADIOTAP_ZERO_LEN_PSDU)); *pos++ = status->zero_length_psdu_type; } @@ -667,7 +672,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, /* ensure 2 byte alignment */ while ((pos - (u8 *)rthdr) & 1) pos++; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_LSIG); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_LSIG)); memcpy(pos, &lsig, sizeof(lsig)); pos += sizeof(lsig); } @@ -3207,6 +3212,58 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) return RX_CONTINUE; } +static bool +ieee80211_process_rx_twt_action(struct ieee80211_rx_data *rx) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)rx->skb->data; + struct ieee80211_sub_if_data *sdata = rx->sdata; + + /* TWT actions are only supported in AP for the moment */ + if (sdata->vif.type != NL80211_IFTYPE_AP) + return false; + + if (!rx->local->ops->add_twt_setup) + return false; + + if (!sdata->vif.bss_conf.twt_responder) + return false; + + if (!rx->sta) + return false; + + switch (mgmt->u.action.u.s1g.action_code) { + case WLAN_S1G_TWT_SETUP: { + struct ieee80211_twt_setup *twt; + + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + + 1 + /* action code */ + sizeof(struct ieee80211_twt_setup) + + 2 /* TWT req_type agrt */) + break; + + twt = (void *)mgmt->u.action.u.s1g.variable; + if (twt->element_id != WLAN_EID_S1G_TWT) + break; + + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + + 4 + /* action code + token + tlv */ + twt->length) + break; + + return true; /* queue the frame */ + } + case WLAN_S1G_TWT_TEARDOWN: + if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE + 2) + break; + + return true; /* queue the frame */ + default: + break; + } + + return false; +} + static ieee80211_rx_result debug_noinline ieee80211_rx_h_action(struct ieee80211_rx_data *rx) { @@ -3486,6 +3543,17 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) !mesh_path_sel_is_hwmp(sdata)) break; goto queue; + case WLAN_CATEGORY_S1G: + switch (mgmt->u.action.u.s1g.action_code) { + case WLAN_S1G_TWT_SETUP: + case WLAN_S1G_TWT_TEARDOWN: + if (ieee80211_process_rx_twt_action(rx)) + goto queue; + break; + default: + break; + } + break; } return RX_CONTINUE; @@ -4053,7 +4121,8 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) if (!bssid) return false; if (ether_addr_equal(sdata->vif.addr, hdr->addr2) || - ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2)) + ether_addr_equal(sdata->u.ibss.bssid, hdr->addr2) || + !is_valid_ether_addr(hdr->addr2)) return false; if (ieee80211_is_beacon(hdr->frame_control)) return true; diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c index c33f332b049a..4141bc80cdfd 100644 --- a/net/mac80211/s1g.c +++ b/net/mac80211/s1g.c @@ -6,6 +6,7 @@ #include <linux/ieee80211.h> #include <net/mac80211.h> #include "ieee80211_i.h" +#include "driver-ops.h" void ieee80211_s1g_sta_rate_init(struct sta_info *sta) { @@ -14,3 +15,184 @@ void ieee80211_s1g_sta_rate_init(struct sta_info *sta) sta->rx_stats.last_rate = STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_S1G); } + +bool ieee80211_s1g_is_twt_setup(struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; + + if (likely(!ieee80211_is_action(mgmt->frame_control))) + return false; + + if (likely(mgmt->u.action.category != WLAN_CATEGORY_S1G)) + return false; + + return mgmt->u.action.u.s1g.action_code == WLAN_S1G_TWT_SETUP; +} + +static void +ieee80211_s1g_send_twt_setup(struct ieee80211_sub_if_data *sdata, const u8 *da, + const u8 *bssid, struct ieee80211_twt_setup *twt) +{ + int len = IEEE80211_MIN_ACTION_SIZE + 4 + twt->length; + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct sk_buff *skb; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + len); + if (!skb) + return; + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = skb_put_zero(skb, len); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, bssid, ETH_ALEN); + + mgmt->u.action.category = WLAN_CATEGORY_S1G; + mgmt->u.action.u.s1g.action_code = WLAN_S1G_TWT_SETUP; + memcpy(mgmt->u.action.u.s1g.variable, twt, 3 + twt->length); + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | + IEEE80211_TX_INTFL_MLME_CONN_TX | + IEEE80211_TX_CTL_REQ_TX_STATUS; + ieee80211_tx_skb(sdata, skb); +} + +static void +ieee80211_s1g_send_twt_teardown(struct ieee80211_sub_if_data *sdata, + const u8 *da, const u8 *bssid, u8 flowid) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct sk_buff *skb; + u8 *id; + + skb = dev_alloc_skb(local->hw.extra_tx_headroom + + IEEE80211_MIN_ACTION_SIZE + 2); + if (!skb) + return; + + skb_reserve(skb, local->hw.extra_tx_headroom); + mgmt = skb_put_zero(skb, IEEE80211_MIN_ACTION_SIZE + 2); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, da, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, bssid, ETH_ALEN); + + mgmt->u.action.category = WLAN_CATEGORY_S1G; + mgmt->u.action.u.s1g.action_code = WLAN_S1G_TWT_TEARDOWN; + id = (u8 *)mgmt->u.action.u.s1g.variable; + *id = flowid; + + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT | + IEEE80211_TX_CTL_REQ_TX_STATUS; + ieee80211_tx_skb(sdata, skb); +} + +static void +ieee80211_s1g_rx_twt_setup(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (void *)skb->data; + struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.u.s1g.variable; + struct ieee80211_twt_params *twt_agrt = (void *)twt->params; + + twt_agrt->req_type &= cpu_to_le16(~IEEE80211_TWT_REQTYPE_REQUEST); + + /* broadcast TWT not supported yet */ + if (twt->control & IEEE80211_TWT_CONTROL_NEG_TYPE_BROADCAST) { + twt_agrt->req_type &= + ~cpu_to_le16(IEEE80211_TWT_REQTYPE_SETUP_CMD); + twt_agrt->req_type |= + le16_encode_bits(TWT_SETUP_CMD_REJECT, + IEEE80211_TWT_REQTYPE_SETUP_CMD); + goto out; + } + + drv_add_twt_setup(sdata->local, sdata, &sta->sta, twt); +out: + ieee80211_s1g_send_twt_setup(sdata, mgmt->sa, sdata->vif.addr, twt); +} + +static void +ieee80211_s1g_rx_twt_teardown(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; + + drv_twt_teardown_request(sdata->local, sdata, &sta->sta, + mgmt->u.action.u.s1g.variable[0]); +} + +static void +ieee80211_s1g_tx_twt_setup_fail(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; + struct ieee80211_twt_setup *twt = (void *)mgmt->u.action.u.s1g.variable; + struct ieee80211_twt_params *twt_agrt = (void *)twt->params; + u8 flowid = le16_get_bits(twt_agrt->req_type, + IEEE80211_TWT_REQTYPE_FLOWID); + + drv_twt_teardown_request(sdata->local, sdata, &sta->sta, flowid); + + ieee80211_s1g_send_twt_teardown(sdata, mgmt->sa, sdata->vif.addr, + flowid); +} + +void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + mutex_lock(&local->sta_mtx); + + sta = sta_info_get_bss(sdata, mgmt->sa); + if (!sta) + goto out; + + switch (mgmt->u.action.u.s1g.action_code) { + case WLAN_S1G_TWT_SETUP: + ieee80211_s1g_rx_twt_setup(sdata, sta, skb); + break; + case WLAN_S1G_TWT_TEARDOWN: + ieee80211_s1g_rx_twt_teardown(sdata, sta, skb); + break; + default: + break; + } + +out: + mutex_unlock(&local->sta_mtx); +} + +void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)skb->data; + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + + mutex_lock(&local->sta_mtx); + + sta = sta_info_get_bss(sdata, mgmt->da); + if (!sta) + goto out; + + switch (mgmt->u.action.u.s1g.action_code) { + case WLAN_S1G_TWT_SETUP: + /* process failed twt setup frames */ + ieee80211_s1g_tx_twt_setup_fail(sdata, sta, skb); + break; + default: + break; + } + +out: + mutex_unlock(&local->sta_mtx); +} diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 6b50cb5e0e3c..5e6b275afc9e 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright 2016-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2020 Intel Corporation + * Copyright (C) 2018-2021 Intel Corporation */ #include <linux/if_arp.h> @@ -155,7 +155,7 @@ ieee80211_bss_info_update(struct ieee80211_local *local, }; bool signal_valid; struct ieee80211_sub_if_data *scan_sdata; - struct ieee802_11_elems elems; + struct ieee802_11_elems *elems; size_t baselen; u8 *elements; @@ -209,8 +209,10 @@ ieee80211_bss_info_update(struct ieee80211_local *local, if (baselen > len) return NULL; - ieee802_11_parse_elems(elements, len - baselen, false, &elems, - mgmt->bssid, cbss->bssid); + elems = ieee802_11_parse_elems(elements, len - baselen, false, + mgmt->bssid, cbss->bssid); + if (!elems) + return NULL; /* In case the signal is invalid update the status */ signal_valid = channel == cbss->channel; @@ -218,15 +220,17 @@ ieee80211_bss_info_update(struct ieee80211_local *local, rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL; bss = (void *)cbss->priv; - ieee80211_update_bss_from_elems(local, bss, &elems, rx_status, beacon); + ieee80211_update_bss_from_elems(local, bss, elems, rx_status, beacon); list_for_each_entry(non_tx_cbss, &cbss->nontrans_list, nontrans_list) { non_tx_bss = (void *)non_tx_cbss->priv; - ieee80211_update_bss_from_elems(local, non_tx_bss, &elems, + ieee80211_update_bss_from_elems(local, non_tx_bss, elems, rx_status, beacon); } + kfree(elems); + return bss; } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index a5505ee51229..51b49f0d3ad4 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -444,6 +444,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, switch (i) { case NL80211_BAND_2GHZ: + case NL80211_BAND_LC: /* * We use both here, even if we cannot really know for * sure the station will support both, but the only use @@ -513,6 +514,8 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->cparams.target = MS2TIME(20); sta->cparams.interval = MS2TIME(100); sta->cparams.ecn = true; + sta->cparams.ce_threshold_selector = 0; + sta->cparams.ce_threshold_mask = 0; sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); @@ -543,7 +546,7 @@ static int sta_info_insert_check(struct sta_info *sta) return -ENETDOWN; if (WARN_ON(ether_addr_equal(sta->sta.addr, sdata->vif.addr) || - is_multicast_ether_addr(sta->sta.addr))) + !is_valid_ether_addr(sta->sta.addr))) return -EINVAL; /* The RCU read lock is required by rhashtable due to diff --git a/net/mac80211/status.c b/net/mac80211/status.c index bae321ff77f6..f6f63a0b1b72 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -305,8 +305,8 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, memset(rthdr, 0, rtap_len); rthdr->it_len = cpu_to_le16(rtap_len); rthdr->it_present = - cpu_to_le32((1 << IEEE80211_RADIOTAP_TX_FLAGS) | - (1 << IEEE80211_RADIOTAP_DATA_RETRIES)); + cpu_to_le32(BIT(IEEE80211_RADIOTAP_TX_FLAGS) | + BIT(IEEE80211_RADIOTAP_DATA_RETRIES)); pos = (unsigned char *)(rthdr + 1); /* @@ -331,7 +331,7 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, sband->bitrates[info->status.rates[0].idx].bitrate; if (legacy_rate) { - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_RATE)); *pos = DIV_ROUND_UP(legacy_rate, 5 * (1 << shift)); /* padding for tx flags */ pos += 2; @@ -358,7 +358,7 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, if (status && status->rate && (status->rate->flags & RATE_INFO_FLAGS_MCS)) { - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; @@ -374,7 +374,7 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); @@ -419,7 +419,7 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, (status->rate->flags & RATE_INFO_FLAGS_HE_MCS)) { struct ieee80211_radiotap_he *he; - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_HE); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_HE)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); @@ -495,7 +495,7 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, /* IEEE80211_RADIOTAP_MCS * IEEE80211_RADIOTAP_VHT */ if (info->status.rates[0].flags & IEEE80211_TX_RC_MCS) { - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_MCS)); pos[0] = IEEE80211_RADIOTAP_MCS_HAVE_MCS | IEEE80211_RADIOTAP_MCS_HAVE_GI | IEEE80211_RADIOTAP_MCS_HAVE_BW; @@ -512,7 +512,7 @@ ieee80211_add_tx_radiotap_header(struct ieee80211_local *local, (IEEE80211_RADIOTAP_VHT_KNOWN_GI | IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH); - rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT); + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); /* required alignment from rthdr */ pos = (u8 *)rthdr + ALIGN(pos - (u8 *)rthdr, 2); @@ -705,13 +705,26 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, /* Check to see if packet is a TDLS teardown packet */ if (ieee80211_is_data(hdr->frame_control) && (ieee80211_get_tdls_action(skb, hdr_size) == - WLAN_TDLS_TEARDOWN)) + WLAN_TDLS_TEARDOWN)) { ieee80211_tdls_td_tx_handle(local, sdata, skb, info->flags); - else + } else if (ieee80211_s1g_is_twt_setup(skb)) { + if (!acked) { + struct sk_buff *qskb; + + qskb = skb_clone(skb, GFP_ATOMIC); + if (qskb) { + skb_queue_tail(&sdata->status_queue, + qskb); + ieee80211_queue_work(&local->hw, + &sdata->work); + } + } + } else { ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, acked); + } } rcu_read_unlock(); diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 45e532ad1215..137be9ec94af 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -6,7 +6,7 @@ * Copyright 2014, Intel Corporation * Copyright 2014 Intel Mobile Communications GmbH * Copyright 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019, 2021 Intel Corporation */ #include <linux/ieee80211.h> @@ -1684,7 +1684,7 @@ ieee80211_process_tdls_channel_switch_resp(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; - struct ieee802_11_elems elems; + struct ieee802_11_elems *elems = NULL; struct sta_info *sta; struct ieee80211_tdls_data *tf = (void *)skb->data; bool local_initiator; @@ -1718,16 +1718,20 @@ ieee80211_process_tdls_channel_switch_resp(struct ieee80211_sub_if_data *sdata, goto call_drv; } - ieee802_11_parse_elems(tf->u.chan_switch_resp.variable, - skb->len - baselen, false, &elems, - NULL, NULL); - if (elems.parse_error) { + elems = ieee802_11_parse_elems(tf->u.chan_switch_resp.variable, + skb->len - baselen, false, NULL, NULL); + if (!elems) { + ret = -ENOMEM; + goto out; + } + + if (elems->parse_error) { tdls_dbg(sdata, "Invalid IEs in TDLS channel switch resp\n"); ret = -EINVAL; goto out; } - if (!elems.ch_sw_timing || !elems.lnk_id) { + if (!elems->ch_sw_timing || !elems->lnk_id) { tdls_dbg(sdata, "TDLS channel switch resp - missing IEs\n"); ret = -EINVAL; goto out; @@ -1735,15 +1739,15 @@ ieee80211_process_tdls_channel_switch_resp(struct ieee80211_sub_if_data *sdata, /* validate the initiator is set correctly */ local_initiator = - !memcmp(elems.lnk_id->init_sta, sdata->vif.addr, ETH_ALEN); + !memcmp(elems->lnk_id->init_sta, sdata->vif.addr, ETH_ALEN); if (local_initiator == sta->sta.tdls_initiator) { tdls_dbg(sdata, "TDLS chan switch invalid lnk-id initiator\n"); ret = -EINVAL; goto out; } - params.switch_time = le16_to_cpu(elems.ch_sw_timing->switch_time); - params.switch_timeout = le16_to_cpu(elems.ch_sw_timing->switch_timeout); + params.switch_time = le16_to_cpu(elems->ch_sw_timing->switch_time); + params.switch_timeout = le16_to_cpu(elems->ch_sw_timing->switch_timeout); params.tmpl_skb = ieee80211_tdls_ch_sw_resp_tmpl_get(sta, ¶ms.ch_sw_tm_ie); @@ -1763,6 +1767,7 @@ call_drv: out: mutex_unlock(&local->sta_mtx); dev_kfree_skb_any(params.tmpl_skb); + kfree(elems); return ret; } @@ -1771,7 +1776,7 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { struct ieee80211_local *local = sdata->local; - struct ieee802_11_elems elems; + struct ieee802_11_elems *elems; struct cfg80211_chan_def chandef; struct ieee80211_channel *chan; enum nl80211_channel_type chan_type; @@ -1831,22 +1836,27 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, return -EINVAL; } - ieee802_11_parse_elems(tf->u.chan_switch_req.variable, - skb->len - baselen, false, &elems, NULL, NULL); - if (elems.parse_error) { + elems = ieee802_11_parse_elems(tf->u.chan_switch_req.variable, + skb->len - baselen, false, NULL, NULL); + if (!elems) + return -ENOMEM; + + if (elems->parse_error) { tdls_dbg(sdata, "Invalid IEs in TDLS channel switch req\n"); - return -EINVAL; + ret = -EINVAL; + goto free; } - if (!elems.ch_sw_timing || !elems.lnk_id) { + if (!elems->ch_sw_timing || !elems->lnk_id) { tdls_dbg(sdata, "TDLS channel switch req - missing IEs\n"); - return -EINVAL; + ret = -EINVAL; + goto free; } - if (!elems.sec_chan_offs) { + if (!elems->sec_chan_offs) { chan_type = NL80211_CHAN_HT20; } else { - switch (elems.sec_chan_offs->sec_chan_offs) { + switch (elems->sec_chan_offs->sec_chan_offs) { case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: chan_type = NL80211_CHAN_HT40PLUS; break; @@ -1865,7 +1875,8 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, if (!cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &chandef, sdata->wdev.iftype)) { tdls_dbg(sdata, "TDLS chan switch to forbidden channel\n"); - return -EINVAL; + ret = -EINVAL; + goto free; } mutex_lock(&local->sta_mtx); @@ -1881,7 +1892,7 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, /* validate the initiator is set correctly */ local_initiator = - !memcmp(elems.lnk_id->init_sta, sdata->vif.addr, ETH_ALEN); + !memcmp(elems->lnk_id->init_sta, sdata->vif.addr, ETH_ALEN); if (local_initiator == sta->sta.tdls_initiator) { tdls_dbg(sdata, "TDLS chan switch invalid lnk-id initiator\n"); ret = -EINVAL; @@ -1889,16 +1900,16 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, } /* peer should have known better */ - if (!sta->sta.ht_cap.ht_supported && elems.sec_chan_offs && - elems.sec_chan_offs->sec_chan_offs) { + if (!sta->sta.ht_cap.ht_supported && elems->sec_chan_offs && + elems->sec_chan_offs->sec_chan_offs) { tdls_dbg(sdata, "TDLS chan switch - wide chan unsupported\n"); ret = -ENOTSUPP; goto out; } params.chandef = &chandef; - params.switch_time = le16_to_cpu(elems.ch_sw_timing->switch_time); - params.switch_timeout = le16_to_cpu(elems.ch_sw_timing->switch_timeout); + params.switch_time = le16_to_cpu(elems->ch_sw_timing->switch_time); + params.switch_timeout = le16_to_cpu(elems->ch_sw_timing->switch_timeout); params.tmpl_skb = ieee80211_tdls_ch_sw_resp_tmpl_get(sta, @@ -1917,6 +1928,8 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, out: mutex_unlock(&local->sta_mtx); dev_kfree_skb_any(params.tmpl_skb); +free: + kfree(elems); return ret; } diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index f6ef15366938..9e8381bef7ed 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -2825,6 +2825,73 @@ DEFINE_EVENT(sta_flag_evt, drv_sta_set_decap_offload, TP_ARGS(local, sdata, sta, enabled) ); +TRACE_EVENT(drv_add_twt_setup, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sta *sta, + struct ieee80211_twt_setup *twt, + struct ieee80211_twt_params *twt_agrt), + + TP_ARGS(local, sta, twt, twt_agrt), + + TP_STRUCT__entry( + LOCAL_ENTRY + STA_ENTRY + __field(u8, dialog_token) + __field(u8, control) + __field(__le16, req_type) + __field(__le64, twt) + __field(u8, duration) + __field(__le16, mantissa) + __field(u8, channel) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + STA_ASSIGN; + __entry->dialog_token = twt->dialog_token; + __entry->control = twt->control; + __entry->req_type = twt_agrt->req_type; + __entry->twt = twt_agrt->twt; + __entry->duration = twt_agrt->min_twt_dur; + __entry->mantissa = twt_agrt->mantissa; + __entry->channel = twt_agrt->channel; + ), + + TP_printk( + LOCAL_PR_FMT STA_PR_FMT + " token:%d control:0x%02x req_type:0x%04x" + " twt:%llu duration:%d mantissa:%d channel:%d", + LOCAL_PR_ARG, STA_PR_ARG, __entry->dialog_token, + __entry->control, le16_to_cpu(__entry->req_type), + le64_to_cpu(__entry->twt), __entry->duration, + le16_to_cpu(__entry->mantissa), __entry->channel + ) +); + +TRACE_EVENT(drv_twt_teardown_request, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sta *sta, u8 flowid), + + TP_ARGS(local, sta, flowid), + + TP_STRUCT__entry( + LOCAL_ENTRY + STA_ENTRY + __field(u8, flowid) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + STA_ASSIGN; + __entry->flowid = flowid; + ), + + TP_printk( + LOCAL_PR_FMT STA_PR_FMT " flowid:%d", + LOCAL_PR_ARG, STA_PR_ARG, __entry->flowid + ) +); + #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 8509778ff31f..a756a197c770 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -146,7 +146,8 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, rate = DIV_ROUND_UP(r->bitrate, 1 << shift); switch (sband->band) { - case NL80211_BAND_2GHZ: { + case NL80211_BAND_2GHZ: + case NL80211_BAND_LC: { u32 flag; if (tx->sdata->flags & IEEE80211_SDATA_OPERATING_GMODE) flag = IEEE80211_RATE_MANDATORY_G; @@ -2209,7 +2210,11 @@ bool ieee80211_parse_tx_radiotap(struct sk_buff *skb, } vht_mcs = iterator.this_arg[4] >> 4; + if (vht_mcs > 11) + vht_mcs = 0; vht_nss = iterator.this_arg[4] & 0xF; + if (!vht_nss || vht_nss > 8) + vht_nss = 1; break; /* @@ -3242,7 +3247,9 @@ static bool ieee80211_amsdu_prepare_head(struct ieee80211_sub_if_data *sdata, if (info->control.flags & IEEE80211_TX_CTRL_AMSDU) return true; - if (!ieee80211_amsdu_realloc_pad(local, skb, sizeof(*amsdu_hdr))) + if (!ieee80211_amsdu_realloc_pad(local, skb, + sizeof(*amsdu_hdr) + + local->hw.extra_tx_headroom)) return false; data = skb_push(skb, sizeof(*amsdu_hdr)); @@ -3378,6 +3385,14 @@ static bool ieee80211_amsdu_aggregate(struct ieee80211_sub_if_data *sdata, if (!ieee80211_amsdu_prepare_head(sdata, fast_tx, head)) goto out; + /* If n == 2, the "while (*frag_tail)" loop above didn't execute + * and frag_tail should be &skb_shinfo(head)->frag_list. + * However, ieee80211_amsdu_prepare_head() can reallocate it. + * Reload frag_tail to have it pointing to the correct place. + */ + if (n == 2) + frag_tail = &skb_shinfo(head)->frag_list; + /* * Pad out the previous subframe to a multiple of 4 by adding the * padding to the next one, that's being added. Note that head->len @@ -4782,11 +4797,11 @@ static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata, struct beacon_data *beacon) { + u8 *beacon_data, count, max_count = 1; struct probe_resp *resp; - u8 *beacon_data; size_t beacon_data_len; + u16 *bcn_offsets; int i; - u8 count = beacon->cntdwn_current_counter; switch (sdata->vif.type) { case NL80211_IFTYPE_AP: @@ -4806,21 +4821,27 @@ static void ieee80211_set_beacon_cntdwn(struct ieee80211_sub_if_data *sdata, } rcu_read_lock(); - for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; ++i) { - resp = rcu_dereference(sdata->u.ap.probe_resp); + resp = rcu_dereference(sdata->u.ap.probe_resp); - if (beacon->cntdwn_counter_offsets[i]) { - if (WARN_ON_ONCE(beacon->cntdwn_counter_offsets[i] >= - beacon_data_len)) { + bcn_offsets = beacon->cntdwn_counter_offsets; + count = beacon->cntdwn_current_counter; + if (sdata->vif.csa_active) + max_count = IEEE80211_MAX_CNTDWN_COUNTERS_NUM; + + for (i = 0; i < max_count; ++i) { + if (bcn_offsets[i]) { + if (WARN_ON_ONCE(bcn_offsets[i] >= beacon_data_len)) { rcu_read_unlock(); return; } - - beacon_data[beacon->cntdwn_counter_offsets[i]] = count; + beacon_data[bcn_offsets[i]] = count; } - if (sdata->vif.type == NL80211_IFTYPE_AP && resp) - resp->data[resp->cntdwn_counter_offsets[i]] = count; + if (sdata->vif.type == NL80211_IFTYPE_AP && resp) { + u16 *resp_offsets = resp->cntdwn_counter_offsets; + + resp->data[resp_offsets[i]] = count; + } } rcu_read_unlock(); } @@ -4971,6 +4992,115 @@ static int ieee80211_beacon_protect(struct sk_buff *skb, return 0; } +static void +ieee80211_beacon_get_finish(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_mutable_offsets *offs, + struct beacon_data *beacon, + struct sk_buff *skb, + struct ieee80211_chanctx_conf *chanctx_conf, + u16 csa_off_base) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_tx_info *info; + enum nl80211_band band; + struct ieee80211_tx_rate_control txrc; + + /* CSA offsets */ + if (offs && beacon) { + u16 i; + + for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; i++) { + u16 csa_off = beacon->cntdwn_counter_offsets[i]; + + if (!csa_off) + continue; + + offs->cntdwn_counter_offs[i] = csa_off_base + csa_off; + } + } + + band = chanctx_conf->def.chan->band; + info = IEEE80211_SKB_CB(skb); + info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + info->flags |= IEEE80211_TX_CTL_NO_ACK; + info->band = band; + + memset(&txrc, 0, sizeof(txrc)); + txrc.hw = hw; + txrc.sband = local->hw.wiphy->bands[band]; + txrc.bss_conf = &sdata->vif.bss_conf; + txrc.skb = skb; + txrc.reported_rate.idx = -1; + if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band]) + txrc.rate_idx_mask = sdata->beacon_rateidx_mask[band]; + else + txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; + txrc.bss = true; + rate_control_get_rate(sdata, NULL, &txrc); + + info->control.vif = vif; + info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT | + IEEE80211_TX_CTL_ASSIGN_SEQ | + IEEE80211_TX_CTL_FIRST_FRAGMENT; +} + +static struct sk_buff * +ieee80211_beacon_get_ap(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_mutable_offsets *offs, + bool is_template, + struct beacon_data *beacon, + struct ieee80211_chanctx_conf *chanctx_conf) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_if_ap *ap = &sdata->u.ap; + struct sk_buff *skb = NULL; + u16 csa_off_base = 0; + + if (beacon->cntdwn_counter_offsets[0]) { + if (!is_template) + ieee80211_beacon_update_cntdwn(vif); + + ieee80211_set_beacon_cntdwn(sdata, beacon); + } + + /* headroom, head length, + * tail length and maximum TIM length + */ + skb = dev_alloc_skb(local->tx_headroom + beacon->head_len + + beacon->tail_len + 256 + + local->hw.extra_beacon_tailroom); + if (!skb) + return NULL; + + skb_reserve(skb, local->tx_headroom); + skb_put_data(skb, beacon->head, beacon->head_len); + + ieee80211_beacon_add_tim(sdata, &ap->ps, skb, is_template); + + if (offs) { + offs->tim_offset = beacon->head_len; + offs->tim_length = skb->len - beacon->head_len; + offs->cntdwn_counter_offs[0] = beacon->cntdwn_counter_offsets[0]; + + /* for AP the csa offsets are from tail */ + csa_off_base = skb->len; + } + + if (beacon->tail) + skb_put_data(skb, beacon->tail, beacon->tail_len); + + if (ieee80211_beacon_protect(skb, local, sdata) < 0) + return NULL; + + ieee80211_beacon_get_finish(hw, vif, offs, beacon, skb, chanctx_conf, + csa_off_base); + return skb; +} + static struct sk_buff * __ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -4980,12 +5110,8 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_local *local = hw_to_local(hw); struct beacon_data *beacon = NULL; struct sk_buff *skb = NULL; - struct ieee80211_tx_info *info; struct ieee80211_sub_if_data *sdata = NULL; - enum nl80211_band band; - struct ieee80211_tx_rate_control txrc; struct ieee80211_chanctx_conf *chanctx_conf; - int csa_off_base = 0; rcu_read_lock(); @@ -5002,47 +5128,11 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, struct ieee80211_if_ap *ap = &sdata->u.ap; beacon = rcu_dereference(ap->beacon); - if (beacon) { - if (beacon->cntdwn_counter_offsets[0]) { - if (!is_template) - ieee80211_beacon_update_cntdwn(vif); - - ieee80211_set_beacon_cntdwn(sdata, beacon); - } - - /* - * headroom, head length, - * tail length and maximum TIM length - */ - skb = dev_alloc_skb(local->tx_headroom + - beacon->head_len + - beacon->tail_len + 256 + - local->hw.extra_beacon_tailroom); - if (!skb) - goto out; - - skb_reserve(skb, local->tx_headroom); - skb_put_data(skb, beacon->head, beacon->head_len); - - ieee80211_beacon_add_tim(sdata, &ap->ps, skb, - is_template); - - if (offs) { - offs->tim_offset = beacon->head_len; - offs->tim_length = skb->len - beacon->head_len; - - /* for AP the csa offsets are from tail */ - csa_off_base = skb->len; - } - - if (beacon->tail) - skb_put_data(skb, beacon->tail, - beacon->tail_len); - - if (ieee80211_beacon_protect(skb, local, sdata) < 0) - goto out; - } else + if (!beacon) goto out; + + skb = ieee80211_beacon_get_ap(hw, vif, offs, is_template, + beacon, chanctx_conf); } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { struct ieee80211_if_ibss *ifibss = &sdata->u.ibss; struct ieee80211_hdr *hdr; @@ -5068,6 +5158,9 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, hdr = (struct ieee80211_hdr *) skb->data; hdr->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON); + + ieee80211_beacon_get_finish(hw, vif, offs, beacon, skb, + chanctx_conf, 0); } else if (ieee80211_vif_is_mesh(&sdata->vif)) { struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; @@ -5107,51 +5200,13 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, } skb_put_data(skb, beacon->tail, beacon->tail_len); + ieee80211_beacon_get_finish(hw, vif, offs, beacon, skb, + chanctx_conf, 0); } else { WARN_ON(1); goto out; } - /* CSA offsets */ - if (offs && beacon) { - int i; - - for (i = 0; i < IEEE80211_MAX_CNTDWN_COUNTERS_NUM; i++) { - u16 csa_off = beacon->cntdwn_counter_offsets[i]; - - if (!csa_off) - continue; - - offs->cntdwn_counter_offs[i] = csa_off_base + csa_off; - } - } - - band = chanctx_conf->def.chan->band; - - info = IEEE80211_SKB_CB(skb); - - info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; - info->flags |= IEEE80211_TX_CTL_NO_ACK; - info->band = band; - - memset(&txrc, 0, sizeof(txrc)); - txrc.hw = hw; - txrc.sband = local->hw.wiphy->bands[band]; - txrc.bss_conf = &sdata->vif.bss_conf; - txrc.skb = skb; - txrc.reported_rate.idx = -1; - if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band]) - txrc.rate_idx_mask = sdata->beacon_rateidx_mask[band]; - else - txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; - txrc.bss = true; - rate_control_get_rate(sdata, NULL, &txrc); - - info->control.vif = vif; - - info->flags |= IEEE80211_TX_CTL_CLEAR_PS_FILT | - IEEE80211_TX_CTL_ASSIGN_SEQ | - IEEE80211_TX_CTL_FIRST_FRAGMENT; out: rcu_read_unlock(); return skb; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 05e96212b104..39fa2a50385d 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1112,10 +1112,6 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, } else elem_parse_failed = true; break; - case WLAN_EID_CHALLENGE: - elems->challenge = pos; - elems->challenge_len = elen; - break; case WLAN_EID_VENDOR_SPECIFIC: if (elen >= 4 && pos[0] == 0x00 && pos[1] == 0x50 && pos[2] == 0xf2) { @@ -1336,6 +1332,18 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, elems->rsnx = pos; elems->rsnx_len = elen; break; + case WLAN_EID_TX_POWER_ENVELOPE: + if (elen < 1 || + elen > sizeof(struct ieee80211_tx_pwr_env)) + break; + + if (elems->tx_pwr_env_num >= ARRAY_SIZE(elems->tx_pwr_env)) + break; + + elems->tx_pwr_env[elems->tx_pwr_env_num] = (void *)pos; + elems->tx_pwr_env_len[elems->tx_pwr_env_num] = elen; + elems->tx_pwr_env_num++; + break; case WLAN_EID_EXTENSION: ieee80211_parse_extension_element(calc_crc ? &crc : NULL, @@ -1383,8 +1391,8 @@ _ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, struct ieee802_11_elems *elems, - u8 *transmitter_bssid, - u8 *bss_bssid, + const u8 *transmitter_bssid, + const u8 *bss_bssid, u8 *nontransmitted_profile) { const struct element *elem, *sub; @@ -1449,16 +1457,20 @@ static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, return found ? profile_len : 0; } -u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, - struct ieee802_11_elems *elems, - u64 filter, u32 crc, u8 *transmitter_bssid, - u8 *bss_bssid) +struct ieee802_11_elems *ieee802_11_parse_elems_crc(const u8 *start, size_t len, + bool action, u64 filter, + u32 crc, + const u8 *transmitter_bssid, + const u8 *bss_bssid) { + struct ieee802_11_elems *elems; const struct element *non_inherit = NULL; u8 *nontransmitted_profile; int nontransmitted_profile_len = 0; - memset(elems, 0, sizeof(*elems)); + elems = kzalloc(sizeof(*elems), GFP_ATOMIC); + if (!elems) + return NULL; elems->ie_start = start; elems->total_len = len; @@ -1504,7 +1516,9 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, kfree(nontransmitted_profile); - return crc; + elems->crc = crc; + + return elems; } void ieee80211_regulatory_limit_wmm_params(struct ieee80211_sub_if_data *sdata, @@ -3371,6 +3385,7 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, const struct ieee80211_sta_he_cap *he_cap; struct cfg80211_chan_def he_chandef = *chandef; const struct ieee80211_he_6ghz_oper *he_6ghz_oper; + struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; bool support_80_80, support_160; u8 he_phy_cap; u32 freq; @@ -3415,6 +3430,19 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, he_chandef.chan = ieee80211_get_channel(sdata->local->hw.wiphy, freq); switch (u8_get_bits(he_6ghz_oper->control, + IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { + case IEEE80211_6GHZ_CTRL_REG_LPI_AP: + bss_conf->power_type = IEEE80211_REG_LPI_AP; + break; + case IEEE80211_6GHZ_CTRL_REG_SP_AP: + bss_conf->power_type = IEEE80211_REG_SP_AP; + break; + default: + bss_conf->power_type = IEEE80211_REG_UNSET_AP; + break; + } + + switch (u8_get_bits(he_6ghz_oper->control, IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH)) { case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ: he_chandef.width = NL80211_CHAN_WIDTH_20; diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index bca47fad5a16..4eed23e27610 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -520,6 +520,9 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, return RX_DROP_UNUSABLE; } + /* reload hdr - skb might have been reallocated */ + hdr = (void *)rx->skb->data; + data_len = skb->len - hdrlen - IEEE80211_CCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; @@ -749,6 +752,9 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; } + /* reload hdr - skb might have been reallocated */ + hdr = (void *)rx->skb->data; + data_len = skb->len - hdrlen - IEEE80211_GCMP_HDR_LEN - mic_len; if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 1cf5ac09edcb..500ed1b81250 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -129,15 +129,14 @@ static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) if (!ieee802154_is_valid_extended_unicast_addr(extended_addr)) return -EINVAL; - memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + dev_addr_set(dev, addr->sa_data); sdata->wpan_dev.extended_addr = extended_addr; /* update lowpan interface mac address when * wpan mac has been changed */ if (sdata->wpan_dev.lowpan_dev) - memcpy(sdata->wpan_dev.lowpan_dev->dev_addr, dev->dev_addr, - dev->addr_len); + dev_addr_set(sdata->wpan_dev.lowpan_dev, dev->dev_addr); return mac802154_wpan_update_llsec(dev); } @@ -615,9 +614,10 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr) { + u8 addr[IEEE802154_EXTENDED_ADDR_LEN]; struct net_device *ndev = NULL; struct ieee802154_sub_if_data *sdata = NULL; - int ret = -ENOMEM; + int ret; ASSERT_RTNL(); @@ -638,11 +638,12 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, switch (type) { case NL802154_IFTYPE_NODE: ndev->type = ARPHRD_IEEE802154; - if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) - ieee802154_le64_to_be64(ndev->dev_addr, &extended_addr); - else - memcpy(ndev->dev_addr, ndev->perm_addr, - IEEE802154_EXTENDED_ADDR_LEN); + if (ieee802154_is_valid_extended_unicast_addr(extended_addr)) { + ieee802154_le64_to_be64(addr, &extended_addr); + dev_addr_set(ndev, addr); + } else { + dev_addr_set(ndev, ndev->perm_addr); + } break; case NL802154_IFTYPE_MONITOR: ndev->type = ARPHRD_IEEE802154_MONITOR; diff --git a/net/mctp/Kconfig b/net/mctp/Kconfig new file mode 100644 index 000000000000..3a5c0e70da77 --- /dev/null +++ b/net/mctp/Kconfig @@ -0,0 +1,23 @@ + +menuconfig MCTP + depends on NET + bool "MCTP core protocol support" + help + Management Component Transport Protocol (MCTP) is an in-system + protocol for communicating between management controllers and + their managed devices (peripherals, host processors, etc.). The + protocol is defined by DMTF specification DSP0236. + + This option enables core MCTP support. For communicating with other + devices, you'll want to enable a driver for a specific hardware + channel. + +config MCTP_TEST + bool "MCTP core tests" if !KUNIT_ALL_TESTS + depends on MCTP=y && KUNIT=y + default KUNIT_ALL_TESTS + +config MCTP_FLOWS + bool + depends on MCTP + select SKB_EXTENSIONS diff --git a/net/mctp/Makefile b/net/mctp/Makefile new file mode 100644 index 000000000000..6cd55233e685 --- /dev/null +++ b/net/mctp/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_MCTP) += mctp.o +mctp-objs := af_mctp.o device.o route.o neigh.o + +# tests +obj-$(CONFIG_MCTP_TEST) += test/utils.o diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c new file mode 100644 index 000000000000..d344b02a1cde --- /dev/null +++ b/net/mctp/af_mctp.c @@ -0,0 +1,511 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#include <linux/if_arp.h> +#include <linux/net.h> +#include <linux/mctp.h> +#include <linux/module.h> +#include <linux/socket.h> + +#include <net/mctp.h> +#include <net/mctpdevice.h> +#include <net/sock.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/mctp.h> + +/* socket implementation */ + +static int mctp_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + sock->sk = NULL; + sk->sk_prot->close(sk, 0); + } + + return 0; +} + +static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) +{ + struct sock *sk = sock->sk; + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct sockaddr_mctp *smctp; + int rc; + + if (addrlen < sizeof(*smctp)) + return -EINVAL; + + if (addr->sa_family != AF_MCTP) + return -EAFNOSUPPORT; + + if (!capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + /* it's a valid sockaddr for MCTP, cast and do protocol checks */ + smctp = (struct sockaddr_mctp *)addr; + + lock_sock(sk); + + /* TODO: allow rebind */ + if (sk_hashed(sk)) { + rc = -EADDRINUSE; + goto out_release; + } + msk->bind_net = smctp->smctp_network; + msk->bind_addr = smctp->smctp_addr.s_addr; + msk->bind_type = smctp->smctp_type & 0x7f; /* ignore the IC bit */ + + rc = sk->sk_prot->hash(sk); + +out_release: + release_sock(sk); + + return rc; +} + +static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) +{ + DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name); + const int hlen = MCTP_HEADER_MAXLEN + sizeof(struct mctp_hdr); + int rc, addrlen = msg->msg_namelen; + struct sock *sk = sock->sk; + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct mctp_skb_cb *cb; + struct mctp_route *rt; + struct sk_buff *skb; + + if (addr) { + if (addrlen < sizeof(struct sockaddr_mctp)) + return -EINVAL; + if (addr->smctp_family != AF_MCTP) + return -EINVAL; + if (addr->smctp_tag & ~(MCTP_TAG_MASK | MCTP_TAG_OWNER)) + return -EINVAL; + + } else { + /* TODO: connect()ed sockets */ + return -EDESTADDRREQ; + } + + if (!capable(CAP_NET_RAW)) + return -EACCES; + + if (addr->smctp_network == MCTP_NET_ANY) + addr->smctp_network = mctp_default_net(sock_net(sk)); + + skb = sock_alloc_send_skb(sk, hlen + 1 + len, + msg->msg_flags & MSG_DONTWAIT, &rc); + if (!skb) + return rc; + + skb_reserve(skb, hlen); + + /* set type as fist byte in payload */ + *(u8 *)skb_put(skb, 1) = addr->smctp_type; + + rc = memcpy_from_msg((void *)skb_put(skb, len), msg, len); + if (rc < 0) + goto err_free; + + /* set up cb */ + cb = __mctp_cb(skb); + cb->net = addr->smctp_network; + + /* direct addressing */ + if (msk->addr_ext && addrlen >= sizeof(struct sockaddr_mctp_ext)) { + DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, + extaddr, msg->msg_name); + + if (extaddr->smctp_halen > sizeof(cb->haddr)) { + rc = -EINVAL; + goto err_free; + } + + cb->ifindex = extaddr->smctp_ifindex; + cb->halen = extaddr->smctp_halen; + memcpy(cb->haddr, extaddr->smctp_haddr, cb->halen); + + rt = NULL; + } else { + rt = mctp_route_lookup(sock_net(sk), addr->smctp_network, + addr->smctp_addr.s_addr); + if (!rt) { + rc = -EHOSTUNREACH; + goto err_free; + } + } + + rc = mctp_local_output(sk, rt, skb, addr->smctp_addr.s_addr, + addr->smctp_tag); + + return rc ? : len; + +err_free: + kfree_skb(skb); + return rc; +} + +static int mctp_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name); + struct sock *sk = sock->sk; + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct sk_buff *skb; + size_t msglen; + u8 type; + int rc; + + if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK)) + return -EOPNOTSUPP; + + skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &rc); + if (!skb) + return rc; + + if (!skb->len) { + rc = 0; + goto out_free; + } + + /* extract message type, remove from data */ + type = *((u8 *)skb->data); + msglen = skb->len - 1; + + if (len < msglen) + msg->msg_flags |= MSG_TRUNC; + else + len = msglen; + + rc = skb_copy_datagram_msg(skb, 1, msg, len); + if (rc < 0) + goto out_free; + + sock_recv_ts_and_drops(msg, sk, skb); + + if (addr) { + struct mctp_skb_cb *cb = mctp_cb(skb); + /* TODO: expand mctp_skb_cb for header fields? */ + struct mctp_hdr *hdr = mctp_hdr(skb); + + addr = msg->msg_name; + addr->smctp_family = AF_MCTP; + addr->smctp_network = cb->net; + addr->smctp_addr.s_addr = hdr->src; + addr->smctp_type = type; + addr->smctp_tag = hdr->flags_seq_tag & + (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + msg->msg_namelen = sizeof(*addr); + + if (msk->addr_ext) { + DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, ae, + msg->msg_name); + msg->msg_namelen = sizeof(*ae); + ae->smctp_ifindex = cb->ifindex; + ae->smctp_halen = cb->halen; + memset(ae->smctp_haddr, 0x0, sizeof(ae->smctp_haddr)); + memcpy(ae->smctp_haddr, cb->haddr, cb->halen); + } + } + + rc = len; + + if (flags & MSG_TRUNC) + rc = msglen; + +out_free: + skb_free_datagram(sk, skb); + return rc; +} + +static int mctp_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct mctp_sock *msk = container_of(sock->sk, struct mctp_sock, sk); + int val; + + if (level != SOL_MCTP) + return -EINVAL; + + if (optname == MCTP_OPT_ADDR_EXT) { + if (optlen != sizeof(int)) + return -EINVAL; + if (copy_from_sockptr(&val, optval, sizeof(int))) + return -EFAULT; + msk->addr_ext = val; + return 0; + } + + return -ENOPROTOOPT; +} + +static int mctp_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct mctp_sock *msk = container_of(sock->sk, struct mctp_sock, sk); + int len, val; + + if (level != SOL_MCTP) + return -EINVAL; + + if (get_user(len, optlen)) + return -EFAULT; + + if (optname == MCTP_OPT_ADDR_EXT) { + if (len != sizeof(int)) + return -EINVAL; + val = !!msk->addr_ext; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; + } + + return -EINVAL; +} + +static const struct proto_ops mctp_dgram_ops = { + .family = PF_MCTP, + .release = mctp_release, + .bind = mctp_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .poll = datagram_poll, + .ioctl = sock_no_ioctl, + .gettstamp = sock_gettstamp, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = mctp_setsockopt, + .getsockopt = mctp_getsockopt, + .sendmsg = mctp_sendmsg, + .recvmsg = mctp_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static void mctp_sk_expire_keys(struct timer_list *timer) +{ + struct mctp_sock *msk = container_of(timer, struct mctp_sock, + key_expiry); + struct net *net = sock_net(&msk->sk); + unsigned long next_expiry, flags; + struct mctp_sk_key *key; + struct hlist_node *tmp; + bool next_expiry_valid = false; + + spin_lock_irqsave(&net->mctp.keys_lock, flags); + + hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { + spin_lock(&key->lock); + + if (!time_after_eq(key->expiry, jiffies)) { + trace_mctp_key_release(key, MCTP_TRACE_KEY_TIMEOUT); + key->valid = false; + hlist_del_rcu(&key->hlist); + hlist_del_rcu(&key->sklist); + spin_unlock(&key->lock); + mctp_key_unref(key); + continue; + } + + if (next_expiry_valid) { + if (time_before(key->expiry, next_expiry)) + next_expiry = key->expiry; + } else { + next_expiry = key->expiry; + next_expiry_valid = true; + } + spin_unlock(&key->lock); + } + + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + + if (next_expiry_valid) + mod_timer(timer, next_expiry); +} + +static int mctp_sk_init(struct sock *sk) +{ + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + + INIT_HLIST_HEAD(&msk->keys); + timer_setup(&msk->key_expiry, mctp_sk_expire_keys, 0); + return 0; +} + +static void mctp_sk_close(struct sock *sk, long timeout) +{ + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + + del_timer_sync(&msk->key_expiry); + sk_common_release(sk); +} + +static int mctp_sk_hash(struct sock *sk) +{ + struct net *net = sock_net(sk); + + mutex_lock(&net->mctp.bind_lock); + sk_add_node_rcu(sk, &net->mctp.binds); + mutex_unlock(&net->mctp.bind_lock); + + return 0; +} + +static void mctp_sk_unhash(struct sock *sk) +{ + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct net *net = sock_net(sk); + struct mctp_sk_key *key; + struct hlist_node *tmp; + unsigned long flags; + + /* remove from any type-based binds */ + mutex_lock(&net->mctp.bind_lock); + sk_del_node_init_rcu(sk); + mutex_unlock(&net->mctp.bind_lock); + + /* remove tag allocations */ + spin_lock_irqsave(&net->mctp.keys_lock, flags); + hlist_for_each_entry_safe(key, tmp, &msk->keys, sklist) { + hlist_del(&key->sklist); + hlist_del(&key->hlist); + + trace_mctp_key_release(key, MCTP_TRACE_KEY_CLOSED); + + spin_lock(&key->lock); + if (key->reasm_head) + kfree_skb(key->reasm_head); + key->reasm_head = NULL; + key->reasm_dead = true; + key->valid = false; + spin_unlock(&key->lock); + + /* key is no longer on the lookup lists, unref */ + mctp_key_unref(key); + } + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); +} + +static struct proto mctp_proto = { + .name = "MCTP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct mctp_sock), + .init = mctp_sk_init, + .close = mctp_sk_close, + .hash = mctp_sk_hash, + .unhash = mctp_sk_unhash, +}; + +static int mctp_pf_create(struct net *net, struct socket *sock, + int protocol, int kern) +{ + const struct proto_ops *ops; + struct proto *proto; + struct sock *sk; + int rc; + + if (protocol) + return -EPROTONOSUPPORT; + + /* only datagram sockets are supported */ + if (sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + proto = &mctp_proto; + ops = &mctp_dgram_ops; + + sock->state = SS_UNCONNECTED; + sock->ops = ops; + + sk = sk_alloc(net, PF_MCTP, GFP_KERNEL, proto, kern); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + rc = 0; + if (sk->sk_prot->init) + rc = sk->sk_prot->init(sk); + + if (rc) + goto err_sk_put; + + return 0; + +err_sk_put: + sock_orphan(sk); + sock_put(sk); + return rc; +} + +static struct net_proto_family mctp_pf = { + .family = PF_MCTP, + .create = mctp_pf_create, + .owner = THIS_MODULE, +}; + +static __init int mctp_init(void) +{ + int rc; + + /* ensure our uapi tag definitions match the header format */ + BUILD_BUG_ON(MCTP_TAG_OWNER != MCTP_HDR_FLAG_TO); + BUILD_BUG_ON(MCTP_TAG_MASK != MCTP_HDR_TAG_MASK); + + pr_info("mctp: management component transport protocol core\n"); + + rc = sock_register(&mctp_pf); + if (rc) + return rc; + + rc = proto_register(&mctp_proto, 0); + if (rc) + goto err_unreg_sock; + + rc = mctp_routes_init(); + if (rc) + goto err_unreg_proto; + + rc = mctp_neigh_init(); + if (rc) + goto err_unreg_proto; + + mctp_device_init(); + + return 0; + +err_unreg_proto: + proto_unregister(&mctp_proto); +err_unreg_sock: + sock_unregister(PF_MCTP); + + return rc; +} + +static __exit void mctp_exit(void) +{ + mctp_device_exit(); + mctp_neigh_exit(); + mctp_routes_exit(); + proto_unregister(&mctp_proto); + sock_unregister(PF_MCTP); +} + +subsys_initcall(mctp_init); +module_exit(mctp_exit); + +MODULE_DESCRIPTION("MCTP core"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Jeremy Kerr <jk@codeconstruct.com.au>"); + +MODULE_ALIAS_NETPROTO(PF_MCTP); diff --git a/net/mctp/device.c b/net/mctp/device.c new file mode 100644 index 000000000000..8799ee77e7b7 --- /dev/null +++ b/net/mctp/device.c @@ -0,0 +1,501 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) - device implementation. + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#include <linux/if_link.h> +#include <linux/mctp.h> +#include <linux/netdevice.h> +#include <linux/rcupdate.h> +#include <linux/rtnetlink.h> + +#include <net/addrconf.h> +#include <net/netlink.h> +#include <net/mctp.h> +#include <net/mctpdevice.h> +#include <net/sock.h> + +struct mctp_dump_cb { + int h; + int idx; + size_t a_idx; +}; + +/* unlocked: caller must hold rcu_read_lock */ +struct mctp_dev *__mctp_dev_get(const struct net_device *dev) +{ + return rcu_dereference(dev->mctp_ptr); +} + +struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev) +{ + return rtnl_dereference(dev->mctp_ptr); +} + +static int mctp_fill_addrinfo(struct sk_buff *skb, struct netlink_callback *cb, + struct mctp_dev *mdev, mctp_eid_t eid) +{ + struct ifaddrmsg *hdr; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + RTM_NEWADDR, sizeof(*hdr), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->ifa_family = AF_MCTP; + hdr->ifa_prefixlen = 0; + hdr->ifa_flags = 0; + hdr->ifa_scope = 0; + hdr->ifa_index = mdev->dev->ifindex; + + if (nla_put_u8(skb, IFA_LOCAL, eid)) + goto cancel; + + if (nla_put_u8(skb, IFA_ADDRESS, eid)) + goto cancel; + + nlmsg_end(skb, nlh); + + return 0; + +cancel: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mctp_dump_dev_addrinfo(struct mctp_dev *mdev, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct mctp_dump_cb *mcb = (void *)cb->ctx; + int rc = 0; + + for (; mcb->a_idx < mdev->num_addrs; mcb->a_idx++) { + rc = mctp_fill_addrinfo(skb, cb, mdev, mdev->addrs[mcb->a_idx]); + if (rc < 0) + break; + } + + return rc; +} + +static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct mctp_dump_cb *mcb = (void *)cb->ctx; + struct net *net = sock_net(skb->sk); + struct hlist_head *head; + struct net_device *dev; + struct ifaddrmsg *hdr; + struct mctp_dev *mdev; + int ifindex; + int idx, rc; + + hdr = nlmsg_data(cb->nlh); + // filter by ifindex if requested + ifindex = hdr->ifa_index; + + rcu_read_lock(); + for (; mcb->h < NETDEV_HASHENTRIES; mcb->h++, mcb->idx = 0) { + idx = 0; + head = &net->dev_index_head[mcb->h]; + hlist_for_each_entry_rcu(dev, head, index_hlist) { + if (idx >= mcb->idx && + (ifindex == 0 || ifindex == dev->ifindex)) { + mdev = __mctp_dev_get(dev); + if (mdev) { + rc = mctp_dump_dev_addrinfo(mdev, + skb, cb); + // Error indicates full buffer, this + // callback will get retried. + if (rc < 0) + goto out; + } + } + idx++; + // reset for next iteration + mcb->a_idx = 0; + } + } +out: + rcu_read_unlock(); + mcb->idx = idx; + + return skb->len; +} + +static const struct nla_policy ifa_mctp_policy[IFA_MAX + 1] = { + [IFA_ADDRESS] = { .type = NLA_U8 }, + [IFA_LOCAL] = { .type = NLA_U8 }, +}; + +static int mctp_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFA_MAX + 1]; + struct net_device *dev; + struct mctp_addr *addr; + struct mctp_dev *mdev; + struct ifaddrmsg *ifm; + unsigned long flags; + u8 *tmp_addrs; + int rc; + + rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_mctp_policy, + extack); + if (rc < 0) + return rc; + + ifm = nlmsg_data(nlh); + + if (tb[IFA_LOCAL]) + addr = nla_data(tb[IFA_LOCAL]); + else if (tb[IFA_ADDRESS]) + addr = nla_data(tb[IFA_ADDRESS]); + else + return -EINVAL; + + /* find device */ + dev = __dev_get_by_index(net, ifm->ifa_index); + if (!dev) + return -ENODEV; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODEV; + + if (!mctp_address_ok(addr->s_addr)) + return -EINVAL; + + /* Prevent duplicates. Under RTNL so don't need to lock for reading */ + if (memchr(mdev->addrs, addr->s_addr, mdev->num_addrs)) + return -EEXIST; + + tmp_addrs = kmalloc(mdev->num_addrs + 1, GFP_KERNEL); + if (!tmp_addrs) + return -ENOMEM; + memcpy(tmp_addrs, mdev->addrs, mdev->num_addrs); + tmp_addrs[mdev->num_addrs] = addr->s_addr; + + /* Lock to write */ + spin_lock_irqsave(&mdev->addrs_lock, flags); + mdev->num_addrs++; + swap(mdev->addrs, tmp_addrs); + spin_unlock_irqrestore(&mdev->addrs_lock, flags); + + kfree(tmp_addrs); + + mctp_route_add_local(mdev, addr->s_addr); + + return 0; +} + +static int mctp_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[IFA_MAX + 1]; + struct net_device *dev; + struct mctp_addr *addr; + struct mctp_dev *mdev; + struct ifaddrmsg *ifm; + unsigned long flags; + u8 *pos; + int rc; + + rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_mctp_policy, + extack); + if (rc < 0) + return rc; + + ifm = nlmsg_data(nlh); + + if (tb[IFA_LOCAL]) + addr = nla_data(tb[IFA_LOCAL]); + else if (tb[IFA_ADDRESS]) + addr = nla_data(tb[IFA_ADDRESS]); + else + return -EINVAL; + + /* find device */ + dev = __dev_get_by_index(net, ifm->ifa_index); + if (!dev) + return -ENODEV; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODEV; + + pos = memchr(mdev->addrs, addr->s_addr, mdev->num_addrs); + if (!pos) + return -ENOENT; + + rc = mctp_route_remove_local(mdev, addr->s_addr); + // we can ignore -ENOENT in the case a route was already removed + if (rc < 0 && rc != -ENOENT) + return rc; + + spin_lock_irqsave(&mdev->addrs_lock, flags); + memmove(pos, pos + 1, mdev->num_addrs - 1 - (pos - mdev->addrs)); + mdev->num_addrs--; + spin_unlock_irqrestore(&mdev->addrs_lock, flags); + + return 0; +} + +void mctp_dev_hold(struct mctp_dev *mdev) +{ + refcount_inc(&mdev->refs); +} + +void mctp_dev_put(struct mctp_dev *mdev) +{ + if (refcount_dec_and_test(&mdev->refs)) { + dev_put(mdev->dev); + kfree_rcu(mdev, rcu); + } +} + +void mctp_dev_release_key(struct mctp_dev *dev, struct mctp_sk_key *key) + __must_hold(&key->lock) +{ + if (!dev) + return; + if (dev->ops && dev->ops->release_flow) + dev->ops->release_flow(dev, key); + key->dev = NULL; + mctp_dev_put(dev); +} + +void mctp_dev_set_key(struct mctp_dev *dev, struct mctp_sk_key *key) + __must_hold(&key->lock) +{ + mctp_dev_hold(dev); + key->dev = dev; +} + +static struct mctp_dev *mctp_add_dev(struct net_device *dev) +{ + struct mctp_dev *mdev; + + ASSERT_RTNL(); + + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + if (!mdev) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&mdev->addrs_lock); + + mdev->net = mctp_default_net(dev_net(dev)); + + /* associate to net_device */ + refcount_set(&mdev->refs, 1); + rcu_assign_pointer(dev->mctp_ptr, mdev); + + dev_hold(dev); + mdev->dev = dev; + + return mdev; +} + +static int mctp_fill_link_af(struct sk_buff *skb, + const struct net_device *dev, u32 ext_filter_mask) +{ + struct mctp_dev *mdev; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODATA; + if (nla_put_u32(skb, IFLA_MCTP_NET, mdev->net)) + return -EMSGSIZE; + return 0; +} + +static size_t mctp_get_link_af_size(const struct net_device *dev, + u32 ext_filter_mask) +{ + struct mctp_dev *mdev; + unsigned int ret; + + /* caller holds RCU */ + mdev = __mctp_dev_get(dev); + if (!mdev) + return 0; + ret = nla_total_size(4); /* IFLA_MCTP_NET */ + return ret; +} + +static const struct nla_policy ifla_af_mctp_policy[IFLA_MCTP_MAX + 1] = { + [IFLA_MCTP_NET] = { .type = NLA_U32 }, +}; + +static int mctp_set_link_af(struct net_device *dev, const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_MCTP_MAX + 1]; + struct mctp_dev *mdev; + int rc; + + rc = nla_parse_nested(tb, IFLA_MCTP_MAX, attr, ifla_af_mctp_policy, + NULL); + if (rc) + return rc; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return 0; + + if (tb[IFLA_MCTP_NET]) + WRITE_ONCE(mdev->net, nla_get_u32(tb[IFLA_MCTP_NET])); + + return 0; +} + +/* Matches netdev types that should have MCTP handling */ +static bool mctp_known(struct net_device *dev) +{ + /* only register specific types (inc. NONE for TUN devices) */ + return dev->type == ARPHRD_MCTP || + dev->type == ARPHRD_LOOPBACK || + dev->type == ARPHRD_NONE; +} + +static void mctp_unregister(struct net_device *dev) +{ + struct mctp_dev *mdev; + + mdev = mctp_dev_get_rtnl(dev); + if (mctp_known(dev) != (bool)mdev) { + // Sanity check, should match what was set in mctp_register + netdev_warn(dev, "%s: mdev pointer %d but type (%d) match is %d", + __func__, (bool)mdev, mctp_known(dev), dev->type); + return; + } + if (!mdev) + return; + + RCU_INIT_POINTER(mdev->dev->mctp_ptr, NULL); + + mctp_route_remove_dev(mdev); + mctp_neigh_remove_dev(mdev); + kfree(mdev->addrs); + + mctp_dev_put(mdev); +} + +static int mctp_register(struct net_device *dev) +{ + struct mctp_dev *mdev; + + /* Already registered? */ + mdev = rtnl_dereference(dev->mctp_ptr); + + if (mdev) { + if (!mctp_known(dev)) + netdev_warn(dev, "%s: mctp_dev set for unknown type %d", + __func__, dev->type); + return 0; + } + + /* only register specific types */ + if (!mctp_known(dev)) + return 0; + + mdev = mctp_add_dev(dev); + if (IS_ERR(mdev)) + return PTR_ERR(mdev); + + return 0; +} + +static int mctp_dev_notify(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + int rc; + + switch (event) { + case NETDEV_REGISTER: + rc = mctp_register(dev); + if (rc) + return notifier_from_errno(rc); + break; + case NETDEV_UNREGISTER: + mctp_unregister(dev); + break; + } + + return NOTIFY_OK; +} + +static int mctp_register_netdevice(struct net_device *dev, + const struct mctp_netdev_ops *ops) +{ + struct mctp_dev *mdev; + + mdev = mctp_add_dev(dev); + if (IS_ERR(mdev)) + return PTR_ERR(mdev); + + mdev->ops = ops; + + return register_netdevice(dev); +} + +int mctp_register_netdev(struct net_device *dev, + const struct mctp_netdev_ops *ops) +{ + int rc; + + rtnl_lock(); + rc = mctp_register_netdevice(dev, ops); + rtnl_unlock(); + + return rc; +} +EXPORT_SYMBOL_GPL(mctp_register_netdev); + +void mctp_unregister_netdev(struct net_device *dev) +{ + unregister_netdev(dev); +} +EXPORT_SYMBOL_GPL(mctp_unregister_netdev); + +static struct rtnl_af_ops mctp_af_ops = { + .family = AF_MCTP, + .fill_link_af = mctp_fill_link_af, + .get_link_af_size = mctp_get_link_af_size, + .set_link_af = mctp_set_link_af, +}; + +static struct notifier_block mctp_dev_nb = { + .notifier_call = mctp_dev_notify, + .priority = ADDRCONF_NOTIFY_PRIORITY, +}; + +void __init mctp_device_init(void) +{ + register_netdevice_notifier(&mctp_dev_nb); + + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETADDR, + NULL, mctp_dump_addrinfo, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWADDR, + mctp_rtm_newaddr, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELADDR, + mctp_rtm_deladdr, NULL, 0); + rtnl_af_register(&mctp_af_ops); +} + +void __exit mctp_device_exit(void) +{ + rtnl_af_unregister(&mctp_af_ops); + rtnl_unregister(PF_MCTP, RTM_DELADDR); + rtnl_unregister(PF_MCTP, RTM_NEWADDR); + rtnl_unregister(PF_MCTP, RTM_GETADDR); + + unregister_netdevice_notifier(&mctp_dev_nb); +} diff --git a/net/mctp/neigh.c b/net/mctp/neigh.c new file mode 100644 index 000000000000..5cc042121493 --- /dev/null +++ b/net/mctp/neigh.c @@ -0,0 +1,342 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) - routing + * implementation. + * + * This is currently based on a simple routing table, with no dst cache. The + * number of routes should stay fairly small, so the lookup cost is small. + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#include <linux/idr.h> +#include <linux/mctp.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> + +#include <net/mctp.h> +#include <net/mctpdevice.h> +#include <net/netlink.h> +#include <net/sock.h> + +static int mctp_neigh_add(struct mctp_dev *mdev, mctp_eid_t eid, + enum mctp_neigh_source source, + size_t lladdr_len, const void *lladdr) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_neigh *neigh; + int rc; + + mutex_lock(&net->mctp.neigh_lock); + if (mctp_neigh_lookup(mdev, eid, NULL) == 0) { + rc = -EEXIST; + goto out; + } + + if (lladdr_len > sizeof(neigh->ha)) { + rc = -EINVAL; + goto out; + } + + neigh = kzalloc(sizeof(*neigh), GFP_KERNEL); + if (!neigh) { + rc = -ENOMEM; + goto out; + } + INIT_LIST_HEAD(&neigh->list); + neigh->dev = mdev; + mctp_dev_hold(neigh->dev); + neigh->eid = eid; + neigh->source = source; + memcpy(neigh->ha, lladdr, lladdr_len); + + list_add_rcu(&neigh->list, &net->mctp.neighbours); + rc = 0; +out: + mutex_unlock(&net->mctp.neigh_lock); + return rc; +} + +static void __mctp_neigh_free(struct rcu_head *rcu) +{ + struct mctp_neigh *neigh = container_of(rcu, struct mctp_neigh, rcu); + + mctp_dev_put(neigh->dev); + kfree(neigh); +} + +/* Removes all neighbour entries referring to a device */ +void mctp_neigh_remove_dev(struct mctp_dev *mdev) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_neigh *neigh, *tmp; + + mutex_lock(&net->mctp.neigh_lock); + list_for_each_entry_safe(neigh, tmp, &net->mctp.neighbours, list) { + if (neigh->dev == mdev) { + list_del_rcu(&neigh->list); + /* TODO: immediate RTM_DELNEIGH */ + call_rcu(&neigh->rcu, __mctp_neigh_free); + } + } + + mutex_unlock(&net->mctp.neigh_lock); +} + +// TODO: add a "source" flag so netlink can only delete static neighbours? +static int mctp_neigh_remove(struct mctp_dev *mdev, mctp_eid_t eid) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_neigh *neigh, *tmp; + bool dropped = false; + + mutex_lock(&net->mctp.neigh_lock); + list_for_each_entry_safe(neigh, tmp, &net->mctp.neighbours, list) { + if (neigh->dev == mdev && neigh->eid == eid) { + list_del_rcu(&neigh->list); + /* TODO: immediate RTM_DELNEIGH */ + call_rcu(&neigh->rcu, __mctp_neigh_free); + dropped = true; + } + } + + mutex_unlock(&net->mctp.neigh_lock); + return dropped ? 0 : -ENOENT; +} + +static const struct nla_policy nd_mctp_policy[NDA_MAX + 1] = { + [NDA_DST] = { .type = NLA_U8 }, + [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, +}; + +static int mctp_rtm_newneigh(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + struct mctp_dev *mdev; + struct ndmsg *ndm; + struct nlattr *tb[NDA_MAX + 1]; + int rc; + mctp_eid_t eid; + void *lladdr; + int lladdr_len; + + rc = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nd_mctp_policy, + extack); + if (rc < 0) { + NL_SET_ERR_MSG(extack, "lladdr too large?"); + return rc; + } + + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Neighbour EID must be specified"); + return -EINVAL; + } + + if (!tb[NDA_LLADDR]) { + NL_SET_ERR_MSG(extack, "Neighbour lladdr must be specified"); + return -EINVAL; + } + + eid = nla_get_u8(tb[NDA_DST]); + if (!mctp_address_ok(eid)) { + NL_SET_ERR_MSG(extack, "Invalid neighbour EID"); + return -EINVAL; + } + + lladdr = nla_data(tb[NDA_LLADDR]); + lladdr_len = nla_len(tb[NDA_LLADDR]); + + ndm = nlmsg_data(nlh); + + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (!dev) + return -ENODEV; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODEV; + + if (lladdr_len != dev->addr_len) { + NL_SET_ERR_MSG(extack, "Wrong lladdr length"); + return -EINVAL; + } + + return mctp_neigh_add(mdev, eid, MCTP_NEIGH_STATIC, + lladdr_len, lladdr); +} + +static int mctp_rtm_delneigh(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct net *net = sock_net(skb->sk); + struct nlattr *tb[NDA_MAX + 1]; + struct net_device *dev; + struct mctp_dev *mdev; + struct ndmsg *ndm; + int rc; + mctp_eid_t eid; + + rc = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nd_mctp_policy, + extack); + if (rc < 0) { + NL_SET_ERR_MSG(extack, "incorrect format"); + return rc; + } + + if (!tb[NDA_DST]) { + NL_SET_ERR_MSG(extack, "Neighbour EID must be specified"); + return -EINVAL; + } + eid = nla_get_u8(tb[NDA_DST]); + + ndm = nlmsg_data(nlh); + dev = __dev_get_by_index(net, ndm->ndm_ifindex); + if (!dev) + return -ENODEV; + + mdev = mctp_dev_get_rtnl(dev); + if (!mdev) + return -ENODEV; + + return mctp_neigh_remove(mdev, eid); +} + +static int mctp_fill_neigh(struct sk_buff *skb, u32 portid, u32 seq, int event, + unsigned int flags, struct mctp_neigh *neigh) +{ + struct net_device *dev = neigh->dev->dev; + struct nlmsghdr *nlh; + struct ndmsg *hdr; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); + if (!nlh) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->ndm_family = AF_MCTP; + hdr->ndm_ifindex = dev->ifindex; + hdr->ndm_state = 0; // TODO other state bits? + if (neigh->source == MCTP_NEIGH_STATIC) + hdr->ndm_state |= NUD_PERMANENT; + hdr->ndm_flags = 0; + hdr->ndm_type = RTN_UNICAST; // TODO: is loopback RTN_LOCAL? + + if (nla_put_u8(skb, NDA_DST, neigh->eid)) + goto cancel; + + if (nla_put(skb, NDA_LLADDR, dev->addr_len, neigh->ha)) + goto cancel; + + nlmsg_end(skb, nlh); + + return 0; +cancel: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mctp_rtm_getneigh(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + int rc, idx, req_ifindex; + struct mctp_neigh *neigh; + struct ndmsg *ndmsg; + struct { + int idx; + } *cbctx = (void *)cb->ctx; + + ndmsg = nlmsg_data(cb->nlh); + req_ifindex = ndmsg->ndm_ifindex; + + idx = 0; + rcu_read_lock(); + list_for_each_entry_rcu(neigh, &net->mctp.neighbours, list) { + if (idx < cbctx->idx) + goto cont; + + rc = 0; + if (req_ifindex == 0 || req_ifindex == neigh->dev->dev->ifindex) + rc = mctp_fill_neigh(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI, neigh); + + if (rc) + break; +cont: + idx++; + } + rcu_read_unlock(); + + cbctx->idx = idx; + return skb->len; +} + +int mctp_neigh_lookup(struct mctp_dev *mdev, mctp_eid_t eid, void *ret_hwaddr) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_neigh *neigh; + int rc = -EHOSTUNREACH; // TODO: or ENOENT? + + rcu_read_lock(); + list_for_each_entry_rcu(neigh, &net->mctp.neighbours, list) { + if (mdev == neigh->dev && eid == neigh->eid) { + if (ret_hwaddr) + memcpy(ret_hwaddr, neigh->ha, + sizeof(neigh->ha)); + rc = 0; + break; + } + } + rcu_read_unlock(); + return rc; +} + +/* namespace registration */ +static int __net_init mctp_neigh_net_init(struct net *net) +{ + struct netns_mctp *ns = &net->mctp; + + INIT_LIST_HEAD(&ns->neighbours); + mutex_init(&ns->neigh_lock); + return 0; +} + +static void __net_exit mctp_neigh_net_exit(struct net *net) +{ + struct netns_mctp *ns = &net->mctp; + struct mctp_neigh *neigh; + + list_for_each_entry(neigh, &ns->neighbours, list) + call_rcu(&neigh->rcu, __mctp_neigh_free); +} + +/* net namespace implementation */ + +static struct pernet_operations mctp_net_ops = { + .init = mctp_neigh_net_init, + .exit = mctp_neigh_net_exit, +}; + +int __init mctp_neigh_init(void) +{ + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWNEIGH, + mctp_rtm_newneigh, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELNEIGH, + mctp_rtm_delneigh, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETNEIGH, + NULL, mctp_rtm_getneigh, 0); + + return register_pernet_subsys(&mctp_net_ops); +} + +void __exit mctp_neigh_exit(void) +{ + unregister_pernet_subsys(&mctp_net_ops); + rtnl_unregister(PF_MCTP, RTM_GETNEIGH); + rtnl_unregister(PF_MCTP, RTM_DELNEIGH); + rtnl_unregister(PF_MCTP, RTM_NEWNEIGH); +} diff --git a/net/mctp/route.c b/net/mctp/route.c new file mode 100644 index 000000000000..46c44823edb7 --- /dev/null +++ b/net/mctp/route.c @@ -0,0 +1,1342 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Management Component Transport Protocol (MCTP) - routing + * implementation. + * + * This is currently based on a simple routing table, with no dst cache. The + * number of routes should stay fairly small, so the lookup cost is small. + * + * Copyright (c) 2021 Code Construct + * Copyright (c) 2021 Google + */ + +#include <linux/idr.h> +#include <linux/kconfig.h> +#include <linux/mctp.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff.h> + +#include <uapi/linux/if_arp.h> + +#include <net/mctp.h> +#include <net/mctpdevice.h> +#include <net/netlink.h> +#include <net/sock.h> + +#include <trace/events/mctp.h> + +static const unsigned int mctp_message_maxlen = 64 * 1024; +static const unsigned long mctp_key_lifetime = 6 * CONFIG_HZ; + +static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev); + +/* route output callbacks */ +static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb) +{ + struct mctp_skb_cb *cb = mctp_cb(skb); + struct mctp_hdr *mh; + struct sock *sk; + u8 type; + + WARN_ON(!rcu_read_lock_held()); + + /* TODO: look up in skb->cb? */ + mh = mctp_hdr(skb); + + if (!skb_headlen(skb)) + return NULL; + + type = (*(u8 *)skb->data) & 0x7f; + + sk_for_each_rcu(sk, &net->mctp.binds) { + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + + if (msk->bind_net != MCTP_NET_ANY && msk->bind_net != cb->net) + continue; + + if (msk->bind_type != type) + continue; + + if (msk->bind_addr != MCTP_ADDR_ANY && + msk->bind_addr != mh->dest) + continue; + + return msk; + } + + return NULL; +} + +static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local, + mctp_eid_t peer, u8 tag) +{ + if (key->local_addr != local) + return false; + + if (key->peer_addr != peer) + return false; + + if (key->tag != tag) + return false; + + return true; +} + +/* returns a key (with key->lock held, and refcounted), or NULL if no such + * key exists. + */ +static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb, + mctp_eid_t peer, + unsigned long *irqflags) + __acquires(&key->lock) +{ + struct mctp_sk_key *key, *ret; + unsigned long flags; + struct mctp_hdr *mh; + u8 tag; + + mh = mctp_hdr(skb); + tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + + ret = NULL; + spin_lock_irqsave(&net->mctp.keys_lock, flags); + + hlist_for_each_entry(key, &net->mctp.keys, hlist) { + if (!mctp_key_match(key, mh->dest, peer, tag)) + continue; + + spin_lock(&key->lock); + if (key->valid) { + refcount_inc(&key->refs); + ret = key; + break; + } + spin_unlock(&key->lock); + } + + if (ret) { + spin_unlock(&net->mctp.keys_lock); + *irqflags = flags; + } else { + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + } + + return ret; +} + +static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk, + mctp_eid_t local, mctp_eid_t peer, + u8 tag, gfp_t gfp) +{ + struct mctp_sk_key *key; + + key = kzalloc(sizeof(*key), gfp); + if (!key) + return NULL; + + key->peer_addr = peer; + key->local_addr = local; + key->tag = tag; + key->sk = &msk->sk; + key->valid = true; + spin_lock_init(&key->lock); + refcount_set(&key->refs, 1); + + return key; +} + +void mctp_key_unref(struct mctp_sk_key *key) +{ + unsigned long flags; + + if (!refcount_dec_and_test(&key->refs)) + return; + + /* even though no refs exist here, the lock allows us to stay + * consistent with the locking requirement of mctp_dev_release_key + */ + spin_lock_irqsave(&key->lock, flags); + mctp_dev_release_key(key->dev, key); + spin_unlock_irqrestore(&key->lock, flags); + + kfree(key); +} + +static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk) +{ + struct net *net = sock_net(&msk->sk); + struct mctp_sk_key *tmp; + unsigned long flags; + int rc = 0; + + spin_lock_irqsave(&net->mctp.keys_lock, flags); + + hlist_for_each_entry(tmp, &net->mctp.keys, hlist) { + if (mctp_key_match(tmp, key->local_addr, key->peer_addr, + key->tag)) { + spin_lock(&tmp->lock); + if (tmp->valid) + rc = -EEXIST; + spin_unlock(&tmp->lock); + if (rc) + break; + } + } + + if (!rc) { + refcount_inc(&key->refs); + key->expiry = jiffies + mctp_key_lifetime; + timer_reduce(&msk->key_expiry, key->expiry); + + hlist_add_head(&key->hlist, &net->mctp.keys); + hlist_add_head(&key->sklist, &msk->keys); + } + + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + + return rc; +} + +/* We're done with the key; unset valid and remove from lists. There may still + * be outstanding refs on the key though... + */ +static void __mctp_key_unlock_drop(struct mctp_sk_key *key, struct net *net, + unsigned long flags) + __releases(&key->lock) +{ + struct sk_buff *skb; + + skb = key->reasm_head; + key->reasm_head = NULL; + key->reasm_dead = true; + key->valid = false; + mctp_dev_release_key(key->dev, key); + spin_unlock_irqrestore(&key->lock, flags); + + spin_lock_irqsave(&net->mctp.keys_lock, flags); + hlist_del(&key->hlist); + hlist_del(&key->sklist); + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); + + /* one unref for the lists */ + mctp_key_unref(key); + + /* and one for the local reference */ + mctp_key_unref(key); + + if (skb) + kfree_skb(skb); + +} + +#ifdef CONFIG_MCTP_FLOWS +static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key) +{ + struct mctp_flow *flow; + + flow = skb_ext_add(skb, SKB_EXT_MCTP); + if (!flow) + return; + + refcount_inc(&key->refs); + flow->key = key; +} + +static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) +{ + struct mctp_sk_key *key; + struct mctp_flow *flow; + + flow = skb_ext_find(skb, SKB_EXT_MCTP); + if (!flow) + return; + + key = flow->key; + + if (WARN_ON(key->dev && key->dev != dev)) + return; + + mctp_dev_set_key(dev, key); +} +#else +static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key) {} +static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) {} +#endif + +static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) +{ + struct mctp_hdr *hdr = mctp_hdr(skb); + u8 exp_seq, this_seq; + + this_seq = (hdr->flags_seq_tag >> MCTP_HDR_SEQ_SHIFT) + & MCTP_HDR_SEQ_MASK; + + if (!key->reasm_head) { + key->reasm_head = skb; + key->reasm_tailp = &(skb_shinfo(skb)->frag_list); + key->last_seq = this_seq; + return 0; + } + + exp_seq = (key->last_seq + 1) & MCTP_HDR_SEQ_MASK; + + if (this_seq != exp_seq) + return -EINVAL; + + if (key->reasm_head->len + skb->len > mctp_message_maxlen) + return -EINVAL; + + skb->next = NULL; + skb->sk = NULL; + *key->reasm_tailp = skb; + key->reasm_tailp = &skb->next; + + key->last_seq = this_seq; + + key->reasm_head->data_len += skb->len; + key->reasm_head->len += skb->len; + key->reasm_head->truesize += skb->truesize; + + return 0; +} + +static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + struct mctp_sk_key *key; + struct mctp_sock *msk; + struct mctp_hdr *mh; + unsigned long f; + u8 tag, flags; + int rc; + + msk = NULL; + rc = -EINVAL; + + /* we may be receiving a locally-routed packet; drop source sk + * accounting + */ + skb_orphan(skb); + + /* ensure we have enough data for a header and a type */ + if (skb->len < sizeof(struct mctp_hdr) + 1) + goto out; + + /* grab header, advance data ptr */ + mh = mctp_hdr(skb); + skb_pull(skb, sizeof(struct mctp_hdr)); + + if (mh->ver != 1) + goto out; + + flags = mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM); + tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + + rcu_read_lock(); + + /* lookup socket / reasm context, exactly matching (src,dest,tag). + * we hold a ref on the key, and key->lock held. + */ + key = mctp_lookup_key(net, skb, mh->src, &f); + + if (flags & MCTP_HDR_FLAG_SOM) { + if (key) { + msk = container_of(key->sk, struct mctp_sock, sk); + } else { + /* first response to a broadcast? do a more general + * key lookup to find the socket, but don't use this + * key for reassembly - we'll create a more specific + * one for future packets if required (ie, !EOM). + */ + key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY, &f); + if (key) { + msk = container_of(key->sk, + struct mctp_sock, sk); + spin_unlock_irqrestore(&key->lock, f); + mctp_key_unref(key); + key = NULL; + } + } + + if (!key && !msk && (tag & MCTP_HDR_FLAG_TO)) + msk = mctp_lookup_bind(net, skb); + + if (!msk) { + rc = -ENOENT; + goto out_unlock; + } + + /* single-packet message? deliver to socket, clean up any + * pending key. + */ + if (flags & MCTP_HDR_FLAG_EOM) { + sock_queue_rcv_skb(&msk->sk, skb); + if (key) { + /* we've hit a pending reassembly; not much we + * can do but drop it + */ + trace_mctp_key_release(key, + MCTP_TRACE_KEY_REPLIED); + __mctp_key_unlock_drop(key, net, f); + key = NULL; + } + rc = 0; + goto out_unlock; + } + + /* broadcast response or a bind() - create a key for further + * packets for this message + */ + if (!key) { + key = mctp_key_alloc(msk, mh->dest, mh->src, + tag, GFP_ATOMIC); + if (!key) { + rc = -ENOMEM; + goto out_unlock; + } + + /* we can queue without the key lock here, as the + * key isn't observable yet + */ + mctp_frag_queue(key, skb); + + /* if the key_add fails, we've raced with another + * SOM packet with the same src, dest and tag. There's + * no way to distinguish future packets, so all we + * can do is drop; we'll free the skb on exit from + * this function. + */ + rc = mctp_key_add(key, msk); + if (rc) + kfree(key); + + trace_mctp_key_acquire(key); + + /* we don't need to release key->lock on exit */ + mctp_key_unref(key); + key = NULL; + + } else { + if (key->reasm_head || key->reasm_dead) { + /* duplicate start? drop everything */ + trace_mctp_key_release(key, + MCTP_TRACE_KEY_INVALIDATED); + __mctp_key_unlock_drop(key, net, f); + rc = -EEXIST; + key = NULL; + } else { + rc = mctp_frag_queue(key, skb); + } + } + + } else if (key) { + /* this packet continues a previous message; reassemble + * using the message-specific key + */ + + /* we need to be continuing an existing reassembly... */ + if (!key->reasm_head) + rc = -EINVAL; + else + rc = mctp_frag_queue(key, skb); + + /* end of message? deliver to socket, and we're done with + * the reassembly/response key + */ + if (!rc && flags & MCTP_HDR_FLAG_EOM) { + sock_queue_rcv_skb(key->sk, key->reasm_head); + key->reasm_head = NULL; + trace_mctp_key_release(key, MCTP_TRACE_KEY_REPLIED); + __mctp_key_unlock_drop(key, net, f); + key = NULL; + } + + } else { + /* not a start, no matching key */ + rc = -ENOENT; + } + +out_unlock: + rcu_read_unlock(); + if (key) { + spin_unlock_irqrestore(&key->lock, f); + mctp_key_unref(key); + } +out: + if (rc) + kfree_skb(skb); + return rc; +} + +static unsigned int mctp_route_mtu(struct mctp_route *rt) +{ + return rt->mtu ?: READ_ONCE(rt->dev->dev->mtu); +} + +static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) +{ + struct mctp_skb_cb *cb = mctp_cb(skb); + struct mctp_hdr *hdr = mctp_hdr(skb); + char daddr_buf[MAX_ADDR_LEN]; + char *daddr = NULL; + unsigned int mtu; + int rc; + + skb->protocol = htons(ETH_P_MCTP); + + mtu = READ_ONCE(skb->dev->mtu); + if (skb->len > mtu) { + kfree_skb(skb); + return -EMSGSIZE; + } + + if (cb->ifindex) { + /* direct route; use the hwaddr we stashed in sendmsg */ + daddr = cb->haddr; + } else { + /* If lookup fails let the device handle daddr==NULL */ + if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0) + daddr = daddr_buf; + } + + rc = dev_hard_header(skb, skb->dev, ntohs(skb->protocol), + daddr, skb->dev->dev_addr, skb->len); + if (rc) { + kfree_skb(skb); + return -EHOSTUNREACH; + } + + mctp_flow_prepare_output(skb, route->dev); + + rc = dev_queue_xmit(skb); + if (rc) + rc = net_xmit_errno(rc); + + return rc; +} + +/* route alloc/release */ +static void mctp_route_release(struct mctp_route *rt) +{ + if (refcount_dec_and_test(&rt->refs)) { + mctp_dev_put(rt->dev); + kfree_rcu(rt, rcu); + } +} + +/* returns a route with the refcount at 1 */ +static struct mctp_route *mctp_route_alloc(void) +{ + struct mctp_route *rt; + + rt = kzalloc(sizeof(*rt), GFP_KERNEL); + if (!rt) + return NULL; + + INIT_LIST_HEAD(&rt->list); + refcount_set(&rt->refs, 1); + rt->output = mctp_route_discard; + + return rt; +} + +unsigned int mctp_default_net(struct net *net) +{ + return READ_ONCE(net->mctp.default_net); +} + +int mctp_default_net_set(struct net *net, unsigned int index) +{ + if (index == 0) + return -EINVAL; + WRITE_ONCE(net->mctp.default_net, index); + return 0; +} + +/* tag management */ +static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key, + struct mctp_sock *msk) +{ + struct netns_mctp *mns = &net->mctp; + + lockdep_assert_held(&mns->keys_lock); + + key->expiry = jiffies + mctp_key_lifetime; + timer_reduce(&msk->key_expiry, key->expiry); + + /* we hold the net->key_lock here, allowing updates to both + * then net and sk + */ + hlist_add_head_rcu(&key->hlist, &mns->keys); + hlist_add_head_rcu(&key->sklist, &msk->keys); + refcount_inc(&key->refs); +} + +/* Allocate a locally-owned tag value for (saddr, daddr), and reserve + * it for the socket msk + */ +static struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk, + mctp_eid_t saddr, + mctp_eid_t daddr, u8 *tagp) +{ + struct net *net = sock_net(&msk->sk); + struct netns_mctp *mns = &net->mctp; + struct mctp_sk_key *key, *tmp; + unsigned long flags; + u8 tagbits; + + /* for NULL destination EIDs, we may get a response from any peer */ + if (daddr == MCTP_ADDR_NULL) + daddr = MCTP_ADDR_ANY; + + /* be optimistic, alloc now */ + key = mctp_key_alloc(msk, saddr, daddr, 0, GFP_KERNEL); + if (!key) + return ERR_PTR(-ENOMEM); + + /* 8 possible tag values */ + tagbits = 0xff; + + spin_lock_irqsave(&mns->keys_lock, flags); + + /* Walk through the existing keys, looking for potential conflicting + * tags. If we find a conflict, clear that bit from tagbits + */ + hlist_for_each_entry(tmp, &mns->keys, hlist) { + /* We can check the lookup fields (*_addr, tag) without the + * lock held, they don't change over the lifetime of the key. + */ + + /* if we don't own the tag, it can't conflict */ + if (tmp->tag & MCTP_HDR_FLAG_TO) + continue; + + if (!((tmp->peer_addr == daddr || + tmp->peer_addr == MCTP_ADDR_ANY) && + tmp->local_addr == saddr)) + continue; + + spin_lock(&tmp->lock); + /* key must still be valid. If we find a match, clear the + * potential tag value + */ + if (tmp->valid) + tagbits &= ~(1 << tmp->tag); + spin_unlock(&tmp->lock); + + if (!tagbits) + break; + } + + if (tagbits) { + key->tag = __ffs(tagbits); + mctp_reserve_tag(net, key, msk); + trace_mctp_key_acquire(key); + + *tagp = key->tag; + } + + spin_unlock_irqrestore(&mns->keys_lock, flags); + + if (!tagbits) { + kfree(key); + return ERR_PTR(-EBUSY); + } + + return key; +} + +/* routing lookups */ +static bool mctp_rt_match_eid(struct mctp_route *rt, + unsigned int net, mctp_eid_t eid) +{ + return READ_ONCE(rt->dev->net) == net && + rt->min <= eid && rt->max >= eid; +} + +/* compares match, used for duplicate prevention */ +static bool mctp_rt_compare_exact(struct mctp_route *rt1, + struct mctp_route *rt2) +{ + ASSERT_RTNL(); + return rt1->dev->net == rt2->dev->net && + rt1->min == rt2->min && + rt1->max == rt2->max; +} + +struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, + mctp_eid_t daddr) +{ + struct mctp_route *tmp, *rt = NULL; + + list_for_each_entry_rcu(tmp, &net->mctp.routes, list) { + /* TODO: add metrics */ + if (mctp_rt_match_eid(tmp, dnet, daddr)) { + if (refcount_inc_not_zero(&tmp->refs)) { + rt = tmp; + break; + } + } + } + + return rt; +} + +static struct mctp_route *mctp_route_lookup_null(struct net *net, + struct net_device *dev) +{ + struct mctp_route *rt; + + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { + if (rt->dev->dev == dev && rt->type == RTN_LOCAL && + refcount_inc_not_zero(&rt->refs)) + return rt; + } + + return NULL; +} + +static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, + unsigned int mtu, u8 tag) +{ + const unsigned int hlen = sizeof(struct mctp_hdr); + struct mctp_hdr *hdr, *hdr2; + unsigned int pos, size; + struct sk_buff *skb2; + int rc; + u8 seq; + + hdr = mctp_hdr(skb); + seq = 0; + rc = 0; + + if (mtu < hlen + 1) { + kfree_skb(skb); + return -EMSGSIZE; + } + + /* we've got the header */ + skb_pull(skb, hlen); + + for (pos = 0; pos < skb->len;) { + /* size of message payload */ + size = min(mtu - hlen, skb->len - pos); + + skb2 = alloc_skb(MCTP_HEADER_MAXLEN + hlen + size, GFP_KERNEL); + if (!skb2) { + rc = -ENOMEM; + break; + } + + /* generic skb copy */ + skb2->protocol = skb->protocol; + skb2->priority = skb->priority; + skb2->dev = skb->dev; + memcpy(skb2->cb, skb->cb, sizeof(skb2->cb)); + + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + + /* establish packet */ + skb_reserve(skb2, MCTP_HEADER_MAXLEN); + skb_reset_network_header(skb2); + skb_put(skb2, hlen + size); + skb2->transport_header = skb2->network_header + hlen; + + /* copy header fields, calculate SOM/EOM flags & seq */ + hdr2 = mctp_hdr(skb2); + hdr2->ver = hdr->ver; + hdr2->dest = hdr->dest; + hdr2->src = hdr->src; + hdr2->flags_seq_tag = tag & + (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO); + + if (pos == 0) + hdr2->flags_seq_tag |= MCTP_HDR_FLAG_SOM; + + if (pos + size == skb->len) + hdr2->flags_seq_tag |= MCTP_HDR_FLAG_EOM; + + hdr2->flags_seq_tag |= seq << MCTP_HDR_SEQ_SHIFT; + + /* copy message payload */ + skb_copy_bits(skb, pos, skb_transport_header(skb2), size); + + /* do route */ + rc = rt->output(rt, skb2); + if (rc) + break; + + seq = (seq + 1) & MCTP_HDR_SEQ_MASK; + pos += size; + } + + consume_skb(skb); + return rc; +} + +int mctp_local_output(struct sock *sk, struct mctp_route *rt, + struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag) +{ + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct mctp_skb_cb *cb = mctp_cb(skb); + struct mctp_route tmp_rt; + struct mctp_sk_key *key; + struct net_device *dev; + struct mctp_hdr *hdr; + unsigned long flags; + unsigned int mtu; + mctp_eid_t saddr; + bool ext_rt; + int rc; + u8 tag; + + rc = -ENODEV; + + if (rt) { + ext_rt = false; + dev = NULL; + + if (WARN_ON(!rt->dev)) + goto out_release; + + } else if (cb->ifindex) { + ext_rt = true; + rt = &tmp_rt; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), cb->ifindex); + if (!dev) { + rcu_read_unlock(); + return rc; + } + + rt->dev = __mctp_dev_get(dev); + rcu_read_unlock(); + + if (!rt->dev) + goto out_release; + + /* establish temporary route - we set up enough to keep + * mctp_route_output happy + */ + rt->output = mctp_route_output; + rt->mtu = 0; + + } else { + return -EINVAL; + } + + spin_lock_irqsave(&rt->dev->addrs_lock, flags); + if (rt->dev->num_addrs == 0) { + rc = -EHOSTUNREACH; + } else { + /* use the outbound interface's first address as our source */ + saddr = rt->dev->addrs[0]; + rc = 0; + } + spin_unlock_irqrestore(&rt->dev->addrs_lock, flags); + + if (rc) + goto out_release; + + if (req_tag & MCTP_HDR_FLAG_TO) { + key = mctp_alloc_local_tag(msk, saddr, daddr, &tag); + if (IS_ERR(key)) { + rc = PTR_ERR(key); + goto out_release; + } + mctp_skb_set_flow(skb, key); + /* done with the key in this scope */ + mctp_key_unref(key); + tag |= MCTP_HDR_FLAG_TO; + } else { + key = NULL; + tag = req_tag; + } + + skb->protocol = htons(ETH_P_MCTP); + skb->priority = 0; + skb_reset_transport_header(skb); + skb_push(skb, sizeof(struct mctp_hdr)); + skb_reset_network_header(skb); + skb->dev = rt->dev->dev; + + /* cb->net will have been set on initial ingress */ + cb->src = saddr; + + /* set up common header fields */ + hdr = mctp_hdr(skb); + hdr->ver = 1; + hdr->dest = daddr; + hdr->src = saddr; + + mtu = mctp_route_mtu(rt); + + if (skb->len + sizeof(struct mctp_hdr) <= mtu) { + hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | + MCTP_HDR_FLAG_EOM | tag; + rc = rt->output(rt, skb); + } else { + rc = mctp_do_fragment_route(rt, skb, mtu, tag); + } + +out_release: + if (!ext_rt) + mctp_route_release(rt); + + if (dev) + dev_put(dev); + + return rc; + +} + +/* route management */ +static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, + unsigned int daddr_extent, unsigned int mtu, + unsigned char type) +{ + int (*rtfn)(struct mctp_route *rt, struct sk_buff *skb); + struct net *net = dev_net(mdev->dev); + struct mctp_route *rt, *ert; + + if (!mctp_address_ok(daddr_start)) + return -EINVAL; + + if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255) + return -EINVAL; + + switch (type) { + case RTN_LOCAL: + rtfn = mctp_route_input; + break; + case RTN_UNICAST: + rtfn = mctp_route_output; + break; + default: + return -EINVAL; + } + + rt = mctp_route_alloc(); + if (!rt) + return -ENOMEM; + + rt->min = daddr_start; + rt->max = daddr_start + daddr_extent; + rt->mtu = mtu; + rt->dev = mdev; + mctp_dev_hold(rt->dev); + rt->type = type; + rt->output = rtfn; + + ASSERT_RTNL(); + /* Prevent duplicate identical routes. */ + list_for_each_entry(ert, &net->mctp.routes, list) { + if (mctp_rt_compare_exact(rt, ert)) { + mctp_route_release(rt); + return -EEXIST; + } + } + + list_add_rcu(&rt->list, &net->mctp.routes); + + return 0; +} + +static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start, + unsigned int daddr_extent) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_route *rt, *tmp; + mctp_eid_t daddr_end; + bool dropped; + + if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255) + return -EINVAL; + + daddr_end = daddr_start + daddr_extent; + dropped = false; + + ASSERT_RTNL(); + + list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) { + if (rt->dev == mdev && + rt->min == daddr_start && rt->max == daddr_end) { + list_del_rcu(&rt->list); + /* TODO: immediate RTM_DELROUTE */ + mctp_route_release(rt); + dropped = true; + } + } + + return dropped ? 0 : -ENOENT; +} + +int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr) +{ + return mctp_route_add(mdev, addr, 0, 0, RTN_LOCAL); +} + +int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr) +{ + return mctp_route_remove(mdev, addr, 0); +} + +/* removes all entries for a given device */ +void mctp_route_remove_dev(struct mctp_dev *mdev) +{ + struct net *net = dev_net(mdev->dev); + struct mctp_route *rt, *tmp; + + ASSERT_RTNL(); + list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) { + if (rt->dev == mdev) { + list_del_rcu(&rt->list); + /* TODO: immediate RTM_DELROUTE */ + mctp_route_release(rt); + } + } +} + +/* Incoming packet-handling */ + +static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev) +{ + struct net *net = dev_net(dev); + struct mctp_dev *mdev; + struct mctp_skb_cb *cb; + struct mctp_route *rt; + struct mctp_hdr *mh; + + rcu_read_lock(); + mdev = __mctp_dev_get(dev); + rcu_read_unlock(); + if (!mdev) { + /* basic non-data sanity checks */ + goto err_drop; + } + + if (!pskb_may_pull(skb, sizeof(struct mctp_hdr))) + goto err_drop; + + skb_reset_transport_header(skb); + skb_reset_network_header(skb); + + /* We have enough for a header; decode and route */ + mh = mctp_hdr(skb); + if (mh->ver < MCTP_VER_MIN || mh->ver > MCTP_VER_MAX) + goto err_drop; + + /* MCTP drivers must populate halen/haddr */ + if (dev->type == ARPHRD_MCTP) { + cb = mctp_cb(skb); + } else { + cb = __mctp_cb(skb); + cb->halen = 0; + } + cb->net = READ_ONCE(mdev->net); + cb->ifindex = dev->ifindex; + + rt = mctp_route_lookup(net, cb->net, mh->dest); + + /* NULL EID, but addressed to our physical address */ + if (!rt && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST) + rt = mctp_route_lookup_null(net, dev); + + if (!rt) + goto err_drop; + + rt->output(rt, skb); + mctp_route_release(rt); + + return NET_RX_SUCCESS; + +err_drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static struct packet_type mctp_packet_type = { + .type = cpu_to_be16(ETH_P_MCTP), + .func = mctp_pkttype_receive, +}; + +/* netlink interface */ + +static const struct nla_policy rta_mctp_policy[RTA_MAX + 1] = { + [RTA_DST] = { .type = NLA_U8 }, + [RTA_METRICS] = { .type = NLA_NESTED }, + [RTA_OIF] = { .type = NLA_U32 }, +}; + +/* Common part for RTM_NEWROUTE and RTM_DELROUTE parsing. + * tb must hold RTA_MAX+1 elements. + */ +static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, + struct nlattr **tb, struct rtmsg **rtm, + struct mctp_dev **mdev, mctp_eid_t *daddr_start) +{ + struct net *net = sock_net(skb->sk); + struct net_device *dev; + unsigned int ifindex; + int rc; + + rc = nlmsg_parse(nlh, sizeof(struct rtmsg), tb, RTA_MAX, + rta_mctp_policy, extack); + if (rc < 0) { + NL_SET_ERR_MSG(extack, "incorrect format"); + return rc; + } + + if (!tb[RTA_DST]) { + NL_SET_ERR_MSG(extack, "dst EID missing"); + return -EINVAL; + } + *daddr_start = nla_get_u8(tb[RTA_DST]); + + if (!tb[RTA_OIF]) { + NL_SET_ERR_MSG(extack, "ifindex missing"); + return -EINVAL; + } + ifindex = nla_get_u32(tb[RTA_OIF]); + + *rtm = nlmsg_data(nlh); + if ((*rtm)->rtm_family != AF_MCTP) { + NL_SET_ERR_MSG(extack, "route family must be AF_MCTP"); + return -EINVAL; + } + + dev = __dev_get_by_index(net, ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "bad ifindex"); + return -ENODEV; + } + *mdev = mctp_dev_get_rtnl(dev); + if (!*mdev) + return -ENODEV; + + if (dev->flags & IFF_LOOPBACK) { + NL_SET_ERR_MSG(extack, "no routes to loopback"); + return -EINVAL; + } + + return 0; +} + +static const struct nla_policy rta_metrics_policy[RTAX_MAX + 1] = { + [RTAX_MTU] = { .type = NLA_U32 }, +}; + +static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RTA_MAX + 1]; + struct nlattr *tbx[RTAX_MAX + 1]; + mctp_eid_t daddr_start; + struct mctp_dev *mdev; + struct rtmsg *rtm; + unsigned int mtu; + int rc; + + rc = mctp_route_nlparse(skb, nlh, extack, tb, + &rtm, &mdev, &daddr_start); + if (rc < 0) + return rc; + + if (rtm->rtm_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, "rtm_type must be RTN_UNICAST"); + return -EINVAL; + } + + mtu = 0; + if (tb[RTA_METRICS]) { + rc = nla_parse_nested(tbx, RTAX_MAX, tb[RTA_METRICS], + rta_metrics_policy, NULL); + if (rc < 0) + return rc; + if (tbx[RTAX_MTU]) + mtu = nla_get_u32(tbx[RTAX_MTU]); + } + + if (rtm->rtm_type != RTN_UNICAST) + return -EINVAL; + + rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu, + rtm->rtm_type); + return rc; +} + +static int mctp_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RTA_MAX + 1]; + mctp_eid_t daddr_start; + struct mctp_dev *mdev; + struct rtmsg *rtm; + int rc; + + rc = mctp_route_nlparse(skb, nlh, extack, tb, + &rtm, &mdev, &daddr_start); + if (rc < 0) + return rc; + + /* we only have unicast routes */ + if (rtm->rtm_type != RTN_UNICAST) + return -EINVAL; + + rc = mctp_route_remove(mdev, daddr_start, rtm->rtm_dst_len); + return rc; +} + +static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt, + u32 portid, u32 seq, int event, unsigned int flags) +{ + struct nlmsghdr *nlh; + struct rtmsg *hdr; + void *metrics; + + nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); + if (!nlh) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->rtm_family = AF_MCTP; + + /* we use the _len fields as a number of EIDs, rather than + * a number of bits in the address + */ + hdr->rtm_dst_len = rt->max - rt->min; + hdr->rtm_src_len = 0; + hdr->rtm_tos = 0; + hdr->rtm_table = RT_TABLE_DEFAULT; + hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */ + hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */ + hdr->rtm_type = rt->type; + + if (nla_put_u8(skb, RTA_DST, rt->min)) + goto cancel; + + metrics = nla_nest_start_noflag(skb, RTA_METRICS); + if (!metrics) + goto cancel; + + if (rt->mtu) { + if (nla_put_u32(skb, RTAX_MTU, rt->mtu)) + goto cancel; + } + + nla_nest_end(skb, metrics); + + if (rt->dev) { + if (nla_put_u32(skb, RTA_OIF, rt->dev->dev->ifindex)) + goto cancel; + } + + /* TODO: conditional neighbour physaddr? */ + + nlmsg_end(skb, nlh); + + return 0; + +cancel: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static int mctp_dump_rtinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net *net = sock_net(skb->sk); + struct mctp_route *rt; + int s_idx, idx; + + /* TODO: allow filtering on route data, possibly under + * cb->strict_check + */ + + /* TODO: change to struct overlay */ + s_idx = cb->args[0]; + idx = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { + if (idx++ < s_idx) + continue; + if (mctp_fill_rtinfo(skb, rt, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, NLM_F_MULTI) < 0) + break; + } + + rcu_read_unlock(); + cb->args[0] = idx; + + return skb->len; +} + +/* net namespace implementation */ +static int __net_init mctp_routes_net_init(struct net *net) +{ + struct netns_mctp *ns = &net->mctp; + + INIT_LIST_HEAD(&ns->routes); + INIT_HLIST_HEAD(&ns->binds); + mutex_init(&ns->bind_lock); + INIT_HLIST_HEAD(&ns->keys); + spin_lock_init(&ns->keys_lock); + WARN_ON(mctp_default_net_set(net, MCTP_INITIAL_DEFAULT_NET)); + return 0; +} + +static void __net_exit mctp_routes_net_exit(struct net *net) +{ + struct mctp_route *rt; + + rcu_read_lock(); + list_for_each_entry_rcu(rt, &net->mctp.routes, list) + mctp_route_release(rt); + rcu_read_unlock(); +} + +static struct pernet_operations mctp_net_ops = { + .init = mctp_routes_net_init, + .exit = mctp_routes_net_exit, +}; + +int __init mctp_routes_init(void) +{ + dev_add_pack(&mctp_packet_type); + + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETROUTE, + NULL, mctp_dump_rtinfo, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWROUTE, + mctp_newroute, NULL, 0); + rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELROUTE, + mctp_delroute, NULL, 0); + + return register_pernet_subsys(&mctp_net_ops); +} + +void __exit mctp_routes_exit(void) +{ + unregister_pernet_subsys(&mctp_net_ops); + rtnl_unregister(PF_MCTP, RTM_DELROUTE); + rtnl_unregister(PF_MCTP, RTM_NEWROUTE); + rtnl_unregister(PF_MCTP, RTM_GETROUTE); + dev_remove_pack(&mctp_packet_type); +} + +#if IS_ENABLED(CONFIG_MCTP_TEST) +#include "test/route-test.c" +#endif diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c new file mode 100644 index 000000000000..36fac3daf86a --- /dev/null +++ b/net/mctp/test/route-test.c @@ -0,0 +1,544 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <kunit/test.h> + +#include "utils.h" + +struct mctp_test_route { + struct mctp_route rt; + struct sk_buff_head pkts; +}; + +static int mctp_test_route_output(struct mctp_route *rt, struct sk_buff *skb) +{ + struct mctp_test_route *test_rt = container_of(rt, struct mctp_test_route, rt); + + skb_queue_tail(&test_rt->pkts, skb); + + return 0; +} + +/* local version of mctp_route_alloc() */ +static struct mctp_test_route *mctp_route_test_alloc(void) +{ + struct mctp_test_route *rt; + + rt = kzalloc(sizeof(*rt), GFP_KERNEL); + if (!rt) + return NULL; + + INIT_LIST_HEAD(&rt->rt.list); + refcount_set(&rt->rt.refs, 1); + rt->rt.output = mctp_test_route_output; + + skb_queue_head_init(&rt->pkts); + + return rt; +} + +static struct mctp_test_route *mctp_test_create_route(struct net *net, + struct mctp_dev *dev, + mctp_eid_t eid, + unsigned int mtu) +{ + struct mctp_test_route *rt; + + rt = mctp_route_test_alloc(); + if (!rt) + return NULL; + + rt->rt.min = eid; + rt->rt.max = eid; + rt->rt.mtu = mtu; + rt->rt.type = RTN_UNSPEC; + if (dev) + mctp_dev_hold(dev); + rt->rt.dev = dev; + + list_add_rcu(&rt->rt.list, &net->mctp.routes); + + return rt; +} + +static void mctp_test_route_destroy(struct kunit *test, + struct mctp_test_route *rt) +{ + unsigned int refs; + + rtnl_lock(); + list_del_rcu(&rt->rt.list); + rtnl_unlock(); + + skb_queue_purge(&rt->pkts); + if (rt->rt.dev) + mctp_dev_put(rt->rt.dev); + + refs = refcount_read(&rt->rt.refs); + KUNIT_ASSERT_EQ_MSG(test, refs, 1, "route ref imbalance"); + + kfree_rcu(&rt->rt, rcu); +} + +static struct sk_buff *mctp_test_create_skb(const struct mctp_hdr *hdr, + unsigned int data_len) +{ + size_t hdr_len = sizeof(*hdr); + struct sk_buff *skb; + unsigned int i; + u8 *buf; + + skb = alloc_skb(hdr_len + data_len, GFP_KERNEL); + if (!skb) + return NULL; + + memcpy(skb_put(skb, hdr_len), hdr, hdr_len); + + buf = skb_put(skb, data_len); + for (i = 0; i < data_len; i++) + buf[i] = i & 0xff; + + return skb; +} + +static struct sk_buff *__mctp_test_create_skb_data(const struct mctp_hdr *hdr, + const void *data, + size_t data_len) +{ + size_t hdr_len = sizeof(*hdr); + struct sk_buff *skb; + + skb = alloc_skb(hdr_len + data_len, GFP_KERNEL); + if (!skb) + return NULL; + + memcpy(skb_put(skb, hdr_len), hdr, hdr_len); + memcpy(skb_put(skb, data_len), data, data_len); + + return skb; +} + +#define mctp_test_create_skb_data(h, d) \ + __mctp_test_create_skb_data(h, d, sizeof(*d)) + +struct mctp_frag_test { + unsigned int mtu; + unsigned int msgsize; + unsigned int n_frags; +}; + +static void mctp_test_fragment(struct kunit *test) +{ + const struct mctp_frag_test *params; + int rc, i, n, mtu, msgsize; + struct mctp_test_route *rt; + struct sk_buff *skb; + struct mctp_hdr hdr; + u8 seq; + + params = test->param_value; + mtu = params->mtu; + msgsize = params->msgsize; + + hdr.ver = 1; + hdr.src = 8; + hdr.dest = 10; + hdr.flags_seq_tag = MCTP_HDR_FLAG_TO; + + skb = mctp_test_create_skb(&hdr, msgsize); + KUNIT_ASSERT_TRUE(test, skb); + + rt = mctp_test_create_route(&init_net, NULL, 10, mtu); + KUNIT_ASSERT_TRUE(test, rt); + + /* The refcount would usually be incremented as part of a route lookup, + * but we're setting the route directly here. + */ + refcount_inc(&rt->rt.refs); + + rc = mctp_do_fragment_route(&rt->rt, skb, mtu, MCTP_TAG_OWNER); + KUNIT_EXPECT_FALSE(test, rc); + + n = rt->pkts.qlen; + + KUNIT_EXPECT_EQ(test, n, params->n_frags); + + for (i = 0;; i++) { + struct mctp_hdr *hdr2; + struct sk_buff *skb2; + u8 tag_mask, seq2; + bool first, last; + + first = i == 0; + last = i == (n - 1); + + skb2 = skb_dequeue(&rt->pkts); + + if (!skb2) + break; + + hdr2 = mctp_hdr(skb2); + + tag_mask = MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO; + + KUNIT_EXPECT_EQ(test, hdr2->ver, hdr.ver); + KUNIT_EXPECT_EQ(test, hdr2->src, hdr.src); + KUNIT_EXPECT_EQ(test, hdr2->dest, hdr.dest); + KUNIT_EXPECT_EQ(test, hdr2->flags_seq_tag & tag_mask, + hdr.flags_seq_tag & tag_mask); + + KUNIT_EXPECT_EQ(test, + !!(hdr2->flags_seq_tag & MCTP_HDR_FLAG_SOM), first); + KUNIT_EXPECT_EQ(test, + !!(hdr2->flags_seq_tag & MCTP_HDR_FLAG_EOM), last); + + seq2 = (hdr2->flags_seq_tag >> MCTP_HDR_SEQ_SHIFT) & + MCTP_HDR_SEQ_MASK; + + if (first) { + seq = seq2; + } else { + seq++; + KUNIT_EXPECT_EQ(test, seq2, seq & MCTP_HDR_SEQ_MASK); + } + + if (!last) + KUNIT_EXPECT_EQ(test, skb2->len, mtu); + else + KUNIT_EXPECT_LE(test, skb2->len, mtu); + + kfree_skb(skb2); + } + + mctp_test_route_destroy(test, rt); +} + +static const struct mctp_frag_test mctp_frag_tests[] = { + {.mtu = 68, .msgsize = 63, .n_frags = 1}, + {.mtu = 68, .msgsize = 64, .n_frags = 1}, + {.mtu = 68, .msgsize = 65, .n_frags = 2}, + {.mtu = 68, .msgsize = 66, .n_frags = 2}, + {.mtu = 68, .msgsize = 127, .n_frags = 2}, + {.mtu = 68, .msgsize = 128, .n_frags = 2}, + {.mtu = 68, .msgsize = 129, .n_frags = 3}, + {.mtu = 68, .msgsize = 130, .n_frags = 3}, +}; + +static void mctp_frag_test_to_desc(const struct mctp_frag_test *t, char *desc) +{ + sprintf(desc, "mtu %d len %d -> %d frags", + t->msgsize, t->mtu, t->n_frags); +} + +KUNIT_ARRAY_PARAM(mctp_frag, mctp_frag_tests, mctp_frag_test_to_desc); + +struct mctp_rx_input_test { + struct mctp_hdr hdr; + bool input; +}; + +static void mctp_test_rx_input(struct kunit *test) +{ + const struct mctp_rx_input_test *params; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct sk_buff *skb; + + params = test->param_value; + + dev = mctp_test_create_dev(); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + + rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); + + skb = mctp_test_create_skb(¶ms->hdr, 1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + + __mctp_cb(skb); + + mctp_pkttype_receive(skb, dev->ndev, &mctp_packet_type, NULL); + + KUNIT_EXPECT_EQ(test, !!rt->pkts.qlen, params->input); + + mctp_test_route_destroy(test, rt); + mctp_test_destroy_dev(dev); +} + +#define RX_HDR(_ver, _src, _dest, _fst) \ + { .ver = _ver, .src = _src, .dest = _dest, .flags_seq_tag = _fst } + +/* we have a route for EID 8 only */ +static const struct mctp_rx_input_test mctp_rx_input_tests[] = { + { .hdr = RX_HDR(1, 10, 8, 0), .input = true }, + { .hdr = RX_HDR(1, 10, 9, 0), .input = false }, /* no input route */ + { .hdr = RX_HDR(2, 10, 8, 0), .input = false }, /* invalid version */ +}; + +static void mctp_rx_input_test_to_desc(const struct mctp_rx_input_test *t, + char *desc) +{ + sprintf(desc, "{%x,%x,%x,%x}", t->hdr.ver, t->hdr.src, t->hdr.dest, + t->hdr.flags_seq_tag); +} + +KUNIT_ARRAY_PARAM(mctp_rx_input, mctp_rx_input_tests, + mctp_rx_input_test_to_desc); + +/* set up a local dev, route on EID 8, and a socket listening on type 0 */ +static void __mctp_route_test_init(struct kunit *test, + struct mctp_test_dev **devp, + struct mctp_test_route **rtp, + struct socket **sockp) +{ + struct sockaddr_mctp addr; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct socket *sock; + int rc; + + dev = mctp_test_create_dev(); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + + rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); + + rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); + KUNIT_ASSERT_EQ(test, rc, 0); + + addr.smctp_family = AF_MCTP; + addr.smctp_network = MCTP_NET_ANY; + addr.smctp_addr.s_addr = 8; + addr.smctp_type = 0; + rc = kernel_bind(sock, (struct sockaddr *)&addr, sizeof(addr)); + KUNIT_ASSERT_EQ(test, rc, 0); + + *rtp = rt; + *devp = dev; + *sockp = sock; +} + +static void __mctp_route_test_fini(struct kunit *test, + struct mctp_test_dev *dev, + struct mctp_test_route *rt, + struct socket *sock) +{ + sock_release(sock); + mctp_test_route_destroy(test, rt); + mctp_test_destroy_dev(dev); +} + +struct mctp_route_input_sk_test { + struct mctp_hdr hdr; + u8 type; + bool deliver; +}; + +static void mctp_test_route_input_sk(struct kunit *test) +{ + const struct mctp_route_input_sk_test *params; + struct sk_buff *skb, *skb2; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct socket *sock; + int rc; + + params = test->param_value; + + __mctp_route_test_init(test, &dev, &rt, &sock); + + skb = mctp_test_create_skb_data(¶ms->hdr, ¶ms->type); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + + skb->dev = dev->ndev; + __mctp_cb(skb); + + rc = mctp_route_input(&rt->rt, skb); + + if (params->deliver) { + KUNIT_EXPECT_EQ(test, rc, 0); + + skb2 = skb_recv_datagram(sock->sk, 0, 1, &rc); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, skb2); + KUNIT_EXPECT_EQ(test, skb->len, 1); + + skb_free_datagram(sock->sk, skb2); + + } else { + KUNIT_EXPECT_NE(test, rc, 0); + skb2 = skb_recv_datagram(sock->sk, 0, 1, &rc); + KUNIT_EXPECT_PTR_EQ(test, skb2, NULL); + } + + __mctp_route_test_fini(test, dev, rt, sock); +} + +#define FL_S (MCTP_HDR_FLAG_SOM) +#define FL_E (MCTP_HDR_FLAG_EOM) +#define FL_T (MCTP_HDR_FLAG_TO) + +static const struct mctp_route_input_sk_test mctp_route_input_sk_tests[] = { + { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_T), .type = 0, .deliver = true }, + { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_T), .type = 1, .deliver = false }, + { .hdr = RX_HDR(1, 10, 8, FL_S | FL_E), .type = 0, .deliver = false }, + { .hdr = RX_HDR(1, 10, 8, FL_E | FL_T), .type = 0, .deliver = false }, + { .hdr = RX_HDR(1, 10, 8, FL_T), .type = 0, .deliver = false }, + { .hdr = RX_HDR(1, 10, 8, 0), .type = 0, .deliver = false }, +}; + +static void mctp_route_input_sk_to_desc(const struct mctp_route_input_sk_test *t, + char *desc) +{ + sprintf(desc, "{%x,%x,%x,%x} type %d", t->hdr.ver, t->hdr.src, + t->hdr.dest, t->hdr.flags_seq_tag, t->type); +} + +KUNIT_ARRAY_PARAM(mctp_route_input_sk, mctp_route_input_sk_tests, + mctp_route_input_sk_to_desc); + +struct mctp_route_input_sk_reasm_test { + const char *name; + struct mctp_hdr hdrs[4]; + int n_hdrs; + int rx_len; +}; + +static void mctp_test_route_input_sk_reasm(struct kunit *test) +{ + const struct mctp_route_input_sk_reasm_test *params; + struct sk_buff *skb, *skb2; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct socket *sock; + int i, rc; + u8 c; + + params = test->param_value; + + __mctp_route_test_init(test, &dev, &rt, &sock); + + for (i = 0; i < params->n_hdrs; i++) { + c = i; + skb = mctp_test_create_skb_data(¶ms->hdrs[i], &c); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + + skb->dev = dev->ndev; + __mctp_cb(skb); + + rc = mctp_route_input(&rt->rt, skb); + } + + skb2 = skb_recv_datagram(sock->sk, 0, 1, &rc); + + if (params->rx_len) { + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, skb2); + KUNIT_EXPECT_EQ(test, skb2->len, params->rx_len); + skb_free_datagram(sock->sk, skb2); + + } else { + KUNIT_EXPECT_PTR_EQ(test, skb2, NULL); + } + + __mctp_route_test_fini(test, dev, rt, sock); +} + +#define RX_FRAG(f, s) RX_HDR(1, 10, 8, FL_T | (f) | ((s) << MCTP_HDR_SEQ_SHIFT)) + +static const struct mctp_route_input_sk_reasm_test mctp_route_input_sk_reasm_tests[] = { + { + .name = "single packet", + .hdrs = { + RX_FRAG(FL_S | FL_E, 0), + }, + .n_hdrs = 1, + .rx_len = 1, + }, + { + .name = "single packet, offset seq", + .hdrs = { + RX_FRAG(FL_S | FL_E, 1), + }, + .n_hdrs = 1, + .rx_len = 1, + }, + { + .name = "start & end packets", + .hdrs = { + RX_FRAG(FL_S, 0), + RX_FRAG(FL_E, 1), + }, + .n_hdrs = 2, + .rx_len = 2, + }, + { + .name = "start & end packets, offset seq", + .hdrs = { + RX_FRAG(FL_S, 1), + RX_FRAG(FL_E, 2), + }, + .n_hdrs = 2, + .rx_len = 2, + }, + { + .name = "start & end packets, out of order", + .hdrs = { + RX_FRAG(FL_E, 1), + RX_FRAG(FL_S, 0), + }, + .n_hdrs = 2, + .rx_len = 0, + }, + { + .name = "start, middle & end packets", + .hdrs = { + RX_FRAG(FL_S, 0), + RX_FRAG(0, 1), + RX_FRAG(FL_E, 2), + }, + .n_hdrs = 3, + .rx_len = 3, + }, + { + .name = "missing seq", + .hdrs = { + RX_FRAG(FL_S, 0), + RX_FRAG(FL_E, 2), + }, + .n_hdrs = 2, + .rx_len = 0, + }, + { + .name = "seq wrap", + .hdrs = { + RX_FRAG(FL_S, 3), + RX_FRAG(FL_E, 0), + }, + .n_hdrs = 2, + .rx_len = 2, + }, +}; + +static void mctp_route_input_sk_reasm_to_desc( + const struct mctp_route_input_sk_reasm_test *t, + char *desc) +{ + sprintf(desc, "%s", t->name); +} + +KUNIT_ARRAY_PARAM(mctp_route_input_sk_reasm, mctp_route_input_sk_reasm_tests, + mctp_route_input_sk_reasm_to_desc); + +static struct kunit_case mctp_test_cases[] = { + KUNIT_CASE_PARAM(mctp_test_fragment, mctp_frag_gen_params), + KUNIT_CASE_PARAM(mctp_test_rx_input, mctp_rx_input_gen_params), + KUNIT_CASE_PARAM(mctp_test_route_input_sk, mctp_route_input_sk_gen_params), + KUNIT_CASE_PARAM(mctp_test_route_input_sk_reasm, + mctp_route_input_sk_reasm_gen_params), + {} +}; + +static struct kunit_suite mctp_test_suite = { + .name = "mctp", + .test_cases = mctp_test_cases, +}; + +kunit_test_suite(mctp_test_suite); diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c new file mode 100644 index 000000000000..cc6b8803aa9d --- /dev/null +++ b/net/mctp/test/utils.c @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/netdevice.h> +#include <linux/mctp.h> +#include <linux/if_arp.h> + +#include <net/mctpdevice.h> +#include <net/pkt_sched.h> + +#include "utils.h" + +static netdev_tx_t mctp_test_dev_tx(struct sk_buff *skb, + struct net_device *ndev) +{ + kfree(skb); + return NETDEV_TX_OK; +} + +static const struct net_device_ops mctp_test_netdev_ops = { + .ndo_start_xmit = mctp_test_dev_tx, +}; + +static void mctp_test_dev_setup(struct net_device *ndev) +{ + ndev->type = ARPHRD_MCTP; + ndev->mtu = MCTP_DEV_TEST_MTU; + ndev->hard_header_len = 0; + ndev->addr_len = 0; + ndev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; + ndev->flags = IFF_NOARP; + ndev->netdev_ops = &mctp_test_netdev_ops; + ndev->needs_free_netdev = true; +} + +struct mctp_test_dev *mctp_test_create_dev(void) +{ + struct mctp_test_dev *dev; + struct net_device *ndev; + int rc; + + ndev = alloc_netdev(sizeof(*dev), "mctptest%d", NET_NAME_ENUM, + mctp_test_dev_setup); + if (!ndev) + return NULL; + + dev = netdev_priv(ndev); + dev->ndev = ndev; + + rc = register_netdev(ndev); + if (rc) { + free_netdev(ndev); + return NULL; + } + + rcu_read_lock(); + dev->mdev = __mctp_dev_get(ndev); + mctp_dev_hold(dev->mdev); + rcu_read_unlock(); + + return dev; +} + +void mctp_test_destroy_dev(struct mctp_test_dev *dev) +{ + mctp_dev_put(dev->mdev); + unregister_netdev(dev->ndev); +} diff --git a/net/mctp/test/utils.h b/net/mctp/test/utils.h new file mode 100644 index 000000000000..df6aa1c03440 --- /dev/null +++ b/net/mctp/test/utils.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NET_MCTP_TEST_UTILS_H +#define __NET_MCTP_TEST_UTILS_H + +#include <kunit/test.h> + +#define MCTP_DEV_TEST_MTU 68 + +struct mctp_test_dev { + struct net_device *ndev; + struct mctp_dev *mdev; +}; + +struct mctp_test_dev; + +struct mctp_test_dev *mctp_test_create_dev(void); +void mctp_test_destroy_dev(struct mctp_test_dev *dev); + +#endif /* __NET_MCTP_TEST_UTILS_H */ diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 05a21dd072df..ffeb2df8be7a 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -407,7 +407,6 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Verify ttl is valid */ if (dec.ttl <= 1) goto err; - dec.ttl -= 1; /* Find the output device */ out_dev = rcu_dereference(nh->nh_dev); @@ -431,6 +430,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, skb->dev = out_dev; skb->protocol = htons(ETH_P_MPLS_UC); + dec.ttl -= 1; if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ if (!mpls_egress(dev_net(out_dev), rt, skb, dec)) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 7d738bd06f2c..8b235468c88f 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -21,43 +21,50 @@ struct mptcp_pernet { struct ctl_table_header *ctl_table_hdr; #endif - u8 mptcp_enabled; unsigned int add_addr_timeout; + unsigned int stale_loss_cnt; + u8 mptcp_enabled; u8 checksum_enabled; u8 allow_join_initial_addr_port; }; -static struct mptcp_pernet *mptcp_get_pernet(struct net *net) +static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) { return net_generic(net, mptcp_pernet_id); } -int mptcp_is_enabled(struct net *net) +int mptcp_is_enabled(const struct net *net) { return mptcp_get_pernet(net)->mptcp_enabled; } -unsigned int mptcp_get_add_addr_timeout(struct net *net) +unsigned int mptcp_get_add_addr_timeout(const struct net *net) { return mptcp_get_pernet(net)->add_addr_timeout; } -int mptcp_is_checksum_enabled(struct net *net) +int mptcp_is_checksum_enabled(const struct net *net) { return mptcp_get_pernet(net)->checksum_enabled; } -int mptcp_allow_join_id0(struct net *net) +int mptcp_allow_join_id0(const struct net *net) { return mptcp_get_pernet(net)->allow_join_initial_addr_port; } +unsigned int mptcp_stale_loss_cnt(const struct net *net) +{ + return mptcp_get_pernet(net)->stale_loss_cnt; +} + static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) { pernet->mptcp_enabled = 1; pernet->add_addr_timeout = TCP_RTO_MAX; pernet->checksum_enabled = 0; pernet->allow_join_initial_addr_port = 1; + pernet->stale_loss_cnt = 4; } #ifdef CONFIG_SYSCTL @@ -95,6 +102,12 @@ static struct ctl_table mptcp_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, + { + .procname = "stale_loss_cnt", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + }, {} }; @@ -114,6 +127,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) table[1].data = &pernet->add_addr_timeout; table[2].data = &pernet->checksum_enabled; table[3].data = &pernet->allow_join_initial_addr_port; + table[4].data = &pernet->stale_loss_cnt; hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); if (!hdr) diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index ff2cc0e3273d..3240b72271a7 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -44,7 +44,11 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), + SNMP_MIB_ITEM("MPFailTx", MPTCP_MIB_MPFAILTX), + SNMP_MIB_ITEM("MPFailRx", MPTCP_MIB_MPFAILRX), SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), + SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), + SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), SNMP_MIB_SENTINEL }; @@ -68,6 +72,7 @@ bool mptcp_mib_alloc(struct net *net) void mptcp_seq_show(struct seq_file *seq) { + unsigned long sum[ARRAY_SIZE(mptcp_snmp_list) - 1]; struct net *net = seq->private; int i; @@ -77,17 +82,13 @@ void mptcp_seq_show(struct seq_file *seq) seq_puts(seq, "\nMPTcpExt:"); - if (!net->mib.mptcp_statistics) { - for (i = 0; mptcp_snmp_list[i].name; i++) - seq_puts(seq, " 0"); - - seq_putc(seq, '\n'); - return; - } + memset(sum, 0, sizeof(sum)); + if (net->mib.mptcp_statistics) + snmp_get_cpu_field_batch(sum, mptcp_snmp_list, + net->mib.mptcp_statistics); for (i = 0; mptcp_snmp_list[i].name; i++) - seq_printf(seq, " %lu", - snmp_fold_field(net->mib.mptcp_statistics, - mptcp_snmp_list[i].entry)); + seq_printf(seq, " %lu", sum[i]); + seq_putc(seq, '\n'); } diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 0663cb12b448..ecd3d8b117e0 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -37,7 +37,11 @@ enum linux_mptcp_mib_field { MPTCP_MIB_RMSUBFLOW, /* Remove a subflow */ MPTCP_MIB_MPPRIOTX, /* Transmit a MP_PRIO */ MPTCP_MIB_MPPRIORX, /* Received a MP_PRIO */ + MPTCP_MIB_MPFAILTX, /* Transmit a MP_FAIL */ + MPTCP_MIB_MPFAILRX, /* Received a MP_FAIL */ MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ + MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ + MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index f48eb6315bbb..f44125dd6697 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -36,7 +36,7 @@ static int mptcp_diag_dump_one(struct netlink_callback *cb, struct sock *sk; net = sock_net(in_skb->sk); - msk = mptcp_token_get_sock(req->id.idiag_cookie[0]); + msk = mptcp_token_get_sock(net, req->id.idiag_cookie[0]); if (!msk) goto out_nosk; @@ -113,37 +113,13 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_info *info = _info; - u32 flags = 0; - bool slow; - u8 val; r->idiag_rqueue = sk_rmem_alloc_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); if (!info) return; - slow = lock_sock_fast(sk); - info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); - info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); - info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); - info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); - info->mptcpi_subflows_max = mptcp_pm_get_subflows_max(msk); - val = mptcp_pm_get_add_addr_signal_max(msk); - info->mptcpi_add_addr_signal_max = val; - val = mptcp_pm_get_add_addr_accept_max(msk); - info->mptcpi_add_addr_accepted_max = val; - info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); - if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) - flags |= MPTCP_INFO_FLAG_FALLBACK; - if (READ_ONCE(msk->can_ack)) - flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; - info->mptcpi_flags = flags; - info->mptcpi_token = READ_ONCE(msk->token); - info->mptcpi_write_seq = READ_ONCE(msk->write_seq); - info->mptcpi_snd_una = READ_ONCE(msk->snd_una); - info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); - info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); - unlock_sock_fast(sk, slow); + mptcp_diag_fill_info(msk, info); } static const struct inet_diag_handler mptcp_diag_handler = { diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 7adcbc1f7d49..7c3420afb1a0 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -81,12 +81,11 @@ static void mptcp_parse_option(const struct sk_buff *skb, * is if both hosts in their SYNs set A=0." */ if (flags & MPTCP_CAP_CHECKSUM_REQD) - mp_opt->csum_reqd = 1; + mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; - if (flags & MPTCP_CAP_DENY_JOIN_ID0) - mp_opt->deny_join_id0 = 1; + mp_opt->deny_join_id0 = !!(flags & MPTCP_CAP_DENY_JOIN_ID0); - mp_opt->mp_capable = 1; + mp_opt->suboptions |= OPTIONS_MPTCP_MPC; if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { mp_opt->sndr_key = get_unaligned_be64(ptr); ptr += 8; @@ -101,7 +100,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, * equivalent to those in a DSS option and can be used * interchangeably." */ - mp_opt->dss = 1; + mp_opt->suboptions |= OPTION_MPTCP_DSS; mp_opt->use_map = 1; mp_opt->mpc_map = 1; mp_opt->data_len = get_unaligned_be16(ptr); @@ -109,7 +108,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, } if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM) { mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); - mp_opt->csum_reqd = 1; + mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; ptr += 2; } pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d csum=%u", @@ -118,7 +117,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, break; case MPTCPOPT_MP_JOIN: - mp_opt->mp_join = 1; + mp_opt->suboptions |= OPTIONS_MPTCP_MPJ; if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; mp_opt->join_id = *ptr++; @@ -144,7 +143,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); pr_debug("MP_JOIN hmac"); } else { - mp_opt->mp_join = 0; + mp_opt->suboptions &= ~OPTIONS_MPTCP_MPJ; } break; @@ -192,8 +191,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) break; - mp_opt->dss = 1; - + mp_opt->suboptions |= OPTION_MPTCP_DSS; if (mp_opt->use_ack) { if (mp_opt->ack64) { mp_opt->data_ack = get_unaligned_be64(ptr); @@ -222,14 +220,15 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 2; if (opsize == expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) { - mp_opt->csum_reqd = 1; + mp_opt->suboptions |= OPTION_MPTCP_CSUMREQD; mp_opt->csum = (__force __sum16)get_unaligned_be16(ptr); ptr += 2; } pr_debug("data_seq=%llu subflow_seq=%u data_len=%u csum=%d:%u", mp_opt->data_seq, mp_opt->subflow_seq, - mp_opt->data_len, mp_opt->csum_reqd, mp_opt->csum); + mp_opt->data_len, !!(mp_opt->suboptions & OPTION_MPTCP_CSUMREQD), + mp_opt->csum); } break; @@ -260,8 +259,10 @@ static void mptcp_parse_option(const struct sk_buff *skb, break; } - mp_opt->add_addr = 1; + mp_opt->suboptions |= OPTION_MPTCP_ADD_ADDR; mp_opt->addr.id = *ptr++; + mp_opt->addr.port = 0; + mp_opt->ahmac = 0; if (mp_opt->addr.family == AF_INET) { memcpy((u8 *)&mp_opt->addr.addr.s_addr, (u8 *)ptr, 4); ptr += 4; @@ -298,7 +299,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr++; - mp_opt->rm_addr = 1; + mp_opt->suboptions |= OPTION_MPTCP_RM_ADDR; mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE; for (i = 0; i < mp_opt->rm_list.nr; i++) mp_opt->rm_list.ids[i] = *ptr++; @@ -309,7 +310,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, if (opsize != TCPOLEN_MPTCP_PRIO) break; - mp_opt->mp_prio = 1; + mp_opt->suboptions |= OPTION_MPTCP_PRIO; mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP; pr_debug("MP_PRIO: prio=%d", mp_opt->backup); break; @@ -321,7 +322,7 @@ static void mptcp_parse_option(const struct sk_buff *skb, ptr += 2; mp_opt->rcvr_key = get_unaligned_be64(ptr); ptr += 8; - mp_opt->fastclose = 1; + mp_opt->suboptions |= OPTION_MPTCP_FASTCLOSE; break; case MPTCPOPT_RST: @@ -330,12 +331,23 @@ static void mptcp_parse_option(const struct sk_buff *skb, if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) break; - mp_opt->reset = 1; + + mp_opt->suboptions |= OPTION_MPTCP_RST; flags = *ptr++; mp_opt->reset_transient = flags & MPTCP_RST_TRANSIENT; mp_opt->reset_reason = *ptr; break; + case MPTCPOPT_MP_FAIL: + if (opsize != TCPOLEN_MPTCP_FAIL) + break; + + ptr += 2; + mp_opt->suboptions |= OPTION_MPTCP_FAIL; + mp_opt->fail_seq = get_unaligned_be64(ptr); + pr_debug("MP_FAIL: data_seq=%llu", mp_opt->fail_seq); + break; + default: break; } @@ -345,25 +357,12 @@ void mptcp_get_options(const struct sock *sk, const struct sk_buff *skb, struct mptcp_options_received *mp_opt) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - struct mptcp_sock *msk = mptcp_sk(subflow->conn); const struct tcphdr *th = tcp_hdr(skb); const unsigned char *ptr; int length; /* initialize option status */ - mp_opt->mp_capable = 0; - mp_opt->mp_join = 0; - mp_opt->add_addr = 0; - mp_opt->ahmac = 0; - mp_opt->fastclose = 0; - mp_opt->addr.port = 0; - mp_opt->rm_addr = 0; - mp_opt->dss = 0; - mp_opt->mp_prio = 0; - mp_opt->reset = 0; - mp_opt->csum_reqd = READ_ONCE(msk->csum_enabled); - mp_opt->deny_join_id0 = 0; + mp_opt->suboptions = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); @@ -486,11 +485,11 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, mpext = mptcp_get_ext(skb); data_len = mpext ? mpext->data_len : 0; - /* we will check ext_copy.data_len in mptcp_write_options() to + /* we will check ops->data_len in mptcp_write_options() to * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and * TCPOLEN_MPTCP_MPC_ACK */ - opts->ext_copy.data_len = data_len; + opts->data_len = data_len; opts->suboptions = OPTION_MPTCP_MPC_ACK; opts->sndr_key = subflow->local_key; opts->rcvr_key = subflow->remote_key; @@ -506,9 +505,9 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) { /* we need to propagate more info to csum the pseudo hdr */ - opts->ext_copy.data_seq = mpext->data_seq; - opts->ext_copy.subflow_seq = mpext->subflow_seq; - opts->ext_copy.csum = mpext->csum; + opts->data_seq = mpext->data_seq; + opts->subflow_seq = mpext->subflow_seq; + opts->csum = mpext->csum; len += TCPOLEN_MPTCP_DSS_CHECKSUM; } *size = ALIGN(len, 4); @@ -592,6 +591,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); + opts->suboptions = OPTION_MPTCP_DSS; ret = true; } @@ -615,6 +615,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, opts->ext_copy.ack64 = 0; } opts->ext_copy.use_ack = 1; + opts->suboptions = OPTION_MPTCP_DSS; WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk)); /* Add kind/length/subtype/flag overhead if mapping is not populated */ @@ -667,29 +668,34 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * bool port; int len; - if ((mptcp_pm_should_add_signal_ipv6(msk) || - mptcp_pm_should_add_signal_port(msk) || - mptcp_pm_should_add_signal_echo(msk)) && - skb && skb_is_tcp_pure_ack(skb)) { - pr_debug("drop other suboptions"); - opts->suboptions = 0; - opts->ext_copy.use_ack = 0; - opts->ext_copy.use_map = 0; - remaining += opt_size; - drop_other_suboptions = true; - } - + /* add addr will strip the existing options, be sure to avoid breaking + * MPC/MPJ handshakes + */ if (!mptcp_pm_should_add_signal(msk) || - !(mptcp_pm_add_addr_signal(msk, remaining, &opts->addr, &echo, &port))) + (opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) || + !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr, + &echo, &port, &drop_other_suboptions)) return false; + if (drop_other_suboptions) + remaining += opt_size; len = mptcp_add_addr_len(opts->addr.family, echo, port); if (remaining < len) return false; *size = len; - if (drop_other_suboptions) + if (drop_other_suboptions) { + pr_debug("drop other suboptions"); + opts->suboptions = 0; + + /* note that e.g. DSS could have written into the memory + * aliased by ahmac, we must reset the field here + * to avoid appending the hmac even for ADD_ADDR echo + * options + */ + opts->ahmac = 0; *size -= opt_size; + } opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { opts->ahmac = add_addr_generate_hmac(msk->local_key, @@ -739,7 +745,10 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - if (!subflow->send_mp_prio) + /* can't send MP_PRIO with MPC, as they share the same option space: + * 'backup'. Also it makes no sense at all + */ + if (!subflow->send_mp_prio || (opts->suboptions & OPTIONS_MPTCP_MPC)) return false; /* account for the trailing 'nop' option */ @@ -755,7 +764,7 @@ static bool mptcp_established_options_mp_prio(struct sock *sk, return true; } -static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, +static noinline bool mptcp_established_options_rst(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -763,12 +772,36 @@ static noinline void mptcp_established_options_rst(struct sock *sk, struct sk_bu const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); if (remaining < TCPOLEN_MPTCP_RST) - return; + return false; *size = TCPOLEN_MPTCP_RST; opts->suboptions |= OPTION_MPTCP_RST; opts->reset_transient = subflow->reset_transient; opts->reset_reason = subflow->reset_reason; + + return true; +} + +static bool mptcp_established_options_mp_fail(struct sock *sk, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + if (likely(!subflow->send_mp_fail)) + return false; + + if (remaining < TCPOLEN_MPTCP_FAIL) + return false; + + *size = TCPOLEN_MPTCP_FAIL; + opts->suboptions |= OPTION_MPTCP_FAIL; + opts->fail_seq = subflow->map_seq; + + pr_debug("MP_FAIL fail_seq=%llu", opts->fail_seq); + + return true; } bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, @@ -787,15 +820,28 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, return false; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { - mptcp_established_options_rst(sk, skb, size, remaining, opts); + if (mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { + *size += opt_size; + remaining -= opt_size; + } + if (mptcp_established_options_rst(sk, skb, &opt_size, remaining, opts)) { + *size += opt_size; + remaining -= opt_size; + } return true; } snd_data_fin = mptcp_data_fin_enabled(msk); if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts)) ret = true; - else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) + else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) { ret = true; + if (mptcp_established_options_mp_fail(sk, &opt_size, remaining, opts)) { + *size += opt_size; + remaining -= opt_size; + return true; + } + } /* we reserved enough space for the above options, and exceeding the * TCP option space would be fatal @@ -868,7 +914,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, */ if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && - subflow->mp_join && mp_opt->mp_join && + subflow->mp_join && (mp_opt->suboptions & OPTIONS_MPTCP_MPJ) && READ_ONCE(msk->pm.server_side)) tcp_send_ack(ssk); goto fully_established; @@ -885,8 +931,8 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return subflow->mp_capable; } - if ((mp_opt->dss && mp_opt->use_ack) || - (mp_opt->add_addr && !mp_opt->echo)) { + if (((mp_opt->suboptions & OPTION_MPTCP_DSS) && mp_opt->use_ack) || + ((mp_opt->suboptions & OPTION_MPTCP_ADD_ADDR) && !mp_opt->echo)) { /* subflows are fully established as soon as we get any * additional ack, including ADD_ADDR. */ @@ -899,7 +945,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, * then fallback to TCP. Fallback scenarios requires a reset for * MP_JOIN subflows. */ - if (!mp_opt->mp_capable) { + if (!(mp_opt->suboptions & OPTIONS_MPTCP_MPC)) { if (subflow->mp_join) goto reset; subflow->mp_capable = 0; @@ -971,8 +1017,8 @@ static void ack_update_msk(struct mptcp_sock *msk, old_snd_una = msk->snd_una; new_snd_una = mptcp_expand_seq(old_snd_una, mp_opt->data_ack, mp_opt->ack64); - /* ACK for data not even sent yet? Ignore. */ - if (after64(new_snd_una, snd_nxt)) + /* ACK for data not even sent yet? Ignore.*/ + if (unlikely(after64(new_snd_una, snd_nxt))) new_snd_una = old_snd_una; new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; @@ -1061,48 +1107,51 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) if (!check_fully_established(msk, sk, subflow, skb, &mp_opt)) return sk->sk_state != TCP_CLOSE; - if (mp_opt.fastclose && - msk->local_key == mp_opt.rcvr_key) { - WRITE_ONCE(msk->rcv_fastclose, true); - mptcp_schedule_work((struct sock *)msk); - } + if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) { + if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) && + msk->local_key == mp_opt.rcvr_key) { + WRITE_ONCE(msk->rcv_fastclose, true); + mptcp_schedule_work((struct sock *)msk); + } - if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) { - if (!mp_opt.echo) { - mptcp_pm_add_addr_received(msk, &mp_opt.addr); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); - } else { - mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); - mptcp_pm_del_add_timer(msk, &mp_opt.addr, true); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); + if ((mp_opt.suboptions & OPTION_MPTCP_ADD_ADDR) && + add_addr_hmac_valid(msk, &mp_opt)) { + if (!mp_opt.echo) { + mptcp_pm_add_addr_received(msk, &mp_opt.addr); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR); + } else { + mptcp_pm_add_addr_echoed(msk, &mp_opt.addr); + mptcp_pm_del_add_timer(msk, &mp_opt.addr, true); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD); + } + + if (mp_opt.addr.port) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); } - if (mp_opt.addr.port) - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD); + if (mp_opt.suboptions & OPTION_MPTCP_RM_ADDR) + mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list); - mp_opt.add_addr = 0; - } + if (mp_opt.suboptions & OPTION_MPTCP_PRIO) { + mptcp_pm_mp_prio_received(sk, mp_opt.backup); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX); + } - if (mp_opt.rm_addr) { - mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list); - mp_opt.rm_addr = 0; - } + if (mp_opt.suboptions & OPTION_MPTCP_FAIL) { + mptcp_pm_mp_fail_received(sk, mp_opt.fail_seq); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFAILRX); + } - if (mp_opt.mp_prio) { - mptcp_pm_mp_prio_received(sk, mp_opt.backup); - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX); - mp_opt.mp_prio = 0; - } + if (mp_opt.suboptions & OPTION_MPTCP_RST) { + subflow->reset_seen = 1; + subflow->reset_reason = mp_opt.reset_reason; + subflow->reset_transient = mp_opt.reset_transient; + } - if (mp_opt.reset) { - subflow->reset_seen = 1; - subflow->reset_reason = mp_opt.reset_reason; - subflow->reset_transient = mp_opt.reset_transient; + if (!(mp_opt.suboptions & OPTION_MPTCP_DSS)) + return true; } - if (!mp_opt.dss) - return true; - /* we can't wait for recvmsg() to update the ack_seq, otherwise * monodirectional flows will stuck */ @@ -1129,7 +1178,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) memset(mpext, 0, sizeof(*mpext)); - if (mp_opt.use_map) { + if (likely(mp_opt.use_map)) { if (mp_opt.mpc_map) { /* this is an MP_CAPABLE carrying MPTCP data * we know this map the first chunk of data @@ -1149,7 +1198,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) } mpext->data_len = mp_opt.data_len; mpext->use_map = 1; - mpext->csum_reqd = mp_opt.csum_reqd; + mpext->csum_reqd = !!(mp_opt.suboptions & OPTION_MPTCP_CSUMREQD); if (mpext->csum_reqd) mpext->csum = mp_opt.csum; @@ -1174,7 +1223,7 @@ static void mptcp_set_rwin(const struct tcp_sock *tp) WRITE_ONCE(msk->rcv_wnd_sent, ack_seq); } -static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +static u16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __sum16 sum) { struct csum_pseudo_header header; __wsum csum; @@ -1184,27 +1233,112 @@ static u16 mptcp_make_csum(const struct mptcp_ext *mpext) * always the 64-bit value, irrespective of what length is used in the * DSS option itself. */ - header.data_seq = cpu_to_be64(mpext->data_seq); - header.subflow_seq = htonl(mpext->subflow_seq); - header.data_len = htons(mpext->data_len); + header.data_seq = cpu_to_be64(data_seq); + header.subflow_seq = htonl(subflow_seq); + header.data_len = htons(data_len); header.csum = 0; - csum = csum_partial(&header, sizeof(header), ~csum_unfold(mpext->csum)); + csum = csum_partial(&header, sizeof(header), ~csum_unfold(sum)); return (__force u16)csum_fold(csum); } +static u16 mptcp_make_csum(const struct mptcp_ext *mpext) +{ + return __mptcp_make_csum(mpext->data_seq, mpext->subflow_seq, mpext->data_len, + mpext->csum); +} + void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, struct mptcp_out_options *opts) { - if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | - OPTION_MPTCP_MPC_ACK) & opts->suboptions) { + if (unlikely(OPTION_MPTCP_FAIL & opts->suboptions)) { + const struct sock *ssk = (const struct sock *)tp; + struct mptcp_subflow_context *subflow; + + subflow = mptcp_subflow_ctx(ssk); + subflow->send_mp_fail = 0; + + *ptr++ = mptcp_option(MPTCPOPT_MP_FAIL, + TCPOLEN_MPTCP_FAIL, + 0, 0); + put_unaligned_be64(opts->fail_seq, ptr); + ptr += 2; + } + + /* RST is mutually exclusive with everything else */ + if (unlikely(OPTION_MPTCP_RST & opts->suboptions)) { + *ptr++ = mptcp_option(MPTCPOPT_RST, + TCPOLEN_MPTCP_RST, + opts->reset_transient, + opts->reset_reason); + return; + } + + /* DSS, MPC, MPJ and ADD_ADDR are mutually exclusive, see + * mptcp_established_options*() + */ + if (likely(OPTION_MPTCP_DSS & opts->suboptions)) { + struct mptcp_ext *mpext = &opts->ext_copy; + u8 len = TCPOLEN_MPTCP_DSS_BASE; + u8 flags = 0; + + if (mpext->use_ack) { + flags = MPTCP_DSS_HAS_ACK; + if (mpext->ack64) { + len += TCPOLEN_MPTCP_DSS_ACK64; + flags |= MPTCP_DSS_ACK64; + } else { + len += TCPOLEN_MPTCP_DSS_ACK32; + } + } + + if (mpext->use_map) { + len += TCPOLEN_MPTCP_DSS_MAP64; + + /* Use only 64-bit mapping flags for now, add + * support for optional 32-bit mappings later. + */ + flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; + if (mpext->data_fin) + flags |= MPTCP_DSS_DATA_FIN; + + if (opts->csum_reqd) + len += TCPOLEN_MPTCP_DSS_CHECKSUM; + } + + *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); + + if (mpext->use_ack) { + if (mpext->ack64) { + put_unaligned_be64(mpext->data_ack, ptr); + ptr += 2; + } else { + put_unaligned_be32(mpext->data_ack32, ptr); + ptr += 1; + } + } + + if (mpext->use_map) { + put_unaligned_be64(mpext->data_seq, ptr); + ptr += 2; + put_unaligned_be32(mpext->subflow_seq, ptr); + ptr += 1; + if (opts->csum_reqd) { + put_unaligned_be32(mpext->data_len << 16 | + mptcp_make_csum(mpext), ptr); + } else { + put_unaligned_be32(mpext->data_len << 16 | + TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); + } + } + } else if (OPTIONS_MPTCP_MPC & opts->suboptions) { u8 len, flag = MPTCP_CAP_HMAC_SHA256; if (OPTION_MPTCP_MPC_SYN & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYN; } else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) { len = TCPOLEN_MPTCP_MPC_SYNACK; - } else if (opts->ext_copy.data_len) { + } else if (opts->data_len) { len = TCPOLEN_MPTCP_MPC_ACK_DATA; if (opts->csum_reqd) len += TCPOLEN_MPTCP_DSS_CHECKSUM; @@ -1233,21 +1367,45 @@ void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, put_unaligned_be64(opts->rcvr_key, ptr); ptr += 2; - if (!opts->ext_copy.data_len) + if (!opts->data_len) goto mp_capable_done; if (opts->csum_reqd) { - put_unaligned_be32(opts->ext_copy.data_len << 16 | - mptcp_make_csum(&opts->ext_copy), ptr); + put_unaligned_be32(opts->data_len << 16 | + __mptcp_make_csum(opts->data_seq, + opts->subflow_seq, + opts->data_len, + opts->csum), ptr); } else { - put_unaligned_be32(opts->ext_copy.data_len << 16 | + put_unaligned_be32(opts->data_len << 16 | TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); } ptr += 1; - } -mp_capable_done: - if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { + /* MPC is additionally mutually exclusive with MP_PRIO */ + goto mp_capable_done; + } else if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_SYN, + opts->backup, opts->join_id); + put_unaligned_be32(opts->token, ptr); + ptr += 1; + put_unaligned_be32(opts->nonce, ptr); + ptr += 1; + } else if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_SYNACK, + opts->backup, opts->join_id); + put_unaligned_be64(opts->thmac, ptr); + ptr += 2; + put_unaligned_be32(opts->nonce, ptr); + ptr += 1; + } else if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_ACK, 0, 0); + memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); + ptr += 5; + } else if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE; u8 echo = MPTCP_ADDR_ECHO; @@ -1305,6 +1463,19 @@ mp_capable_done: } } + if (OPTION_MPTCP_PRIO & opts->suboptions) { + const struct sock *ssk = (const struct sock *)tp; + struct mptcp_subflow_context *subflow; + + subflow = mptcp_subflow_ctx(ssk); + subflow->send_mp_prio = 0; + + *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, + TCPOLEN_MPTCP_PRIO, + opts->backup, TCPOPT_NOP); + } + +mp_capable_done: if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { u8 i = 1; @@ -1325,107 +1496,6 @@ mp_capable_done: } } - if (OPTION_MPTCP_PRIO & opts->suboptions) { - const struct sock *ssk = (const struct sock *)tp; - struct mptcp_subflow_context *subflow; - - subflow = mptcp_subflow_ctx(ssk); - subflow->send_mp_prio = 0; - - *ptr++ = mptcp_option(MPTCPOPT_MP_PRIO, - TCPOLEN_MPTCP_PRIO, - opts->backup, TCPOPT_NOP); - } - - if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { - *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, - TCPOLEN_MPTCP_MPJ_SYN, - opts->backup, opts->join_id); - put_unaligned_be32(opts->token, ptr); - ptr += 1; - put_unaligned_be32(opts->nonce, ptr); - ptr += 1; - } - - if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { - *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, - TCPOLEN_MPTCP_MPJ_SYNACK, - opts->backup, opts->join_id); - put_unaligned_be64(opts->thmac, ptr); - ptr += 2; - put_unaligned_be32(opts->nonce, ptr); - ptr += 1; - } - - if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) { - *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, - TCPOLEN_MPTCP_MPJ_ACK, 0, 0); - memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); - ptr += 5; - } - - if (OPTION_MPTCP_RST & opts->suboptions) - *ptr++ = mptcp_option(MPTCPOPT_RST, - TCPOLEN_MPTCP_RST, - opts->reset_transient, - opts->reset_reason); - - if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { - struct mptcp_ext *mpext = &opts->ext_copy; - u8 len = TCPOLEN_MPTCP_DSS_BASE; - u8 flags = 0; - - if (mpext->use_ack) { - flags = MPTCP_DSS_HAS_ACK; - if (mpext->ack64) { - len += TCPOLEN_MPTCP_DSS_ACK64; - flags |= MPTCP_DSS_ACK64; - } else { - len += TCPOLEN_MPTCP_DSS_ACK32; - } - } - - if (mpext->use_map) { - len += TCPOLEN_MPTCP_DSS_MAP64; - - /* Use only 64-bit mapping flags for now, add - * support for optional 32-bit mappings later. - */ - flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; - if (mpext->data_fin) - flags |= MPTCP_DSS_DATA_FIN; - - if (opts->csum_reqd) - len += TCPOLEN_MPTCP_DSS_CHECKSUM; - } - - *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); - - if (mpext->use_ack) { - if (mpext->ack64) { - put_unaligned_be64(mpext->data_ack, ptr); - ptr += 2; - } else { - put_unaligned_be32(mpext->data_ack32, ptr); - ptr += 1; - } - } - - if (mpext->use_map) { - put_unaligned_be64(mpext->data_seq, ptr); - ptr += 2; - put_unaligned_be32(mpext->subflow_seq, ptr); - ptr += 1; - if (opts->csum_reqd) { - put_unaligned_be32(mpext->data_len << 16 | - mptcp_make_csum(mpext), ptr); - } else { - put_unaligned_be32(mpext->data_len << 16 | - TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); - } - } - } - if (tp) mptcp_set_rwin(tp); } diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 639271e09604..6ab386ff3294 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -10,6 +10,8 @@ #include <net/mptcp.h> #include "protocol.h" +#include "mib.h" + /* path manager command handlers */ int mptcp_pm_announce_addr(struct mptcp_sock *msk, @@ -18,23 +20,23 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk, { u8 add_addr = READ_ONCE(msk->pm.addr_signal); - pr_debug("msk=%p, local_id=%d", msk, addr->id); + pr_debug("msk=%p, local_id=%d, echo=%d", msk, addr->id, echo); lockdep_assert_held(&msk->pm.lock); - if (add_addr) { - pr_warn("addr_signal error, add_addr=%d", add_addr); + if (add_addr & + (echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) { + pr_warn("addr_signal error, add_addr=%d, echo=%d", add_addr, echo); return -EINVAL; } - msk->pm.local = *addr; - add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); - if (echo) + if (echo) { + msk->pm.remote = *addr; add_addr |= BIT(MPTCP_ADD_ADDR_ECHO); - if (addr->family == AF_INET6) - add_addr |= BIT(MPTCP_ADD_ADDR_IPV6); - if (addr->port) - add_addr |= BIT(MPTCP_ADD_ADDR_PORT); + } else { + msk->pm.local = *addr; + add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL); + } WRITE_ONCE(msk->pm.addr_signal, add_addr); return 0; } @@ -247,12 +249,21 @@ void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup) mptcp_event(MPTCP_EVENT_SUB_PRIORITY, mptcp_sk(subflow->conn), sk, GFP_ATOMIC); } +void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq) +{ + pr_debug("fail_seq=%llu", fail_seq); +} + /* path manager helpers */ -bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - struct mptcp_addr_info *saddr, bool *echo, bool *port) +bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb, + unsigned int opt_size, unsigned int remaining, + struct mptcp_addr_info *addr, bool *echo, + bool *port, bool *drop_other_suboptions) { int ret = false; + u8 add_addr; + u8 family; spin_lock_bh(&msk->pm.lock); @@ -260,14 +271,30 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, if (!mptcp_pm_should_add_signal(msk)) goto out_unlock; + /* always drop every other options for pure ack ADD_ADDR; this is a + * plain dup-ack from TCP perspective. The other MPTCP-relevant info, + * if any, will be carried by the 'original' TCP ack + */ + if (skb && skb_is_tcp_pure_ack(skb)) { + remaining += opt_size; + *drop_other_suboptions = true; + } + *echo = mptcp_pm_should_add_signal_echo(msk); - *port = mptcp_pm_should_add_signal_port(msk); + *port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port); - if (remaining < mptcp_add_addr_len(msk->pm.local.family, *echo, *port)) + family = *echo ? msk->pm.remote.family : msk->pm.local.family; + if (remaining < mptcp_add_addr_len(family, *echo, *port)) goto out_unlock; - *saddr = msk->pm.local; - WRITE_ONCE(msk->pm.addr_signal, 0); + if (*echo) { + *addr = msk->pm.remote; + add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO); + } else { + *addr = msk->pm.local; + add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL); + } + WRITE_ONCE(msk->pm.addr_signal, add_addr); ret = true; out_unlock: @@ -279,6 +306,7 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list) { int ret = false, len; + u8 rm_addr; spin_lock_bh(&msk->pm.lock); @@ -286,16 +314,17 @@ bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, if (!mptcp_pm_should_rm_signal(msk)) goto out_unlock; + rm_addr = msk->pm.addr_signal & ~BIT(MPTCP_RM_ADDR_SIGNAL); len = mptcp_rm_addr_len(&msk->pm.rm_list_tx); if (len < 0) { - WRITE_ONCE(msk->pm.addr_signal, 0); + WRITE_ONCE(msk->pm.addr_signal, rm_addr); goto out_unlock; } if (remaining < len) goto out_unlock; *rm_list = msk->pm.rm_list_tx; - WRITE_ONCE(msk->pm.addr_signal, 0); + WRITE_ONCE(msk->pm.addr_signal, rm_addr); ret = true; out_unlock: @@ -308,6 +337,25 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_get_local_id(msk, skc); } +void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp); + + /* keep track of rtx periods with no progress */ + if (!subflow->stale_count) { + subflow->stale_rcv_tstamp = rcv_tstamp; + subflow->stale_count++; + } else if (subflow->stale_rcv_tstamp == rcv_tstamp) { + if (subflow->stale_count < U8_MAX) + subflow->stale_count++; + mptcp_pm_nl_subflow_chk_stale(msk, ssk); + } else { + subflow->stale_count = 0; + mptcp_subflow_set_active(subflow); + } +} + void mptcp_pm_data_init(struct mptcp_sock *msk) { msk->pm.add_addr_signaled = 0; diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 7b3794459783..7b96be1e9f14 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -46,6 +46,7 @@ struct pm_nl_pernet { spinlock_t lock; struct list_head local_addr_list; unsigned int addrs; + unsigned int stale_loss_cnt; unsigned int add_addr_signal_max; unsigned int add_addr_accept_max; unsigned int local_addr_max; @@ -316,14 +317,14 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (!entry->addr.id) return; - if (mptcp_pm_should_add_signal(msk)) { + if (mptcp_pm_should_add_signal_addr(msk)) { sk_reset_timer(sk, timer, jiffies + TCP_RTO_MAX / 8); goto out; } spin_lock_bh(&msk->pm.lock); - if (!mptcp_pm_should_add_signal(msk)) { + if (!mptcp_pm_should_add_signal_addr(msk)) { pr_debug("retransmit ADD_ADDR id=%d", entry->addr.id); mptcp_pm_announce_addr(msk, &entry->addr, false); mptcp_pm_add_addr_send_ack(msk); @@ -409,6 +410,55 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) } } +static bool lookup_address_in_vec(struct mptcp_addr_info *addrs, unsigned int nr, + struct mptcp_addr_info *addr) +{ + int i; + + for (i = 0; i < nr; i++) { + if (addresses_equal(&addrs[i], addr, addr->port)) + return true; + } + + return false; +} + +/* Fill all the remote addresses into the array addrs[], + * and return the array size. + */ +static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullmesh, + struct mptcp_addr_info *addrs) +{ + struct sock *sk = (struct sock *)msk, *ssk; + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info remote = { 0 }; + unsigned int subflows_max; + int i = 0; + + subflows_max = mptcp_pm_get_subflows_max(msk); + + /* Non-fullmesh endpoint, fill in the single entry + * corresponding to the primary MPC subflow remote address + */ + if (!fullmesh) { + remote_address((struct sock_common *)sk, &remote); + msk->pm.subflows++; + addrs[i++] = remote; + } else { + mptcp_for_each_subflow(msk, subflow) { + ssk = mptcp_subflow_tcp_sock(subflow); + remote_address((struct sock_common *)ssk, &remote); + if (!lookup_address_in_vec(addrs, i, &remote) && + msk->pm.subflows < subflows_max) { + msk->pm.subflows++; + addrs[i++] = remote; + } + } + } + + return i; +} + static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; @@ -454,15 +504,16 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) !READ_ONCE(msk->pm.remote_deny_join_id0)) { local = select_local_address(pernet, msk); if (local) { - struct mptcp_addr_info remote = { 0 }; + bool fullmesh = !!(local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH); + struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; + int i, nr; msk->pm.local_addr_used++; - msk->pm.subflows++; check_work_pending(msk); - remote_address((struct sock_common *)sk, &remote); + nr = fill_remote_addresses_vec(msk, fullmesh, addrs); spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect(sk, &local->addr, &remote, - local->flags, local->ifindex); + for (i = 0; i < nr; i++) + __mptcp_subflow_connect(sk, &local->addr, &addrs[i]); spin_lock_bh(&msk->pm.lock); return; } @@ -483,13 +534,67 @@ static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) mptcp_pm_create_subflow_or_signal_addr(msk); } +/* Fill all the local addresses into the array addrs[], + * and return the array size. + */ +static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, + struct mptcp_addr_info *addrs) +{ + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info local; + struct pm_nl_pernet *pernet; + unsigned int subflows_max; + int i = 0; + + pernet = net_generic(sock_net(sk), pm_nl_pernet_id); + subflows_max = mptcp_pm_get_subflows_max(msk); + + rcu_read_lock(); + __mptcp_flush_join_list(msk); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) + continue; + + if (entry->addr.family != sk->sk_family) { +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if ((entry->addr.family == AF_INET && + !ipv6_addr_v4mapped(&sk->sk_v6_daddr)) || + (sk->sk_family == AF_INET && + !ipv6_addr_v4mapped(&entry->addr.addr6))) +#endif + continue; + } + + if (msk->pm.subflows < subflows_max) { + msk->pm.subflows++; + addrs[i++] = entry->addr; + } + } + rcu_read_unlock(); + + /* If the array is empty, fill in the single + * 'IPADDRANY' local address + */ + if (!i) { + memset(&local, 0, sizeof(local)); + local.family = msk->pm.remote.family; + + msk->pm.subflows++; + addrs[i++] = local; + } + + return i; +} + static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { + struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; unsigned int add_addr_accept_max; struct mptcp_addr_info remote; - struct mptcp_addr_info local; unsigned int subflows_max; + int i, nr; add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); subflows_max = mptcp_pm_get_subflows_max(msk); @@ -501,23 +606,22 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) if (lookup_subflow_by_daddr(&msk->conn_list, &msk->pm.remote)) goto add_addr_echo; - msk->pm.add_addr_accepted++; - msk->pm.subflows++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) - WRITE_ONCE(msk->pm.accept_addr, false); - /* connect to the specified remote address, using whatever * local address the routing configuration will pick. */ remote = msk->pm.remote; if (!remote.port) remote.port = sk->sk_dport; - memset(&local, 0, sizeof(local)); - local.family = remote.family; + nr = fill_local_addresses_vec(msk, addrs); + + msk->pm.add_addr_accepted++; + if (msk->pm.add_addr_accepted >= add_addr_accept_max || + msk->pm.subflows >= subflows_max) + WRITE_ONCE(msk->pm.accept_addr, false); spin_unlock_bh(&msk->pm.lock); - __mptcp_subflow_connect(sk, &local, &remote, 0, 0); + for (i = 0; i < nr; i++) + __mptcp_subflow_connect(sk, &addrs[i], &remote); spin_lock_bh(&msk->pm.lock); add_addr_echo: @@ -540,24 +644,19 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); if (subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow; spin_unlock_bh(&msk->pm.lock); - pr_debug("send ack for %s%s%s", - mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr", - mptcp_pm_should_add_signal_ipv6(msk) ? " [ipv6]" : "", - mptcp_pm_should_add_signal_port(msk) ? " [port]" : ""); - - slow = lock_sock_fast(ssk); - tcp_send_ack(ssk); - unlock_sock_fast(ssk, slow); + pr_debug("send ack for %s", + mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr"); + + mptcp_subflow_send_ack(ssk); spin_lock_bh(&msk->pm.lock); } } -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - u8 bkup) +static int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + u8 bkup) { struct mptcp_subflow_context *subflow; @@ -567,7 +666,6 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *sk = (struct sock *)msk; struct mptcp_addr_info local; - bool slow; local_address((struct sock_common *)ssk, &local); if (!addresses_equal(&local, addr, addr->port)) @@ -580,9 +678,7 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, spin_unlock_bh(&msk->pm.lock); pr_debug("send ack for mp_prio"); - slow = lock_sock_fast(ssk); - tcp_send_ack(ssk); - unlock_sock_fast(ssk, slow); + mptcp_subflow_send_ack(ssk); spin_lock_bh(&msk->pm.lock); return 0; @@ -899,6 +995,43 @@ static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = { [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, }, }; +void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) +{ + struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = (struct sock *)msk; + unsigned int active_max_loss_cnt; + struct net *net = sock_net(sk); + unsigned int stale_loss_cnt; + bool slow; + + stale_loss_cnt = mptcp_stale_loss_cnt(net); + if (subflow->stale || !stale_loss_cnt || subflow->stale_count <= stale_loss_cnt) + return; + + /* look for another available subflow not in loss state */ + active_max_loss_cnt = max_t(int, stale_loss_cnt - 1, 1); + mptcp_for_each_subflow(msk, iter) { + if (iter != subflow && mptcp_subflow_active(iter) && + iter->stale_count < active_max_loss_cnt) { + /* we have some alternatives, try to mark this subflow as idle ...*/ + slow = lock_sock_fast(ssk); + if (!tcp_rtx_and_write_queues_empty(ssk)) { + subflow->stale = 1; + __mptcp_retransmit_pending_data(sk); + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_SUBFLOWSTALE); + } + unlock_sock_fast(ssk, slow); + + /* always try to push the pending data regarless of re-injections: + * we can possibly use backup subflows now, and subflow selection + * is cheap under the msk socket lock + */ + __mptcp_push_pending(sk, 0); + return; + } + } +} + static int mptcp_pm_family_to_addr(int family) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -1067,6 +1200,27 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) return NULL; } +int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, + u8 *flags, int *ifindex) +{ + struct mptcp_pm_addr_entry *entry; + + *flags = 0; + *ifindex = 0; + + if (id) { + rcu_read_lock(); + entry = __lookup_addr_by_id(net_generic(net, pm_nl_pernet_id), id); + if (entry) { + *flags = entry->flags; + *ifindex = entry->ifindex; + } + rcu_read_unlock(); + } + + return 0; +} + static bool remove_anno_list_by_saddr(struct mptcp_sock *msk, struct mptcp_addr_info *addr) { @@ -1564,9 +1718,7 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) list_for_each_entry(entry, &pernet->local_addr_list, list) { if (addresses_equal(&entry->addr, &addr.addr, true)) { - ret = mptcp_nl_addr_backup(net, &entry->addr, bkup); - if (ret) - return ret; + mptcp_nl_addr_backup(net, &entry->addr, bkup); if (bkup) entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; @@ -1900,7 +2052,11 @@ static int __net_init pm_nl_init_net(struct net *net) struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); INIT_LIST_HEAD_RCU(&pernet->local_addr_list); + + /* Cit. 2 subflows ought to be enough for anybody. */ + pernet->subflows_max = 2; pernet->next_id = 1; + pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); /* No need to initialize other pernet fields, the struct is zeroed at diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index a88924947815..b7e32e316738 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -126,6 +126,11 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); } +static void mptcp_rmem_charge(struct sock *sk, int size) +{ + mptcp_sk(sk)->rmem_fwd_alloc -= size; +} + static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from) { @@ -142,7 +147,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; kfree_skb_partial(from, fragstolen); atomic_add(delta, &sk->sk_rmem_alloc); - sk_mem_charge(sk, delta); + mptcp_rmem_charge(sk, delta); return true; } @@ -155,6 +160,44 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, return mptcp_try_coalesce((struct sock *)msk, to, from); } +static void __mptcp_rmem_reclaim(struct sock *sk, int amount) +{ + amount >>= SK_MEM_QUANTUM_SHIFT; + mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT; + __sk_mem_reduce_allocated(sk, amount); +} + +static void mptcp_rmem_uncharge(struct sock *sk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int reclaimable; + + msk->rmem_fwd_alloc += size; + reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); + + /* see sk_mem_uncharge() for the rationale behind the following schema */ + if (unlikely(reclaimable >= SK_RECLAIM_THRESHOLD)) + __mptcp_rmem_reclaim(sk, SK_RECLAIM_CHUNK); +} + +static void mptcp_rfree(struct sk_buff *skb) +{ + unsigned int len = skb->truesize; + struct sock *sk = skb->sk; + + atomic_sub(len, &sk->sk_rmem_alloc); + mptcp_rmem_uncharge(sk, len); +} + +static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + skb_orphan(skb); + skb->sk = sk; + skb->destructor = mptcp_rfree; + atomic_add(skb->truesize, &sk->sk_rmem_alloc); + mptcp_rmem_charge(sk, skb->truesize); +} + /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks @@ -267,7 +310,29 @@ merge_right: end: skb_condense(skb); - skb_set_owner_r(skb, sk); + mptcp_set_owner_r(skb, sk); +} + +static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + int amt, amount; + + if (size < msk->rmem_fwd_alloc) + return true; + + amt = sk_mem_pages(size); + amount = amt << SK_MEM_QUANTUM_SHIFT; + msk->rmem_fwd_alloc += amount; + if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) { + if (ssk->sk_forward_alloc < amount) { + msk->rmem_fwd_alloc -= amount; + return false; + } + + ssk->sk_forward_alloc -= amount; + } + return true; } static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, @@ -285,15 +350,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, skb_orphan(skb); /* try to fetch required memory from subflow */ - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT; - - if (ssk->sk_forward_alloc < amount) - goto drop; - - ssk->sk_forward_alloc -= amount; - sk->sk_forward_alloc += amount; - } + if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) + goto drop; has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; @@ -313,7 +371,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, if (tail && mptcp_try_coalesce(sk, tail, skb)) return true; - skb_set_owner_r(skb, sk); + mptcp_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); return true; } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { @@ -411,35 +469,51 @@ static void mptcp_set_datafin_timeout(const struct sock *sk) TCP_RTO_MIN << icsk->icsk_retransmits); } -static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) +static void __mptcp_set_timeout(struct sock *sk, long tout) { - long tout = ssk && inet_csk(ssk)->icsk_pending ? - inet_csk(ssk)->icsk_timeout - jiffies : 0; - - if (tout <= 0) - tout = mptcp_sk(sk)->timer_ival; mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; } +static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subflow) +{ + const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? + inet_csk(ssk)->icsk_timeout - jiffies : 0; +} + +static void mptcp_set_timeout(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + long tout = 0; + + mptcp_for_each_subflow(mptcp_sk(sk), subflow) + tout = max(tout, mptcp_timeout_from_subflow(subflow)); + __mptcp_set_timeout(sk, tout); +} + static bool tcp_can_send_ack(const struct sock *ssk) { return !((1 << inet_sk_state_load(ssk)) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); } +void mptcp_subflow_send_ack(struct sock *ssk) +{ + bool slow; + + slow = lock_sock_fast(ssk); + if (tcp_can_send_ack(ssk)) + tcp_send_ack(ssk); + unlock_sock_fast(ssk, slow); +} + static void mptcp_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow; - - slow = lock_sock_fast(ssk); - if (tcp_can_send_ack(ssk)) - tcp_send_ack(ssk); - unlock_sock_fast(ssk, slow); - } + mptcp_for_each_subflow(msk, subflow) + mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) @@ -512,7 +586,6 @@ static bool mptcp_check_data_fin(struct sock *sk) sk->sk_shutdown |= RCV_SHUTDOWN; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); switch (sk->sk_state) { case TCP_ESTABLISHED: @@ -531,7 +604,6 @@ static bool mptcp_check_data_fin(struct sock *sk) } ret = true; - mptcp_set_timeout(sk, NULL); mptcp_send_ack(msk); mptcp_close_wake_up(sk); } @@ -727,10 +799,9 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); - if (move_skbs_to_msk(msk, ssk)) { - set_bit(MPTCP_DATA_READY, &msk->flags); + if (move_skbs_to_msk(msk, ssk)) sk->sk_data_ready(sk); - } + mptcp_data_unlock(sk); } @@ -791,10 +862,7 @@ static void mptcp_reset_timer(struct sock *sk) if (unlikely(inet_sk_state_load(sk) == TCP_CLOSE)) return; - /* should never be called with mptcp level timer cleared */ - tout = READ_ONCE(mptcp_sk(sk)->timer_ival); - if (WARN_ON_ONCE(!tout)) - tout = TCP_RTO_MIN; + tout = mptcp_sk(sk)->timer_ival; sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); } @@ -835,7 +903,6 @@ static void mptcp_check_for_eof(struct mptcp_sock *msk) sk->sk_shutdown |= RCV_SHUTDOWN; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); sk->sk_data_ready(sk); } @@ -899,117 +966,20 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, df->data_seq + df->data_len == msk->write_seq; } -static int mptcp_wmem_with_overhead(int size) -{ - return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT); -} - -static void __mptcp_wmem_reserve(struct sock *sk, int size) -{ - int amount = mptcp_wmem_with_overhead(size); - struct mptcp_sock *msk = mptcp_sk(sk); - - WARN_ON_ONCE(msk->wmem_reserved); - if (WARN_ON_ONCE(amount < 0)) - amount = 0; - - if (amount <= sk->sk_forward_alloc) - goto reserve; - - /* under memory pressure try to reserve at most a single page - * otherwise try to reserve the full estimate and fallback - * to a single page before entering the error path - */ - if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) || - !sk_wmem_schedule(sk, amount)) { - if (amount <= PAGE_SIZE) - goto nomem; - - amount = PAGE_SIZE; - if (!sk_wmem_schedule(sk, amount)) - goto nomem; - } - -reserve: - msk->wmem_reserved = amount; - sk->sk_forward_alloc -= amount; - return; - -nomem: - /* we will wait for memory on next allocation */ - msk->wmem_reserved = -1; -} - -static void __mptcp_update_wmem(struct sock *sk) +static void __mptcp_mem_reclaim_partial(struct sock *sk) { - struct mptcp_sock *msk = mptcp_sk(sk); - -#ifdef CONFIG_LOCKDEP - WARN_ON_ONCE(!lockdep_is_held(&sk->sk_lock.slock)); -#endif - - if (!msk->wmem_reserved) - return; - - if (msk->wmem_reserved < 0) - msk->wmem_reserved = 0; - if (msk->wmem_reserved > 0) { - sk->sk_forward_alloc += msk->wmem_reserved; - msk->wmem_reserved = 0; - } -} - -static bool mptcp_wmem_alloc(struct sock *sk, int size) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - /* check for pre-existing error condition */ - if (msk->wmem_reserved < 0) - return false; - - if (msk->wmem_reserved >= size) - goto account; - - mptcp_data_lock(sk); - if (!sk_wmem_schedule(sk, size)) { - mptcp_data_unlock(sk); - return false; - } - - sk->sk_forward_alloc -= size; - msk->wmem_reserved += size; - mptcp_data_unlock(sk); - -account: - msk->wmem_reserved -= size; - return true; -} + int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk); -static void mptcp_wmem_uncharge(struct sock *sk, int size) -{ - struct mptcp_sock *msk = mptcp_sk(sk); + lockdep_assert_held_once(&sk->sk_lock.slock); - if (msk->wmem_reserved < 0) - msk->wmem_reserved = 0; - msk->wmem_reserved += size; + __mptcp_rmem_reclaim(sk, reclaimable - 1); + sk_mem_reclaim_partial(sk); } static void mptcp_mem_reclaim_partial(struct sock *sk) { - struct mptcp_sock *msk = mptcp_sk(sk); - - /* if we are experiencing a transint allocation error, - * the forward allocation memory has been already - * released - */ - if (msk->wmem_reserved < 0) - return; - mptcp_data_lock(sk); - sk->sk_forward_alloc += msk->wmem_reserved; - sk_mem_reclaim_partial(sk); - msk->wmem_reserved = sk->sk_forward_alloc; - sk->sk_forward_alloc = 0; + __mptcp_mem_reclaim_partial(sk); mptcp_data_unlock(sk); } @@ -1046,8 +1016,14 @@ static void __mptcp_clean_una(struct sock *sk) if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) break; - if (WARN_ON_ONCE(dfrag == msk->first_pending)) - break; + if (unlikely(dfrag == msk->first_pending)) { + /* in recovery mode can see ack after the current snd head */ + if (WARN_ON_ONCE(!msk->recovery)) + break; + + WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + } + dfrag_clear(sk, dfrag); cleaned = true; } @@ -1056,8 +1032,14 @@ static void __mptcp_clean_una(struct sock *sk) if (dfrag && after64(snd_una, dfrag->data_seq)) { u64 delta = snd_una - dfrag->data_seq; - if (WARN_ON_ONCE(delta > dfrag->already_sent)) - goto out; + /* prevent wrap around in recovery mode */ + if (unlikely(delta > dfrag->already_sent)) { + if (WARN_ON_ONCE(!msk->recovery)) + goto out; + if (WARN_ON_ONCE(delta > dfrag->data_len)) + goto out; + dfrag->already_sent += delta - dfrag->already_sent; + } dfrag->data_seq += delta; dfrag->offset += delta; @@ -1068,16 +1050,17 @@ static void __mptcp_clean_una(struct sock *sk) cleaned = true; } + /* all retransmitted data acked, recovery completed */ + if (unlikely(msk->recovery) && after64(msk->snd_una, msk->recovery_snd_nxt)) + msk->recovery = false; + out: - if (cleaned) { - if (tcp_under_memory_pressure(sk)) { - __mptcp_update_wmem(sk); - sk_mem_reclaim_partial(sk); - } - } + if (cleaned && tcp_under_memory_pressure(sk)) + __mptcp_mem_reclaim_partial(sk); - if (snd_una == READ_ONCE(msk->snd_nxt)) { - if (msk->timer_ival && !mptcp_data_fin_enabled(msk)) + if (snd_una == READ_ONCE(msk->snd_nxt) && + snd_una == READ_ONCE(msk->write_seq)) { + if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) mptcp_stop_timer(sk); } else { mptcp_reset_timer(sk); @@ -1086,9 +1069,8 @@ out: static void __mptcp_clean_una_wakeup(struct sock *sk) { -#ifdef CONFIG_LOCKDEP - WARN_ON_ONCE(!lockdep_is_held(&sk->sk_lock.slock)); -#endif + lockdep_assert_held_once(&sk->sk_lock.slock); + __mptcp_clean_una(sk); mptcp_write_space(sk); } @@ -1154,6 +1136,7 @@ struct mptcp_sendmsg_info { u16 limit; u16 sent; unsigned int flags; + bool data_lock_held; }; static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, @@ -1191,7 +1174,8 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) if (likely(skb)) { if (likely(__mptcp_add_ext(skb, gfp))) { skb_reserve(skb, MAX_TCP_HEADER); - skb->reserved_tailroom = skb->end - skb->tail; + skb->ip_summed = CHECKSUM_PARTIAL; + INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); @@ -1201,41 +1185,33 @@ static struct sk_buff *__mptcp_do_alloc_tx_skb(struct sock *sk, gfp_t gfp) return NULL; } -static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) +static struct sk_buff *__mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) { struct sk_buff *skb; - if (ssk->sk_tx_skb_cache) { - skb = ssk->sk_tx_skb_cache; - if (unlikely(!skb_ext_find(skb, SKB_EXT_MPTCP) && - !__mptcp_add_ext(skb, gfp))) - return false; - return true; - } - skb = __mptcp_do_alloc_tx_skb(sk, gfp); if (!skb) - return false; + return NULL; if (likely(sk_wmem_schedule(ssk, skb->truesize))) { - ssk->sk_tx_skb_cache = skb; - return true; + tcp_skb_entail(ssk, skb); + return skb; } kfree_skb(skb); - return false; + return NULL; } -static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) +static struct sk_buff *mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) { - return !ssk->sk_tx_skb_cache && - tcp_under_memory_pressure(sk); -} + gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; -static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) -{ - if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) - mptcp_mem_reclaim_partial(sk); - return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation); + if (unlikely(tcp_under_memory_pressure(sk))) { + if (data_lock_held) + __mptcp_mem_reclaim_partial(sk); + else + mptcp_mem_reclaim_partial(sk); + } + return __mptcp_alloc_tx_skb(sk, ssk, gfp); } /* note: this always recompute the csum on the whole skb, even @@ -1255,23 +1231,29 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, struct mptcp_sendmsg_info *info) { u64 data_seq = dfrag->data_seq + info->sent; + int offset = dfrag->offset + info->sent; struct mptcp_sock *msk = mptcp_sk(sk); bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; - struct sk_buff *skb, *tail; - bool can_collapse = false; - int size_bias = 0; - int avail_size; - size_t ret = 0; + bool can_coalesce = false; + bool reuse_skb = true; + struct sk_buff *skb; + size_t copy; + int i; pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u", msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent); + if (WARN_ON_ONCE(info->sent > info->limit || + info->limit > dfrag->data_len)) + return 0; + /* compute send limit */ info->mss_now = tcp_send_mss(ssk, &info->size_goal, info->flags); - avail_size = info->size_goal; + copy = info->size_goal; + skb = tcp_write_queue_tail(ssk); - if (skb) { + if (skb && copy > skb->len) { /* Limit the write to the size available in the * current skb, if any, so that we create at most a new skb. * Explicitly tells TCP internals to avoid collapsing on later @@ -1279,60 +1261,84 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * SSN association set here */ mpext = skb_ext_find(skb, SKB_EXT_MPTCP); - can_collapse = (info->size_goal - skb->len > 0) && - mptcp_skb_can_collapse_to(data_seq, skb, mpext); - if (!can_collapse) { + if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { TCP_SKB_CB(skb)->eor = 1; - } else { - size_bias = skb->len; - avail_size = info->size_goal - skb->len; + goto alloc_skb; } + + i = skb_shinfo(skb)->nr_frags; + can_coalesce = skb_can_coalesce(skb, i, dfrag->page, offset); + if (!can_coalesce && i >= sysctl_max_skb_frags) { + tcp_mark_push(tcp_sk(ssk), skb); + goto alloc_skb; + } + + copy -= skb->len; + } else { +alloc_skb: + skb = mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held); + if (!skb) + return -ENOMEM; + + i = skb_shinfo(skb)->nr_frags; + reuse_skb = false; + mpext = skb_ext_find(skb, SKB_EXT_MPTCP); } /* Zero window and all data acked? Probe. */ - avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); - if (avail_size == 0) { + copy = mptcp_check_allowed_size(msk, data_seq, copy); + if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); - if (skb || snd_una != msk->snd_nxt) + if (snd_una != msk->snd_nxt) { + tcp_remove_empty_skb(ssk); return 0; + } + zero_window_probe = true; data_seq = snd_una - 1; - avail_size = 1; - } + copy = 1; - if (WARN_ON_ONCE(info->sent > info->limit || - info->limit > dfrag->data_len)) - return 0; + /* all mptcp-level data is acked, no skbs should be present into the + * ssk write queue + */ + WARN_ON_ONCE(reuse_skb); + } - ret = info->limit - info->sent; - tail = tcp_build_frag(ssk, avail_size + size_bias, info->flags, - dfrag->page, dfrag->offset + info->sent, &ret); - if (!tail) { - tcp_remove_empty_skb(sk, tcp_write_queue_tail(ssk)); + copy = min_t(size_t, copy, info->limit - info->sent); + if (!sk_wmem_schedule(ssk, copy)) { + tcp_remove_empty_skb(ssk); return -ENOMEM; } - /* if the tail skb is still the cached one, collapsing really happened. - */ - if (skb == tail) { - TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; - mpext->data_len += ret; - WARN_ON_ONCE(!can_collapse); + if (can_coalesce) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + get_page(dfrag->page); + skb_fill_page_desc(skb, i, dfrag->page, offset, copy); + } + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk_wmem_queued_add(ssk, copy); + sk_mem_charge(ssk, copy); + WRITE_ONCE(tcp_sk(ssk)->write_seq, tcp_sk(ssk)->write_seq + copy); + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); + + /* on skb reuse we just need to update the DSS len */ + if (reuse_skb) { + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; + mpext->data_len += copy; WARN_ON_ONCE(zero_window_probe); goto out; } - mpext = skb_ext_find(tail, SKB_EXT_MPTCP); - if (WARN_ON_ONCE(!mpext)) { - /* should never reach here, stream corrupted */ - return -EINVAL; - } - memset(mpext, 0, sizeof(*mpext)); mpext->data_seq = data_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; - mpext->data_len = ret; + mpext->data_len = copy; mpext->use_map = 1; mpext->dsn64 = 1; @@ -1341,18 +1347,18 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->dsn64); if (zero_window_probe) { - mptcp_subflow_ctx(ssk)->rel_write_seq += ret; + mptcp_subflow_ctx(ssk)->rel_write_seq += copy; mpext->frozen = 1; if (READ_ONCE(msk->csum_enabled)) - mptcp_update_data_checksum(tail, ret); + mptcp_update_data_checksum(skb, copy); tcp_push_pending_frames(ssk); return 0; } out: if (READ_ONCE(msk->csum_enabled)) - mptcp_update_data_checksum(tail, ret); - mptcp_subflow_ctx(ssk)->rel_write_seq += ret; - return ret; + mptcp_update_data_checksum(skb, copy); + mptcp_subflow_ctx(ssk)->rel_write_seq += copy; + return copy; } #define MPTCP_SEND_BURST_SIZE ((1 << 16) - \ @@ -1366,16 +1372,44 @@ struct subflow_send_info { u64 ratio; }; +void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow) +{ + if (!subflow->stale) + return; + + subflow->stale = 0; + MPTCP_INC_STATS(sock_net(mptcp_subflow_tcp_sock(subflow)), MPTCP_MIB_SUBFLOWRECOVER); +} + +bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) +{ + if (unlikely(subflow->stale)) { + u32 rcv_tstamp = READ_ONCE(tcp_sk(mptcp_subflow_tcp_sock(subflow))->rcv_tstamp); + + if (subflow->stale_rcv_tstamp == rcv_tstamp) + return false; + + mptcp_subflow_set_active(subflow); + } + return __mptcp_subflow_active(subflow); +} + +/* implement the mptcp packet scheduler; + * returns the subflow that will transmit the next DSS + * additionally updates the rtx timeout + */ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct subflow_send_info send_info[2]; struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; int i, nr_active = 0; struct sock *ssk; + long tout = 0; u64 ratio; u32 pace; - sock_owned_by_me((struct sock *)msk); + sock_owned_by_me(sk); if (__mptcp_check_fallback(msk)) { if (!msk->first) @@ -1386,8 +1420,10 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) /* re-use last subflow, if the burst allow that */ if (msk->last_snd && msk->snd_burst > 0 && sk_stream_memory_free(msk->last_snd) && - mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) + mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) { + mptcp_set_timeout(sk); return msk->last_snd; + } /* pick the subflow with the lower wmem/wspace ratio */ for (i = 0; i < 2; ++i) { @@ -1400,6 +1436,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) if (!mptcp_subflow_active(subflow)) continue; + tout = max(tout, mptcp_timeout_from_subflow(subflow)); nr_active += !subflow->backup; if (!sk_stream_memory_free(subflow->tcp_sock) || !tcp_sk(ssk)->snd_wnd) continue; @@ -1415,6 +1452,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) send_info[subflow->backup].ratio = ratio; } } + __mptcp_set_timeout(sk, tout); /* pick the best backup if no other subflow is active */ if (!nr_active) @@ -1430,15 +1468,45 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) return NULL; } -static void mptcp_push_release(struct sock *sk, struct sock *ssk, - struct mptcp_sendmsg_info *info) +static void mptcp_push_release(struct sock *ssk, struct mptcp_sendmsg_info *info) { - mptcp_set_timeout(sk, ssk); tcp_push(ssk, 0, info->mss_now, tcp_sk(ssk)->nonagle, info->size_goal); release_sock(ssk); } -static void __mptcp_push_pending(struct sock *sk, unsigned int flags) +static void mptcp_update_post_push(struct mptcp_sock *msk, + struct mptcp_data_frag *dfrag, + u32 sent) +{ + u64 snd_nxt_new = dfrag->data_seq; + + dfrag->already_sent += sent; + + msk->snd_burst -= sent; + + snd_nxt_new += dfrag->already_sent; + + /* snd_nxt_new can be smaller than snd_nxt in case mptcp + * is recovering after a failover. In that event, this re-sends + * old segments. + * + * Thus compute snd_nxt_new candidate based on + * the dfrag->data_seq that was sent and the data + * that has been handed to the subflow for transmission + * and skip update in case it was old dfrag. + */ + if (likely(after64(snd_nxt_new, msk->snd_nxt))) + msk->snd_nxt = snd_nxt_new; +} + +static void mptcp_check_and_set_pending(struct sock *sk) +{ + if (mptcp_send_head(sk) && + !test_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags)) + set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags); +} + +void __mptcp_push_pending(struct sock *sk, unsigned int flags) { struct sock *prev_ssk = NULL, *ssk = NULL; struct mptcp_sock *msk = mptcp_sk(sk); @@ -1459,60 +1527,54 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags) mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); - /* try to keep the subflow socket lock across - * consecutive xmit on the same socket + /* First check. If the ssk has changed since + * the last round, release prev_ssk */ if (ssk != prev_ssk && prev_ssk) - mptcp_push_release(sk, prev_ssk, &info); + mptcp_push_release(prev_ssk, &info); if (!ssk) goto out; - if (ssk != prev_ssk || !prev_ssk) - lock_sock(ssk); - - /* keep it simple and always provide a new skb for the - * subflow, even if we will not use it when collapsing - * on the pending one + /* Need to lock the new subflow only if different + * from the previous one, otherwise we are still + * helding the relevant lock */ - if (!mptcp_alloc_tx_skb(sk, ssk)) { - mptcp_push_release(sk, ssk, &info); - goto out; - } + if (ssk != prev_ssk) + lock_sock(ssk); ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) { - mptcp_push_release(sk, ssk, &info); + mptcp_push_release(ssk, &info); goto out; } info.sent += ret; - dfrag->already_sent += ret; - msk->snd_nxt += ret; - msk->snd_burst -= ret; - msk->tx_pending_data -= ret; copied += ret; len -= ret; + + mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } /* at this point we held the socket lock for the last subflow we used */ if (ssk) - mptcp_push_release(sk, ssk, &info); + mptcp_push_release(ssk, &info); out: - if (copied) { - /* start the timer, if it's not pending */ - if (!mptcp_timer_pending(sk)) - mptcp_reset_timer(sk); + /* ensure the rtx timer is running */ + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + if (copied) __mptcp_check_send_data_fin(sk); - } } static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) { struct mptcp_sock *msk = mptcp_sk(sk); - struct mptcp_sendmsg_info info; + struct mptcp_sendmsg_info info = { + .data_lock_held = true, + }; struct mptcp_data_frag *dfrag; struct sock *xmit_ssk; int len, copied = 0; @@ -1538,25 +1600,16 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) goto out; } - if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) { - __mptcp_update_wmem(sk); - sk_mem_reclaim_partial(sk); - } - if (!__mptcp_alloc_tx_skb(sk, ssk, GFP_ATOMIC)) - goto out; - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) goto out; info.sent += ret; - dfrag->already_sent += ret; - msk->snd_nxt += ret; - msk->snd_burst -= ret; - msk->tx_pending_data -= ret; copied += ret; len -= ret; first = false; + + mptcp_update_post_push(msk, dfrag, ret); } WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); } @@ -1565,9 +1618,7 @@ out: /* __mptcp_alloc_tx_skb could have released some wmem and we are * not going to flush it via release_sock() */ - __mptcp_update_wmem(sk); if (copied) { - mptcp_set_timeout(sk, ssk); tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, info.size_goal); if (!mptcp_timer_pending(sk)) @@ -1603,7 +1654,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* silently ignore everything else */ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL; - mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len))); + lock_sock(sk); timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); @@ -1651,23 +1702,22 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) psize = min_t(size_t, psize, msg_data_left(msg)); total_ts = psize + frag_truesize; - if (!mptcp_wmem_alloc(sk, total_ts)) + if (!sk_wmem_schedule(sk, total_ts)) goto wait_for_memory; if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { - mptcp_wmem_uncharge(sk, psize + frag_truesize); ret = -EFAULT; goto out; } /* data successfully copied into the write queue */ + sk->sk_forward_alloc -= total_ts; copied += psize; dfrag->data_len += psize; frag_truesize += psize; pfrag->offset += frag_truesize; WRITE_ONCE(msk->write_seq, msk->write_seq + psize); - msk->tx_pending_data += psize; /* charge data on mptcp pending queue to the msk socket * Note: we charge such data both to sk and ssk @@ -1701,21 +1751,6 @@ out: return copied ? : ret; } -static void mptcp_wait_data(struct sock *sk, long *timeo) -{ - DEFINE_WAIT_FUNC(wait, woken_wake_function); - struct mptcp_sock *msk = mptcp_sk(sk); - - add_wait_queue(sk_sleep(sk), &wait); - sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); - - sk_wait_event(sk, timeo, - test_bit(MPTCP_DATA_READY, &msk->flags), &wait); - - sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); - remove_wait_queue(sk_sleep(sk), &wait); -} - static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, @@ -1874,7 +1909,7 @@ static void __mptcp_update_rmem(struct sock *sk) return; atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); - sk_mem_uncharge(sk, msk->rmem_released); + mptcp_rmem_uncharge(sk, msk->rmem_released); WRITE_ONCE(msk->rmem_released, 0); } @@ -1942,7 +1977,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); - mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk)); + lock_sock(sk); if (unlikely(sk->sk_state == TCP_LISTEN)) { copied = -ENOTCONN; goto out_err; @@ -2019,19 +2054,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld", timeo); - mptcp_wait_data(sk, &timeo); - } - - if (skb_queue_empty_lockless(&sk->sk_receive_queue) && - skb_queue_empty(&msk->receive_queue)) { - /* entire backlog drained, clear DATA_READY. */ - clear_bit(MPTCP_DATA_READY, &msk->flags); - - /* .. race-breaker: ssk might have gotten new data - * after last __mptcp_move_skbs() returned false. - */ - if (unlikely(__mptcp_move_skbs(msk))) - set_bit(MPTCP_DATA_READY, &msk->flags); + sk_wait_data(sk, &timeo, NULL); } out_err: @@ -2040,9 +2063,9 @@ out_err: tcp_recv_timestamp(msg, sk, &tss); } - pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d", - msk, test_bit(MPTCP_DATA_READY, &msk->flags), - skb_queue_empty_lockless(&sk->sk_receive_queue), copied); + pr_debug("msk=%p rx queue empty=%d:%d copied=%d", + msk, skb_queue_empty_lockless(&sk->sk_receive_queue), + skb_queue_empty(&msk->receive_queue), copied); if (!(flags & MSG_PEEK)) mptcp_rcv_space_adjust(msk, copied); @@ -2083,10 +2106,11 @@ static void mptcp_timeout_timer(struct timer_list *t) * * A backup subflow is returned only if that is the only kind available. */ -static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) +static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) { + struct sock *backup = NULL, *pick = NULL; struct mptcp_subflow_context *subflow; - struct sock *backup = NULL; + int min_stale_count = INT_MAX; sock_owned_by_me((const struct sock *)msk); @@ -2096,14 +2120,14 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (!mptcp_subflow_active(subflow)) + if (!__mptcp_subflow_active(subflow)) continue; - /* still data outstanding at TCP level? Don't retransmit. */ - if (!tcp_write_queue_empty(ssk)) { - if (inet_csk(ssk)->icsk_ca_state >= TCP_CA_Loss) - continue; - return NULL; + /* still data outstanding at TCP level? skip this */ + if (!tcp_rtx_and_write_queues_empty(ssk)) { + mptcp_pm_subflow_chk_stale(msk, ssk); + min_stale_count = min_t(int, min_stale_count, subflow->stale_count); + continue; } if (subflow->backup) { @@ -2112,10 +2136,15 @@ static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) continue; } - return ssk; + if (!pick) + pick = ssk; } - return backup; + if (pick) + return pick; + + /* use backup only if there are no progresses anywhere */ + return min_stale_count > 1 ? backup : NULL; } static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) @@ -2126,6 +2155,46 @@ static void mptcp_dispose_initial_subflow(struct mptcp_sock *msk) } } +bool __mptcp_retransmit_pending_data(struct sock *sk) +{ + struct mptcp_data_frag *cur, *rtx_head; + struct mptcp_sock *msk = mptcp_sk(sk); + + if (__mptcp_check_fallback(mptcp_sk(sk))) + return false; + + if (tcp_rtx_and_write_queues_empty(sk)) + return false; + + /* the closing socket has some data untransmitted and/or unacked: + * some data in the mptcp rtx queue has not really xmitted yet. + * keep it simple and re-inject the whole mptcp level rtx queue + */ + mptcp_data_lock(sk); + __mptcp_clean_una_wakeup(sk); + rtx_head = mptcp_rtx_head(sk); + if (!rtx_head) { + mptcp_data_unlock(sk); + return false; + } + + msk->recovery_snd_nxt = msk->snd_nxt; + msk->recovery = true; + mptcp_data_unlock(sk); + + msk->first_pending = rtx_head; + msk->snd_burst = 0; + + /* be sure to clear the "sent status" on all re-injected fragments */ + list_for_each_entry(cur, &msk->rtx_queue, list) { + if (!cur->already_sent) + break; + cur->already_sent = 0; + } + + return true; +} + /* subflow sockets can be either outgoing (connect) or incoming * (accept). * @@ -2138,6 +2207,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { struct mptcp_sock *msk = mptcp_sk(sk); + bool need_push; list_del(&subflow->node); @@ -2149,6 +2219,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (ssk->sk_socket) sock_orphan(ssk); + need_push = __mptcp_retransmit_pending_data(sk); subflow->disposable = 1; /* if ssk hit tcp_done(), tcp_cleanup_ulp() cleared the related ops @@ -2176,6 +2247,9 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (msk->subflow && ssk == msk->subflow->sk) mptcp_dispose_initial_subflow(msk); + + if (need_push) + __mptcp_push_pending(sk, 0); } void mptcp_close_ssk(struct sock *sk, struct sock *ssk, @@ -2255,7 +2329,6 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) inet_sk_state_store(sk, TCP_CLOSE); sk->sk_shutdown = SHUTDOWN_MASK; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ - set_bit(MPTCP_DATA_READY, &msk->flags); set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); mptcp_close_wake_up(sk); @@ -2271,6 +2344,9 @@ static void __mptcp_retrans(struct sock *sk) int ret; mptcp_clean_una_wakeup(sk); + + /* first check ssk: need to kick "stale" logic */ + ssk = mptcp_subflow_get_retrans(msk); dfrag = mptcp_rtx_head(sk); if (!dfrag) { if (mptcp_data_fin_enabled(msk)) { @@ -2283,10 +2359,12 @@ static void __mptcp_retrans(struct sock *sk) goto reset_timer; } - return; + if (!mptcp_send_head(sk)) + return; + + goto reset_timer; } - ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_timer; @@ -2296,9 +2374,6 @@ static void __mptcp_retrans(struct sock *sk) info.sent = 0; info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; while (info.sent < info.limit) { - if (!mptcp_alloc_tx_skb(sk, ssk)) - break; - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) break; @@ -2313,10 +2388,11 @@ static void __mptcp_retrans(struct sock *sk) info.size_goal); } - mptcp_set_timeout(sk, ssk); release_sock(ssk); reset_timer: + mptcp_check_and_set_pending(sk); + if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); } @@ -2381,13 +2457,14 @@ static int __mptcp_init_sock(struct sock *sk) __skb_queue_head_init(&msk->receive_queue); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; - msk->wmem_reserved = 0; + msk->rmem_fwd_alloc = 0; WRITE_ONCE(msk->rmem_released, 0); - msk->tx_pending_data = 0; + msk->timer_ival = TCP_RTO_MIN; msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); + msk->recovery = false; mptcp_pm_data_init(msk); @@ -2472,7 +2549,6 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how) tcp_shutdown(ssk, how); } else { pr_debug("Sending DATA_FIN on subflow %p", ssk); - mptcp_set_timeout(sk, ssk); tcp_send_ack(ssk); if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); @@ -2592,7 +2668,7 @@ static void __mptcp_destroy_sock(struct sock *sk) sk->sk_prot->destroy(sk); - WARN_ON_ONCE(msk->wmem_reserved); + WARN_ON_ONCE(msk->rmem_fwd_alloc); WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); @@ -2625,7 +2701,7 @@ cleanup: inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32; mptcp_for_each_subflow(mptcp_sk(sk), subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow = lock_sock_fast(ssk); + bool slow = lock_sock_fast_nested(ssk); sock_orphan(ssk); unlock_sock_fast(ssk, slow); @@ -2723,7 +2799,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->token = subflow_req->token; msk->subflow = NULL; WRITE_ONCE(msk->fully_established, false); - if (mp_opt->csum_reqd) + if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); msk->write_seq = subflow_req->idsn + 1; @@ -2732,7 +2808,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; - if (mp_opt->mp_capable) { + if (mp_opt->suboptions & OPTIONS_MPTCP_MPC) { msk->can_ack = true; msk->remote_key = mp_opt->sndr_key; mptcp_crypto_key_sha(msk->remote_key, NULL, &ack_seq); @@ -2825,8 +2901,14 @@ void mptcp_destroy_common(struct mptcp_sock *msk) /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); - + __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); + + /* move all the rx fwd alloc into the sk_mem_reclaim_final in + * inet_sock_destruct() will dispose it + */ + sk->sk_forward_alloc += msk->rmem_fwd_alloc; + msk->rmem_fwd_alloc = 0; mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); } @@ -2908,10 +2990,6 @@ static void mptcp_release_cb(struct sock *sk) if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags)) __mptcp_error_report(sk); - /* push_pending may touch wmem_reserved, ensure we do the cleanup - * later - */ - __mptcp_update_wmem(sk); __mptcp_update_rmem(sk); } @@ -3061,6 +3139,11 @@ static void mptcp_shutdown(struct sock *sk, int how) __mptcp_wr_shutdown(sk); } +static int mptcp_forward_alloc_get(const struct sock *sk) +{ + return sk->sk_forward_alloc + mptcp_sk(sk)->rmem_fwd_alloc; +} + static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, @@ -3078,6 +3161,7 @@ static struct proto mptcp_prot = { .hash = mptcp_hash, .unhash = mptcp_unhash, .get_port = mptcp_get_port, + .forward_alloc_get = mptcp_forward_alloc_get, .sockets_allocated = &mptcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, @@ -3275,8 +3359,14 @@ unlock_fail: static __poll_t mptcp_check_readable(struct mptcp_sock *msk) { - return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : - 0; + /* Concurrent splices from sk_receive_queue into receive_queue will + * always show at least one non-empty queue when checked in this order. + */ + if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && + skb_queue_empty_lockless(&msk->receive_queue)) + return 0; + + return EPOLLIN | EPOLLRDNORM; } static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) @@ -3311,7 +3401,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, state = inet_sk_state_load(sk); pr_debug("msk=%p state=%d flags=%lx", msk, state, msk->flags); if (state == TCP_LISTEN) - return mptcp_check_readable(msk); + return test_bit(MPTCP_DATA_READY, &msk->flags) ? EPOLLIN | EPOLLRDNORM : 0; if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0f0c026c5f8b..67a61ac48b20 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -26,6 +26,15 @@ #define OPTION_MPTCP_FASTCLOSE BIT(8) #define OPTION_MPTCP_PRIO BIT(9) #define OPTION_MPTCP_RST BIT(10) +#define OPTION_MPTCP_DSS BIT(11) +#define OPTION_MPTCP_FAIL BIT(12) + +#define OPTION_MPTCP_CSUMREQD BIT(13) + +#define OPTIONS_MPTCP_MPC (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | \ + OPTION_MPTCP_MPC_ACK) +#define OPTIONS_MPTCP_MPJ (OPTION_MPTCP_MPJ_SYN | OPTION_MPTCP_MPJ_SYNACK | \ + OPTION_MPTCP_MPJ_ACK) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -67,6 +76,7 @@ #define TCPOLEN_MPTCP_PRIO_ALIGN 4 #define TCPOLEN_MPTCP_FASTCLOSE 12 #define TCPOLEN_MPTCP_RST 4 +#define TCPOLEN_MPTCP_FAIL 12 #define TCPOLEN_MPTCP_MPC_ACK_DATA_CSUM (TCPOLEN_MPTCP_DSS_CHECKSUM + TCPOLEN_MPTCP_MPC_ACK_DATA) @@ -129,35 +139,28 @@ struct mptcp_options_received { u32 subflow_seq; u16 data_len; __sum16 csum; - u16 mp_capable : 1, - mp_join : 1, - fastclose : 1, - reset : 1, - dss : 1, - add_addr : 1, - rm_addr : 1, - mp_prio : 1, - echo : 1, - csum_reqd : 1, - backup : 1, - deny_join_id0 : 1; + u16 suboptions; u32 token; u32 nonce; - u64 thmac; - u8 hmac[MPTCPOPT_HMAC_LEN]; - u8 join_id; - u8 use_map:1, + u16 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, + reset_reason:4, + reset_transient:1, + echo:1, + backup:1, + deny_join_id0:1, __unused:2; + u8 join_id; + u64 thmac; + u8 hmac[MPTCPOPT_HMAC_LEN]; struct mptcp_addr_info addr; struct mptcp_rm_list rm_list; u64 ahmac; - u8 reset_reason:4; - u8 reset_transient:1; + u64 fail_seq; }; static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) @@ -178,8 +181,6 @@ enum mptcp_pm_status { enum mptcp_addr_signal_status { MPTCP_ADD_ADDR_SIGNAL, MPTCP_ADD_ADDR_ECHO, - MPTCP_ADD_ADDR_IPV6, - MPTCP_ADD_ADDR_PORT, MPTCP_RM_ADDR_SIGNAL, }; @@ -226,16 +227,21 @@ struct mptcp_sock { u64 ack_seq; u64 rcv_wnd_sent; u64 rcv_data_fin_seq; - int wmem_reserved; + int rmem_fwd_alloc; struct sock *last_snd; int snd_burst; int old_wspace; + u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; + * recovery related fields are under data_lock + * protection + */ u64 snd_una; u64 wnd_end; unsigned long timer_ival; u32 token; int rmem_released; unsigned long flags; + bool recovery; /* closing subflow write queue reinjected */ bool can_ack; bool fully_established; bool rcv_data_fin; @@ -248,7 +254,6 @@ struct mptcp_sock { struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; struct sk_buff_head receive_queue; - int tx_pending_data; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; @@ -267,19 +272,6 @@ struct mptcp_sock { char ca_name[TCP_CA_NAME_MAX]; }; -#define mptcp_lock_sock(___sk, cb) do { \ - struct sock *__sk = (___sk); /* silence macro reuse warning */ \ - might_sleep(); \ - spin_lock_bh(&__sk->sk_lock.slock); \ - if (__sk->sk_lock.owned) \ - __lock_sock(__sk); \ - cb; \ - __sk->sk_lock.owned = 1; \ - spin_unlock(&__sk->sk_lock.slock); \ - mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \ - local_bh_enable(); \ -} while (0) - #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock) @@ -425,9 +417,11 @@ struct mptcp_subflow_context { mpc_map : 1, backup : 1, send_mp_prio : 1, + send_mp_fail : 1, rx_eof : 1, can_ack : 1, /* only after processing the remote a key */ - disposable : 1; /* ctx can be free at ulp release time */ + disposable : 1, /* ctx can be free at ulp release time */ + stale : 1; /* unable to snd/rcv data, do not use for xmit */ enum mptcp_data_avail data_avail; u32 remote_nonce; u64 thmac; @@ -439,11 +433,13 @@ struct mptcp_subflow_context { u8 reset_seen:1; u8 reset_transient:1; u8 reset_reason:4; + u8 stale_count; long delegated_status; struct list_head delegated_node; /* link into delegated_action, protected by local BH */ - u32 setsockopt_seq; + u32 setsockopt_seq; + u32 stale_rcv_tstamp; struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ @@ -549,31 +545,34 @@ static inline void mptcp_subflow_delegated_done(struct mptcp_subflow_context *su clear_bit(MPTCP_DELEGATE_SEND, &subflow->delegated_status); } -int mptcp_is_enabled(struct net *net); -unsigned int mptcp_get_add_addr_timeout(struct net *net); -int mptcp_is_checksum_enabled(struct net *net); -int mptcp_allow_join_id0(struct net *net); +int mptcp_is_enabled(const struct net *net); +unsigned int mptcp_get_add_addr_timeout(const struct net *net); +int mptcp_is_checksum_enabled(const struct net *net); +int mptcp_allow_join_id0(const struct net *net); +unsigned int mptcp_stale_loss_cnt(const struct net *net); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); +bool __mptcp_retransmit_pending_data(struct sock *sk); +void __mptcp_push_pending(struct sock *sk, unsigned int flags); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); +void mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, - const struct mptcp_addr_info *remote, - u8 flags, int ifindex); + const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); void mptcp_info2sockaddr(const struct mptcp_addr_info *info, struct sockaddr_storage *addr, unsigned short family); -static inline bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) +static inline bool __mptcp_subflow_active(struct mptcp_subflow_context *subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -585,6 +584,10 @@ static inline bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)); } +void mptcp_subflow_set_active(struct mptcp_subflow_context *subflow); + +bool mptcp_subflow_active(struct mptcp_subflow_context *subflow); + static inline void mptcp_subflow_tcp_fallback(struct sock *sk, struct mptcp_subflow_context *ctx) { @@ -596,6 +599,19 @@ static inline void mptcp_subflow_tcp_fallback(struct sock *sk, inet_csk(sk)->icsk_af_ops = ctx->icsk_af_ops; } +static inline bool mptcp_has_another_subflow(struct sock *ssk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk), *tmp; + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + + mptcp_for_each_subflow(msk, tmp) { + if (tmp != subflow) + return true; + } + + return false; +} + void __init mptcp_proto_init(void); #if IS_ENABLED(CONFIG_MPTCP_IPV6) int __init mptcp_proto_v6_init(void); @@ -679,7 +695,7 @@ int mptcp_token_new_connect(struct sock *sk); void mptcp_token_accept(struct mptcp_subflow_request_sock *r, struct mptcp_sock *msk); bool mptcp_token_exists(u32 token); -struct mptcp_sock *mptcp_token_get_sock(u32 token); +struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token); struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num); void mptcp_token_destroy(struct mptcp_sock *msk); @@ -690,6 +706,8 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); void __init mptcp_pm_init(void); void mptcp_pm_data_init(struct mptcp_sock *msk); +void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); +void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); @@ -705,9 +723,7 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - u8 bkup); +void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); bool mptcp_pm_sport_in_anno_list(struct mptcp_sock *msk, const struct sock *sk); struct mptcp_pm_add_entry * @@ -716,6 +732,8 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, struct mptcp_pm_add_entry * mptcp_lookup_anno_list_by_saddr(struct mptcp_sock *msk, struct mptcp_addr_info *addr); +int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, + u8 *flags, int *ifindex); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, @@ -730,22 +748,18 @@ void mptcp_event_addr_removed(const struct mptcp_sock *msk, u8 id); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); + return READ_ONCE(msk->pm.addr_signal) & + (BIT(MPTCP_ADD_ADDR_SIGNAL) | BIT(MPTCP_ADD_ADDR_ECHO)); } -static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) -{ - return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); -} - -static inline bool mptcp_pm_should_add_signal_ipv6(struct mptcp_sock *msk) +static inline bool mptcp_pm_should_add_signal_addr(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_IPV6); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_SIGNAL); } -static inline bool mptcp_pm_should_add_signal_port(struct mptcp_sock *msk) +static inline bool mptcp_pm_should_add_signal_echo(struct mptcp_sock *msk) { - return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_PORT); + return READ_ONCE(msk->pm.addr_signal) & BIT(MPTCP_ADD_ADDR_ECHO); } static inline bool mptcp_pm_should_rm_signal(struct mptcp_sock *msk) @@ -776,8 +790,10 @@ static inline int mptcp_rm_addr_len(const struct mptcp_rm_list *rm_list) return TCPOLEN_MPTCP_RM_ADDR_BASE + roundup(rm_list->nr - 1, 4) + 1; } -bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, unsigned int remaining, - struct mptcp_addr_info *saddr, bool *echo, bool *port); +bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, struct sk_buff *skb, + unsigned int opt_size, unsigned int remaining, + struct mptcp_addr_info *addr, bool *echo, + bool *port, bool *drop_other_suboptions); bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 8c03afac5ca0..0f1e661c2032 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -14,6 +14,8 @@ #include <net/mptcp.h> #include "protocol.h" +#define MIN_INFO_OPTLEN_SIZE 16 + static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk) { sock_owned_by_me((const struct sock *)msk); @@ -670,6 +672,266 @@ out: return ret; } +void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) +{ + struct sock *sk = &msk->sk.icsk_inet.sk; + u32 flags = 0; + bool slow; + u8 val; + + memset(info, 0, sizeof(*info)); + + slow = lock_sock_fast(sk); + + info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); + info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); + info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); + info->mptcpi_subflows_max = mptcp_pm_get_subflows_max(msk); + val = mptcp_pm_get_add_addr_signal_max(msk); + info->mptcpi_add_addr_signal_max = val; + val = mptcp_pm_get_add_addr_accept_max(msk); + info->mptcpi_add_addr_accepted_max = val; + info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) + flags |= MPTCP_INFO_FLAG_FALLBACK; + if (READ_ONCE(msk->can_ack)) + flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED; + info->mptcpi_flags = flags; + info->mptcpi_token = READ_ONCE(msk->token); + info->mptcpi_write_seq = READ_ONCE(msk->write_seq); + info->mptcpi_snd_una = READ_ONCE(msk->snd_una); + info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq); + info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); + + unlock_sock_fast(sk, slow); +} +EXPORT_SYMBOL_GPL(mptcp_diag_fill_info); + +static int mptcp_getsockopt_info(struct mptcp_sock *msk, char __user *optval, int __user *optlen) +{ + struct mptcp_info m_info; + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(struct mptcp_info)); + + mptcp_diag_fill_info(msk, &m_info); + + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, &m_info, len)) + return -EFAULT; + + return 0; +} + +static int mptcp_put_subflow_data(struct mptcp_subflow_data *sfd, + char __user *optval, + u32 copied, + int __user *optlen) +{ + u32 copylen = min_t(u32, sfd->size_subflow_data, sizeof(*sfd)); + + if (copied) + copied += sfd->size_subflow_data; + else + copied = copylen; + + if (put_user(copied, optlen)) + return -EFAULT; + + if (copy_to_user(optval, sfd, copylen)) + return -EFAULT; + + return 0; +} + +static int mptcp_get_subflow_data(struct mptcp_subflow_data *sfd, + char __user *optval, int __user *optlen) +{ + int len, copylen; + + if (get_user(len, optlen)) + return -EFAULT; + + /* if mptcp_subflow_data size is changed, need to adjust + * this function to deal with programs using old version. + */ + BUILD_BUG_ON(sizeof(*sfd) != MIN_INFO_OPTLEN_SIZE); + + if (len < MIN_INFO_OPTLEN_SIZE) + return -EINVAL; + + memset(sfd, 0, sizeof(*sfd)); + + copylen = min_t(unsigned int, len, sizeof(*sfd)); + if (copy_from_user(sfd, optval, copylen)) + return -EFAULT; + + /* size_subflow_data is u32, but len is signed */ + if (sfd->size_subflow_data > INT_MAX || + sfd->size_user > INT_MAX) + return -EINVAL; + + if (sfd->size_subflow_data < MIN_INFO_OPTLEN_SIZE || + sfd->size_subflow_data > len) + return -EINVAL; + + if (sfd->num_subflows || sfd->size_kernel) + return -EINVAL; + + return len - sfd->size_subflow_data; +} + +static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval, + int __user *optlen) +{ + struct mptcp_subflow_context *subflow; + struct sock *sk = &msk->sk.icsk_inet.sk; + unsigned int sfcount = 0, copied = 0; + struct mptcp_subflow_data sfd; + char __user *infoptr; + int len; + + len = mptcp_get_subflow_data(&sfd, optval, optlen); + if (len < 0) + return len; + + sfd.size_kernel = sizeof(struct tcp_info); + sfd.size_user = min_t(unsigned int, sfd.size_user, + sizeof(struct tcp_info)); + + infoptr = optval + sfd.size_subflow_data; + + lock_sock(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + ++sfcount; + + if (len && len >= sfd.size_user) { + struct tcp_info info; + + tcp_get_info(ssk, &info); + + if (copy_to_user(infoptr, &info, sfd.size_user)) { + release_sock(sk); + return -EFAULT; + } + + infoptr += sfd.size_user; + copied += sfd.size_user; + len -= sfd.size_user; + } + } + + release_sock(sk); + + sfd.num_subflows = sfcount; + + if (mptcp_put_subflow_data(&sfd, optval, copied, optlen)) + return -EFAULT; + + return 0; +} + +static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a) +{ + struct inet_sock *inet = inet_sk(sk); + + memset(a, 0, sizeof(*a)); + + if (sk->sk_family == AF_INET) { + a->sin_local.sin_family = AF_INET; + a->sin_local.sin_port = inet->inet_sport; + a->sin_local.sin_addr.s_addr = inet->inet_rcv_saddr; + + if (!a->sin_local.sin_addr.s_addr) + a->sin_local.sin_addr.s_addr = inet->inet_saddr; + + a->sin_remote.sin_family = AF_INET; + a->sin_remote.sin_port = inet->inet_dport; + a->sin_remote.sin_addr.s_addr = inet->inet_daddr; +#if IS_ENABLED(CONFIG_IPV6) + } else if (sk->sk_family == AF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + + if (WARN_ON_ONCE(!np)) + return; + + a->sin6_local.sin6_family = AF_INET6; + a->sin6_local.sin6_port = inet->inet_sport; + + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) + a->sin6_local.sin6_addr = np->saddr; + else + a->sin6_local.sin6_addr = sk->sk_v6_rcv_saddr; + + a->sin6_remote.sin6_family = AF_INET6; + a->sin6_remote.sin6_port = inet->inet_dport; + a->sin6_remote.sin6_addr = sk->sk_v6_daddr; +#endif + } +} + +static int mptcp_getsockopt_subflow_addrs(struct mptcp_sock *msk, char __user *optval, + int __user *optlen) +{ + struct sock *sk = &msk->sk.icsk_inet.sk; + struct mptcp_subflow_context *subflow; + unsigned int sfcount = 0, copied = 0; + struct mptcp_subflow_data sfd; + char __user *addrptr; + int len; + + len = mptcp_get_subflow_data(&sfd, optval, optlen); + if (len < 0) + return len; + + sfd.size_kernel = sizeof(struct mptcp_subflow_addrs); + sfd.size_user = min_t(unsigned int, sfd.size_user, + sizeof(struct mptcp_subflow_addrs)); + + addrptr = optval + sfd.size_subflow_data; + + lock_sock(sk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + ++sfcount; + + if (len && len >= sfd.size_user) { + struct mptcp_subflow_addrs a; + + mptcp_get_sub_addrs(ssk, &a); + + if (copy_to_user(addrptr, &a, sfd.size_user)) { + release_sock(sk); + return -EFAULT; + } + + addrptr += sfd.size_user; + copied += sfd.size_user; + len -= sfd.size_user; + } + } + + release_sock(sk); + + sfd.num_subflows = sfcount; + + if (mptcp_put_subflow_data(&sfd, optval, copied, optlen)) + return -EFAULT; + + return 0; +} + static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, char __user *optval, int __user *optlen) { @@ -684,6 +946,21 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, return -EOPNOTSUPP; } +static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname, + char __user *optval, int __user *optlen) +{ + switch (optname) { + case MPTCP_INFO: + return mptcp_getsockopt_info(msk, optval, optlen); + case MPTCP_TCPINFO: + return mptcp_getsockopt_tcpinfo(msk, optval, optlen); + case MPTCP_SUBFLOW_ADDRS: + return mptcp_getsockopt_subflow_addrs(msk, optval, optlen); + } + + return -EOPNOTSUPP; +} + int mptcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *option) { @@ -706,6 +983,8 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname, if (level == SOL_TCP) return mptcp_getsockopt_sol_tcp(msk, optname, optval, option); + if (level == SOL_MPTCP) + return mptcp_getsockopt_sol_mptcp(msk, optname, optval, option); return -EOPNOTSUPP; } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 966f777d35ce..6172f380dfb7 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -86,7 +86,7 @@ static struct mptcp_sock *subflow_token_join_request(struct request_sock *req) struct mptcp_sock *msk; int local_id; - msk = mptcp_token_get_sock(subflow_req->token); + msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token); if (!msk) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); return NULL; @@ -141,6 +141,7 @@ static int subflow_check_req(struct request_sock *req, struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; + bool opt_mp_capable, opt_mp_join; pr_debug("subflow_req=%p, listener=%p", subflow_req, listener); @@ -154,16 +155,18 @@ static int subflow_check_req(struct request_sock *req, mptcp_get_options(sk_listener, skb, &mp_opt); - if (mp_opt.mp_capable) { + opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); + opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); + if (opt_mp_capable) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); - if (mp_opt.mp_join) + if (opt_mp_join) return 0; - } else if (mp_opt.mp_join) { + } else if (opt_mp_join) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); } - if (mp_opt.mp_capable && listener->request_mptcp) { + if (opt_mp_capable && listener->request_mptcp) { int err, retries = MPTCP_TOKEN_MAX_RETRIES; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; @@ -194,7 +197,7 @@ again: else SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT); - } else if (mp_opt.mp_join && listener->request_mptcp) { + } else if (opt_mp_join && listener->request_mptcp) { subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; subflow_req->mp_join = 1; subflow_req->backup = mp_opt.backup; @@ -243,15 +246,18 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener); struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct mptcp_options_received mp_opt; + bool opt_mp_capable, opt_mp_join; int err; subflow_init_req(req, sk_listener); mptcp_get_options(sk_listener, skb, &mp_opt); - if (mp_opt.mp_capable && mp_opt.mp_join) + opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC); + opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ); + if (opt_mp_capable && opt_mp_join) return -EINVAL; - if (mp_opt.mp_capable && listener->request_mptcp) { + if (opt_mp_capable && listener->request_mptcp) { if (mp_opt.sndr_key == 0) return -EINVAL; @@ -262,7 +268,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req, subflow_req->mp_capable = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1; - } else if (mp_opt.mp_join && listener->request_mptcp) { + } else if (opt_mp_join && listener->request_mptcp) { if (!mptcp_token_join_cookie_init_state(subflow_req, skb)) return -EINVAL; @@ -394,7 +400,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) subflow->icsk_af_ops->sk_rx_dst_set(sk, skb); - /* be sure no special action on any packet other than syn-ack */ if (subflow->conn_finished) return; @@ -407,7 +412,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) mptcp_get_options(sk, skb, &mp_opt); if (subflow->request_mptcp) { - if (!mp_opt.mp_capable) { + if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); mptcp_do_fallback(sk); @@ -415,7 +420,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) goto fallback; } - if (mp_opt.csum_reqd) + if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true); if (mp_opt.deny_join_id0) WRITE_ONCE(mptcp_sk(parent)->pm.remote_deny_join_id0, true); @@ -430,15 +435,17 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; - if (!mp_opt.mp_join) { + if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ)) { subflow->reset_reason = MPTCP_RST_EMPTCP; goto do_reset; } + subflow->backup = mp_opt.backup; subflow->thmac = mp_opt.thmac; subflow->remote_nonce = mp_opt.nonce; - pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, - subflow->thmac, subflow->remote_nonce); + pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d", + subflow, subflow->thmac, subflow->remote_nonce, + subflow->backup); if (!subflow_thmac_valid(subflow)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); @@ -634,10 +641,10 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); - /* After child creation we must look for 'mp_capable' even when options + /* After child creation we must look for MPC even when options * are not parsed */ - mp_opt.mp_capable = 0; + mp_opt.suboptions = 0; /* hopefully temporary handling for MP_JOIN+syncookie */ subflow_req = mptcp_subflow_rsk(req); @@ -657,7 +664,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * options. */ mptcp_get_options(sk, skb, &mp_opt); - if (!mp_opt.mp_capable) { + if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { fallback = true; goto create_child; } @@ -667,7 +674,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, fallback = true; } else if (subflow_req->mp_join) { mptcp_get_options(sk, skb, &mp_opt); - if (!mp_opt.mp_join || !subflow_hmac_valid(req, &mp_opt) || + if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) || + !subflow_hmac_valid(req, &mp_opt) || !mptcp_can_accept_new_subflow(subflow_req->msk)) { SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); fallback = true; @@ -724,7 +732,7 @@ create_child: /* with OoO packets we can reach here without ingress * mpc option */ - if (mp_opt.mp_capable) + if (mp_opt.suboptions & OPTIONS_MPTCP_MPC) mptcp_subflow_fully_established(ctx, &mp_opt); } else if (ctx->mp_join) { struct mptcp_sock *owner; @@ -908,6 +916,8 @@ static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff * csum = csum_partial(&header, sizeof(header), subflow->map_data_csum); if (unlikely(csum_fold(csum))) { MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR); + subflow->send_mp_fail = 1; + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPFAILTX); return subflow->mp_join ? MAPPING_INVALID : MAPPING_DUMMY; } @@ -1155,6 +1165,20 @@ no_data: fallback: /* RFC 8684 section 3.7. */ + if (subflow->send_mp_fail) { + if (mptcp_has_another_subflow(ssk)) { + while ((skb = skb_peek(&ssk->sk_receive_queue))) + sk_eat_skb(ssk, skb); + } + ssk->sk_err = EBADMSG; + tcp_set_state(ssk, TCP_CLOSE); + subflow->reset_transient = 0; + subflow->reset_reason = MPTCP_RST_EMIDDLEBOX; + tcp_send_active_reset(ssk, GFP_ATOMIC); + WRITE_ONCE(subflow->data_avail, 0); + return true; + } + if (subflow->mp_join || subflow->fully_established) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers @@ -1353,8 +1377,7 @@ void mptcp_info2sockaddr(const struct mptcp_addr_info *info, } int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, - const struct mptcp_addr_info *remote, - u8 flags, int ifindex) + const struct mptcp_addr_info *remote) { struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_subflow_context *subflow; @@ -1365,6 +1388,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, struct sock *ssk; u32 remote_token; int addrlen; + int ifindex; + u8 flags; int err; if (!mptcp_is_fully_established(sk)) @@ -1388,6 +1413,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, local_id = err; } + mptcp_pm_get_flags_and_ifindex_by_id(sock_net(sk), local_id, + &flags, &ifindex); subflow->remote_key = msk->remote_key; subflow->local_key = msk->local_key; subflow->token = msk->token; diff --git a/net/mptcp/syncookies.c b/net/mptcp/syncookies.c index 37127781aee9..7f22526346a7 100644 --- a/net/mptcp/syncookies.c +++ b/net/mptcp/syncookies.c @@ -108,18 +108,12 @@ bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subfl e->valid = 0; - msk = mptcp_token_get_sock(e->token); + msk = mptcp_token_get_sock(net, e->token); if (!msk) { spin_unlock_bh(&join_entry_locks[i]); return false; } - /* If this fails, the token got re-used in the mean time by another - * mptcp socket in a different netns, i.e. entry is outdated. - */ - if (!net_eq(sock_net((struct sock *)msk), net)) - goto err_put; - subflow_req->remote_nonce = e->remote_nonce; subflow_req->local_nonce = e->local_nonce; subflow_req->backup = e->backup; @@ -128,11 +122,6 @@ bool mptcp_token_join_cookie_init_state(struct mptcp_subflow_request_sock *subfl subflow_req->msk = msk; spin_unlock_bh(&join_entry_locks[i]); return true; - -err_put: - spin_unlock_bh(&join_entry_locks[i]); - sock_put((struct sock *)msk); - return false; } void __init mptcp_join_cookie_init(void) diff --git a/net/mptcp/token.c b/net/mptcp/token.c index a98e554b034f..e581b341c5be 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -231,6 +231,7 @@ found: /** * mptcp_token_get_sock - retrieve mptcp connection sock using its token + * @net: restrict to this namespace * @token: token of the mptcp connection to retrieve * * This function returns the mptcp connection structure with the given token. @@ -238,7 +239,7 @@ found: * * returns NULL if no connection with the given token value exists. */ -struct mptcp_sock *mptcp_token_get_sock(u32 token) +struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token) { struct hlist_nulls_node *pos; struct token_bucket *bucket; @@ -251,11 +252,15 @@ struct mptcp_sock *mptcp_token_get_sock(u32 token) again: sk_nulls_for_each_rcu(sk, pos, &bucket->msk_chain) { msk = mptcp_sk(sk); - if (READ_ONCE(msk->token) != token) + if (READ_ONCE(msk->token) != token || + !net_eq(sock_net(sk), net)) continue; + if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto not_found; - if (READ_ONCE(msk->token) != token) { + + if (READ_ONCE(msk->token) != token || + !net_eq(sock_net(sk), net)) { sock_put(sk); goto again; } diff --git a/net/mptcp/token_test.c b/net/mptcp/token_test.c index e1bd6f0a0676..5d984bec1cd8 100644 --- a/net/mptcp/token_test.c +++ b/net/mptcp/token_test.c @@ -11,6 +11,7 @@ static struct mptcp_subflow_request_sock *build_req_sock(struct kunit *test) GFP_USER); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, req); mptcp_token_init_request((struct request_sock *)req); + sock_net_set((struct sock *)req, &init_net); return req; } @@ -22,7 +23,7 @@ static void mptcp_token_test_req_basic(struct kunit *test) KUNIT_ASSERT_EQ(test, 0, mptcp_token_new_request((struct request_sock *)req)); KUNIT_EXPECT_NE(test, 0, (int)req->token); - KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(req->token)); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, req->token)); /* cleanup */ mptcp_token_destroy_request((struct request_sock *)req); @@ -55,6 +56,7 @@ static struct mptcp_sock *build_msk(struct kunit *test) msk = kunit_kzalloc(test, sizeof(struct mptcp_sock), GFP_USER); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, msk); refcount_set(&((struct sock *)msk)->sk_refcnt, 1); + sock_net_set((struct sock *)msk, &init_net); return msk; } @@ -74,11 +76,11 @@ static void mptcp_token_test_msk_basic(struct kunit *test) mptcp_token_new_connect((struct sock *)icsk)); KUNIT_EXPECT_NE(test, 0, (int)ctx->token); KUNIT_EXPECT_EQ(test, ctx->token, msk->token); - KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(ctx->token)); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, ctx->token)); KUNIT_EXPECT_EQ(test, 2, (int)refcount_read(&sk->sk_refcnt)); mptcp_token_destroy(msk); - KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(ctx->token)); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, ctx->token)); } static void mptcp_token_test_accept(struct kunit *test) @@ -90,11 +92,11 @@ static void mptcp_token_test_accept(struct kunit *test) mptcp_token_new_request((struct request_sock *)req)); msk->token = req->token; mptcp_token_accept(req, msk); - KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, msk->token)); /* this is now a no-op */ mptcp_token_destroy_request((struct request_sock *)req); - KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(msk->token)); + KUNIT_EXPECT_PTR_EQ(test, msk, mptcp_token_get_sock(&init_net, msk->token)); /* cleanup */ mptcp_token_destroy(msk); @@ -116,7 +118,7 @@ static void mptcp_token_test_destroyed(struct kunit *test) /* simulate race on removal */ refcount_set(&sk->sk_refcnt, 0); - KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(msk->token)); + KUNIT_EXPECT_PTR_EQ(test, null_msk, mptcp_token_get_sock(&init_net, msk->token)); /* cleanup */ mptcp_token_destroy(msk); diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 0b6cfd3b31e0..03757e76bb6b 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -80,6 +80,7 @@ enum { #define NCSI_OEM_MFR_BCM_ID 0x113d #define NCSI_OEM_MFR_INTEL_ID 0x157 /* Intel specific OEM command */ +#define NCSI_OEM_INTEL_CMD_GMA 0x06 /* CMD ID for Get MAC */ #define NCSI_OEM_INTEL_CMD_KEEP_PHY 0x20 /* CMD ID for Keep PHY up */ /* Broadcom specific OEM Command */ #define NCSI_OEM_BCM_CMD_GMA 0x01 /* CMD ID for Get MAC */ @@ -89,6 +90,7 @@ enum { #define NCSI_OEM_MLX_CMD_SMAF 0x01 /* CMD ID for Set MC Affinity */ #define NCSI_OEM_MLX_CMD_SMAF_PARAM 0x07 /* Parameter for SMAF */ /* OEM Command payload lengths*/ +#define NCSI_OEM_INTEL_CMD_GMA_LEN 5 #define NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN 7 #define NCSI_OEM_BCM_CMD_GMA_LEN 12 #define NCSI_OEM_MLX_CMD_GMA_LEN 8 @@ -99,6 +101,7 @@ enum { /* Mac address offset in OEM response */ #define BCM_MAC_ADDR_OFFSET 28 #define MLX_MAC_ADDR_OFFSET 8 +#define INTEL_MAC_ADDR_OFFSET 1 struct ncsi_channel_version { diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 89c7742cd72e..7121ce2a47c0 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -795,13 +795,36 @@ static int ncsi_oem_smaf_mlx(struct ncsi_cmd_arg *nca) return ret; } +static int ncsi_oem_gma_handler_intel(struct ncsi_cmd_arg *nca) +{ + unsigned char data[NCSI_OEM_INTEL_CMD_GMA_LEN]; + int ret = 0; + + nca->payload = NCSI_OEM_INTEL_CMD_GMA_LEN; + + memset(data, 0, NCSI_OEM_INTEL_CMD_GMA_LEN); + *(unsigned int *)data = ntohl((__force __be32)NCSI_OEM_MFR_INTEL_ID); + data[4] = NCSI_OEM_INTEL_CMD_GMA; + + nca->data = data; + + ret = ncsi_xmit_cmd(nca); + if (ret) + netdev_err(nca->ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during configure\n", + nca->type); + + return ret; +} + /* OEM Command handlers initialization */ static struct ncsi_oem_gma_handler { unsigned int mfr_id; int (*handler)(struct ncsi_cmd_arg *nca); } ncsi_oem_gma_handlers[] = { { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }, - { NCSI_OEM_MFR_MLX_ID, ncsi_oem_gma_handler_mlx } + { NCSI_OEM_MFR_MLX_ID, ncsi_oem_gma_handler_mlx }, + { NCSI_OEM_MFR_INTEL_ID, ncsi_oem_gma_handler_intel } }; static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index 80938b338fee..ba66c7dc3a21 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -178,6 +178,12 @@ struct ncsi_rsp_oem_bcm_pkt { unsigned char data[]; /* Cmd specific Data */ }; +/* Intel Response Data */ +struct ncsi_rsp_oem_intel_pkt { + unsigned char cmd; /* OEM Command ID */ + unsigned char data[]; /* Cmd specific Data */ +}; + /* Get Link Status */ struct ncsi_rsp_gls_pkt { struct ncsi_rsp_pkt_hdr rsp; /* Response header */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index d48374894817..6447a09932f5 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -699,9 +699,51 @@ static int ncsi_rsp_handler_oem_bcm(struct ncsi_request *nr) return 0; } +/* Response handler for Intel command Get Mac Address */ +static int ncsi_rsp_handler_oem_intel_gma(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct net_device *ndev = ndp->ndev.dev; + const struct net_device_ops *ops = ndev->netdev_ops; + struct ncsi_rsp_oem_pkt *rsp; + struct sockaddr saddr; + int ret = 0; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + + saddr.sa_family = ndev->type; + ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + memcpy(saddr.sa_data, &rsp->data[INTEL_MAC_ADDR_OFFSET], ETH_ALEN); + /* Increase mac address by 1 for BMC's address */ + eth_addr_inc((u8 *)saddr.sa_data); + if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) + return -ENXIO; + + /* Set the flag for GMA command which should only be called once */ + ndp->gma_flag = 1; + + ret = ops->ndo_set_mac_address(ndev, &saddr); + if (ret < 0) + netdev_warn(ndev, + "NCSI: 'Writing mac address to device failed\n"); + + return ret; +} + /* Response handler for Intel card */ static int ncsi_rsp_handler_oem_intel(struct ncsi_request *nr) { + struct ncsi_rsp_oem_intel_pkt *intel; + struct ncsi_rsp_oem_pkt *rsp; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + intel = (struct ncsi_rsp_oem_intel_pkt *)(rsp->data); + + if (intel->cmd == NCSI_OEM_INTEL_CMD_GMA) + return ncsi_rsp_handler_oem_intel_gma(nr); + return 0; } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 54395266339d..3646fc195e7d 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -10,6 +10,17 @@ config NETFILTER_INGRESS This allows you to classify packets from ingress using the Netfilter infrastructure. +config NETFILTER_EGRESS + bool "Netfilter egress support" + default y + select NET_EGRESS + help + This allows you to classify packets before transmission using the + Netfilter infrastructure. + +config NETFILTER_SKIP_EGRESS + def_bool NETFILTER_EGRESS && (NET_CLS_ACT || IFB) + config NETFILTER_NETLINK tristate @@ -109,7 +120,7 @@ config NF_CONNTRACK_MARK config NF_CONNTRACK_SECMARK bool 'Connection tracking security mark support' depends on NETWORK_SECMARK - default m if NETFILTER_ADVANCED=n + default y if NETFILTER_ADVANCED=n help This option enables security markings to be applied to connections. Typically they are copied to connections from diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 049890e00a3d..aab20e575ecd 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -212,3 +212,6 @@ obj-$(CONFIG_IP_SET) += ipset/ # IPVS obj-$(CONFIG_IP_VS) += ipvs/ + +# lwtunnel +obj-$(CONFIG_LWTUNNEL) += nf_hooks_lwtunnel.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 63d032191e62..6dec9cd395f1 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -317,6 +317,12 @@ nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, return &dev->nf_hooks_ingress; } #endif +#ifdef CONFIG_NETFILTER_EGRESS + if (hooknum == NF_NETDEV_EGRESS) { + if (dev && dev_net(dev) == net) + return &dev->nf_hooks_egress; + } +#endif WARN_ON_ONCE(1); return NULL; } @@ -335,7 +341,8 @@ static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg, return 0; } -static inline bool nf_ingress_hook(const struct nf_hook_ops *reg, int pf) +static inline bool __maybe_unused nf_ingress_hook(const struct nf_hook_ops *reg, + int pf) { if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) || (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS)) @@ -344,6 +351,12 @@ static inline bool nf_ingress_hook(const struct nf_hook_ops *reg, int pf) return false; } +static inline bool __maybe_unused nf_egress_hook(const struct nf_hook_ops *reg, + int pf) +{ + return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS; +} + static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf) { #ifdef CONFIG_JUMP_LABEL @@ -383,9 +396,18 @@ static int __nf_register_net_hook(struct net *net, int pf, switch (pf) { case NFPROTO_NETDEV: - err = nf_ingress_check(net, reg, NF_NETDEV_INGRESS); - if (err < 0) - return err; +#ifndef CONFIG_NETFILTER_INGRESS + if (reg->hooknum == NF_NETDEV_INGRESS) + return -EOPNOTSUPP; +#endif +#ifndef CONFIG_NETFILTER_EGRESS + if (reg->hooknum == NF_NETDEV_EGRESS) + return -EOPNOTSUPP; +#endif + if ((reg->hooknum != NF_NETDEV_INGRESS && + reg->hooknum != NF_NETDEV_EGRESS) || + !reg->dev || dev_net(reg->dev) != net) + return -EINVAL; break; case NFPROTO_INET: if (reg->hooknum != NF_INET_INGRESS) @@ -418,6 +440,10 @@ static int __nf_register_net_hook(struct net *net, int pf, if (nf_ingress_hook(reg, pf)) net_inc_ingress_queue(); #endif +#ifdef CONFIG_NETFILTER_EGRESS + if (nf_egress_hook(reg, pf)) + net_inc_egress_queue(); +#endif nf_static_key_inc(reg, pf); BUG_ON(p == new_hooks); @@ -475,6 +501,10 @@ static void __nf_unregister_net_hook(struct net *net, int pf, if (nf_ingress_hook(reg, pf)) net_dec_ingress_queue(); #endif +#ifdef CONFIG_NETFILTER_EGRESS + if (nf_egress_hook(reg, pf)) + net_dec_egress_queue(); +#endif nf_static_key_dec(reg, pf); } else { WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum); diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 6186358eac7c..6e391308431d 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -130,11 +130,11 @@ htable_size(u8 hbits) { size_t hsize; - /* We must fit both into u32 in jhash and size_t */ + /* We must fit both into u32 in jhash and INT_MAX in kvmalloc_node() */ if (hbits > 31) return 0; hsize = jhash_size(hbits); - if ((((size_t)-1) - sizeof(struct htable)) / sizeof(struct hbucket *) + if ((INT_MAX - sizeof(struct htable)) / sizeof(struct hbucket *) < hsize) return 0; diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index c100c6b112c8..2c467c422dc6 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1468,6 +1468,10 @@ int __init ip_vs_conn_init(void) int idx; /* Compute size and mask */ + if (ip_vs_conn_tab_bits < 8 || ip_vs_conn_tab_bits > 20) { + pr_info("conn_tab_bits not in [8, 20]. Using default value\n"); + ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; + } ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 128690c512df..e93c937a8bf0 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1330,12 +1330,15 @@ drop: * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int -ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + struct netns_ipvs *ipvs = net_ipvs(state->net); + unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; + int af = state->pf; struct sock *sk; EnterFunction(11); @@ -1468,56 +1471,6 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in return NF_ACCEPT; } -/* - * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, - * used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_reply4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); -} - -/* - * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_local_reply4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET); -} - -#ifdef CONFIG_IP_VS_IPV6 - -/* - * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, - * used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_reply6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); -} - -/* - * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. - * Check if packet is reply for established ip_vs_conn. - */ -static unsigned int -ip_vs_local_reply6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_out(net_ipvs(state->net), state->hook, skb, AF_INET6); -} - -#endif - static unsigned int ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, @@ -1957,8 +1910,10 @@ out: * and send it on its way... */ static unsigned int -ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int af) +ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + struct netns_ipvs *ipvs = net_ipvs(state->net); + unsigned int hooknum = state->hook; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; @@ -1966,6 +1921,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int int ret, pkts; int conn_reuse_mode; struct sock *sk; + int af = state->pf; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) @@ -2138,55 +2094,6 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int } /* - * AF_INET handler in NF_INET_LOCAL_IN chain - * Schedule and forward packets from remote clients - */ -static unsigned int -ip_vs_remote_request4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); -} - -/* - * AF_INET handler in NF_INET_LOCAL_OUT chain - * Schedule and forward packets from local clients - */ -static unsigned int -ip_vs_local_request4(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET); -} - -#ifdef CONFIG_IP_VS_IPV6 - -/* - * AF_INET6 handler in NF_INET_LOCAL_IN chain - * Schedule and forward packets from remote clients - */ -static unsigned int -ip_vs_remote_request6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); -} - -/* - * AF_INET6 handler in NF_INET_LOCAL_OUT chain - * Schedule and forward packets from local clients - */ -static unsigned int -ip_vs_local_request6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - return ip_vs_in(net_ipvs(state->net), state->hook, skb, AF_INET6); -} - -#endif - - -/* * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP * related packets destined for 0.0.0.0/0. * When fwmark-based virtual service is used, such as transparent @@ -2199,45 +2106,36 @@ static unsigned int ip_vs_forward_icmp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - int r; struct netns_ipvs *ipvs = net_ipvs(state->net); - - if (ip_hdr(skb)->protocol != IPPROTO_ICMP) - return NF_ACCEPT; + int r; /* ipvs enabled in this netns ? */ if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) return NF_ACCEPT; - return ip_vs_in_icmp(ipvs, skb, &r, state->hook); -} - + if (state->pf == NFPROTO_IPV4) { + if (ip_hdr(skb)->protocol != IPPROTO_ICMP) + return NF_ACCEPT; #ifdef CONFIG_IP_VS_IPV6 -static unsigned int -ip_vs_forward_icmp_v6(void *priv, struct sk_buff *skb, - const struct nf_hook_state *state) -{ - int r; - struct netns_ipvs *ipvs = net_ipvs(state->net); - struct ip_vs_iphdr iphdr; + } else { + struct ip_vs_iphdr iphdr; - ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); - if (iphdr.protocol != IPPROTO_ICMPV6) - return NF_ACCEPT; + ip_vs_fill_iph_skb(AF_INET6, skb, false, &iphdr); - /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) - return NF_ACCEPT; + if (iphdr.protocol != IPPROTO_ICMPV6) + return NF_ACCEPT; - return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); -} + return ip_vs_in_icmp_v6(ipvs, skb, &r, state->hook, &iphdr); #endif + } + return ip_vs_in_icmp(ipvs, skb, &r, state->hook); +} static const struct nf_hook_ops ip_vs_ops4[] = { /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply4, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 2, @@ -2246,21 +2144,21 @@ static const struct nf_hook_ops ip_vs_ops4[] = { * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_remote_request4, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_local_reply4, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { - .hook = ip_vs_local_request4, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST + 2, @@ -2275,7 +2173,7 @@ static const struct nf_hook_ops ip_vs_ops4[] = { }, /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply4, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV4, .hooknum = NF_INET_FORWARD, .priority = 100, @@ -2286,7 +2184,7 @@ static const struct nf_hook_ops ip_vs_ops4[] = { static const struct nf_hook_ops ip_vs_ops6[] = { /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply6, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 2, @@ -2295,21 +2193,21 @@ static const struct nf_hook_ops ip_vs_ops6[] = { * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_remote_request6, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC - 1, }, /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_local_reply6, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 1, }, /* After mangle, schedule and forward local requests */ { - .hook = ip_vs_local_request6, + .hook = ip_vs_in_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST + 2, @@ -2317,14 +2215,14 @@ static const struct nf_hook_ops ip_vs_ops6[] = { /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { - .hook = ip_vs_forward_icmp_v6, + .hook = ip_vs_forward_icmp, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { - .hook = ip_vs_reply6, + .hook = ip_vs_out_hook, .pf = NFPROTO_IPV6, .hooknum = NF_INET_FORWARD, .priority = 100, diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c25097092a06..e62b40bd349e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2017,6 +2017,12 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "run_estimation", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_IP_VS_DEBUG { .procname = "debug_level", @@ -4090,6 +4096,13 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode; tbl[idx++].data = &ipvs->sysctl_schedule_icmp; tbl[idx++].data = &ipvs->sysctl_ignore_tunneled; + ipvs->sysctl_run_estimation = 1; + tbl[idx++].data = &ipvs->sysctl_run_estimation; +#ifdef CONFIG_IP_VS_DEBUG + /* Global sysctls must be ro in non-init netns */ + if (!net_eq(net, &init_net)) + tbl[idx++].mode = 0444; +#endif ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl); if (ipvs->sysctl_hdr == NULL) { diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 05b8112ffb37..9a1a7af6a186 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -100,6 +100,9 @@ static void estimation_timer(struct timer_list *t) u64 rate; struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer); + if (!sysctl_run_estimation(ipvs)) + goto skip; + spin_lock(&ipvs->est_lock); list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); @@ -131,6 +134,8 @@ static void estimation_timer(struct timer_list *t) spin_unlock(&s->lock); } spin_unlock(&ipvs->est_lock); + +skip: mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index d31dbccbe7bd..770a63103c7a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -21,7 +21,6 @@ #include <linux/stddef.h> #include <linux/slab.h> #include <linux/random.h> -#include <linux/jhash.h> #include <linux/siphash.h> #include <linux/err.h> #include <linux/percpu.h> @@ -75,9 +74,15 @@ static __read_mostly struct kmem_cache *nf_conntrack_cachep; static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; +/* serialize hash resizes and nf_ct_iterate_cleanup */ +static DEFINE_MUTEX(nf_conntrack_mutex); + #define GC_SCAN_INTERVAL (120u * HZ) #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) +#define MIN_CHAINLEN 8u +#define MAX_CHAINLEN (32u - MIN_CHAINLEN) + static struct conntrack_gc_work conntrack_gc_work; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) @@ -184,25 +189,34 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); seqcount_spinlock_t nf_conntrack_generation __read_mostly; -static unsigned int nf_conntrack_hash_rnd __read_mostly; +static siphash_key_t nf_conntrack_hash_rnd __read_mostly; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, + unsigned int zoneid, const struct net *net) { - unsigned int n; - u32 seed; + struct { + struct nf_conntrack_man src; + union nf_inet_addr dst_addr; + unsigned int zone; + u32 net_mix; + u16 dport; + u16 proto; + } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); - /* The direction must be ignored, so we hash everything up to the - * destination ports (which is a multiple of 4) and treat the last - * three bytes manually. - */ - seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); - n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, seed ^ - (((__force __u16)tuple->dst.u.all << 16) | - tuple->dst.protonum)); + memset(&combined, 0, sizeof(combined)); + + /* The direction must be ignored, so handle usable members manually. */ + combined.src = tuple->src; + combined.dst_addr = tuple->dst.u3; + combined.zone = zoneid; + combined.net_mix = net_hash_mix(net); + combined.dport = (__force __u16)tuple->dst.u.all; + combined.proto = tuple->dst.protonum; + + return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd); } static u32 scale_hash(u32 hash) @@ -212,15 +226,17 @@ static u32 scale_hash(u32 hash) static u32 __hash_conntrack(const struct net *net, const struct nf_conntrack_tuple *tuple, + unsigned int zoneid, unsigned int size) { - return reciprocal_scale(hash_conntrack_raw(tuple, net), size); + return reciprocal_scale(hash_conntrack_raw(tuple, zoneid, net), size); } static u32 hash_conntrack(const struct net *net, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, + unsigned int zoneid) { - return scale_hash(hash_conntrack_raw(tuple, net)); + return scale_hash(hash_conntrack_raw(tuple, zoneid, net)); } static bool nf_ct_get_tuple_ports(const struct sk_buff *skb, @@ -643,9 +659,11 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); clean_from_lists(ct); @@ -812,8 +830,20 @@ struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { - return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple, net)); + unsigned int rid, zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); + struct nf_conntrack_tuple_hash *thash; + + thash = __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, zone_id, net)); + + if (thash) + return thash; + + rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); + if (rid != zone_id) + return __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, rid, net)); + return thash; } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -835,7 +865,10 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int max_chainlen; + unsigned int chainlen = 0; unsigned int sequence; + int err = -EEXIST; zone = nf_ct_zone(ct); @@ -843,21 +876,34 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) do { sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_ORIGINAL)); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); + max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); + /* See if there's one in the list already, including reverse */ - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (chainlen++ > max_chainlen) + goto chaintoolong; + } + + chainlen = 0; + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; + if (chainlen++ > max_chainlen) + goto chaintoolong; + } smp_wmb(); /* The caller holds a reference to this object */ @@ -867,11 +913,13 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) NF_CT_STAT_INC(net, insert); local_bh_enable(); return 0; - +chaintoolong: + NF_CT_STAT_INC(net, chaintoolong); + err = -ENOSPC; out: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); - return -EEXIST; + return err; } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); @@ -1083,6 +1131,7 @@ drop: int __nf_conntrack_confirm(struct sk_buff *skb) { + unsigned int chainlen = 0, sequence, max_chainlen; const struct nf_conntrack_zone *zone; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; @@ -1091,7 +1140,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; - unsigned int sequence; int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); @@ -1113,8 +1161,8 @@ __nf_conntrack_confirm(struct sk_buff *skb) hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = scale_hash(hash); reply_hash = hash_conntrack(net, - &ct->tuplehash[IP_CT_DIR_REPLY].tuple); - + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + nf_ct_zone_id(nf_ct_zone(ct), IP_CT_DIR_REPLY)); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* We're not in hash table, and we refuse to set up related @@ -1148,18 +1196,32 @@ __nf_conntrack_confirm(struct sk_buff *skb) goto dying; } + max_chainlen = MIN_CHAINLEN + prandom_u32_max(MAX_CHAINLEN); /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; + if (chainlen++ > max_chainlen) + goto chaintoolong; + } - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + chainlen = 0; + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; + if (chainlen++ > max_chainlen) { +chaintoolong: + nf_ct_add_to_dying_list(ct); + NF_CT_STAT_INC(net, chaintoolong); + NF_CT_STAT_INC(net, insert_failed); + ret = NF_DROP; + goto dying; + } + } /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in @@ -1213,7 +1275,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, rcu_read_lock(); begin: nf_conntrack_get_ht(&ct_hash, &hsize); - hash = __hash_conntrack(net, tuple, hsize); + hash = __hash_conntrack(net, tuple, nf_ct_zone_id(zone, IP_CT_DIR_REPLY), hsize); hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); @@ -1654,8 +1716,8 @@ resolve_normal_ct(struct nf_conn *tmpl, struct nf_conntrack_tuple_hash *h; enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; + u32 hash, zone_id, rid; struct nf_conn *ct; - u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, state->pf, protonum, state->net, @@ -1666,8 +1728,20 @@ resolve_normal_ct(struct nf_conn *tmpl, /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - hash = hash_conntrack_raw(&tuple, state->net); + + zone_id = nf_ct_zone_id(zone, IP_CT_DIR_ORIGINAL); + hash = hash_conntrack_raw(&tuple, zone_id, state->net); h = __nf_conntrack_find_get(state->net, zone, &tuple, hash); + + if (!h) { + rid = nf_ct_zone_id(zone, IP_CT_DIR_REPLY); + if (zone_id != rid) { + u32 tmp = hash_conntrack_raw(&tuple, rid, state->net); + + h = __nf_conntrack_find_get(state->net, zone, &tuple, tmp); + } + } + if (!h) { h = init_conntrack(state->net, tmpl, &tuple, skb, dataoff, hash); @@ -2192,28 +2266,31 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), spinlock_t *lockp; for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { + struct hlist_nulls_head *hslot = &nf_conntrack_hash[*bucket]; + + if (hlist_nulls_empty(hslot)) + continue; + lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); - if (*bucket < nf_conntrack_htable_size) { - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { - if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) - continue; - /* All nf_conn objects are added to hash table twice, one - * for original direction tuple, once for the reply tuple. - * - * Exception: In the IPS_NAT_CLASH case, only the reply - * tuple is added (the original tuple already existed for - * a different object). - * - * We only need to call the iterator once for each - * conntrack, so we just use the 'reply' direction - * tuple while iterating. - */ - ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) - goto found; - } + hlist_nulls_for_each_entry(h, n, hslot, hnnode) { + if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) + continue; + /* All nf_conn objects are added to hash table twice, one + * for original direction tuple, once for the reply tuple. + * + * Exception: In the IPS_NAT_CLASH case, only the reply + * tuple is added (the original tuple already existed for + * a different object). + * + * We only need to call the iterator once for each + * conntrack, so we just use the 'reply' direction + * tuple while iterating. + */ + ct = nf_ct_tuplehash_to_ctrack(h); + if (iter(ct, data)) + goto found; } spin_unlock(lockp); local_bh_enable(); @@ -2231,26 +2308,20 @@ found: static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report) { - unsigned int bucket = 0, sequence; + unsigned int bucket = 0; struct nf_conn *ct; might_sleep(); - for (;;) { - sequence = read_seqcount_begin(&nf_conntrack_generation); - - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { - /* Time to push up daises... */ + mutex_lock(&nf_conntrack_mutex); + while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { + /* Time to push up daises... */ - nf_ct_delete(ct, portid, report); - nf_ct_put(ct); - cond_resched(); - } - - if (!read_seqcount_retry(&nf_conntrack_generation, sequence)) - break; - bucket = 0; + nf_ct_delete(ct, portid, report); + nf_ct_put(ct); + cond_resched(); } + mutex_unlock(&nf_conntrack_mutex); } struct iter_data { @@ -2486,8 +2557,10 @@ int nf_conntrack_hash_resize(unsigned int hashsize) if (!hash) return -ENOMEM; + mutex_lock(&nf_conntrack_mutex); old_size = nf_conntrack_htable_size; if (old_size == hashsize) { + mutex_unlock(&nf_conntrack_mutex); kvfree(hash); return 0; } @@ -2504,12 +2577,16 @@ int nf_conntrack_hash_resize(unsigned int hashsize) for (i = 0; i < nf_conntrack_htable_size; i++) { while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { + unsigned int zone_id; + h = hlist_nulls_entry(nf_conntrack_hash[i].first, struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); + + zone_id = nf_ct_zone_id(nf_ct_zone(ct), NF_CT_DIRECTION(h)); bucket = __hash_conntrack(nf_ct_net(ct), - &h->tuple, hashsize); + &h->tuple, zone_id, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } @@ -2523,6 +2600,8 @@ int nf_conntrack_hash_resize(unsigned int hashsize) nf_conntrack_all_unlock(); local_bh_enable(); + mutex_unlock(&nf_conntrack_mutex); + synchronize_net(); kvfree(old_hash); return 0; @@ -2594,26 +2673,24 @@ int nf_conntrack_init_start(void) spin_lock_init(&nf_conntrack_locks[i]); if (!nf_conntrack_htable_size) { - /* Idea from tcp.c: use 1/16384 of memory. - * On i386: 32MB machine has 512 buckets. - * >= 1GB machines have 16384 buckets. - * >= 4GB machines have 65536 buckets. - */ nf_conntrack_htable_size = (((nr_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) - nf_conntrack_htable_size = 65536; + if (BITS_PER_LONG >= 64 && + nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) + nf_conntrack_htable_size = 262144; else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) - nf_conntrack_htable_size = 16384; - if (nf_conntrack_htable_size < 32) - nf_conntrack_htable_size = 32; - - /* Use a max. factor of four by default to get the same max as - * with the old struct list_heads. When a table size is given - * we use the old value of 8 to avoid reducing the max. - * entries. */ - max_factor = 4; + nf_conntrack_htable_size = 65536; + + if (nf_conntrack_htable_size < 1024) + nf_conntrack_htable_size = 1024; + /* Use a max. factor of one by default to keep the average + * hash chain length at 2 entries. Each entry has to be added + * twice (once for original direction, once for reply). + * When a table size is given we use the old value of 8 to + * avoid implicit reduction of the max entries setting. + */ + max_factor = 1; } nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 296e4a171bd1..41768ff19464 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -130,58 +130,77 @@ static void ecache_work(struct work_struct *work) schedule_delayed_work(&cnet->ecache_dwork, delay); } -int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, - u32 portid, int report) +static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e, + const unsigned int events, + const unsigned long missed, + const struct nf_ct_event *item) { - int ret = 0; - struct net *net = nf_ct_net(ct); + struct nf_conn *ct = item->ct; + struct net *net = nf_ct_net(item->ct); struct nf_ct_event_notifier *notify; - struct nf_conntrack_ecache *e; + int ret; + + if (!((events | missed) & e->ctmask)) + return 0; rcu_read_lock(); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); - if (!notify) - goto out_unlock; + if (!notify) { + rcu_read_unlock(); + return 0; + } + + ret = notify->ct_event(events | missed, item); + rcu_read_unlock(); + + if (likely(ret >= 0 && missed == 0)) + return 0; + + spin_lock_bh(&ct->lock); + if (ret < 0) + e->missed |= events; + else + e->missed &= ~missed; + spin_unlock_bh(&ct->lock); + + return ret; +} + +int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct, + u32 portid, int report) +{ + struct nf_conntrack_ecache *e; + struct nf_ct_event item; + unsigned long missed; + int ret; + + if (!nf_ct_is_confirmed(ct)) + return 0; e = nf_ct_ecache_find(ct); if (!e) - goto out_unlock; + return 0; - if (nf_ct_is_confirmed(ct)) { - struct nf_ct_event item = { - .ct = ct, - .portid = e->portid ? e->portid : portid, - .report = report - }; - /* This is a resent of a destroy event? If so, skip missed */ - unsigned long missed = e->portid ? 0 : e->missed; - - if (!((eventmask | missed) & e->ctmask)) - goto out_unlock; - - ret = notify->fcn(eventmask | missed, &item); - if (unlikely(ret < 0 || missed)) { - spin_lock_bh(&ct->lock); - if (ret < 0) { - /* This is a destroy event that has been - * triggered by a process, we store the PORTID - * to include it in the retransmission. - */ - if (eventmask & (1 << IPCT_DESTROY)) { - if (e->portid == 0 && portid != 0) - e->portid = portid; - e->state = NFCT_ECACHE_DESTROY_FAIL; - } else { - e->missed |= eventmask; - } - } else { - e->missed &= ~missed; - } - spin_unlock_bh(&ct->lock); - } + memset(&item, 0, sizeof(item)); + + item.ct = ct; + item.portid = e->portid ? e->portid : portid; + item.report = report; + + /* This is a resent of a destroy event? If so, skip missed */ + missed = e->portid ? 0 : e->missed; + + ret = __nf_conntrack_eventmask_report(e, events, missed, &item); + if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) { + /* This is a destroy event that has been triggered by a process, + * we store the PORTID to include it in the retransmission. + */ + if (e->portid == 0 && portid != 0) + e->portid = portid; + e->state = NFCT_ECACHE_DESTROY_FAIL; } -out_unlock: - rcu_read_unlock(); + return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); @@ -190,53 +209,28 @@ EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) { - struct net *net = nf_ct_net(ct); - unsigned long events, missed; - struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; struct nf_ct_event item; - int ret; - - rcu_read_lock(); - notify = rcu_dereference(net->ct.nf_conntrack_event_cb); - if (notify == NULL) - goto out_unlock; + unsigned long events; if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) - goto out_unlock; + return; e = nf_ct_ecache_find(ct); if (e == NULL) - goto out_unlock; + return; events = xchg(&e->cache, 0); - /* We make a copy of the missed event cache without taking - * the lock, thus we may send missed events twice. However, - * this does not harm and it happens very rarely. */ - missed = e->missed; - - if (!((events | missed) & e->ctmask)) - goto out_unlock; - item.ct = ct; item.portid = 0; item.report = 0; - ret = notify->fcn(events | missed, &item); - - if (likely(ret == 0 && !missed)) - goto out_unlock; - - spin_lock_bh(&ct->lock); - if (ret < 0) - e->missed |= events; - else - e->missed &= ~missed; - spin_unlock_bh(&ct->lock); - -out_unlock: - rcu_read_unlock(); + /* We make a copy of the missed event cache without taking + * the lock, thus we may send missed events twice. However, + * this does not harm and it happens very rarely. + */ + __nf_conntrack_eventmask_report(e, events, e->missed, &item); } EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); @@ -246,11 +240,11 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, { struct net *net = nf_ct_exp_net(exp); - struct nf_exp_event_notifier *notify; + struct nf_ct_event_notifier *notify; struct nf_conntrack_ecache *e; rcu_read_lock(); - notify = rcu_dereference(net->ct.nf_expect_event_cb); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); if (!notify) goto out_unlock; @@ -264,86 +258,35 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, .portid = portid, .report = report }; - notify->fcn(1 << event, &item); + notify->exp_event(1 << event, &item); } out_unlock: rcu_read_unlock(); } -int nf_conntrack_register_notifier(struct net *net, - struct nf_ct_event_notifier *new) +void nf_conntrack_register_notifier(struct net *net, + const struct nf_ct_event_notifier *new) { - int ret; struct nf_ct_event_notifier *notify; mutex_lock(&nf_ct_ecache_mutex); notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, lockdep_is_held(&nf_ct_ecache_mutex)); - if (notify != NULL) { - ret = -EBUSY; - goto out_unlock; - } + WARN_ON_ONCE(notify); rcu_assign_pointer(net->ct.nf_conntrack_event_cb, new); - ret = 0; - -out_unlock: mutex_unlock(&nf_ct_ecache_mutex); - return ret; } EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); -void nf_conntrack_unregister_notifier(struct net *net, - struct nf_ct_event_notifier *new) +void nf_conntrack_unregister_notifier(struct net *net) { - struct nf_ct_event_notifier *notify; - mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(net->ct.nf_conntrack_event_cb, - lockdep_is_held(&nf_ct_ecache_mutex)); - BUG_ON(notify != new); RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); - /* synchronize_rcu() is called from ctnetlink_exit. */ + /* synchronize_rcu() is called after netns pre_exit */ } EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); -int nf_ct_expect_register_notifier(struct net *net, - struct nf_exp_event_notifier *new) -{ - int ret; - struct nf_exp_event_notifier *notify; - - mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, - lockdep_is_held(&nf_ct_ecache_mutex)); - if (notify != NULL) { - ret = -EBUSY; - goto out_unlock; - } - rcu_assign_pointer(net->ct.nf_expect_event_cb, new); - ret = 0; - -out_unlock: - mutex_unlock(&nf_ct_ecache_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(nf_ct_expect_register_notifier); - -void nf_ct_expect_unregister_notifier(struct net *net, - struct nf_exp_event_notifier *new) -{ - struct nf_exp_event_notifier *notify; - - mutex_lock(&nf_ct_ecache_mutex); - notify = rcu_dereference_protected(net->ct.nf_expect_event_cb, - lockdep_is_held(&nf_ct_ecache_mutex)); - BUG_ON(notify != new); - RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); - mutex_unlock(&nf_ct_ecache_mutex); - /* synchronize_rcu() is called from ctnetlink_exit. */ -} -EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); - void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 1e851bc2e61a..f562eeef4234 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -17,7 +17,7 @@ #include <linux/err.h> #include <linux/percpu.h> #include <linux/kernel.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> @@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_hash); unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; -static unsigned int nf_ct_expect_hashrnd __read_mostly; +static siphash_key_t nf_ct_expect_hashrnd __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, @@ -81,15 +81,26 @@ static void nf_ct_expectation_timed_out(struct timer_list *t) static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { - unsigned int hash, seed; + struct { + union nf_inet_addr dst_addr; + u32 net_mix; + u16 dport; + u8 l3num; + u8 protonum; + } __aligned(SIPHASH_ALIGNMENT) combined; + u32 hash; get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); - seed = nf_ct_expect_hashrnd ^ net_hash_mix(n); + memset(&combined, 0, sizeof(combined)); - hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), - (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | - (__force __u16)tuple->dst.u.all) ^ seed); + combined.dst_addr = tuple->dst.u3; + combined.net_mix = net_hash_mix(n); + combined.dport = (__force __u16)tuple->dst.u.all; + combined.l3num = tuple->src.l3num; + combined.protonum = tuple->dst.protonum; + + hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd); return reciprocal_scale(hash, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e81af33b233b..f1e5443fe7c7 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -706,7 +706,7 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) } static int -ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) +ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item) { const struct nf_conntrack_zone *zone; struct net *net; @@ -852,6 +852,11 @@ static int ctnetlink_done(struct netlink_callback *cb) return 0; } +struct ctnetlink_filter_u32 { + u32 val; + u32 mask; +}; + struct ctnetlink_filter { u8 family; @@ -862,10 +867,8 @@ struct ctnetlink_filter { struct nf_conntrack_tuple reply; struct nf_conntrack_zone zone; - struct { - u_int32_t val; - u_int32_t mask; - } mark; + struct ctnetlink_filter_u32 mark; + struct ctnetlink_filter_u32 status; }; static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = { @@ -907,6 +910,46 @@ static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], struct nf_conntrack_zone *zone, u_int32_t flags); +static int ctnetlink_filter_parse_mark(struct ctnetlink_filter_u32 *mark, + const struct nlattr * const cda[]) +{ +#ifdef CONFIG_NF_CONNTRACK_MARK + if (cda[CTA_MARK]) { + mark->val = ntohl(nla_get_be32(cda[CTA_MARK])); + + if (cda[CTA_MARK_MASK]) + mark->mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + else + mark->mask = 0xffffffff; + } else if (cda[CTA_MARK_MASK]) { + return -EINVAL; + } +#endif + return 0; +} + +static int ctnetlink_filter_parse_status(struct ctnetlink_filter_u32 *status, + const struct nlattr * const cda[]) +{ + if (cda[CTA_STATUS]) { + status->val = ntohl(nla_get_be32(cda[CTA_STATUS])); + if (cda[CTA_STATUS_MASK]) + status->mask = ntohl(nla_get_be32(cda[CTA_STATUS_MASK])); + else + status->mask = status->val; + + /* status->val == 0? always true, else always false. */ + if (status->mask == 0) + return -EINVAL; + } else if (cda[CTA_STATUS_MASK]) { + return -EINVAL; + } + + /* CTA_STATUS is NLA_U32, if this fires UAPI needs to be extended */ + BUILD_BUG_ON(__IPS_MAX_BIT >= 32); + return 0; +} + static struct ctnetlink_filter * ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) { @@ -924,18 +967,14 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) filter->family = family; -#ifdef CONFIG_NF_CONNTRACK_MARK - if (cda[CTA_MARK]) { - filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); - if (cda[CTA_MARK_MASK]) - filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); - else - filter->mark.mask = 0xffffffff; - } else if (cda[CTA_MARK_MASK]) { - err = -EINVAL; + err = ctnetlink_filter_parse_mark(&filter->mark, cda); + if (err) goto err_filter; - } -#endif + + err = ctnetlink_filter_parse_status(&filter->status, cda); + if (err) + goto err_filter; + if (!cda[CTA_FILTER]) return filter; @@ -989,7 +1028,7 @@ err_filter: static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) { - return family || cda[CTA_MARK] || cda[CTA_FILTER]; + return family || cda[CTA_MARK] || cda[CTA_FILTER] || cda[CTA_STATUS]; } static int ctnetlink_start(struct netlink_callback *cb) @@ -1082,6 +1121,7 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) { struct ctnetlink_filter *filter = data; struct nf_conntrack_tuple *tuple; + u32 status; if (filter == NULL) goto out; @@ -1113,6 +1153,9 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) if ((ct->mark & filter->mark.mask) != filter->mark.val) goto ignore_entry; #endif + status = (u32)READ_ONCE(ct->status); + if ((status & filter->status.mask) != filter->status.val) + goto ignore_entry; out: return 1; @@ -1495,6 +1538,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { [CTA_LABELS_MASK] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, [CTA_FILTER] = { .type = NLA_NESTED }, + [CTA_STATUS_MASK] = { .type = NLA_U32 }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) @@ -2484,7 +2528,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, htonl(st->search_restart)) || nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE, - htonl(st->clash_resolve))) + htonl(st->clash_resolve)) || + nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG, + htonl(st->chaintoolong))) goto nla_put_failure; nlmsg_end(skb, nlh); @@ -2625,6 +2671,8 @@ ctnetlink_glue_build_size(const struct nf_conn *ct) + nla_total_size(0) /* CTA_HELP */ + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ + ctnetlink_secctx_size(ct) + + ctnetlink_acct_size(ct) + + ctnetlink_timestamp_size(ct) #if IS_ENABLED(CONFIG_NF_NAT) + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */ @@ -2682,6 +2730,10 @@ static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) if (ctnetlink_dump_protoinfo(skb, ct, false) < 0) goto nla_put_failure; + if (ctnetlink_dump_acct(skb, ct, IPCTNL_MSG_CT_GET) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0) + goto nla_put_failure; + if (ctnetlink_dump_helpinfo(skb, ct) < 0) goto nla_put_failure; @@ -3060,7 +3112,7 @@ nla_put_failure: #ifdef CONFIG_NF_CONNTRACK_EVENTS static int -ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) +ctnetlink_expect_event(unsigned int events, const struct nf_exp_event *item) { struct nf_conntrack_expect *exp = item->exp; struct net *net = nf_ct_exp_net(exp); @@ -3711,11 +3763,8 @@ static int ctnetlink_stat_exp_cpu(struct sk_buff *skb, #ifdef CONFIG_NF_CONNTRACK_EVENTS static struct nf_ct_event_notifier ctnl_notifier = { - .fcn = ctnetlink_conntrack_event, -}; - -static struct nf_exp_event_notifier ctnl_notifier_exp = { - .fcn = ctnetlink_expect_event, + .ct_event = ctnetlink_conntrack_event, + .exp_event = ctnetlink_expect_event, }; #endif @@ -3808,52 +3857,21 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_EXP); static int __net_init ctnetlink_net_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_EVENTS - int ret; - - ret = nf_conntrack_register_notifier(net, &ctnl_notifier); - if (ret < 0) { - pr_err("ctnetlink_init: cannot register notifier.\n"); - goto err_out; - } - - ret = nf_ct_expect_register_notifier(net, &ctnl_notifier_exp); - if (ret < 0) { - pr_err("ctnetlink_init: cannot expect register notifier.\n"); - goto err_unreg_notifier; - } + nf_conntrack_register_notifier(net, &ctnl_notifier); #endif return 0; - -#ifdef CONFIG_NF_CONNTRACK_EVENTS -err_unreg_notifier: - nf_conntrack_unregister_notifier(net, &ctnl_notifier); -err_out: - return ret; -#endif } -static void ctnetlink_net_exit(struct net *net) +static void ctnetlink_net_pre_exit(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_EVENTS - nf_ct_expect_unregister_notifier(net, &ctnl_notifier_exp); - nf_conntrack_unregister_notifier(net, &ctnl_notifier); + nf_conntrack_unregister_notifier(net); #endif } -static void __net_exit ctnetlink_net_exit_batch(struct list_head *net_exit_list) -{ - struct net *net; - - list_for_each_entry(net, net_exit_list, exit_list) - ctnetlink_net_exit(net); - - /* wait for other cpus until they are done with ctnl_notifiers */ - synchronize_rcu(); -} - static struct pernet_operations ctnetlink_net_ops = { .init = ctnetlink_net_init, - .exit_batch = ctnetlink_net_exit_batch, + .pre_exit = ctnetlink_net_pre_exit, }; static int __init ctnetlink_init(void) diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 8f7a9837349c..d1f2d3c8d2b1 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -155,6 +155,16 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff, } EXPORT_SYMBOL_GPL(nf_confirm); +static bool in_vrf_postrouting(const struct nf_hook_state *state) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (state->hook == NF_INET_POST_ROUTING && + netif_is_l3_master(state->out)) + return true; +#endif + return false; +} + static unsigned int ipv4_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -166,6 +176,9 @@ static unsigned int ipv4_confirm(void *priv, if (!ct || ctinfo == IP_CT_RELATED_REPLY) return nf_conntrack_confirm(skb); + if (in_vrf_postrouting(state)) + return NF_ACCEPT; + return nf_confirm(skb, skb_network_offset(skb) + ip_hdrlen(skb), ct, ctinfo); @@ -374,6 +387,9 @@ static unsigned int ipv6_confirm(void *priv, if (!ct || ctinfo == IP_CT_RELATED_REPLY) return nf_conntrack_confirm(skb); + if (in_vrf_postrouting(state)) + return NF_ACCEPT; + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off); if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index f8e3c0d2602f..3b516cffc779 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -104,10 +104,13 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { unsigned long extra = timeouts[UDP_CT_UNREPLIED]; + bool stream = false; /* Still active after two seconds? Extend timeout. */ - if (time_after(jiffies, ct->proto.udp.stream_ts)) + if (time_after(jiffies, ct->proto.udp.stream_ts)) { extra = timeouts[UDP_CT_REPLIED]; + stream = true; + } nf_ct_refresh_acct(ct, ctinfo, skb, extra); @@ -116,7 +119,7 @@ int nf_conntrack_udp_packet(struct nf_conn *ct, return NF_ACCEPT; /* Also, more likely to be important, and not a probe */ - if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) + if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_ASSURED, ct); } else { nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index e84b499b7bfa..80f675d884b2 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -22,6 +22,9 @@ #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_timestamp.h> +#ifdef CONFIG_LWTUNNEL +#include <net/netfilter/nf_hooks_lwtunnel.h> +#endif #include <linux/rculist_nulls.h> static bool enable_hooks __read_mostly; @@ -429,7 +432,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) unsigned int nr_conntracks; if (v == SEQ_START_TOKEN) { - seq_puts(seq, "entries clashres found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); + seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } @@ -444,7 +447,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) st->invalid, 0, 0, - 0, + st->chaintoolong, st->insert, st->insert_failed, st->drop, @@ -612,6 +615,9 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, #endif +#ifdef CONFIG_LWTUNNEL + NF_SYSCTL_CT_LWTUNNEL, +#endif __NF_SYSCTL_CT_LAST_SYSCTL, }; @@ -959,6 +965,15 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif +#ifdef CONFIG_LWTUNNEL + [NF_SYSCTL_CT_LWTUNNEL] = { + .procname = "nf_hooks_lwtunnel", + .data = NULL, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = nf_hooks_lwtunnel_sysctl_handler, + }, +#endif {} }; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 8788b519255e..87a7388b6c89 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -99,7 +99,7 @@ static int flow_offload_fill_route(struct flow_offload *flow, flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true); break; case NFPROTO_IPV6: - flow_tuple->mtu = ip6_dst_mtu_forward(dst); + flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true); break; } @@ -180,15 +180,10 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) { - const struct nf_conntrack_l4proto *l4proto; struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); s32 timeout; - l4proto = nf_ct_l4proto_find(l4num); - if (!l4proto) - return; - if (l4num == IPPROTO_TCP) { struct nf_tcp_net *tn = nf_tcp_pernet(net); @@ -278,15 +273,10 @@ static const struct rhashtable_params nf_flow_offload_rhash_params = { unsigned long flow_offload_get_timeout(struct flow_offload *flow) { - const struct nf_conntrack_l4proto *l4proto; unsigned long timeout = NF_FLOW_TIMEOUT; struct net *net = nf_ct_net(flow->ct); int l4num = nf_ct_protonum(flow->ct); - l4proto = nf_ct_l4proto_find(l4num); - if (!l4proto) - return timeout; - if (l4num == IPPROTO_TCP) { struct nf_tcp_net *tn = nf_tcp_pernet(net); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index f92006cec94c..d6bf1b2cd541 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -251,8 +251,7 @@ static int flow_offload_eth_src(struct net *net, flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8, &val, &mask); - if (dev) - dev_put(dev); + dev_put(dev); return 0; } @@ -1097,6 +1096,7 @@ static void nf_flow_table_block_offload_init(struct flow_block_offload *bo, bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; + bo->cb_list_head = &flowtable->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } diff --git a/net/netfilter/nf_hooks_lwtunnel.c b/net/netfilter/nf_hooks_lwtunnel.c new file mode 100644 index 000000000000..00e89ffd78f6 --- /dev/null +++ b/net/netfilter/nf_hooks_lwtunnel.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/sysctl.h> +#include <net/lwtunnel.h> +#include <net/netfilter/nf_hooks_lwtunnel.h> + +static inline int nf_hooks_lwtunnel_get(void) +{ + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) + return 1; + else + return 0; +} + +static inline int nf_hooks_lwtunnel_set(int enable) +{ + if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) { + if (!enable) + return -EBUSY; + } else if (enable) { + static_branch_enable(&nf_hooks_lwtunnel_enabled); + } + + return 0; +} + +#ifdef CONFIG_SYSCTL +int nf_hooks_lwtunnel_sysctl_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int proc_nf_hooks_lwtunnel_enabled = 0; + struct ctl_table tmp = { + .procname = table->procname, + .data = &proc_nf_hooks_lwtunnel_enabled, + .maxlen = sizeof(int), + .mode = table->mode, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + int ret; + + if (!write) + proc_nf_hooks_lwtunnel_enabled = nf_hooks_lwtunnel_get(); + + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + + if (write && ret == 0) + ret = nf_hooks_lwtunnel_set(proc_nf_hooks_lwtunnel_enabled); + + return ret; +} +EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_sysctl_handler); +#endif /* CONFIG_SYSCTL */ diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7de595ead06a..4d50d51db796 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -13,7 +13,7 @@ #include <linux/skbuff.h> #include <linux/gfp.h> #include <net/xfrm.h> -#include <linux/jhash.h> +#include <linux/siphash.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack.h> @@ -34,7 +34,7 @@ static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; -static unsigned int nf_nat_hash_rnd __read_mostly; +static siphash_key_t nf_nat_hash_rnd __read_mostly; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; @@ -150,15 +150,32 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int -hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *net, + const struct nf_conntrack_zone *zone, + const struct nf_conntrack_tuple *tuple) { unsigned int hash; + struct { + struct nf_conntrack_man src; + u32 net_mix; + u32 protonum; + u32 zone; + } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + memset(&combined, 0, sizeof(combined)); + /* Original src, to ensure we map it consistently if poss. */ - hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); + combined.src = tuple->src; + combined.net_mix = net_hash_mix(net); + combined.protonum = tuple->dst.protonum; + + /* Zone ID can be used provided its valid for both directions */ + if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) + combined.zone = zone->id; + + hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } @@ -262,7 +279,7 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { - unsigned int h = hash_by_src(net, tuple); + unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { @@ -609,7 +626,7 @@ nf_nat_setup_info(struct nf_conn *ct, unsigned int srchash; spinlock_t *lock; - srchash = hash_by_src(net, + srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); @@ -682,6 +699,16 @@ unsigned int nf_nat_packet(struct nf_conn *ct, } EXPORT_SYMBOL_GPL(nf_nat_packet); +static bool in_vrf_postrouting(const struct nf_hook_state *state) +{ +#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) + if (state->hook == NF_INET_POST_ROUTING && + netif_is_l3_master(state->out)) + return true; +#endif + return false; +} + unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -698,7 +725,7 @@ nf_nat_inet_fn(void *priv, struct sk_buff *skb, * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ - if (!ct) + if (!ct || in_vrf_postrouting(state)) return NF_ACCEPT; nat = nfct_nat(ct); @@ -778,7 +805,7 @@ static void __nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; - h = hash_by_src(nf_ct_net(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c index 8e8a65d46345..acd73f717a08 100644 --- a/net/netfilter/nf_nat_masquerade.c +++ b/net/netfilter/nf_nat_masquerade.c @@ -9,8 +9,19 @@ #include <net/netfilter/nf_nat_masquerade.h> +struct masq_dev_work { + struct work_struct work; + struct net *net; + union nf_inet_addr addr; + int ifindex; + int (*iter)(struct nf_conn *i, void *data); +}; + +#define MAX_MASQ_WORKER_COUNT 16 + static DEFINE_MUTEX(masq_mutex); static unsigned int masq_refcnt __read_mostly; +static atomic_t masq_worker_count __read_mostly; unsigned int nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, @@ -63,13 +74,71 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4); -static int device_cmp(struct nf_conn *i, void *ifindex) +static void iterate_cleanup_work(struct work_struct *work) +{ + struct masq_dev_work *w; + + w = container_of(work, struct masq_dev_work, work); + + nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0); + + put_net(w->net); + kfree(w); + atomic_dec(&masq_worker_count); + module_put(THIS_MODULE); +} + +/* Iterate conntrack table in the background and remove conntrack entries + * that use the device/address being removed. + * + * In case too many work items have been queued already or memory allocation + * fails iteration is skipped, conntrack entries will time out eventually. + */ +static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr, + int ifindex, + int (*iter)(struct nf_conn *i, void *data), + gfp_t gfp_flags) +{ + struct masq_dev_work *w; + + if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT) + return; + + net = maybe_get_net(net); + if (!net) + return; + + if (!try_module_get(THIS_MODULE)) + goto err_module; + + w = kzalloc(sizeof(*w), gfp_flags); + if (w) { + /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */ + atomic_inc(&masq_worker_count); + + INIT_WORK(&w->work, iterate_cleanup_work); + w->ifindex = ifindex; + w->net = net; + w->iter = iter; + if (addr) + w->addr = *addr; + schedule_work(&w->work); + return; + } + + module_put(THIS_MODULE); + err_module: + put_net(net); +} + +static int device_cmp(struct nf_conn *i, void *arg) { const struct nf_conn_nat *nat = nfct_nat(i); + const struct masq_dev_work *w = arg; if (!nat) return 0; - return nat->masq_index == (int)(long)ifindex; + return nat->masq_index == w->ifindex; } static int masq_device_event(struct notifier_block *this, @@ -85,8 +154,8 @@ static int masq_device_event(struct notifier_block *this, * and forget them. */ - nf_ct_iterate_cleanup_net(net, device_cmp, - (void *)(long)dev->ifindex, 0, 0); + nf_nat_masq_schedule(net, NULL, dev->ifindex, + device_cmp, GFP_KERNEL); } return NOTIFY_DONE; @@ -94,35 +163,45 @@ static int masq_device_event(struct notifier_block *this, static int inet_cmp(struct nf_conn *ct, void *ptr) { - struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; - struct net_device *dev = ifa->ifa_dev->dev; struct nf_conntrack_tuple *tuple; + struct masq_dev_work *w = ptr; - if (!device_cmp(ct, (void *)(long)dev->ifindex)) + if (!device_cmp(ct, ptr)) return 0; tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - return ifa->ifa_address == tuple->dst.u3.ip; + return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3); } static int masq_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { - struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev; - struct net *net = dev_net(idev->dev); + const struct in_ifaddr *ifa = ptr; + const struct in_device *idev; + const struct net_device *dev; + union nf_inet_addr addr; + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; /* The masq_dev_notifier will catch the case of the device going * down. So if the inetdev is dead and being destroyed we have * no work to do. Otherwise this is an individual address removal * and we have to perform the flush. */ + idev = ifa->ifa_dev; if (idev->dead) return NOTIFY_DONE; - if (event == NETDEV_DOWN) - nf_ct_iterate_cleanup_net(net, inet_cmp, ptr, 0, 0); + memset(&addr, 0, sizeof(addr)); + + addr.ip = ifa->ifa_address; + + dev = idev->dev; + nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex, + inet_cmp, GFP_KERNEL); return NOTIFY_DONE; } @@ -136,8 +215,6 @@ static struct notifier_block masq_inet_notifier = { }; #if IS_ENABLED(CONFIG_IPV6) -static atomic_t v6_worker_count __read_mostly; - static int nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, @@ -187,40 +264,6 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6); -struct masq_dev_work { - struct work_struct work; - struct net *net; - struct in6_addr addr; - int ifindex; -}; - -static int inet6_cmp(struct nf_conn *ct, void *work) -{ - struct masq_dev_work *w = (struct masq_dev_work *)work; - struct nf_conntrack_tuple *tuple; - - if (!device_cmp(ct, (void *)(long)w->ifindex)) - return 0; - - tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - - return ipv6_addr_equal(&w->addr, &tuple->dst.u3.in6); -} - -static void iterate_cleanup_work(struct work_struct *work) -{ - struct masq_dev_work *w; - - w = container_of(work, struct masq_dev_work, work); - - nf_ct_iterate_cleanup_net(w->net, inet6_cmp, (void *)w, 0, 0); - - put_net(w->net); - kfree(w); - atomic_dec(&v6_worker_count); - module_put(THIS_MODULE); -} - /* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep). * * Defer it to the system workqueue. @@ -233,36 +276,19 @@ static int masq_inet6_event(struct notifier_block *this, { struct inet6_ifaddr *ifa = ptr; const struct net_device *dev; - struct masq_dev_work *w; - struct net *net; + union nf_inet_addr addr; - if (event != NETDEV_DOWN || atomic_read(&v6_worker_count) >= 16) + if (event != NETDEV_DOWN) return NOTIFY_DONE; dev = ifa->idev->dev; - net = maybe_get_net(dev_net(dev)); - if (!net) - return NOTIFY_DONE; - if (!try_module_get(THIS_MODULE)) - goto err_module; + memset(&addr, 0, sizeof(addr)); - w = kmalloc(sizeof(*w), GFP_ATOMIC); - if (w) { - atomic_inc(&v6_worker_count); - - INIT_WORK(&w->work, iterate_cleanup_work); - w->ifindex = dev->ifindex; - w->net = net; - w->addr = ifa->addr; - schedule_work(&w->work); + addr.in6 = ifa->addr; - return NOTIFY_DONE; - } - - module_put(THIS_MODULE); - err_module: - put_net(net); + nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp, + GFP_ATOMIC); return NOTIFY_DONE; } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index bbd1209694b8..6d12afabfe8a 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -21,6 +21,8 @@ #include "nf_internals.h" +static const struct nf_queue_handler __rcu *nf_queue_handler; + /* * Hook for nfnetlink_queue to register its queue handler. * We do this so that most of the NFQUEUE code can be modular. @@ -29,20 +31,18 @@ * receives, no matter what. */ -/* return EBUSY when somebody else is registered, return EEXIST if the - * same handler is registered, return 0 in case of success. */ -void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh) +void nf_register_queue_handler(const struct nf_queue_handler *qh) { /* should never happen, we only have one queueing backend in kernel */ - WARN_ON(rcu_access_pointer(net->nf.queue_handler)); - rcu_assign_pointer(net->nf.queue_handler, qh); + WARN_ON(rcu_access_pointer(nf_queue_handler)); + rcu_assign_pointer(nf_queue_handler, qh); } EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ -void nf_unregister_queue_handler(struct net *net) +void nf_unregister_queue_handler(void) { - RCU_INIT_POINTER(net->nf.queue_handler, NULL); + RCU_INIT_POINTER(nf_queue_handler, NULL); } EXPORT_SYMBOL(nf_unregister_queue_handler); @@ -51,18 +51,14 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) struct nf_hook_state *state = &entry->state; /* Release those devices we held, or Alexey will kill me. */ - if (state->in) - dev_put(state->in); - if (state->out) - dev_put(state->out); + dev_put(state->in); + dev_put(state->out); if (state->sk) sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->physin) - dev_put(entry->physin); - if (entry->physout) - dev_put(entry->physout); + dev_put(entry->physin); + dev_put(entry->physout); #endif } @@ -95,18 +91,14 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; - if (state->in) - dev_hold(state->in); - if (state->out) - dev_hold(state->out); + dev_hold(state->in); + dev_hold(state->out); if (state->sk) sock_hold(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (entry->physin) - dev_hold(entry->physin); - if (entry->physout) - dev_hold(entry->physout); + dev_hold(entry->physin); + dev_hold(entry->physout); #endif } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); @@ -116,7 +108,7 @@ void nf_queue_nf_hook_drop(struct net *net) const struct nf_queue_handler *qh; rcu_read_lock(); - qh = rcu_dereference(net->nf.queue_handler); + qh = rcu_dereference(nf_queue_handler); if (qh) qh->nf_hook_drop(net); rcu_read_unlock(); @@ -157,12 +149,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, { struct nf_queue_entry *entry = NULL; const struct nf_queue_handler *qh; - struct net *net = state->net; unsigned int route_key_size; int status; /* QUEUE == DROP if no one is waiting, to be safe. */ - qh = rcu_dereference(net->nf.queue_handler); + qh = rcu_dereference(nf_queue_handler); if (!qh) return -ESRCH; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 081437dd75b7..c0851fec11d4 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -780,6 +780,7 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) { struct nftables_pernet *nft_net; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -790,8 +791,11 @@ static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table); + event, flags, ctx->family, ctx->table); if (err < 0) { kfree_skb(skb); goto err; @@ -1563,6 +1567,7 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) { struct nftables_pernet *nft_net; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -1573,8 +1578,11 @@ static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event) if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table, + event, flags, ctx->family, ctx->table, ctx->chain); if (err < 0) { kfree_skb(skb); @@ -2866,8 +2874,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u32 flags, int family, const struct nft_table *table, const struct nft_chain *chain, - const struct nft_rule *rule, - const struct nft_rule *prule) + const struct nft_rule *rule, u64 handle) { struct nlmsghdr *nlh; const struct nft_expr *expr, *next; @@ -2887,9 +2894,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, NFTA_RULE_PAD)) goto nla_put_failure; - if (event != NFT_MSG_DELRULE && prule) { - if (nla_put_be64(skb, NFTA_RULE_POSITION, - cpu_to_be64(prule->handle), + if (event != NFT_MSG_DELRULE && handle) { + if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(handle), NFTA_RULE_PAD)) goto nla_put_failure; } @@ -2925,7 +2931,10 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, const struct nft_rule *rule, int event) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); + const struct nft_rule *prule; struct sk_buff *skb; + u64 handle = 0; + u16 flags = 0; int err; if (!ctx->report && @@ -2936,9 +2945,20 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; + if (event == NFT_MSG_NEWRULE && + !list_is_first(&rule->list, &ctx->chain->rules) && + !list_is_last(&rule->list, &ctx->chain->rules)) { + prule = list_prev_entry(rule, list); + handle = prule->handle; + } + if (ctx->flags & (NLM_F_APPEND | NLM_F_REPLACE)) + flags |= NLM_F_APPEND; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq, - event, 0, ctx->family, ctx->table, - ctx->chain, rule, NULL); + event, flags, ctx->family, ctx->table, + ctx->chain, rule, handle); if (err < 0) { kfree_skb(skb); goto err; @@ -2964,6 +2984,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, struct net *net = sock_net(skb->sk); const struct nft_rule *rule, *prule; unsigned int s_idx = cb->args[0]; + u64 handle; prule = NULL; list_for_each_entry_rcu(rule, &chain->rules, list) { @@ -2975,12 +2996,17 @@ static int __nf_tables_dump_rules(struct sk_buff *skb, memset(&cb->args[1], 0, sizeof(cb->args) - sizeof(cb->args[0])); } + if (prule) + handle = prule->handle; + else + handle = 0; + if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFT_MSG_NEWRULE, NLM_F_MULTI | NLM_F_APPEND, table->family, - table, chain, rule, prule) < 0) + table, chain, rule, handle) < 0) return 1; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -3143,7 +3169,7 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info, err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid, info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0, - family, table, chain, rule, NULL); + family, table, chain, rule, 0); if (err < 0) goto err_fill_rule_info; @@ -3403,17 +3429,15 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, } if (info->nlh->nlmsg_flags & NLM_F_REPLACE) { + err = nft_delrule(&ctx, old_rule); + if (err < 0) + goto err_destroy_flow_rule; + trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); if (trans == NULL) { err = -ENOMEM; goto err_destroy_flow_rule; } - err = nft_delrule(&ctx, old_rule); - if (err < 0) { - nft_trans_destroy(trans); - goto err_destroy_flow_rule; - } - list_add_tail_rcu(&rule->list, &old_rule->list); } else { trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule); @@ -3943,8 +3967,9 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, gfp_t gfp_flags) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); - struct sk_buff *skb; u32 portid = ctx->portid; + struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -3955,7 +3980,10 @@ static void nf_tables_set_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; - err = nf_tables_fill_set(skb, ctx, set, event, 0); + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + + err = nf_tables_fill_set(skb, ctx, set, event, flags); if (err < 0) { kfree_skb(skb); goto err; @@ -4336,7 +4364,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (ops->privsize != NULL) size = ops->privsize(nla, &desc); alloc_size = sizeof(*set) + size + udlen; - if (alloc_size < size) + if (alloc_size < size || alloc_size > INT_MAX) return -ENOMEM; set = kvzalloc(alloc_size, GFP_KERNEL); if (!set) @@ -5231,12 +5259,13 @@ static int nf_tables_getsetelem(struct sk_buff *skb, static void nf_tables_setelem_notify(const struct nft_ctx *ctx, const struct nft_set *set, const struct nft_set_elem *elem, - int event, u16 flags) + int event) { struct nftables_pernet *nft_net; struct net *net = ctx->net; u32 portid = ctx->portid; struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES)) @@ -5246,6 +5275,9 @@ static void nf_tables_setelem_notify(const struct nft_ctx *ctx, if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags, set, elem); if (err < 0) { @@ -6921,7 +6953,7 @@ static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info, void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, - int family, int report, gfp_t gfp) + u16 flags, int family, int report, gfp_t gfp) { struct nftables_pernet *nft_net = nft_pernet(net); struct sk_buff *skb; @@ -6946,8 +6978,9 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, if (skb == NULL) goto err; - err = nf_tables_fill_obj_info(skb, net, portid, seq, event, 0, family, - table, obj, false); + err = nf_tables_fill_obj_info(skb, net, portid, seq, event, + flags & (NLM_F_CREATE | NLM_F_EXCL), + family, table, obj, false); if (err < 0) { kfree_skb(skb); goto err; @@ -6964,7 +6997,7 @@ static void nf_tables_obj_notify(const struct nft_ctx *ctx, struct nft_object *obj, int event) { nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid, ctx->seq, event, - ctx->family, ctx->report, GFP_KERNEL); + ctx->flags, ctx->family, ctx->report, GFP_KERNEL); } /* @@ -7745,6 +7778,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, { struct nftables_pernet *nft_net = nft_pernet(ctx->net); struct sk_buff *skb; + u16 flags = 0; int err; if (!ctx->report && @@ -7755,8 +7789,11 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, if (skb == NULL) goto err; + if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL)) + flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL); + err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, - ctx->seq, event, 0, + ctx->seq, event, flags, ctx->family, flowtable, hook_list); if (err < 0) { kfree_skb(skb); @@ -8634,7 +8671,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_setelem_activate(net, te->set, &te->elem); nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, - NFT_MSG_NEWSETELEM, 0); + NFT_MSG_NEWSETELEM); nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: @@ -8642,7 +8679,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nf_tables_setelem_notify(&trans->ctx, te->set, &te->elem, - NFT_MSG_DELSETELEM, 0); + NFT_MSG_DELSETELEM); nft_setelem_remove(net, te->set, &te->elem); if (!nft_setelem_is_catchall(te->set, &te->elem)) { atomic_dec(&te->set->nelems); @@ -9599,7 +9636,6 @@ static void __nft_release_table(struct net *net, struct nft_table *table) table->use--; nf_tables_chain_destroy(&ctx); } - list_del(&table->list); nf_tables_table_destroy(&ctx); } @@ -9612,6 +9648,8 @@ static void __nft_release_tables(struct net *net) if (nft_table_has_owner(table)) continue; + list_del(&table->list); + __nft_release_table(net, table); } } @@ -9619,31 +9657,38 @@ static void __nft_release_tables(struct net *net) static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { + struct nft_table *table, *to_delete[8]; struct nftables_pernet *nft_net; struct netlink_notify *n = ptr; - struct nft_table *table, *nt; struct net *net = n->net; - bool release = false; + unsigned int deleted; + bool restart = false; if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) return NOTIFY_DONE; nft_net = nft_pernet(net); + deleted = 0; mutex_lock(&nft_net->commit_mutex); +again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && n->portid == table->nlpid) { __nft_release_hook(net, table); - release = true; + list_del_rcu(&table->list); + to_delete[deleted++] = table; + if (deleted >= ARRAY_SIZE(to_delete)) + break; } } - if (release) { + if (deleted) { + restart = deleted >= ARRAY_SIZE(to_delete); synchronize_rcu(); - list_for_each_entry_safe(table, nt, &nft_net->tables, list) { - if (nft_table_has_owner(table) && - n->portid == table->nlpid) - __nft_release_table(net, table); - } + while (deleted) + __nft_release_table(net, to_delete[--deleted]); + + if (restart) + goto again; } mutex_unlock(&nft_net->commit_mutex); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 866cfba04d6c..adc348056076 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -79,7 +79,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, if (priv->base == NFT_PAYLOAD_NETWORK_HEADER) ptr = skb_network_header(skb); else { - if (!pkt->tprot_set) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) return false; ptr = skb_network_header(skb) + nft_thoff(pkt); } diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index b58d73a96523..9656c1646222 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -353,6 +353,7 @@ static void nft_flow_block_offload_init(struct flow_block_offload *bo, bo->command = cmd; bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; bo->extack = extack; + bo->cb_list_head = &basechain->flow_block.cb_list; INIT_LIST_HEAD(&bo->cb_list); } diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index e4fe2f0780eb..84a7dea46efa 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -113,13 +113,13 @@ static int nf_trace_fill_pkt_info(struct sk_buff *nlskb, int off = skb_network_offset(skb); unsigned int len, nh_end; - nh_end = pkt->tprot_set ? nft_thoff(pkt) : skb->len; + nh_end = pkt->flags & NFT_PKTINFO_L4PROTO ? nft_thoff(pkt) : skb->len; len = min_t(unsigned int, nh_end - skb_network_offset(skb), NFT_TRACETYPE_NETWORK_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len)) return -1; - if (pkt->tprot_set) { + if (pkt->flags & NFT_PKTINFO_L4PROTO) { len = min_t(unsigned int, skb->len - nft_thoff(pkt), NFT_TRACETYPE_TRANSPORT_HSIZE); if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb, diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index f554e2ea32ee..d5c719c9e36c 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -185,7 +185,7 @@ static const struct nf_hook_entries * nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *dev) { const struct nf_hook_entries *hook_head = NULL; -#ifdef CONFIG_NETFILTER_INGRESS +#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS) struct net_device *netdev; #endif @@ -221,9 +221,9 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de hook_head = rcu_dereference(net->nf.hooks_decnet[hook]); break; #endif -#ifdef CONFIG_NETFILTER_INGRESS +#if defined(CONFIG_NETFILTER_INGRESS) || defined(CONFIG_NETFILTER_EGRESS) case NFPROTO_NETDEV: - if (hook != NF_NETDEV_INGRESS) + if (hook >= NF_NETDEV_NUMHOOKS) return ERR_PTR(-EOPNOTSUPP); if (!dev) @@ -233,7 +233,15 @@ nfnl_hook_entries_head(u8 pf, unsigned int hook, struct net *net, const char *de if (!netdev) return ERR_PTR(-ENODEV); - return rcu_dereference(netdev->nf_hooks_ingress); +#ifdef CONFIG_NETFILTER_INGRESS + if (hook == NF_NETDEV_INGRESS) + return rcu_dereference(netdev->nf_hooks_ingress); +#endif +#ifdef CONFIG_NETFILTER_EGRESS + if (hook == NF_NETDEV_EGRESS) + return rcu_dereference(netdev->nf_hooks_egress); +#endif + fallthrough; #endif default: return ERR_PTR(-EPROTONOSUPPORT); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f774de0fc24f..4c3fbaaeb103 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -951,6 +951,16 @@ static void nfqnl_nf_hook_drop(struct net *net) struct nfnl_queue_net *q = nfnl_queue_pernet(net); int i; + /* This function is also called on net namespace error unwind, + * when pernet_ops->init() failed and ->exit() functions of the + * previous pernet_ops gets called. + * + * This may result in a call to nfqnl_nf_hook_drop() before + * struct nfnl_queue_net was allocated. + */ + if (!q) + return; + for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; @@ -1502,7 +1512,6 @@ static int __net_init nfnl_queue_net_init(struct net *net) &nfqnl_seq_ops, sizeof(struct iter_state))) return -ENOMEM; #endif - nf_register_queue_handler(net, &nfqh); return 0; } @@ -1511,7 +1520,6 @@ static void __net_exit nfnl_queue_net_exit(struct net *net) struct nfnl_queue_net *q = nfnl_queue_pernet(net); unsigned int i; - nf_unregister_queue_handler(net); #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif @@ -1555,6 +1563,8 @@ static int __init nfnetlink_queue_init(void) goto cleanup_netlink_subsys; } + nf_register_queue_handler(&nfqh); + return status; cleanup_netlink_subsys: @@ -1568,6 +1578,7 @@ out: static void __exit nfnetlink_queue_fini(void) { + nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 5b02408a920b..c3563f0be269 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -310,9 +310,11 @@ static const struct nft_chain_type nft_chain_filter_netdev = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_NETDEV, - .hook_mask = (1 << NF_NETDEV_INGRESS), + .hook_mask = (1 << NF_NETDEV_INGRESS) | + (1 << NF_NETDEV_EGRESS), .hooks = { [NF_NETDEV_INGRESS] = nft_do_chain_netdev, + [NF_NETDEV_EGRESS] = nft_do_chain_netdev, }, }; @@ -342,12 +344,6 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, return; } - /* UNREGISTER events are also happening on netns exit. - * - * Although nf_tables core releases all tables/chains, only this event - * handler provides guarantee that hook->ops.dev is still accessible, - * so we cannot skip exiting net namespaces. - */ __nft_release_basechain(ctx); } @@ -366,6 +362,9 @@ static int nf_tables_netdev_event(struct notifier_block *this, event != NETDEV_CHANGENAME) return NOTIFY_DONE; + if (!check_net(ctx.net)) + return NOTIFY_DONE; + nft_net = nft_pernet(ctx.net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 639c337c885b..f69cc73c5813 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -19,6 +19,7 @@ #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_arp/arp_tables.h> #include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_log.h> /* Used for matches where *info is larger than X byte */ #define NFT_MATCH_LARGE_THRESH 192 @@ -257,8 +258,22 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, nft_compat_wait_for_destructors(); ret = xt_check_target(&par, size, proto, inv); - if (ret < 0) + if (ret < 0) { + if (ret == -ENOENT) { + const char *modname = NULL; + + if (strcmp(target->name, "LOG") == 0) + modname = "nf_log_syslog"; + else if (strcmp(target->name, "NFLOG") == 0) + modname = "nfnetlink_log"; + + if (modname && + nft_request_module(ctx->net, "%s", modname) == -EAGAIN) + return -EAGAIN; + } + return ret; + } /* The standard target cannot be used */ if (!target->target) @@ -683,14 +698,12 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, goto out_put; } - ret = netlink_unicast(info->sk, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; + ret = nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); out_put: rcu_read_lock(); module_put(THIS_MODULE); - return ret == -EAGAIN ? -ENOBUFS : ret; + + return ret; } static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 337e22d8b40b..99b1de14ff7e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -41,6 +41,7 @@ struct nft_ct_helper_obj { #ifdef CONFIG_NF_CONNTRACK_ZONES static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; +static DEFINE_MUTEX(nft_ct_pcpu_mutex); #endif static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, @@ -525,8 +526,10 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: + mutex_lock(&nft_ct_pcpu_mutex); if (--nft_ct_pcpu_template_refcnt == 0) nft_ct_tmpl_put_pcpu(); + mutex_unlock(&nft_ct_pcpu_mutex); break; #endif default: @@ -564,9 +567,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: - if (!nft_ct_tmpl_alloc_pcpu()) + mutex_lock(&nft_ct_pcpu_mutex); + if (!nft_ct_tmpl_alloc_pcpu()) { + mutex_unlock(&nft_ct_pcpu_mutex); return -ENOMEM; + } nft_ct_pcpu_template_refcnt++; + mutex_unlock(&nft_ct_pcpu_mutex); len = sizeof(u16); break; #endif diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 6ba3256fa844..87f3af4645d9 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -198,17 +198,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return -EBUSY; priv->op = ntohl(nla_get_be32(tb[NFTA_DYNSET_OP])); - switch (priv->op) { - case NFT_DYNSET_OP_ADD: - case NFT_DYNSET_OP_DELETE: - break; - case NFT_DYNSET_OP_UPDATE: - if (!(set->flags & NFT_SET_TIMEOUT)) - return -EOPNOTSUPP; - break; - default: + if (priv->op > NFT_DYNSET_OP_DELETE) return -EOPNOTSUPP; - } timeout = 0; if (tb[NFTA_DYNSET_TIMEOUT] != NULL) { diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index a7e01e9952f1..fe91ff5f8fbe 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -244,7 +244,11 @@ static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, case NFT_META_OIF: nft_meta_store_ifindex(dest, nft_out(pkt)); break; - case NFT_META_IIFTYPE: + case NFT_META_IFTYPE: + if (!nft_meta_store_iftype(dest, pkt->skb->dev)) + return false; + break; + case __NFT_META_IIFTYPE: if (!nft_meta_store_iftype(dest, nft_in(pkt))) return false; break; @@ -329,7 +333,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, nft_reg_store8(dest, nft_pf(pkt)); break; case NFT_META_L4PROTO: - if (!pkt->tprot_set) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) goto err; nft_reg_store8(dest, pkt->tprot); break; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index a44b14f6c0dc..cbfe4e4a4ad7 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -22,6 +22,7 @@ #include <linux/icmpv6.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/ip.h> #include <net/sctp/checksum.h> static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, @@ -79,6 +80,45 @@ nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; } +static int __nft_payload_inner_offset(struct nft_pktinfo *pkt) +{ + unsigned int thoff = nft_thoff(pkt); + + if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) + return -1; + + switch (pkt->tprot) { + case IPPROTO_UDP: + pkt->inneroff = thoff + sizeof(struct udphdr); + break; + case IPPROTO_TCP: { + struct tcphdr *th, _tcph; + + th = skb_header_pointer(pkt->skb, thoff, sizeof(_tcph), &_tcph); + if (!th) + return -1; + + pkt->inneroff = thoff + __tcp_hdrlen(th); + } + break; + default: + return -1; + } + + pkt->flags |= NFT_PKTINFO_INNER; + + return 0; +} + +static int nft_payload_inner_offset(const struct nft_pktinfo *pkt) +{ + if (!(pkt->flags & NFT_PKTINFO_INNER) && + __nft_payload_inner_offset((struct nft_pktinfo *)pkt) < 0) + return -1; + + return pkt->inneroff; +} + void nft_payload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -108,10 +148,15 @@ void nft_payload_eval(const struct nft_expr *expr, offset = skb_network_offset(skb); break; case NFT_PAYLOAD_TRANSPORT_HEADER: - if (!pkt->tprot_set) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) goto err; offset = nft_thoff(pkt); break; + case NFT_PAYLOAD_INNER_HEADER: + offset = nft_payload_inner_offset(pkt); + if (offset < 0) + goto err; + break; default: BUG(); } @@ -610,10 +655,15 @@ static void nft_payload_set_eval(const struct nft_expr *expr, offset = skb_network_offset(skb); break; case NFT_PAYLOAD_TRANSPORT_HEADER: - if (!pkt->tprot_set) + if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) goto err; offset = nft_thoff(pkt); break; + case NFT_PAYLOAD_INNER_HEADER: + offset = nft_payload_inner_offset(pkt); + if (offset < 0) + goto err; + break; default: BUG(); } @@ -622,7 +672,8 @@ static void nft_payload_set_eval(const struct nft_expr *expr, offset += priv->offset; if ((priv->csum_type == NFT_PAYLOAD_CSUM_INET || priv->csum_flags) && - (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER || + ((priv->base != NFT_PAYLOAD_TRANSPORT_HEADER && + priv->base != NFT_PAYLOAD_INNER_HEADER) || skb->ip_summed != CHECKSUM_PARTIAL)) { fsum = skb_checksum(skb, offset, priv->len, 0); tsum = csum_partial(src, priv->len, 0); @@ -741,6 +792,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, case NFT_PAYLOAD_LL_HEADER: case NFT_PAYLOAD_NETWORK_HEADER: case NFT_PAYLOAD_TRANSPORT_HEADER: + case NFT_PAYLOAD_INNER_HEADER: break; default: return ERR_PTR(-EOPNOTSUPP); @@ -759,7 +811,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); if (len <= 4 && is_power_of_2(len) && IS_ALIGNED(offset, len) && - base != NFT_PAYLOAD_LL_HEADER) + base != NFT_PAYLOAD_LL_HEADER && base != NFT_PAYLOAD_INNER_HEADER) return &nft_payload_fast_ops; else return &nft_payload_ops; diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 0363f533a42b..c4d1389f7185 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -60,7 +60,7 @@ static void nft_quota_obj_eval(struct nft_object *obj, if (overquota && !test_and_set_bit(NFT_QUOTA_DEPLETED_BIT, &priv->flags)) nft_obj_notify(nft_net(pkt), obj->key.table, obj, 0, 0, - NFT_MSG_NEWOBJ, nft_pf(pkt), 0, GFP_ATOMIC); + NFT_MSG_NEWOBJ, 0, nft_pf(pkt), 0, GFP_ATOMIC); } static int nft_quota_do_init(const struct nlattr * const tb[], diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 84e58ee501a4..25524e393349 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -39,6 +39,20 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define XT_PCPU_BLOCK_SIZE 4096 #define XT_MAX_TABLE_SIZE (512 * 1024 * 1024) +struct xt_template { + struct list_head list; + + /* called when table is needed in the given netns */ + int (*table_init)(struct net *net); + + struct module *me; + + /* A unique name... */ + char name[XT_TABLE_MAXNAMELEN]; +}; + +static struct list_head xt_templates[NFPROTO_NUMPROTO]; + struct xt_pernet { struct list_head tables[NFPROTO_NUMPROTO]; }; @@ -1221,48 +1235,43 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) { struct xt_pernet *xt_net = net_generic(net, xt_pernet_id); - struct xt_table *t, *found = NULL; + struct module *owner = NULL; + struct xt_template *tmpl; + struct xt_table *t; mutex_lock(&xt[af].mutex); list_for_each_entry(t, &xt_net->tables[af], list) if (strcmp(t->name, name) == 0 && try_module_get(t->me)) return t; - if (net == &init_net) - goto out; - - /* Table doesn't exist in this netns, re-try init */ - xt_net = net_generic(&init_net, xt_pernet_id); - list_for_each_entry(t, &xt_net->tables[af], list) { + /* Table doesn't exist in this netns, check larval list */ + list_for_each_entry(tmpl, &xt_templates[af], list) { int err; - if (strcmp(t->name, name)) + if (strcmp(tmpl->name, name)) continue; - if (!try_module_get(t->me)) + if (!try_module_get(tmpl->me)) goto out; + + owner = tmpl->me; + mutex_unlock(&xt[af].mutex); - err = t->table_init(net); + err = tmpl->table_init(net); if (err < 0) { - module_put(t->me); + module_put(owner); return ERR_PTR(err); } - found = t; - mutex_lock(&xt[af].mutex); break; } - if (!found) - goto out; - - xt_net = net_generic(net, xt_pernet_id); /* and once again: */ list_for_each_entry(t, &xt_net->tables[af], list) if (strcmp(t->name, name) == 0) return t; - module_put(found->me); + module_put(owner); out: mutex_unlock(&xt[af].mutex); return ERR_PTR(-ENOENT); @@ -1749,6 +1758,58 @@ xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn) } EXPORT_SYMBOL_GPL(xt_hook_ops_alloc); +int xt_register_template(const struct xt_table *table, + int (*table_init)(struct net *net)) +{ + int ret = -EEXIST, af = table->af; + struct xt_template *t; + + mutex_lock(&xt[af].mutex); + + list_for_each_entry(t, &xt_templates[af], list) { + if (WARN_ON_ONCE(strcmp(table->name, t->name) == 0)) + goto out_unlock; + } + + ret = -ENOMEM; + t = kzalloc(sizeof(*t), GFP_KERNEL); + if (!t) + goto out_unlock; + + BUILD_BUG_ON(sizeof(t->name) != sizeof(table->name)); + + strscpy(t->name, table->name, sizeof(t->name)); + t->table_init = table_init; + t->me = table->me; + list_add(&t->list, &xt_templates[af]); + ret = 0; +out_unlock: + mutex_unlock(&xt[af].mutex); + return ret; +} +EXPORT_SYMBOL_GPL(xt_register_template); + +void xt_unregister_template(const struct xt_table *table) +{ + struct xt_template *t; + int af = table->af; + + mutex_lock(&xt[af].mutex); + list_for_each_entry(t, &xt_templates[af], list) { + if (strcmp(table->name, t->name)) + continue; + + list_del(&t->list); + mutex_unlock(&xt[af].mutex); + kfree(t); + return; + } + + mutex_unlock(&xt[af].mutex); + WARN_ON_ONCE(1); +} +EXPORT_SYMBOL_GPL(xt_unregister_template); + int xt_proto_init(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS @@ -1937,6 +1998,7 @@ static int __init xt_init(void) #endif INIT_LIST_HEAD(&xt[i].target); INIT_LIST_HEAD(&xt[i].match); + INIT_LIST_HEAD(&xt_templates[i]); } rv = register_pernet_subsys(&xt_net_ops); if (rv < 0) diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 12404d221026..0a913ce07425 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -351,21 +351,10 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static int notrack_chk(const struct xt_tgchk_param *par) -{ - if (!par->net->xt.notrack_deprecated_warning) { - pr_info("netfilter: NOTRACK target is deprecated, " - "use CT instead or upgrade iptables\n"); - par->net->xt.notrack_deprecated_warning = true; - } - return 0; -} - static struct xt_target notrack_tg_reg __read_mostly = { .name = "NOTRACK", .revision = 0, .family = NFPROTO_UNSPEC, - .checkentry = notrack_chk, .target = notrack_tg, .table = "raw", .me = THIS_MODULE, diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 7b2f359bfce4..2f7cf5ecebf4 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -137,7 +137,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) { int ret; - info->timer = kmalloc(sizeof(*info->timer), GFP_KERNEL); + info->timer = kzalloc(sizeof(*info->timer), GFP_KERNEL); if (!info->timer) { ret = -ENOMEM; goto out; diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c index 2ff75f7637b0..f39244f9c0ed 100644 --- a/net/netfilter/xt_LOG.c +++ b/net/netfilter/xt_LOG.c @@ -44,6 +44,7 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par) static int log_tg_check(const struct xt_tgchk_param *par) { const struct xt_log_info *loginfo = par->targinfo; + int ret; if (par->family != NFPROTO_IPV4 && par->family != NFPROTO_IPV6) return -EINVAL; @@ -58,7 +59,14 @@ static int log_tg_check(const struct xt_tgchk_param *par) return -EINVAL; } - return nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + if (ret != 0 && !par->nft_compat) { + request_module("%s", "nf_log_syslog"); + + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_LOG); + } + + return ret; } static void log_tg_destroy(const struct xt_tgdtor_param *par) diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index fb5793208059..e660c3710a10 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -42,13 +42,21 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par) static int nflog_tg_check(const struct xt_tgchk_param *par) { const struct xt_nflog_info *info = par->targinfo; + int ret; if (info->flags & ~XT_NFLOG_MASK) return -EINVAL; if (info->prefix[sizeof(info->prefix) - 1] != '\0') return -EINVAL; - return nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + if (ret != 0 && !par->nft_compat) { + request_module("%s", "nfnetlink_log"); + + ret = nf_logger_find_get(par->family, NF_LOG_TYPE_ULOG); + } + + return ret; } static void nflog_tg_destroy(const struct xt_tgdtor_param *par) diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 0d5c422f8745..8aec1b529364 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -94,11 +94,11 @@ static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; - struct gnet_stats_basic_packed *stats = &info->est->bstats; + struct gnet_stats_basic_sync *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); - stats->bytes += skb->len; - stats->packets++; + u64_stats_add(&stats->bytes, skb->len); + u64_stats_inc(&stats->packets); spin_unlock_bh(&info->est->lock); return XT_CONTINUE; @@ -143,6 +143,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) if (!est) goto err1; + gnet_stats_basic_sync_init(&est->bstats); strlcpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 13cf3f9b5938..849ac552a154 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -90,7 +90,7 @@ static bool bpf_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_bpf_info *info = par->matchinfo; - return BPF_PROG_RUN(info->filter, skb); + return bpf_prog_run(info->filter, skb); } static bool bpf_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index baf235721c43..894e6b8f1a86 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -144,8 +144,8 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, return -ENOMEM; doi_def->map.std = kzalloc(sizeof(*doi_def->map.std), GFP_KERNEL); if (doi_def->map.std == NULL) { - ret_val = -ENOMEM; - goto add_std_failure; + kfree(doi_def); + return -ENOMEM; } doi_def->type = CIPSO_V4_MAP_TRANS; @@ -187,14 +187,14 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, } doi_def->map.std->lvl.local = kcalloc(doi_def->map.std->lvl.local_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->lvl.local == NULL) { ret_val = -ENOMEM; goto add_std_failure; } doi_def->map.std->lvl.cipso = kcalloc(doi_def->map.std->lvl.cipso_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->lvl.cipso == NULL) { ret_val = -ENOMEM; goto add_std_failure; @@ -263,7 +263,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, doi_def->map.std->cat.local = kcalloc( doi_def->map.std->cat.local_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->cat.local == NULL) { ret_val = -ENOMEM; goto add_std_failure; @@ -271,7 +271,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, doi_def->map.std->cat.cipso = kcalloc( doi_def->map.std->cat.cipso_size, sizeof(u32), - GFP_KERNEL); + GFP_KERNEL | __GFP_NOWARN); if (doi_def->map.std->cat.cipso == NULL) { ret_val = -ENOMEM; goto add_std_failure; diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 2483df0bbd7c..566ba4397ee4 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -492,8 +492,7 @@ static int netlbl_unlhsh_remove_addr4(struct net *net, netlbl_af4list_audit_addr(audit_buf, 1, (dev != NULL ? dev->name : NULL), addr->s_addr, mask->s_addr); - if (dev != NULL) - dev_put(dev); + dev_put(dev); if (entry != NULL && security_secid_to_secctx(entry->secid, &secctx, &secctx_len) == 0) { @@ -553,8 +552,7 @@ static int netlbl_unlhsh_remove_addr6(struct net *net, netlbl_af6list_audit_addr(audit_buf, 1, (dev != NULL ? dev->name : NULL), addr, mask); - if (dev != NULL) - dev_put(dev); + dev_put(dev); if (entry != NULL && security_secid_to_secctx(entry->secid, &secctx, &secctx_len) == 0) { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 380f95aacdec..4c575324a985 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -594,7 +594,10 @@ static int netlink_insert(struct sock *sk, u32 portid) /* We need to ensure that the socket is hashed and visible. */ smp_wmb(); - nlk_sk(sk)->bound = portid; + /* Paired with lockless reads from netlink_bind(), + * netlink_connect() and netlink_sendmsg(). + */ + WRITE_ONCE(nlk_sk(sk)->bound, portid); err: release_sock(sk); @@ -1012,7 +1015,8 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr, if (nlk->ngroups < BITS_PER_LONG) groups &= (1UL << nlk->ngroups) - 1; - bound = nlk->bound; + /* Paired with WRITE_ONCE() in netlink_insert() */ + bound = READ_ONCE(nlk->bound); if (bound) { /* Ensure nlk->portid is up-to-date. */ smp_rmb(); @@ -1098,8 +1102,9 @@ static int netlink_connect(struct socket *sock, struct sockaddr *addr, /* No need for barriers here as we return to user-space without * using any of the bound attributes. + * Paired with WRITE_ONCE() in netlink_insert(). */ - if (!nlk->bound) + if (!READ_ONCE(nlk->bound)) err = netlink_autobind(sock); if (err == 0) { @@ -1407,8 +1412,6 @@ struct netlink_broadcast_data { int delivered; gfp_t allocation; struct sk_buff *skb, *skb2; - int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); - void *tx_data; }; static void do_one_broadcast(struct sock *sk, @@ -1462,11 +1465,6 @@ static void do_one_broadcast(struct sock *sk, p->delivery_failure = 1; goto out; } - if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { - kfree_skb(p->skb2); - p->skb2 = NULL; - goto out; - } if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; @@ -1489,10 +1487,8 @@ out: sock_put(sk); } -int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, - u32 group, gfp_t allocation, - int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data), - void *filter_data) +int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, + u32 group, gfp_t allocation) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; @@ -1511,8 +1507,6 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid info.allocation = allocation; info.skb = skb; info.skb2 = NULL; - info.tx_filter = filter; - info.tx_data = filter_data; /* While we sleep in clone, do not allow to change socket list */ @@ -1538,14 +1532,6 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid } return -ESRCH; } -EXPORT_SYMBOL(netlink_broadcast_filtered); - -int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, - u32 group, gfp_t allocation) -{ - return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, - NULL, NULL); -} EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { @@ -1888,7 +1874,8 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) dst_group = nlk->dst_group; } - if (!nlk->bound) { + /* Paired with WRITE_ONCE() in netlink_insert() */ + if (!READ_ONCE(nlk->bound)) { err = netlink_autobind(sock); if (err) goto out; @@ -2545,13 +2532,15 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, /* errors reported via destination sk->sk_err, but propagate * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ err = nlmsg_multicast(sk, skb, exclude_portid, group, flags); + if (err == -ESRCH) + err = 0; } if (report) { int err2; err2 = nlmsg_unicast(sk, skb, portid); - if (!err || err == -ESRCH) + if (!err) err = err2; } diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 2d6fdf40df66..1afca2a6c2ac 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -40,14 +40,6 @@ void genl_unlock(void) } EXPORT_SYMBOL(genl_unlock); -#ifdef CONFIG_LOCKDEP -bool lockdep_genl_is_held(void) -{ - return lockdep_is_held(&genl_mutex); -} -EXPORT_SYMBOL(lockdep_genl_is_held); -#endif - static void genl_lock_all(void) { down_write(&cb_lock); @@ -1485,6 +1477,7 @@ int genlmsg_multicast_allns(const struct genl_family *family, { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; + group = family->mcgrp_offset + group; return genlmsg_mcast(skb, portid, group, flags); } @@ -1495,14 +1488,12 @@ void genl_notify(const struct genl_family *family, struct sk_buff *skb, { struct net *net = genl_info_net(info); struct sock *sk = net->genl_sock; - int report = 0; - - if (info->nlhdr) - report = nlmsg_report(info->nlhdr); if (WARN_ON_ONCE(group >= family->n_mcgrps)) return; + group = family->mcgrp_offset + group; - nlmsg_notify(sk, skb, info->snd_portid, group, report, flags); + nlmsg_notify(sk, skb, info->snd_portid, group, + nlmsg_report(info->nlhdr), flags); } EXPORT_SYMBOL(genl_notify); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 6d16e1ab1a8a..775064cdd0ee 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -633,7 +633,7 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr; - ax25_address *source = NULL; + const ax25_address *source = NULL; ax25_uid_assoc *user; struct net_device *dev; int err = 0; @@ -673,7 +673,7 @@ static int nr_connect(struct socket *sock, struct sockaddr *uaddr, err = -ENETUNREACH; goto out_release; } - source = (ax25_address *)dev->dev_addr; + source = (const ax25_address *)dev->dev_addr; user = ax25_findbyuid(current_euid()); if (user) { diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c index 29e418c8c6c3..3aaac4a22b38 100644 --- a/net/netrom/nr_dev.c +++ b/net/netrom/nr_dev.c @@ -108,10 +108,10 @@ static int __must_check nr_set_mac_address(struct net_device *dev, void *addr) if (err) return err; - ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + ax25_listen_release((const ax25_address *)dev->dev_addr, NULL); } - memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); + dev_addr_set(dev, sa->sa_data); return 0; } @@ -120,7 +120,7 @@ static int nr_open(struct net_device *dev) { int err; - err = ax25_listen_register((ax25_address *)dev->dev_addr, NULL); + err = ax25_listen_register((const ax25_address *)dev->dev_addr, NULL); if (err) return err; @@ -131,7 +131,7 @@ static int nr_open(struct net_device *dev) static int nr_close(struct net_device *dev) { - ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + ax25_listen_release((const ax25_address *)dev->dev_addr, NULL); netif_stop_queue(dev); return 0; } diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c index a880dd33e901..511819fbfa67 100644 --- a/net/netrom/nr_loopback.c +++ b/net/netrom/nr_loopback.c @@ -59,8 +59,7 @@ static void nr_loopback_timer(struct timer_list *unused) if (dev == NULL || nr_rx_frame(skb, dev) == 0) kfree_skb(skb); - if (dev != NULL) - dev_put(dev); + dev_put(dev); if (!skb_queue_empty(&loopback_queue) && !nr_loopback_running()) mod_timer(&loopback_timer, jiffies + 10); diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index de0456073dc0..baea3cbd76ca 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -582,8 +582,7 @@ struct net_device *nr_dev_first(void) if (first == NULL || strncmp(dev->name, first->name, 3) < 0) first = dev; } - if (first) - dev_hold(first); + dev_hold(first); rcu_read_unlock(); return first; @@ -599,7 +598,7 @@ struct net_device *nr_dev_get(ax25_address *addr) rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM && - ax25cmp(addr, (ax25_address *)dev->dev_addr) == 0) { + ax25cmp(addr, (const ax25_address *)dev->dev_addr) == 0) { dev_hold(dev); goto out; } @@ -826,7 +825,7 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) ax25s = nr_neigh->ax25; nr_neigh->ax25 = ax25_send_frame(skb, 256, - (ax25_address *)dev->dev_addr, + (const ax25_address *)dev->dev_addr, &nr_neigh->callsign, nr_neigh->digipeat, nr_neigh->dev); if (ax25s) diff --git a/net/nfc/af_nfc.c b/net/nfc/af_nfc.c index 4a9e72073564..dda323e0a473 100644 --- a/net/nfc/af_nfc.c +++ b/net/nfc/af_nfc.c @@ -60,6 +60,9 @@ int nfc_proto_register(const struct nfc_protocol *nfc_proto) proto_tab[nfc_proto->id] = nfc_proto; write_unlock(&proto_tab_lock); + if (rc) + proto_unregister(nfc_proto->proto); + return rc; } EXPORT_SYMBOL(nfc_proto_register); @@ -79,7 +82,7 @@ int __init af_nfc_init(void) return sock_register(&nfc_sock_family_ops); } -void af_nfc_exit(void) +void __exit af_nfc_exit(void) { sock_unregister(PF_NFC); } diff --git a/net/nfc/core.c b/net/nfc/core.c index 573c80c6ff7a..3c645c1d99c9 100644 --- a/net/nfc/core.c +++ b/net/nfc/core.c @@ -636,7 +636,7 @@ error: return rc; } -int nfc_set_remote_general_bytes(struct nfc_dev *dev, u8 *gb, u8 gb_len) +int nfc_set_remote_general_bytes(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { pr_debug("dev_name=%s gb_len=%d\n", dev_name(&dev->dev), gb_len); @@ -665,7 +665,7 @@ int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb) EXPORT_SYMBOL(nfc_tm_data_received); int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode, - u8 *gb, size_t gb_len) + const u8 *gb, size_t gb_len) { int rc; @@ -824,7 +824,7 @@ EXPORT_SYMBOL(nfc_targets_found); */ int nfc_target_lost(struct nfc_dev *dev, u32 target_idx) { - struct nfc_target *tg; + const struct nfc_target *tg; int i; pr_debug("dev_name %s n_target %d\n", dev_name(&dev->dev), target_idx); @@ -1048,7 +1048,7 @@ struct nfc_dev *nfc_get_device(unsigned int idx) * @tx_headroom: reserved space at beginning of skb * @tx_tailroom: reserved space at end of skb */ -struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, +struct nfc_dev *nfc_allocate_device(const struct nfc_ops *ops, u32 supported_protocols, int tx_headroom, int tx_tailroom) { diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c index 5044c7db577e..d63d2e5dc60c 100644 --- a/net/nfc/digital_core.c +++ b/net/nfc/digital_core.c @@ -277,6 +277,7 @@ int digital_tg_configure_hw(struct nfc_digital_dev *ddev, int type, int param) static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) { struct digital_tg_mdaa_params *params; + int rc; params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) @@ -291,8 +292,12 @@ static int digital_tg_listen_mdaa(struct nfc_digital_dev *ddev, u8 rf_tech) get_random_bytes(params->nfcid2 + 2, NFC_NFCID2_MAXSIZE - 2); params->sc = DIGITAL_SENSF_FELICA_SC; - return digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params, - 500, digital_tg_recv_atr_req, NULL); + rc = digital_send_cmd(ddev, DIGITAL_CMD_TG_LISTEN_MDAA, NULL, params, + 500, digital_tg_recv_atr_req, NULL); + if (rc) + kfree(params); + + return rc; } static int digital_tg_listen_md(struct nfc_digital_dev *ddev, u8 rf_tech) @@ -732,7 +737,7 @@ exit: return rc; } -static struct nfc_ops digital_nfc_ops = { +static const struct nfc_ops digital_nfc_ops = { .dev_up = digital_dev_up, .dev_down = digital_dev_down, .start_poll = digital_start_poll, @@ -745,7 +750,7 @@ static struct nfc_ops digital_nfc_ops = { .im_transceive = digital_in_send, }; -struct nfc_digital_dev *nfc_digital_allocate_device(struct nfc_digital_ops *ops, +struct nfc_digital_dev *nfc_digital_allocate_device(const struct nfc_digital_ops *ops, __u32 supported_protocols, __u32 driver_capabilities, int tx_headroom, int tx_tailroom) diff --git a/net/nfc/digital_technology.c b/net/nfc/digital_technology.c index 84d2345c75a3..3adf4589852a 100644 --- a/net/nfc/digital_technology.c +++ b/net/nfc/digital_technology.c @@ -465,8 +465,12 @@ static int digital_in_send_sdd_req(struct nfc_digital_dev *ddev, skb_put_u8(skb, sel_cmd); skb_put_u8(skb, DIGITAL_SDD_REQ_SEL_PAR); - return digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res, - target); + rc = digital_in_send_cmd(ddev, skb, 30, digital_in_recv_sdd_res, + target); + if (rc) + kfree_skb(skb); + + return rc; } static void digital_in_recv_sens_res(struct nfc_digital_dev *ddev, void *arg, diff --git a/net/nfc/hci/command.c b/net/nfc/hci/command.c index 3a89bd9b89fc..af6bacb3ba98 100644 --- a/net/nfc/hci/command.c +++ b/net/nfc/hci/command.c @@ -114,8 +114,6 @@ int nfc_hci_send_cmd(struct nfc_hci_dev *hdev, u8 gate, u8 cmd, { u8 pipe; - pr_debug("\n"); - pipe = hdev->gate2pipe[gate]; if (pipe == NFC_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; @@ -130,8 +128,6 @@ int nfc_hci_send_cmd_async(struct nfc_hci_dev *hdev, u8 gate, u8 cmd, { u8 pipe; - pr_debug("\n"); - pipe = hdev->gate2pipe[gate]; if (pipe == NFC_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; @@ -205,8 +201,6 @@ static int nfc_hci_open_pipe(struct nfc_hci_dev *hdev, u8 pipe) static int nfc_hci_close_pipe(struct nfc_hci_dev *hdev, u8 pipe) { - pr_debug("\n"); - return nfc_hci_execute_cmd(hdev, pipe, NFC_HCI_ANY_CLOSE_PIPE, NULL, 0, NULL); } @@ -242,8 +236,6 @@ static u8 nfc_hci_create_pipe(struct nfc_hci_dev *hdev, u8 dest_host, static int nfc_hci_delete_pipe(struct nfc_hci_dev *hdev, u8 pipe) { - pr_debug("\n"); - return nfc_hci_execute_cmd(hdev, NFC_HCI_ADMIN_PIPE, NFC_HCI_ADM_DELETE_PIPE, &pipe, 1, NULL); } @@ -256,8 +248,6 @@ static int nfc_hci_clear_all_pipes(struct nfc_hci_dev *hdev) /* TODO: Find out what the identity reference data is * and fill param with it. HCI spec 6.1.3.5 */ - pr_debug("\n"); - if (test_bit(NFC_HCI_QUIRK_SHORT_CLEAR, &hdev->quirks)) param_len = 0; @@ -271,8 +261,6 @@ int nfc_hci_disconnect_gate(struct nfc_hci_dev *hdev, u8 gate) int r; u8 pipe = hdev->gate2pipe[gate]; - pr_debug("\n"); - if (pipe == NFC_HCI_INVALID_PIPE) return -EADDRNOTAVAIL; @@ -296,8 +284,6 @@ int nfc_hci_disconnect_all_gates(struct nfc_hci_dev *hdev) { int r; - pr_debug("\n"); - r = nfc_hci_clear_all_pipes(hdev); if (r < 0) return r; @@ -314,8 +300,6 @@ int nfc_hci_connect_gate(struct nfc_hci_dev *hdev, u8 dest_host, u8 dest_gate, bool pipe_created = false; int r; - pr_debug("\n"); - if (pipe == NFC_HCI_DO_NOT_CREATE_PIPE) return 0; diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 3481941be70b..ceb87db57cdb 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -128,7 +128,7 @@ static void nfc_hci_msg_rx_work(struct work_struct *work) struct nfc_hci_dev *hdev = container_of(work, struct nfc_hci_dev, msg_rx_work); struct sk_buff *skb; - struct hcp_message *message; + const struct hcp_message *message; u8 pipe; u8 type; u8 instruction; @@ -182,9 +182,9 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, struct sk_buff *skb) { u8 status = NFC_HCI_ANY_OK; - struct hci_create_pipe_resp *create_info; - struct hci_delete_pipe_noti *delete_info; - struct hci_all_pipe_cleared_noti *cleared_info; + const struct hci_create_pipe_resp *create_info; + const struct hci_delete_pipe_noti *delete_info; + const struct hci_all_pipe_cleared_noti *cleared_info; u8 gate; pr_debug("from pipe %x cmd %x\n", pipe, cmd); @@ -447,7 +447,7 @@ static void nfc_hci_cmd_timeout(struct timer_list *t) } static int hci_dev_connect_gates(struct nfc_hci_dev *hdev, u8 gate_count, - struct nfc_hci_gate *gates) + const struct nfc_hci_gate *gates) { int r; while (gate_count--) { @@ -928,7 +928,7 @@ static int hci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name) return hdev->ops->fw_download(hdev, firmware_name); } -static struct nfc_ops hci_nfc_ops = { +static const struct nfc_ops hci_nfc_ops = { .dev_up = hci_dev_up, .dev_down = hci_dev_down, .start_poll = hci_start_poll, @@ -947,7 +947,7 @@ static struct nfc_ops hci_nfc_ops = { .se_io = hci_se_io, }; -struct nfc_hci_dev *nfc_hci_allocate_device(struct nfc_hci_ops *ops, +struct nfc_hci_dev *nfc_hci_allocate_device(const struct nfc_hci_ops *ops, struct nfc_hci_init_data *init_data, unsigned long quirks, u32 protocols, diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index 6ab40ea17662..2140f6724644 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -11,7 +11,7 @@ static LIST_HEAD(llc_engines); -int nfc_llc_init(void) +int __init nfc_llc_init(void) { int r; @@ -41,7 +41,7 @@ void nfc_llc_exit(void) } } -int nfc_llc_register(const char *name, struct nfc_llc_ops *ops) +int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops) { struct nfc_llc_engine *llc_engine; diff --git a/net/nfc/hci/llc.h b/net/nfc/hci/llc.h index 823ddb621e5d..d66271d211a5 100644 --- a/net/nfc/hci/llc.h +++ b/net/nfc/hci/llc.h @@ -26,20 +26,20 @@ struct nfc_llc_ops { struct nfc_llc_engine { const char *name; - struct nfc_llc_ops *ops; + const struct nfc_llc_ops *ops; struct list_head entry; }; struct nfc_llc { void *data; - struct nfc_llc_ops *ops; + const struct nfc_llc_ops *ops; int rx_headroom; int rx_tailroom; }; void *nfc_llc_get_data(struct nfc_llc *llc); -int nfc_llc_register(const char *name, struct nfc_llc_ops *ops); +int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops); void nfc_llc_unregister(const char *name); int nfc_llc_nop_register(void); diff --git a/net/nfc/hci/llc_nop.c b/net/nfc/hci/llc_nop.c index a42852f36f2e..a58716f16954 100644 --- a/net/nfc/hci/llc_nop.c +++ b/net/nfc/hci/llc_nop.c @@ -71,7 +71,7 @@ static int llc_nop_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb) return llc_nop->xmit_to_drv(llc_nop->hdev, skb); } -static struct nfc_llc_ops llc_nop_ops = { +static const struct nfc_llc_ops llc_nop_ops = { .init = llc_nop_init, .deinit = llc_nop_deinit, .start = llc_nop_start, diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 1e3a90049da9..e90f70385813 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -123,7 +123,7 @@ static bool llc_shdlc_x_lteq_y_lt_z(int x, int y, int z) return ((y >= x) || (y < z)) ? true : false; } -static struct sk_buff *llc_shdlc_alloc_skb(struct llc_shdlc *shdlc, +static struct sk_buff *llc_shdlc_alloc_skb(const struct llc_shdlc *shdlc, int payload_len) { struct sk_buff *skb; @@ -137,7 +137,7 @@ static struct sk_buff *llc_shdlc_alloc_skb(struct llc_shdlc *shdlc, } /* immediately sends an S frame. */ -static int llc_shdlc_send_s_frame(struct llc_shdlc *shdlc, +static int llc_shdlc_send_s_frame(const struct llc_shdlc *shdlc, enum sframe_type sframe_type, int nr) { int r; @@ -159,7 +159,7 @@ static int llc_shdlc_send_s_frame(struct llc_shdlc *shdlc, } /* immediately sends an U frame. skb may contain optional payload */ -static int llc_shdlc_send_u_frame(struct llc_shdlc *shdlc, +static int llc_shdlc_send_u_frame(const struct llc_shdlc *shdlc, struct sk_buff *skb, enum uframe_modifier uframe_modifier) { @@ -201,8 +201,7 @@ static void llc_shdlc_reset_t2(struct llc_shdlc *shdlc, int y_nr) del_timer_sync(&shdlc->t2_timer); shdlc->t2_active = false; - pr_debug - ("All sent frames acked. Stopped T2(retransmit)\n"); + pr_debug("All sent frames acked. Stopped T2(retransmit)\n"); } } else { skb = skb_peek(&shdlc->ack_pending_q); @@ -211,8 +210,7 @@ static void llc_shdlc_reset_t2(struct llc_shdlc *shdlc, int y_nr) msecs_to_jiffies(SHDLC_T2_VALUE_MS)); shdlc->t2_active = true; - pr_debug - ("Start T2(retransmit) for remaining unacked sent frames\n"); + pr_debug("Start T2(retransmit) for remaining unacked sent frames\n"); } } @@ -361,12 +359,10 @@ static void llc_shdlc_connect_complete(struct llc_shdlc *shdlc, int r) wake_up(shdlc->connect_wq); } -static int llc_shdlc_connect_initiate(struct llc_shdlc *shdlc) +static int llc_shdlc_connect_initiate(const struct llc_shdlc *shdlc) { struct sk_buff *skb; - pr_debug("\n"); - skb = llc_shdlc_alloc_skb(shdlc, 2); if (skb == NULL) return -ENOMEM; @@ -377,12 +373,10 @@ static int llc_shdlc_connect_initiate(struct llc_shdlc *shdlc) return llc_shdlc_send_u_frame(shdlc, skb, U_FRAME_RSET); } -static int llc_shdlc_connect_send_ua(struct llc_shdlc *shdlc) +static int llc_shdlc_connect_send_ua(const struct llc_shdlc *shdlc) { struct sk_buff *skb; - pr_debug("\n"); - skb = llc_shdlc_alloc_skb(shdlc, 0); if (skb == NULL) return -ENOMEM; @@ -522,12 +516,11 @@ static void llc_shdlc_handle_send_queue(struct llc_shdlc *shdlc) unsigned long time_sent; if (shdlc->send_q.qlen) - pr_debug - ("sendQlen=%d ns=%d dnr=%d rnr=%s w_room=%d unackQlen=%d\n", - shdlc->send_q.qlen, shdlc->ns, shdlc->dnr, - shdlc->rnr == false ? "false" : "true", - shdlc->w - llc_shdlc_w_used(shdlc->ns, shdlc->dnr), - shdlc->ack_pending_q.qlen); + pr_debug("sendQlen=%d ns=%d dnr=%d rnr=%s w_room=%d unackQlen=%d\n", + shdlc->send_q.qlen, shdlc->ns, shdlc->dnr, + shdlc->rnr == false ? "false" : "true", + shdlc->w - llc_shdlc_w_used(shdlc->ns, shdlc->dnr), + shdlc->ack_pending_q.qlen); while (shdlc->send_q.qlen && shdlc->ack_pending_q.qlen < shdlc->w && (shdlc->rnr == false)) { @@ -573,8 +566,6 @@ static void llc_shdlc_connect_timeout(struct timer_list *t) { struct llc_shdlc *shdlc = from_timer(shdlc, t, connect_timer); - pr_debug("\n"); - schedule_work(&shdlc->sm_work); } @@ -601,8 +592,6 @@ static void llc_shdlc_sm_work(struct work_struct *work) struct llc_shdlc *shdlc = container_of(work, struct llc_shdlc, sm_work); int r; - pr_debug("\n"); - mutex_lock(&shdlc->state_mutex); switch (shdlc->state) { @@ -649,8 +638,7 @@ static void llc_shdlc_sm_work(struct work_struct *work) llc_shdlc_handle_send_queue(shdlc); if (shdlc->t1_active && timer_pending(&shdlc->t1_timer) == 0) { - pr_debug - ("Handle T1(send ack) elapsed (T1 now inactive)\n"); + pr_debug("Handle T1(send ack) elapsed (T1 now inactive)\n"); shdlc->t1_active = false; r = llc_shdlc_send_s_frame(shdlc, S_FRAME_RR, @@ -660,8 +648,7 @@ static void llc_shdlc_sm_work(struct work_struct *work) } if (shdlc->t2_active && timer_pending(&shdlc->t2_timer) == 0) { - pr_debug - ("Handle T2(retransmit) elapsed (T2 inactive)\n"); + pr_debug("Handle T2(retransmit) elapsed (T2 inactive)\n"); shdlc->t2_active = false; @@ -686,8 +673,6 @@ static int llc_shdlc_connect(struct llc_shdlc *shdlc) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(connect_wq); - pr_debug("\n"); - mutex_lock(&shdlc->state_mutex); shdlc->state = SHDLC_CONNECTING; @@ -706,8 +691,6 @@ static int llc_shdlc_connect(struct llc_shdlc *shdlc) static void llc_shdlc_disconnect(struct llc_shdlc *shdlc) { - pr_debug("\n"); - mutex_lock(&shdlc->state_mutex); shdlc->state = SHDLC_DISCONNECTED; @@ -820,7 +803,7 @@ static int llc_shdlc_xmit_from_hci(struct nfc_llc *llc, struct sk_buff *skb) return 0; } -static struct nfc_llc_ops llc_shdlc_ops = { +static const struct nfc_llc_ops llc_shdlc_ops = { .init = llc_shdlc_init, .deinit = llc_shdlc_deinit, .start = llc_shdlc_start, diff --git a/net/nfc/llcp.h b/net/nfc/llcp.h index 97853c9cefc7..d49d4bf2e37c 100644 --- a/net/nfc/llcp.h +++ b/net/nfc/llcp.h @@ -221,15 +221,15 @@ struct sock *nfc_llcp_accept_dequeue(struct sock *sk, struct socket *newsock); /* TLV API */ int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, - u8 *tlv_array, u16 tlv_array_len); + const u8 *tlv_array, u16 tlv_array_len); int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, - u8 *tlv_array, u16 tlv_array_len); + const u8 *tlv_array, u16 tlv_array_len); /* Commands API */ void nfc_llcp_recv(void *data, struct sk_buff *skb, int err); -u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length); +u8 *nfc_llcp_build_tlv(u8 type, const u8 *value, u8 value_length, u8 *tlv_length); struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap); -struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri, +struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri, size_t uri_len); void nfc_llcp_free_sdp_tlv(struct nfc_llcp_sdp_tlv *sdp); void nfc_llcp_free_sdp_tlv_list(struct hlist_head *sdp_head); diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c index 475061c79c44..41e3a20c8935 100644 --- a/net/nfc/llcp_commands.c +++ b/net/nfc/llcp_commands.c @@ -15,7 +15,7 @@ #include "nfc.h" #include "llcp.h" -static u8 llcp_tlv_length[LLCP_TLV_MAX] = { +static const u8 llcp_tlv_length[LLCP_TLV_MAX] = { 0, 1, /* VERSION */ 2, /* MIUX */ @@ -29,7 +29,7 @@ static u8 llcp_tlv_length[LLCP_TLV_MAX] = { }; -static u8 llcp_tlv8(u8 *tlv, u8 type) +static u8 llcp_tlv8(const u8 *tlv, u8 type) { if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]]) return 0; @@ -37,7 +37,7 @@ static u8 llcp_tlv8(u8 *tlv, u8 type) return tlv[2]; } -static u16 llcp_tlv16(u8 *tlv, u8 type) +static u16 llcp_tlv16(const u8 *tlv, u8 type) { if (tlv[0] != type || tlv[1] != llcp_tlv_length[tlv[0]]) return 0; @@ -46,37 +46,37 @@ static u16 llcp_tlv16(u8 *tlv, u8 type) } -static u8 llcp_tlv_version(u8 *tlv) +static u8 llcp_tlv_version(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_VERSION); } -static u16 llcp_tlv_miux(u8 *tlv) +static u16 llcp_tlv_miux(const u8 *tlv) { return llcp_tlv16(tlv, LLCP_TLV_MIUX) & 0x7ff; } -static u16 llcp_tlv_wks(u8 *tlv) +static u16 llcp_tlv_wks(const u8 *tlv) { return llcp_tlv16(tlv, LLCP_TLV_WKS); } -static u16 llcp_tlv_lto(u8 *tlv) +static u16 llcp_tlv_lto(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_LTO); } -static u8 llcp_tlv_opt(u8 *tlv) +static u8 llcp_tlv_opt(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_OPT); } -static u8 llcp_tlv_rw(u8 *tlv) +static u8 llcp_tlv_rw(const u8 *tlv) { return llcp_tlv8(tlv, LLCP_TLV_RW) & 0xf; } -u8 *nfc_llcp_build_tlv(u8 type, u8 *value, u8 value_length, u8 *tlv_length) +u8 *nfc_llcp_build_tlv(u8 type, const u8 *value, u8 value_length, u8 *tlv_length) { u8 *tlv, length; @@ -130,7 +130,7 @@ struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdres_tlv(u8 tid, u8 sap) return sdres; } -struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, char *uri, +struct nfc_llcp_sdp_tlv *nfc_llcp_build_sdreq_tlv(u8 tid, const char *uri, size_t uri_len) { struct nfc_llcp_sdp_tlv *sdreq; @@ -190,9 +190,10 @@ void nfc_llcp_free_sdp_tlv_list(struct hlist_head *head) } int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, - u8 *tlv_array, u16 tlv_array_len) + const u8 *tlv_array, u16 tlv_array_len) { - u8 *tlv = tlv_array, type, length, offset = 0; + const u8 *tlv = tlv_array; + u8 type, length, offset = 0; pr_debug("TLV array length %d\n", tlv_array_len); @@ -239,9 +240,10 @@ int nfc_llcp_parse_gb_tlv(struct nfc_llcp_local *local, } int nfc_llcp_parse_connection_tlv(struct nfc_llcp_sock *sock, - u8 *tlv_array, u16 tlv_array_len) + const u8 *tlv_array, u16 tlv_array_len) { - u8 *tlv = tlv_array, type, length, offset = 0; + const u8 *tlv = tlv_array; + u8 type, length, offset = 0; pr_debug("TLV array length %d\n", tlv_array_len); @@ -295,7 +297,7 @@ static struct sk_buff *llcp_add_header(struct sk_buff *pdu, return pdu; } -static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, u8 *tlv, +static struct sk_buff *llcp_add_tlv(struct sk_buff *pdu, const u8 *tlv, u8 tlv_length) { /* XXX Add an skb length check */ @@ -335,8 +337,6 @@ int nfc_llcp_send_disconnect(struct nfc_llcp_sock *sock) struct nfc_dev *dev; struct nfc_llcp_local *local; - pr_debug("Sending DISC\n"); - local = sock->local; if (local == NULL) return -ENODEV; @@ -360,8 +360,6 @@ int nfc_llcp_send_symm(struct nfc_dev *dev) struct nfc_llcp_local *local; u16 size = 0; - pr_debug("Sending SYMM\n"); - local = nfc_llcp_find_local(dev); if (local == NULL) return -ENODEV; @@ -389,15 +387,14 @@ int nfc_llcp_send_connect(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local; struct sk_buff *skb; - u8 *service_name_tlv = NULL, service_name_tlv_length; - u8 *miux_tlv = NULL, miux_tlv_length; - u8 *rw_tlv = NULL, rw_tlv_length, rw; + const u8 *service_name_tlv = NULL; + const u8 *miux_tlv = NULL; + const u8 *rw_tlv = NULL; + u8 service_name_tlv_length, miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; - pr_debug("Sending CONNECT\n"); - local = sock->local; if (local == NULL) return -ENODEV; @@ -465,14 +462,13 @@ int nfc_llcp_send_cc(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local; struct sk_buff *skb; - u8 *miux_tlv = NULL, miux_tlv_length; - u8 *rw_tlv = NULL, rw_tlv_length, rw; + const u8 *miux_tlv = NULL; + const u8 *rw_tlv = NULL; + u8 miux_tlv_length, rw_tlv_length, rw; int err; u16 size = 0; __be16 miux; - pr_debug("Sending CC\n"); - local = sock->local; if (local == NULL) return -ENODEV; diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c index cc997518f79d..5ad5157aa9c5 100644 --- a/net/nfc/llcp_core.c +++ b/net/nfc/llcp_core.c @@ -45,8 +45,6 @@ static void nfc_llcp_socket_purge(struct nfc_llcp_sock *sock) struct nfc_llcp_local *local = sock->local; struct sk_buff *s, *tmp; - pr_debug("%p\n", &sock->sk); - skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); @@ -301,7 +299,7 @@ static char *wks[] = { "urn:nfc:sn:snep", }; -static int nfc_llcp_wks_sap(char *service_name, size_t service_name_len) +static int nfc_llcp_wks_sap(const char *service_name, size_t service_name_len) { int sap, num_wks; @@ -325,7 +323,7 @@ static int nfc_llcp_wks_sap(char *service_name, size_t service_name_len) static struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, - u8 *sn, size_t sn_len) + const u8 *sn, size_t sn_len) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; @@ -522,7 +520,7 @@ static int nfc_llcp_build_gb(struct nfc_llcp_local *local) { u8 *gb_cur, version, version_length; u8 lto_length, wks_length, miux_length; - u8 *version_tlv = NULL, *lto_tlv = NULL, + const u8 *version_tlv = NULL, *lto_tlv = NULL, *wks_tlv = NULL, *miux_tlv = NULL; __be16 wks = cpu_to_be16(local->local_wks); u8 gb_len = 0; @@ -612,7 +610,7 @@ u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len) return local->gb; } -int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len) +int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { struct nfc_llcp_local *local; @@ -639,27 +637,27 @@ int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len) local->remote_gb_len - 3); } -static u8 nfc_llcp_dsap(struct sk_buff *pdu) +static u8 nfc_llcp_dsap(const struct sk_buff *pdu) { return (pdu->data[0] & 0xfc) >> 2; } -static u8 nfc_llcp_ptype(struct sk_buff *pdu) +static u8 nfc_llcp_ptype(const struct sk_buff *pdu) { return ((pdu->data[0] & 0x03) << 2) | ((pdu->data[1] & 0xc0) >> 6); } -static u8 nfc_llcp_ssap(struct sk_buff *pdu) +static u8 nfc_llcp_ssap(const struct sk_buff *pdu) { return pdu->data[1] & 0x3f; } -static u8 nfc_llcp_ns(struct sk_buff *pdu) +static u8 nfc_llcp_ns(const struct sk_buff *pdu) { return pdu->data[2] >> 4; } -static u8 nfc_llcp_nr(struct sk_buff *pdu) +static u8 nfc_llcp_nr(const struct sk_buff *pdu) { return pdu->data[2] & 0xf; } @@ -801,7 +799,7 @@ out: } static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, - u8 *sn, size_t sn_len) + const u8 *sn, size_t sn_len) { struct nfc_llcp_sock *llcp_sock; @@ -815,9 +813,10 @@ static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, return llcp_sock; } -static u8 *nfc_llcp_connect_sn(struct sk_buff *skb, size_t *sn_len) +static const u8 *nfc_llcp_connect_sn(const struct sk_buff *skb, size_t *sn_len) { - u8 *tlv = &skb->data[2], type, length; + u8 type, length; + const u8 *tlv = &skb->data[2]; size_t tlv_array_len = skb->len - LLCP_HEADER_SIZE, offset = 0; while (offset < tlv_array_len) { @@ -875,7 +874,7 @@ static void nfc_llcp_recv_ui(struct nfc_llcp_local *local, } static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, - struct sk_buff *skb) + const struct sk_buff *skb) { struct sock *new_sk, *parent; struct nfc_llcp_sock *sock, *new_sock; @@ -893,7 +892,7 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, goto fail; } } else { - u8 *sn; + const u8 *sn; size_t sn_len; sn = nfc_llcp_connect_sn(skb, &sn_len); @@ -1112,7 +1111,7 @@ static void nfc_llcp_recv_hdlc(struct nfc_llcp_local *local, } static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; @@ -1155,7 +1154,8 @@ static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, nfc_llcp_sock_put(llcp_sock); } -static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, struct sk_buff *skb) +static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; @@ -1188,7 +1188,8 @@ static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, struct sk_buff *skb) nfc_llcp_sock_put(llcp_sock); } -static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, struct sk_buff *skb) +static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; @@ -1226,12 +1227,13 @@ static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, struct sk_buff *skb) } static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; - u8 dsap, ssap, *tlv, type, length, tid, sap; + u8 dsap, ssap, type, length, tid, sap; + const u8 *tlv; u16 tlv_len, offset; - char *service_name; + const char *service_name; size_t service_name_len; struct nfc_llcp_sdp_tlv *sdp; HLIST_HEAD(llc_sdres_list); @@ -1501,9 +1503,8 @@ void nfc_llcp_recv(void *data, struct sk_buff *skb, int err) { struct nfc_llcp_local *local = (struct nfc_llcp_local *) data; - pr_debug("Received an LLCP PDU\n"); if (err < 0) { - pr_err("err %d\n", err); + pr_err("LLCP PDU receive err %d\n", err); return; } diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index da7fe9db1b00..6fd873aa86be 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -53,9 +53,9 @@ struct nci_conn_info *nci_get_conn_info_by_conn_id(struct nci_dev *ndev, } int nci_get_conn_info_by_dest_type_params(struct nci_dev *ndev, u8 dest_type, - struct dest_spec_params *params) + const struct dest_spec_params *params) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; list_for_each_entry(conn_info, &ndev->conn_info_list, list) { if (conn_info->dest_type == dest_type) { @@ -95,8 +95,8 @@ static void nci_req_cancel(struct nci_dev *ndev, int err) /* Execute request and wait for completion. */ static int __nci_request(struct nci_dev *ndev, - void (*req)(struct nci_dev *ndev, unsigned long opt), - unsigned long opt, __u32 timeout) + void (*req)(struct nci_dev *ndev, const void *opt), + const void *opt, __u32 timeout) { int rc = 0; long completion_rc; @@ -139,8 +139,8 @@ static int __nci_request(struct nci_dev *ndev, inline int nci_request(struct nci_dev *ndev, void (*req)(struct nci_dev *ndev, - unsigned long opt), - unsigned long opt, __u32 timeout) + const void *opt), + const void *opt, __u32 timeout) { int rc; @@ -155,7 +155,7 @@ inline int nci_request(struct nci_dev *ndev, return rc; } -static void nci_reset_req(struct nci_dev *ndev, unsigned long opt) +static void nci_reset_req(struct nci_dev *ndev, const void *opt) { struct nci_core_reset_cmd cmd; @@ -163,17 +163,17 @@ static void nci_reset_req(struct nci_dev *ndev, unsigned long opt) nci_send_cmd(ndev, NCI_OP_CORE_RESET_CMD, 1, &cmd); } -static void nci_init_req(struct nci_dev *ndev, unsigned long opt) +static void nci_init_req(struct nci_dev *ndev, const void *opt) { u8 plen = 0; if (opt) plen = sizeof(struct nci_core_init_v2_cmd); - nci_send_cmd(ndev, NCI_OP_CORE_INIT_CMD, plen, (void *)opt); + nci_send_cmd(ndev, NCI_OP_CORE_INIT_CMD, plen, opt); } -static void nci_init_complete_req(struct nci_dev *ndev, unsigned long opt) +static void nci_init_complete_req(struct nci_dev *ndev, const void *opt) { struct nci_rf_disc_map_cmd cmd; struct disc_map_config *cfg = cmd.mapping_configs; @@ -210,14 +210,14 @@ static void nci_init_complete_req(struct nci_dev *ndev, unsigned long opt) } struct nci_set_config_param { - __u8 id; - size_t len; - __u8 *val; + __u8 id; + size_t len; + const __u8 *val; }; -static void nci_set_config_req(struct nci_dev *ndev, unsigned long opt) +static void nci_set_config_req(struct nci_dev *ndev, const void *opt) { - struct nci_set_config_param *param = (struct nci_set_config_param *)opt; + const struct nci_set_config_param *param = opt; struct nci_core_set_config_cmd cmd; BUG_ON(param->len > NCI_MAX_PARAM_LEN); @@ -235,10 +235,9 @@ struct nci_rf_discover_param { __u32 tm_protocols; }; -static void nci_rf_discover_req(struct nci_dev *ndev, unsigned long opt) +static void nci_rf_discover_req(struct nci_dev *ndev, const void *opt) { - struct nci_rf_discover_param *param = - (struct nci_rf_discover_param *)opt; + const struct nci_rf_discover_param *param = opt; struct nci_rf_disc_cmd cmd; cmd.num_disc_configs = 0; @@ -301,10 +300,9 @@ struct nci_rf_discover_select_param { __u8 rf_protocol; }; -static void nci_rf_discover_select_req(struct nci_dev *ndev, unsigned long opt) +static void nci_rf_discover_select_req(struct nci_dev *ndev, const void *opt) { - struct nci_rf_discover_select_param *param = - (struct nci_rf_discover_select_param *)opt; + const struct nci_rf_discover_select_param *param = opt; struct nci_rf_discover_select_cmd cmd; cmd.rf_discovery_id = param->rf_discovery_id; @@ -328,11 +326,11 @@ static void nci_rf_discover_select_req(struct nci_dev *ndev, unsigned long opt) sizeof(struct nci_rf_discover_select_cmd), &cmd); } -static void nci_rf_deactivate_req(struct nci_dev *ndev, unsigned long opt) +static void nci_rf_deactivate_req(struct nci_dev *ndev, const void *opt) { struct nci_rf_deactivate_cmd cmd; - cmd.type = opt; + cmd.type = (unsigned long)opt; nci_send_cmd(ndev, NCI_OP_RF_DEACTIVATE_CMD, sizeof(struct nci_rf_deactivate_cmd), &cmd); @@ -341,18 +339,17 @@ static void nci_rf_deactivate_req(struct nci_dev *ndev, unsigned long opt) struct nci_cmd_param { __u16 opcode; size_t len; - __u8 *payload; + const __u8 *payload; }; -static void nci_generic_req(struct nci_dev *ndev, unsigned long opt) +static void nci_generic_req(struct nci_dev *ndev, const void *opt) { - struct nci_cmd_param *param = - (struct nci_cmd_param *)opt; + const struct nci_cmd_param *param = opt; nci_send_cmd(ndev, param->opcode, param->len, param->payload); } -int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload) +int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, const __u8 *payload) { struct nci_cmd_param param; @@ -360,12 +357,13 @@ int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload) param.len = len; param.payload = payload; - return __nci_request(ndev, nci_generic_req, (unsigned long)¶m, + return __nci_request(ndev, nci_generic_req, ¶m, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_prop_cmd); -int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, __u8 *payload) +int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, + const __u8 *payload) { struct nci_cmd_param param; @@ -373,21 +371,21 @@ int nci_core_cmd(struct nci_dev *ndev, __u16 opcode, size_t len, __u8 *payload) param.len = len; param.payload = payload; - return __nci_request(ndev, nci_generic_req, (unsigned long)¶m, + return __nci_request(ndev, nci_generic_req, ¶m, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_core_cmd); int nci_core_reset(struct nci_dev *ndev) { - return __nci_request(ndev, nci_reset_req, 0, + return __nci_request(ndev, nci_reset_req, (void *)0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); } EXPORT_SYMBOL(nci_core_reset); int nci_core_init(struct nci_dev *ndev) { - return __nci_request(ndev, nci_init_req, 0, + return __nci_request(ndev, nci_init_req, (void *)0, msecs_to_jiffies(NCI_INIT_TIMEOUT)); } EXPORT_SYMBOL(nci_core_init); @@ -397,9 +395,9 @@ struct nci_loopback_data { struct sk_buff *data; }; -static void nci_send_data_req(struct nci_dev *ndev, unsigned long opt) +static void nci_send_data_req(struct nci_dev *ndev, const void *opt) { - struct nci_loopback_data *data = (struct nci_loopback_data *)opt; + const struct nci_loopback_data *data = opt; nci_send_data(ndev, data->conn_id, data->data); } @@ -407,7 +405,7 @@ static void nci_send_data_req(struct nci_dev *ndev, unsigned long opt) static void nci_nfcc_loopback_cb(void *context, struct sk_buff *skb, int err) { struct nci_dev *ndev = (struct nci_dev *)context; - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_conn_id); if (!conn_info) { @@ -420,7 +418,7 @@ static void nci_nfcc_loopback_cb(void *context, struct sk_buff *skb, int err) nci_req_complete(ndev, NCI_STATUS_OK); } -int nci_nfcc_loopback(struct nci_dev *ndev, void *data, size_t data_len, +int nci_nfcc_loopback(struct nci_dev *ndev, const void *data, size_t data_len, struct sk_buff **resp) { int r; @@ -460,7 +458,7 @@ int nci_nfcc_loopback(struct nci_dev *ndev, void *data, size_t data_len, loopback_data.data = skb; ndev->cur_conn_id = conn_id; - r = nci_request(ndev, nci_send_data_req, (unsigned long)&loopback_data, + r = nci_request(ndev, nci_send_data_req, &loopback_data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK && resp) *resp = conn_info->rx_skb; @@ -493,7 +491,7 @@ static int nci_open_device(struct nci_dev *ndev) rc = ndev->ops->init(ndev); if (!rc) { - rc = __nci_request(ndev, nci_reset_req, 0, + rc = __nci_request(ndev, nci_reset_req, (void *)0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); } @@ -506,10 +504,10 @@ static int nci_open_device(struct nci_dev *ndev) .feature1 = NCI_FEATURE_DISABLE, .feature2 = NCI_FEATURE_DISABLE }; - unsigned long opt = 0; + const void *opt = NULL; if (ndev->nci_ver & NCI_VER_2_MASK) - opt = (unsigned long)&nci_init_v2_cmd; + opt = &nci_init_v2_cmd; rc = __nci_request(ndev, nci_init_req, opt, msecs_to_jiffies(NCI_INIT_TIMEOUT)); @@ -519,7 +517,7 @@ static int nci_open_device(struct nci_dev *ndev) rc = ndev->ops->post_setup(ndev); if (!rc) { - rc = __nci_request(ndev, nci_init_complete_req, 0, + rc = __nci_request(ndev, nci_init_complete_req, (void *)0, msecs_to_jiffies(NCI_INIT_TIMEOUT)); } @@ -569,7 +567,7 @@ static int nci_close_device(struct nci_dev *ndev) atomic_set(&ndev->cmd_cnt, 1); set_bit(NCI_INIT, &ndev->flags); - __nci_request(ndev, nci_reset_req, 0, + __nci_request(ndev, nci_reset_req, (void *)0, msecs_to_jiffies(NCI_RESET_TIMEOUT)); /* After this point our queues are empty @@ -624,7 +622,7 @@ static int nci_dev_down(struct nfc_dev *nfc_dev) return nci_close_device(ndev); } -int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val) +int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, const __u8 *val) { struct nci_set_config_param param; @@ -635,15 +633,15 @@ int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val) param.len = len; param.val = val; - return __nci_request(ndev, nci_set_config_req, (unsigned long)¶m, + return __nci_request(ndev, nci_set_config_req, ¶m, msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT)); } EXPORT_SYMBOL(nci_set_config); -static void nci_nfcee_discover_req(struct nci_dev *ndev, unsigned long opt) +static void nci_nfcee_discover_req(struct nci_dev *ndev, const void *opt) { struct nci_nfcee_discover_cmd cmd; - __u8 action = opt; + __u8 action = (unsigned long)opt; cmd.discovery_action = action; @@ -652,15 +650,16 @@ static void nci_nfcee_discover_req(struct nci_dev *ndev, unsigned long opt) int nci_nfcee_discover(struct nci_dev *ndev, u8 action) { - return __nci_request(ndev, nci_nfcee_discover_req, action, + unsigned long opt = action; + + return __nci_request(ndev, nci_nfcee_discover_req, (void *)opt, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_nfcee_discover); -static void nci_nfcee_mode_set_req(struct nci_dev *ndev, unsigned long opt) +static void nci_nfcee_mode_set_req(struct nci_dev *ndev, const void *opt) { - struct nci_nfcee_mode_set_cmd *cmd = - (struct nci_nfcee_mode_set_cmd *)opt; + const struct nci_nfcee_mode_set_cmd *cmd = opt; nci_send_cmd(ndev, NCI_OP_NFCEE_MODE_SET_CMD, sizeof(struct nci_nfcee_mode_set_cmd), cmd); @@ -673,16 +672,14 @@ int nci_nfcee_mode_set(struct nci_dev *ndev, u8 nfcee_id, u8 nfcee_mode) cmd.nfcee_id = nfcee_id; cmd.nfcee_mode = nfcee_mode; - return __nci_request(ndev, nci_nfcee_mode_set_req, - (unsigned long)&cmd, + return __nci_request(ndev, nci_nfcee_mode_set_req, &cmd, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_nfcee_mode_set); -static void nci_core_conn_create_req(struct nci_dev *ndev, unsigned long opt) +static void nci_core_conn_create_req(struct nci_dev *ndev, const void *opt) { - struct core_conn_create_data *data = - (struct core_conn_create_data *)opt; + const struct core_conn_create_data *data = opt; nci_send_cmd(ndev, NCI_OP_CORE_CONN_CREATE_CMD, data->length, data->cmd); } @@ -690,7 +687,7 @@ static void nci_core_conn_create_req(struct nci_dev *ndev, unsigned long opt) int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, u8 number_destination_params, size_t params_len, - struct core_conn_create_dest_spec_params *params) + const struct core_conn_create_dest_spec_params *params) { int r; struct nci_core_conn_create_cmd *cmd; @@ -719,24 +716,26 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type, } ndev->cur_dest_type = destination_type; - r = __nci_request(ndev, nci_core_conn_create_req, (unsigned long)&data, + r = __nci_request(ndev, nci_core_conn_create_req, &data, msecs_to_jiffies(NCI_CMD_TIMEOUT)); kfree(cmd); return r; } EXPORT_SYMBOL(nci_core_conn_create); -static void nci_core_conn_close_req(struct nci_dev *ndev, unsigned long opt) +static void nci_core_conn_close_req(struct nci_dev *ndev, const void *opt) { - __u8 conn_id = opt; + __u8 conn_id = (unsigned long)opt; nci_send_cmd(ndev, NCI_OP_CORE_CONN_CLOSE_CMD, 1, &conn_id); } int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id) { + unsigned long opt = conn_id; + ndev->cur_conn_id = conn_id; - return __nci_request(ndev, nci_core_conn_close_req, conn_id, + return __nci_request(ndev, nci_core_conn_close_req, (void *)opt, msecs_to_jiffies(NCI_CMD_TIMEOUT)); } EXPORT_SYMBOL(nci_core_conn_close); @@ -756,14 +755,14 @@ static int nci_set_local_general_bytes(struct nfc_dev *nfc_dev) param.id = NCI_PN_ATR_REQ_GEN_BYTES; - rc = nci_request(ndev, nci_set_config_req, (unsigned long)¶m, + rc = nci_request(ndev, nci_set_config_req, ¶m, msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT)); if (rc) return rc; param.id = NCI_LN_ATR_RES_GEN_BYTES; - return nci_request(ndev, nci_set_config_req, (unsigned long)¶m, + return nci_request(ndev, nci_set_config_req, ¶m, msecs_to_jiffies(NCI_SET_CONFIG_TIMEOUT)); } @@ -813,7 +812,7 @@ static int nci_start_poll(struct nfc_dev *nfc_dev, pr_debug("target active or w4 select, implicitly deactivate\n"); rc = nci_request(ndev, nci_rf_deactivate_req, - NCI_DEACTIVATE_TYPE_IDLE_MODE, + (void *)NCI_DEACTIVATE_TYPE_IDLE_MODE, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); if (rc) return -EBUSY; @@ -835,7 +834,7 @@ static int nci_start_poll(struct nfc_dev *nfc_dev, param.im_protocols = im_protocols; param.tm_protocols = tm_protocols; - rc = nci_request(ndev, nci_rf_discover_req, (unsigned long)¶m, + rc = nci_request(ndev, nci_rf_discover_req, ¶m, msecs_to_jiffies(NCI_RF_DISC_TIMEOUT)); if (!rc) @@ -854,7 +853,8 @@ static void nci_stop_poll(struct nfc_dev *nfc_dev) return; } - nci_request(ndev, nci_rf_deactivate_req, NCI_DEACTIVATE_TYPE_IDLE_MODE, + nci_request(ndev, nci_rf_deactivate_req, + (void *)NCI_DEACTIVATE_TYPE_IDLE_MODE, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); } @@ -863,7 +863,7 @@ static int nci_activate_target(struct nfc_dev *nfc_dev, { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); struct nci_rf_discover_select_param param; - struct nfc_target *nci_target = NULL; + const struct nfc_target *nci_target = NULL; int i; int rc = 0; @@ -913,8 +913,7 @@ static int nci_activate_target(struct nfc_dev *nfc_dev, else param.rf_protocol = NCI_RF_PROTOCOL_NFC_DEP; - rc = nci_request(ndev, nci_rf_discover_select_req, - (unsigned long)¶m, + rc = nci_request(ndev, nci_rf_discover_select_req, ¶m, msecs_to_jiffies(NCI_RF_DISC_SELECT_TIMEOUT)); } @@ -929,9 +928,7 @@ static void nci_deactivate_target(struct nfc_dev *nfc_dev, __u8 mode) { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); - u8 nci_mode = NCI_DEACTIVATE_TYPE_IDLE_MODE; - - pr_debug("entry\n"); + unsigned long nci_mode = NCI_DEACTIVATE_TYPE_IDLE_MODE; if (!ndev->target_active_prot) { pr_err("unable to deactivate target, no active target\n"); @@ -947,7 +944,7 @@ static void nci_deactivate_target(struct nfc_dev *nfc_dev, } if (atomic_read(&ndev->state) == NCI_POLL_ACTIVE) { - nci_request(ndev, nci_rf_deactivate_req, nci_mode, + nci_request(ndev, nci_rf_deactivate_req, (void *)nci_mode, msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); } } @@ -978,15 +975,13 @@ static int nci_dep_link_down(struct nfc_dev *nfc_dev) struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); int rc; - pr_debug("entry\n"); - if (nfc_dev->rf_mode == NFC_RF_INITIATOR) { nci_deactivate_target(nfc_dev, NULL, NCI_DEACTIVATE_TYPE_IDLE_MODE); } else { if (atomic_read(&ndev->state) == NCI_LISTEN_ACTIVE || atomic_read(&ndev->state) == NCI_DISCOVERY) { - nci_request(ndev, nci_rf_deactivate_req, 0, - msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); + nci_request(ndev, nci_rf_deactivate_req, (void *)0, + msecs_to_jiffies(NCI_RF_DEACTIVATE_TIMEOUT)); } rc = nfc_tm_deactivated(nfc_dev); @@ -1004,7 +999,7 @@ static int nci_transceive(struct nfc_dev *nfc_dev, struct nfc_target *target, { struct nci_dev *ndev = nfc_get_drvdata(nfc_dev); int rc; - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; conn_info = ndev->rf_conn_info; if (!conn_info) @@ -1102,7 +1097,7 @@ static int nci_fw_download(struct nfc_dev *nfc_dev, const char *firmware_name) return ndev->ops->fw_download(ndev, firmware_name); } -static struct nfc_ops nci_nfc_ops = { +static const struct nfc_ops nci_nfc_ops = { .dev_up = nci_dev_up, .dev_down = nci_dev_down, .start_poll = nci_start_poll, @@ -1129,7 +1124,7 @@ static struct nfc_ops nci_nfc_ops = { * @tx_headroom: Reserved space at beginning of skb * @tx_tailroom: Reserved space at end of skb */ -struct nci_dev *nci_allocate_device(struct nci_ops *ops, +struct nci_dev *nci_allocate_device(const struct nci_ops *ops, __u32 supported_protocols, int tx_headroom, int tx_tailroom) { @@ -1152,8 +1147,7 @@ struct nci_dev *nci_allocate_device(struct nci_ops *ops, if (ops->n_prop_ops > NCI_MAX_PROPRIETARY_CMD) { pr_err("Too many proprietary commands: %zd\n", ops->n_prop_ops); - ops->prop_ops = NULL; - ops->n_prop_ops = 0; + goto free_nci; } ndev->tx_headroom = tx_headroom; @@ -1270,7 +1264,7 @@ EXPORT_SYMBOL(nci_register_device); */ void nci_unregister_device(struct nci_dev *ndev) { - struct nci_conn_info *conn_info, *n; + struct nci_conn_info *conn_info, *n; nci_close_device(ndev); @@ -1332,7 +1326,7 @@ int nci_send_frame(struct nci_dev *ndev, struct sk_buff *skb) EXPORT_SYMBOL(nci_send_frame); /* Send NCI command */ -int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload) +int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, const void *payload) { struct nci_ctrl_hdr *hdr; struct sk_buff *skb; @@ -1364,12 +1358,12 @@ int nci_send_cmd(struct nci_dev *ndev, __u16 opcode, __u8 plen, void *payload) EXPORT_SYMBOL(nci_send_cmd); /* Proprietary commands API */ -static struct nci_driver_ops *ops_cmd_lookup(struct nci_driver_ops *ops, - size_t n_ops, - __u16 opcode) +static const struct nci_driver_ops *ops_cmd_lookup(const struct nci_driver_ops *ops, + size_t n_ops, + __u16 opcode) { size_t i; - struct nci_driver_ops *op; + const struct nci_driver_ops *op; if (!ops || !n_ops) return NULL; @@ -1384,10 +1378,10 @@ static struct nci_driver_ops *ops_cmd_lookup(struct nci_driver_ops *ops, } static int nci_op_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode, - struct sk_buff *skb, struct nci_driver_ops *ops, + struct sk_buff *skb, const struct nci_driver_ops *ops, size_t n_ops) { - struct nci_driver_ops *op; + const struct nci_driver_ops *op; op = ops_cmd_lookup(ops, n_ops, rsp_opcode); if (!op || !op->rsp) @@ -1397,10 +1391,10 @@ static int nci_op_rsp_packet(struct nci_dev *ndev, __u16 rsp_opcode, } static int nci_op_ntf_packet(struct nci_dev *ndev, __u16 ntf_opcode, - struct sk_buff *skb, struct nci_driver_ops *ops, + struct sk_buff *skb, const struct nci_driver_ops *ops, size_t n_ops) { - struct nci_driver_ops *op; + const struct nci_driver_ops *op; op = ops_cmd_lookup(ops, n_ops, ntf_opcode); if (!op || !op->ntf) @@ -1442,7 +1436,7 @@ int nci_core_ntf_packet(struct nci_dev *ndev, __u16 opcode, static void nci_tx_work(struct work_struct *work) { struct nci_dev *ndev = container_of(work, struct nci_dev, tx_work); - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; struct sk_buff *skb; conn_info = nci_get_conn_info_by_conn_id(ndev, ndev->cur_conn_id); diff --git a/net/nfc/nci/data.c b/net/nfc/nci/data.c index ce3382be937f..6055dc9a82aa 100644 --- a/net/nfc/nci/data.c +++ b/net/nfc/nci/data.c @@ -26,7 +26,7 @@ void nci_data_exchange_complete(struct nci_dev *ndev, struct sk_buff *skb, __u8 conn_id, int err) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; data_exchange_cb_t cb; void *cb_context; @@ -80,7 +80,7 @@ static inline void nci_push_data_hdr(struct nci_dev *ndev, int nci_conn_max_data_pkt_payload_size(struct nci_dev *ndev, __u8 conn_id) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; conn_info = nci_get_conn_info_by_conn_id(ndev, conn_id); if (!conn_info) @@ -93,9 +93,9 @@ EXPORT_SYMBOL(nci_conn_max_data_pkt_payload_size); static int nci_queue_tx_data_frags(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; int total_len = skb->len; - unsigned char *data = skb->data; + const unsigned char *data = skb->data; unsigned long flags; struct sk_buff_head frags_q; struct sk_buff *skb_frag; @@ -166,7 +166,7 @@ exit: /* Send NCI data */ int nci_send_data(struct nci_dev *ndev, __u8 conn_id, struct sk_buff *skb) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; int rc = 0; pr_debug("conn_id 0x%x, plen %d\n", conn_id, skb->len); @@ -269,7 +269,7 @@ void nci_rx_data_packet(struct nci_dev *ndev, struct sk_buff *skb) __u8 pbf = nci_pbf(skb->data); __u8 status = 0; __u8 conn_id = nci_conn_id(skb->data); - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; pr_debug("len %d\n", skb->len); diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index d6732e5e8958..19703a649b5a 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -16,11 +16,11 @@ #include <linux/nfc.h> struct nci_data { - u8 conn_id; - u8 pipe; - u8 cmd; - const u8 *data; - u32 data_len; + u8 conn_id; + u8 pipe; + u8 cmd; + const u8 *data; + u32 data_len; } __packed; struct nci_hci_create_pipe_params { @@ -142,7 +142,7 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, const u8 data_type, const u8 *data, size_t data_len) { - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; struct sk_buff *skb; int len, i, r; u8 cb = pipe; @@ -195,9 +195,9 @@ static int nci_hci_send_data(struct nci_dev *ndev, u8 pipe, return i; } -static void nci_hci_send_data_req(struct nci_dev *ndev, unsigned long opt) +static void nci_hci_send_data_req(struct nci_dev *ndev, const void *opt) { - struct nci_data *data = (struct nci_data *)opt; + const struct nci_data *data = opt; nci_hci_send_data(ndev, data->pipe, data->cmd, data->data, data->data_len); @@ -221,8 +221,8 @@ int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd, const u8 *param, size_t param_len, struct sk_buff **skb) { - struct nci_hcp_message *message; - struct nci_conn_info *conn_info; + const struct nci_hcp_message *message; + const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; @@ -240,7 +240,7 @@ int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd, data.data = param; data.data_len = param_len; - r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data, + r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; @@ -363,7 +363,7 @@ exit: static void nci_hci_resp_received(struct nci_dev *ndev, u8 pipe, struct sk_buff *skb) { - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) @@ -406,7 +406,7 @@ static void nci_hci_msg_rx_work(struct work_struct *work) struct nci_hci_dev *hdev = container_of(work, struct nci_hci_dev, msg_rx_work); struct sk_buff *skb; - struct nci_hcp_message *message; + const struct nci_hcp_message *message; u8 pipe, type, instruction; while ((skb = skb_dequeue(&hdev->msg_rx_queue)) != NULL) { @@ -432,8 +432,6 @@ void nci_hci_data_received_cb(void *context, struct sk_buff *frag_skb; int msg_len; - pr_debug("\n"); - if (err) { nci_req_complete(ndev, err); return; @@ -498,7 +496,7 @@ void nci_hci_data_received_cb(void *context, int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe) { struct nci_data data; - struct nci_conn_info *conn_info; + const struct nci_conn_info *conn_info; conn_info = ndev->hci_dev->conn_info; if (!conn_info) @@ -511,9 +509,8 @@ int nci_hci_open_pipe(struct nci_dev *ndev, u8 pipe) data.data = NULL; data.data_len = 0; - return nci_request(ndev, nci_hci_send_data_req, - (unsigned long)&data, - msecs_to_jiffies(NCI_DATA_TIMEOUT)); + return nci_request(ndev, nci_hci_send_data_req, &data, + msecs_to_jiffies(NCI_DATA_TIMEOUT)); } EXPORT_SYMBOL(nci_hci_open_pipe); @@ -523,7 +520,7 @@ static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, u8 pipe; struct sk_buff *skb; struct nci_hci_create_pipe_params params; - struct nci_hci_create_pipe_resp *resp; + const struct nci_hci_create_pipe_resp *resp; pr_debug("gate=%d\n", dest_gate); @@ -548,8 +545,6 @@ static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, static int nci_hci_delete_pipe(struct nci_dev *ndev, u8 pipe) { - pr_debug("\n"); - return nci_hci_send_cmd(ndev, NCI_HCI_ADMIN_GATE, NCI_HCI_ADM_DELETE_PIPE, &pipe, 1, NULL); } @@ -557,8 +552,8 @@ static int nci_hci_delete_pipe(struct nci_dev *ndev, u8 pipe) int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx, const u8 *param, size_t param_len) { - struct nci_hcp_message *message; - struct nci_conn_info *conn_info; + const struct nci_hcp_message *message; + const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 *tmp; @@ -587,8 +582,7 @@ int nci_hci_set_param(struct nci_dev *ndev, u8 gate, u8 idx, data.data = tmp; data.data_len = param_len + 1; - r = nci_request(ndev, nci_hci_send_data_req, - (unsigned long)&data, + r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { message = (struct nci_hcp_message *)conn_info->rx_skb->data; @@ -605,8 +599,8 @@ EXPORT_SYMBOL(nci_hci_set_param); int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx, struct sk_buff **skb) { - struct nci_hcp_message *message; - struct nci_conn_info *conn_info; + const struct nci_hcp_message *message; + const struct nci_conn_info *conn_info; struct nci_data data; int r; u8 pipe = ndev->hci_dev->gate2pipe[gate]; @@ -627,7 +621,7 @@ int nci_hci_get_param(struct nci_dev *ndev, u8 gate, u8 idx, data.data = &idx; data.data_len = 1; - r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data, + r = nci_request(ndev, nci_hci_send_data_req, &data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); if (r == NCI_STATUS_OK) { @@ -697,7 +691,7 @@ EXPORT_SYMBOL(nci_hci_connect_gate); static int nci_hci_dev_connect_gates(struct nci_dev *ndev, u8 gate_count, - struct nci_hci_gate *gates) + const struct nci_hci_gate *gates) { int r; @@ -714,7 +708,7 @@ static int nci_hci_dev_connect_gates(struct nci_dev *ndev, int nci_hci_dev_session_init(struct nci_dev *ndev) { - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; struct sk_buff *skb; int r; diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index 98af04c86b2c..282c51051dcc 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -28,10 +28,10 @@ /* Handle NCI Notification packets */ static void nci_core_reset_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { /* Handle NCI 2.x core reset notification */ - struct nci_core_reset_ntf *ntf = (void *)skb->data; + const struct nci_core_reset_ntf *ntf = (void *)skb->data; ndev->nci_ver = ntf->nci_ver; pr_debug("nci_ver 0x%x, config_status 0x%x\n", @@ -48,7 +48,7 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) { struct nci_core_conn_credit_ntf *ntf = (void *) skb->data; - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; int i; pr_debug("num_entries %d\n", ntf->num_entries); @@ -80,7 +80,7 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, } static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -107,9 +107,10 @@ static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, nci_data_exchange_complete(ndev, NULL, ntf->conn_id, -EIO); } -static __u8 *nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, - struct rf_tech_specific_params_nfca_poll *nfca_poll, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfca_poll *nfca_poll, + const __u8 *data) { nfca_poll->sens_res = __le16_to_cpu(*((__le16 *)data)); data += 2; @@ -134,9 +135,10 @@ static __u8 *nci_extract_rf_params_nfca_passive_poll(struct nci_dev *ndev, return data; } -static __u8 *nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, - struct rf_tech_specific_params_nfcb_poll *nfcb_poll, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcb_poll *nfcb_poll, + const __u8 *data) { nfcb_poll->sensb_res_len = min_t(__u8, *data++, NFC_SENSB_RES_MAXSIZE); @@ -148,9 +150,10 @@ static __u8 *nci_extract_rf_params_nfcb_passive_poll(struct nci_dev *ndev, return data; } -static __u8 *nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, - struct rf_tech_specific_params_nfcf_poll *nfcf_poll, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcf_poll *nfcf_poll, + const __u8 *data) { nfcf_poll->bit_rate = *data++; nfcf_poll->sensf_res_len = min_t(__u8, *data++, NFC_SENSF_RES_MAXSIZE); @@ -164,9 +167,10 @@ static __u8 *nci_extract_rf_params_nfcf_passive_poll(struct nci_dev *ndev, return data; } -static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, - struct rf_tech_specific_params_nfcv_poll *nfcv_poll, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcv_poll *nfcv_poll, + const __u8 *data) { ++data; nfcv_poll->dsfid = *data++; @@ -175,9 +179,10 @@ static __u8 *nci_extract_rf_params_nfcv_passive_poll(struct nci_dev *ndev, return data; } -static __u8 *nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev, - struct rf_tech_specific_params_nfcf_listen *nfcf_listen, - __u8 *data) +static const __u8 * +nci_extract_rf_params_nfcf_passive_listen(struct nci_dev *ndev, + struct rf_tech_specific_params_nfcf_listen *nfcf_listen, + const __u8 *data) { nfcf_listen->local_nfcid2_len = min_t(__u8, *data++, NFC_NFCID2_MAXSIZE); @@ -198,12 +203,12 @@ static int nci_add_new_protocol(struct nci_dev *ndev, struct nfc_target *target, __u8 rf_protocol, __u8 rf_tech_and_mode, - void *params) + const void *params) { - struct rf_tech_specific_params_nfca_poll *nfca_poll; - struct rf_tech_specific_params_nfcb_poll *nfcb_poll; - struct rf_tech_specific_params_nfcf_poll *nfcf_poll; - struct rf_tech_specific_params_nfcv_poll *nfcv_poll; + const struct rf_tech_specific_params_nfca_poll *nfca_poll; + const struct rf_tech_specific_params_nfcb_poll *nfcb_poll; + const struct rf_tech_specific_params_nfcf_poll *nfcf_poll; + const struct rf_tech_specific_params_nfcv_poll *nfcv_poll; __u32 protocol; if (rf_protocol == NCI_RF_PROTOCOL_T1T) @@ -274,7 +279,7 @@ static int nci_add_new_protocol(struct nci_dev *ndev, } static void nci_add_new_target(struct nci_dev *ndev, - struct nci_rf_discover_ntf *ntf) + const struct nci_rf_discover_ntf *ntf) { struct nfc_target *target; int i, rc; @@ -319,10 +324,10 @@ void nci_clear_target_list(struct nci_dev *ndev) } static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nci_rf_discover_ntf ntf; - __u8 *data = skb->data; + const __u8 *data = skb->data; bool add_target = true; ntf.rf_discovery_id = *data++; @@ -382,7 +387,8 @@ static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, } static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, - struct nci_rf_intf_activated_ntf *ntf, __u8 *data) + struct nci_rf_intf_activated_ntf *ntf, + const __u8 *data) { struct activation_params_nfca_poll_iso_dep *nfca_poll; struct activation_params_nfcb_poll_iso_dep *nfcb_poll; @@ -418,7 +424,8 @@ static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, } static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev, - struct nci_rf_intf_activated_ntf *ntf, __u8 *data) + struct nci_rf_intf_activated_ntf *ntf, + const __u8 *data) { struct activation_params_poll_nfc_dep *poll; struct activation_params_listen_nfc_dep *listen; @@ -454,7 +461,7 @@ static int nci_extract_activation_params_nfc_dep(struct nci_dev *ndev, } static void nci_target_auto_activated(struct nci_dev *ndev, - struct nci_rf_intf_activated_ntf *ntf) + const struct nci_rf_intf_activated_ntf *ntf) { struct nfc_target *target; int rc; @@ -477,7 +484,7 @@ static void nci_target_auto_activated(struct nci_dev *ndev, } static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev, - struct nci_rf_intf_activated_ntf *ntf) + const struct nci_rf_intf_activated_ntf *ntf) { ndev->remote_gb_len = 0; @@ -519,11 +526,11 @@ static int nci_store_general_bytes_nfc_dep(struct nci_dev *ndev, } static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; struct nci_rf_intf_activated_ntf ntf; - __u8 *data = skb->data; + const __u8 *data = skb->data; int err = NCI_STATUS_OK; ntf.rf_discovery_id = *data++; @@ -681,10 +688,10 @@ listen: } static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { - struct nci_conn_info *conn_info; - struct nci_rf_deactivate_ntf *ntf = (void *) skb->data; + const struct nci_conn_info *conn_info; + const struct nci_rf_deactivate_ntf *ntf = (void *)skb->data; pr_debug("entry, type 0x%x, reason 0x%x\n", ntf->type, ntf->reason); @@ -725,14 +732,12 @@ static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, } static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { u8 status = NCI_STATUS_OK; - struct nci_nfcee_discover_ntf *nfcee_ntf = + const struct nci_nfcee_discover_ntf *nfcee_ntf = (struct nci_nfcee_discover_ntf *)skb->data; - pr_debug("\n"); - /* NFCForum NCI 9.2.1 HCI Network Specific Handling * If the NFCC supports the HCI Network, it SHALL return one, * and only one, NFCEE_DISCOVER_NTF with a Protocol type of @@ -744,12 +749,6 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, nci_req_complete(ndev, status); } -static void nci_nfcee_action_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) -{ - pr_debug("\n"); -} - void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) { __u16 ntf_opcode = nci_opcode(skb->data); @@ -806,7 +805,6 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) break; case NCI_OP_RF_NFCEE_ACTION_NTF: - nci_nfcee_action_ntf_packet(ndev, skb); break; default: diff --git a/net/nfc/nci/rsp.c b/net/nfc/nci/rsp.c index e9605922a322..b911ab78bed9 100644 --- a/net/nfc/nci/rsp.c +++ b/net/nfc/nci/rsp.c @@ -25,9 +25,10 @@ /* Handle NCI Response packets */ -static void nci_core_reset_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) +static void nci_core_reset_rsp_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { - struct nci_core_reset_rsp *rsp = (void *) skb->data; + const struct nci_core_reset_rsp *rsp = (void *)skb->data; pr_debug("status 0x%x\n", rsp->status); @@ -43,10 +44,11 @@ static void nci_core_reset_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) } } -static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, struct sk_buff *skb) +static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, + const struct sk_buff *skb) { - struct nci_core_init_rsp_1 *rsp_1 = (void *) skb->data; - struct nci_core_init_rsp_2 *rsp_2; + const struct nci_core_init_rsp_1 *rsp_1 = (void *)skb->data; + const struct nci_core_init_rsp_2 *rsp_2; pr_debug("status 0x%x\n", rsp_1->status); @@ -81,10 +83,11 @@ static u8 nci_core_init_rsp_packet_v1(struct nci_dev *ndev, struct sk_buff *skb) return NCI_STATUS_OK; } -static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, struct sk_buff *skb) +static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, + const struct sk_buff *skb) { - struct nci_core_init_rsp_nci_ver2 *rsp = (void *)skb->data; - u8 *supported_rf_interface = rsp->supported_rf_interfaces; + const struct nci_core_init_rsp_nci_ver2 *rsp = (void *)skb->data; + const u8 *supported_rf_interface = rsp->supported_rf_interfaces; u8 rf_interface_idx = 0; u8 rf_extension_cnt = 0; @@ -118,7 +121,7 @@ static u8 nci_core_init_rsp_packet_v2(struct nci_dev *ndev, struct sk_buff *skb) return NCI_STATUS_OK; } -static void nci_core_init_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) +static void nci_core_init_rsp_packet(struct nci_dev *ndev, const struct sk_buff *skb) { u8 status = 0; @@ -160,9 +163,9 @@ exit: } static void nci_core_set_config_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { - struct nci_core_set_config_rsp *rsp = (void *) skb->data; + const struct nci_core_set_config_rsp *rsp = (void *)skb->data; pr_debug("status 0x%x\n", rsp->status); @@ -170,7 +173,7 @@ static void nci_core_set_config_rsp_packet(struct nci_dev *ndev, } static void nci_rf_disc_map_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -179,9 +182,10 @@ static void nci_rf_disc_map_rsp_packet(struct nci_dev *ndev, nci_req_complete(ndev, status); } -static void nci_rf_disc_rsp_packet(struct nci_dev *ndev, struct sk_buff *skb) +static void nci_rf_disc_rsp_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { - struct nci_conn_info *conn_info; + struct nci_conn_info *conn_info; __u8 status = skb->data[0]; pr_debug("status 0x%x\n", status); @@ -210,7 +214,7 @@ exit: } static void nci_rf_disc_select_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -222,7 +226,7 @@ static void nci_rf_disc_select_rsp_packet(struct nci_dev *ndev, } static void nci_rf_deactivate_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -238,9 +242,9 @@ static void nci_rf_deactivate_rsp_packet(struct nci_dev *ndev, } static void nci_nfcee_discover_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { - struct nci_nfcee_discover_rsp *discover_rsp; + const struct nci_nfcee_discover_rsp *discover_rsp; if (skb->len != 2) { nci_req_complete(ndev, NCI_STATUS_NFCEE_PROTOCOL_ERROR); @@ -255,7 +259,7 @@ static void nci_nfcee_discover_rsp_packet(struct nci_dev *ndev, } static void nci_nfcee_mode_set_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; @@ -264,11 +268,11 @@ static void nci_nfcee_mode_set_rsp_packet(struct nci_dev *ndev, } static void nci_core_conn_create_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { __u8 status = skb->data[0]; struct nci_conn_info *conn_info = NULL; - struct nci_core_conn_create_rsp *rsp; + const struct nci_core_conn_create_rsp *rsp; pr_debug("status 0x%x\n", status); @@ -319,7 +323,7 @@ exit: } static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev, - struct sk_buff *skb) + const struct sk_buff *skb) { struct nci_conn_info *conn_info; __u8 status = skb->data[0]; @@ -330,6 +334,8 @@ static void nci_core_conn_close_rsp_packet(struct nci_dev *ndev, ndev->cur_conn_id); if (conn_info) { list_del(&conn_info->list); + if (conn_info == ndev->rf_conn_info) + ndev->rf_conn_info = NULL; devm_kfree(&ndev->nfc_dev->dev, conn_info); } } diff --git a/net/nfc/nci/spi.c b/net/nfc/nci/spi.c index 7d8e10e27c20..0935527d1d12 100644 --- a/net/nfc/nci/spi.c +++ b/net/nfc/nci/spi.c @@ -27,7 +27,7 @@ #define CRC_INIT 0xFFFF -static int __nci_spi_send(struct nci_spi *nspi, struct sk_buff *skb, +static int __nci_spi_send(struct nci_spi *nspi, const struct sk_buff *skb, int cs_change) { struct spi_message m; diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c index 502e7a3f8948..57500c262cc3 100644 --- a/net/nfc/nci/uart.c +++ b/net/nfc/nci/uart.c @@ -1,20 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2015, Marvell International Ltd. * - * This software file (the "File") is distributed by Marvell International - * Ltd. under the terms of the GNU General Public License Version 2, June 1991 - * (the "License"). You may use, redistribute and/or modify this File in - * accordance with the terms and conditions of the License, a copy of which - * is available on the worldwide web at - * http://www.gnu.org/licenses/old-licenses/gpl-2.0.txt. - * - * THE FILE IS DISTRIBUTED AS-IS, WITHOUT WARRANTY OF ANY KIND, AND THE - * IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE - * ARE EXPRESSLY DISCLAIMED. The License provides additional details about - * this warranty disclaimer. - */ - -/* Inspired (hugely) by HCI LDISC implementation in Bluetooth. + * Inspired (hugely) by HCI LDISC implementation in Bluetooth. * * Copyright (C) 2000-2001 Qualcomm Incorporated * Copyright (C) 2002-2003 Maxim Krasnyansky <maxk@qualcomm.com> diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 722f7ef891e1..49089c50872e 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -530,7 +530,7 @@ free_msg: int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) { - struct nfc_se *se; + const struct nfc_se *se; struct sk_buff *msg; void *hdr; @@ -1531,7 +1531,7 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; - struct nfc_vendor_cmd *cmd; + const struct nfc_vendor_cmd *cmd; u32 dev_idx, vid, subcmd; u8 *data; size_t data_len; diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h index 889fefd64e56..de2ec66d7e83 100644 --- a/net/nfc/nfc.h +++ b/net/nfc/nfc.h @@ -48,7 +48,7 @@ void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode); int nfc_llcp_register_device(struct nfc_dev *dev); void nfc_llcp_unregister_device(struct nfc_dev *dev); -int nfc_llcp_set_remote_gb(struct nfc_dev *dev, u8 *gb, u8 gb_len); +int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len); u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len); int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb); struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev); diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 5e39640becdb..0ca214ab5aef 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -140,7 +140,7 @@ static void rawsock_data_exchange_complete(void *context, struct sk_buff *skb, { struct sock *sk = (struct sock *) context; - BUG_ON(in_irq()); + BUG_ON(in_hardirq()); pr_debug("sk=%p err=%d\n", sk, err); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index ef15d9eb4774..076774034bb9 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -924,7 +924,13 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, break; case OVS_USERSPACE_ATTR_PID: - upcall.portid = nla_get_u32(a); + if (dp->user_features & + OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + upcall.portid = + ovs_dp_get_upcall_portid(dp, + smp_processor_id()); + else + upcall.portid = nla_get_u32(a); break; case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: { diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index bc164b35e67d..67ad08320886 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -133,6 +133,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, static void ovs_dp_masks_rebalance(struct work_struct *work); +static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *); + /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { @@ -166,6 +168,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) free_percpu(dp->stats_percpu); kfree(dp->ports); ovs_meters_exit(dp); + kfree(rcu_dereference_raw(dp->upcall_portids)); kfree(dp); } @@ -239,7 +242,13 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.portid = ovs_vport_find_upcall_portid(p, skb); + + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + upcall.portid = + ovs_dp_get_upcall_portid(dp, smp_processor_id()); + else + upcall.portid = ovs_vport_find_upcall_portid(p, skb); + upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall, 0); if (unlikely(error)) @@ -1594,16 +1603,70 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb, DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support); +static int ovs_dp_set_upcall_portids(struct datapath *dp, + const struct nlattr *ids) +{ + struct dp_nlsk_pids *old, *dp_nlsk_pids; + + if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) + return -EINVAL; + + old = ovsl_dereference(dp->upcall_portids); + + dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids), + GFP_KERNEL); + if (!dp_nlsk_pids) + return -ENOMEM; + + dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32); + nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids)); + + rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids); + + kfree_rcu(old, rcu); + + return 0; +} + +u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id) +{ + struct dp_nlsk_pids *dp_nlsk_pids; + + dp_nlsk_pids = rcu_dereference(dp->upcall_portids); + + if (dp_nlsk_pids) { + if (cpu_id < dp_nlsk_pids->n_pids) { + return dp_nlsk_pids->pids[cpu_id]; + } else if (dp_nlsk_pids->n_pids > 0 && + cpu_id >= dp_nlsk_pids->n_pids) { + /* If the number of netlink PIDs is mismatched with + * the number of CPUs as seen by the kernel, log this + * and send the upcall to an arbitrary socket (0) in + * order to not drop packets + */ + pr_info_ratelimited("cpu_id mismatch with handler threads"); + return dp_nlsk_pids->pids[cpu_id % + dp_nlsk_pids->n_pids]; + } else { + return 0; + } + } else { + return 0; + } +} + static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) { u32 user_features = 0; + int err; if (a[OVS_DP_ATTR_USER_FEATURES]) { user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); if (user_features & ~(OVS_DP_F_VPORT_PIDS | OVS_DP_F_UNALIGNED | - OVS_DP_F_TC_RECIRC_SHARING)) + OVS_DP_F_TC_RECIRC_SHARING | + OVS_DP_F_DISPATCH_UPCALL_PER_CPU)) return -EOPNOTSUPP; #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) @@ -1624,6 +1687,15 @@ static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) dp->user_features = user_features; + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && + a[OVS_DP_ATTR_PER_CPU_PIDS]) { + /* Upcall Netlink Port IDs have been updated */ + err = ovs_dp_set_upcall_portids(dp, + a[OVS_DP_ATTR_PER_CPU_PIDS]); + if (err) + return err; + } + if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) static_branch_enable(&tc_recirc_sharing_support); else diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 38f7d3e66ca6..fcfe6cb46441 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -51,6 +51,21 @@ struct dp_stats_percpu { }; /** + * struct dp_nlsk_pids - array of netlink portids of for a datapath. + * This is used when OVS_DP_F_DISPATCH_UPCALL_PER_CPU + * is enabled and must be protected by rcu. + * @rcu: RCU callback head for deferred destruction. + * @n_pids: Size of @pids array. + * @pids: Array storing the Netlink socket PIDs indexed by CPU ID for packets + * that miss the flow table. + */ +struct dp_nlsk_pids { + struct rcu_head rcu; + u32 n_pids; + u32 pids[]; +}; + +/** * struct datapath - datapath for flow-based packet switching * @rcu: RCU callback head for deferred destruction. * @list_node: Element in global 'dps' list. @@ -61,6 +76,7 @@ struct dp_stats_percpu { * @net: Reference to net namespace. * @max_headroom: the maximum headroom of all vports in this datapath; it will * be used by all the internal vports in this dp. + * @upcall_portids: RCU protected 'struct dp_nlsk_pids'. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -87,6 +103,8 @@ struct datapath { /* Switch meters. */ struct dp_meter_table meter_tbl; + + struct dp_nlsk_pids __rcu *upcall_portids; }; /** @@ -243,6 +261,8 @@ int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); +u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id); + const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, u32 portid, u32 seq, u8 cmd); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 57a1971f29e5..46943a18a10d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -46,6 +46,8 @@ * Copyright (C) 2011, <lokec@ccs.neu.edu> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/ethtool.h> #include <linux/types.h> #include <linux/mm.h> @@ -89,6 +91,7 @@ #endif #include <linux/bpf.h> #include <net/compat.h> +#include <linux/netfilter_netdev.h> #include "internal.h" @@ -239,8 +242,42 @@ struct packet_skb_cb { static void __fanout_unlink(struct sock *sk, struct packet_sock *po); static void __fanout_link(struct sock *sk, struct packet_sock *po); +#ifdef CONFIG_NETFILTER_EGRESS +static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb) +{ + struct sk_buff *next, *head = NULL, *tail; + int rc; + + rcu_read_lock(); + for (; skb != NULL; skb = next) { + next = skb->next; + skb_mark_not_on_list(skb); + + if (!nf_hook_egress(skb, &rc, skb->dev)) + continue; + + if (!head) + head = skb; + else + tail->next = skb; + + tail = skb; + } + rcu_read_unlock(); + + return head; +} +#endif + static int packet_direct_xmit(struct sk_buff *skb) { +#ifdef CONFIG_NETFILTER_EGRESS + if (nf_hook_egress_active()) { + skb = nf_hook_direct_egress(skb); + if (!skb) + return NET_XMIT_DROP; + } +#endif return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); } @@ -250,8 +287,7 @@ static struct net_device *packet_cached_dev_get(struct packet_sock *po) rcu_read_lock(); dev = rcu_dereference(po->cached_dev); - if (likely(dev)) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); return dev; @@ -3024,8 +3060,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) out_free: kfree_skb(skb); out_unlock: - if (dev) - dev_put(dev); + dev_put(dev); out: return err; } @@ -3158,8 +3193,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, } } - if (dev) - dev_hold(dev); + dev_hold(dev); proto_curr = po->prot_hook.type; dev_curr = po->prot_hook.dev; @@ -3196,8 +3230,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, packet_cached_dev_assign(po, dev); } } - if (dev_curr) - dev_put(dev_curr); + dev_put(dev_curr); if (proto == 0 || !need_rehook) goto out_unlock; @@ -4109,8 +4142,7 @@ static int packet_notifier(struct notifier_block *this, if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); WRITE_ONCE(po->ifindex, -1); - if (po->prot_hook.dev) - dev_put(po->prot_hook.dev); + dev_put(po->prot_hook.dev); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index ca6ae4c59433..65218b7ce9f9 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -275,8 +275,7 @@ int pn_skb_send(struct sock *sk, struct sk_buff *skb, drop: kfree_skb(skb); - if (dev) - dev_put(dev); + dev_put(dev); return err; } EXPORT_SYMBOL(pn_skb_send); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index ac0fae06cc15..cde671d29d5d 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -122,8 +122,7 @@ struct net_device *phonet_device_get(struct net *net) break; dev = NULL; } - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); return dev; } @@ -233,11 +232,11 @@ static int phonet_device_autoconf(struct net_device *dev) struct if_phonet_req req; int ret; - if (!dev->netdev_ops->ndo_do_ioctl) + if (!dev->netdev_ops->ndo_siocdevprivate) return -EOPNOTSUPP; - ret = dev->netdev_ops->ndo_do_ioctl(dev, (struct ifreq *)&req, - SIOCPNGAUTOCONF); + ret = dev->netdev_ops->ndo_siocdevprivate(dev, (struct ifreq *)&req, + NULL, SIOCPNGAUTOCONF); if (ret < 0) return ret; @@ -411,8 +410,7 @@ struct net_device *phonet_route_output(struct net *net, u8 daddr) daddr >>= 2; rcu_read_lock(); dev = rcu_dereference(routes->table[daddr]); - if (dev) - dev_hold(dev); + dev_hold(dev); rcu_read_unlock(); if (!dev) diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 2599235d592e..71e2caf6ab85 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -379,8 +379,7 @@ static int pn_socket_ioctl(struct socket *sock, unsigned int cmd, saddr = PN_NO_ADDR; release_sock(sk); - if (dev) - dev_put(dev); + dev_put(dev); if (saddr == PN_NO_ADDR) return -EHOSTUNREACH; diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile index 1b1411d158a7..8e0605f88a73 100644 --- a/net/qrtr/Makefile +++ b/net/qrtr/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_QRTR) := qrtr.o ns.o +obj-$(CONFIG_QRTR) += qrtr.o +qrtr-y := af_qrtr.o ns.o obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o qrtr-smd-y := smd.o diff --git a/net/qrtr/qrtr.c b/net/qrtr/af_qrtr.c index 0c30908628ba..ec2322529727 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -506,8 +506,12 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) if (cb->type == QRTR_TYPE_NEW_SERVER) { /* Remote node endpoint can bridge other distant nodes */ - const struct qrtr_ctrl_pkt *pkt = data + hdrlen; + const struct qrtr_ctrl_pkt *pkt; + if (size < sizeof(*pkt)) + goto err; + + pkt = data + hdrlen; qrtr_node_assign(node, le32_to_cpu(pkt->server.node)); } @@ -1157,14 +1161,14 @@ static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = put_user(len, (int __user *)argp); break; case SIOCGIFADDR: - if (copy_from_user(&ifr, argp, sizeof(ifr))) { + if (get_user_ifreq(&ifr, NULL, argp)) { rc = -EFAULT; break; } sq = (struct sockaddr_qrtr *)&ifr.ifr_addr; *sq = ipc->us; - if (copy_to_user(argp, &ifr, sizeof(ifr))) { + if (put_user_ifreq(&ifr, argp)) { rc = -EFAULT; break; } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index cf7d974e0f61..30a1cf4c16c6 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -109,7 +109,7 @@ char *rose2asc(char *buf, const rose_address *addr) /* * Compare two ROSE addresses, 0 == equal. */ -int rosecmp(rose_address *addr1, rose_address *addr2) +int rosecmp(const rose_address *addr1, const rose_address *addr2) { int i; @@ -123,7 +123,8 @@ int rosecmp(rose_address *addr1, rose_address *addr2) /* * Compare two ROSE addresses for only mask digits, 0 == equal. */ -int rosecmpm(rose_address *addr1, rose_address *addr2, unsigned short mask) +int rosecmpm(const rose_address *addr1, const rose_address *addr2, + unsigned short mask) { unsigned int i, j; diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c index 051804fbee17..f1a76a5820f1 100644 --- a/net/rose/rose_dev.c +++ b/net/rose/rose_dev.c @@ -66,10 +66,10 @@ static int rose_set_mac_address(struct net_device *dev, void *addr) if (err) return err; - rose_del_loopback_node((rose_address *)dev->dev_addr); + rose_del_loopback_node((const rose_address *)dev->dev_addr); } - memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); + dev_addr_set(dev, sa->sa_data); return 0; } @@ -78,7 +78,7 @@ static int rose_open(struct net_device *dev) { int err; - err = rose_add_loopback_node((rose_address *)dev->dev_addr); + err = rose_add_loopback_node((const rose_address *)dev->dev_addr); if (err) return err; @@ -90,7 +90,7 @@ static int rose_open(struct net_device *dev) static int rose_close(struct net_device *dev) { netif_stop_queue(dev); - rose_del_loopback_node((rose_address *)dev->dev_addr); + rose_del_loopback_node((const rose_address *)dev->dev_addr); return 0; } diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index f6102e6f5161..8b96a56d3a49 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -94,11 +94,11 @@ static void rose_t0timer_expiry(struct timer_list *t) */ static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh) { - ax25_address *rose_call; + const ax25_address *rose_call; ax25_cb *ax25s; if (ax25cmp(&rose_callsign, &null_ax25_address) == 0) - rose_call = (ax25_address *)neigh->dev->dev_addr; + rose_call = (const ax25_address *)neigh->dev->dev_addr; else rose_call = &rose_callsign; @@ -117,11 +117,11 @@ static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh) */ static int rose_link_up(struct rose_neigh *neigh) { - ax25_address *rose_call; + const ax25_address *rose_call; ax25_cb *ax25s; if (ax25cmp(&rose_callsign, &null_ax25_address) == 0) - rose_call = (ax25_address *)neigh->dev->dev_addr; + rose_call = (const ax25_address *)neigh->dev->dev_addr; else rose_call = &rose_callsign; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index c0e04c261a15..e2e6b6b78578 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -401,7 +401,7 @@ void rose_add_loopback_neigh(void) /* * Add a loopback node. */ -int rose_add_loopback_node(rose_address *address) +int rose_add_loopback_node(const rose_address *address) { struct rose_node *rose_node; int err = 0; @@ -446,7 +446,7 @@ out: /* * Delete a loopback node. */ -void rose_del_loopback_node(rose_address *address) +void rose_del_loopback_node(const rose_address *address) { struct rose_node *rose_node; @@ -629,7 +629,8 @@ struct net_device *rose_dev_get(rose_address *addr) rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { - if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) { + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && + rosecmp(addr, (const rose_address *)dev->dev_addr) == 0) { dev_hold(dev); goto out; } @@ -646,7 +647,8 @@ static int rose_dev_exists(rose_address *addr) rcu_read_lock(); for_each_netdev_rcu(&init_net, dev) { - if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && + rosecmp(addr, (const rose_address *)dev->dev_addr) == 0) goto out; } dev = NULL; diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index 0885b22e5c0e..accd35c05577 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -21,6 +21,8 @@ config AF_RXRPC See Documentation/networking/rxrpc.rst. +if AF_RXRPC + config AF_RXRPC_IPV6 bool "IPv6 support for RxRPC" depends on (IPV6 = m && AF_RXRPC = m) || (IPV6 = y && AF_RXRPC) @@ -30,7 +32,6 @@ config AF_RXRPC_IPV6 config AF_RXRPC_INJECT_LOSS bool "Inject packet loss into RxRPC packet stream" - depends on AF_RXRPC help Say Y here to inject packet loss by discarding some received and some transmitted packets. @@ -38,7 +39,6 @@ config AF_RXRPC_INJECT_LOSS config AF_RXRPC_DEBUG bool "RxRPC dynamic debugging" - depends on AF_RXRPC help Say Y here to make runtime controllable debugging messages appear. @@ -47,7 +47,6 @@ config AF_RXRPC_DEBUG config RXKAD bool "RxRPC Kerberos security" - depends on AF_RXRPC select CRYPTO select CRYPTO_MANAGER select CRYPTO_SKCIPHER @@ -58,3 +57,5 @@ config RXKAD through the use of the key retention service. See Documentation/networking/rxrpc.rst. + +endif diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c index 4e565eeab426..be61d6f5be8d 100644 --- a/net/rxrpc/rtt.c +++ b/net/rxrpc/rtt.c @@ -22,7 +22,7 @@ static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer) static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer) { - return _usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); + return usecs_to_jiffies((peer->srtt_us >> 3) + peer->rttvar_us); } static u32 rxrpc_bound_rto(u32 rto) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index d17a66aab8ee..3258da3d5bed 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -480,26 +480,28 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, atomic_set(&p->tcfa_bindcnt, 1); if (cpustats) { - p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats) goto err1; - p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!p->cpu_bstats_hw) goto err2; p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); if (!p->cpu_qstats) goto err3; } + gnet_stats_basic_sync_init(&p->tcfa_bstats); + gnet_stats_basic_sync_init(&p->tcfa_bstats_hw); spin_lock_init(&p->tcfa_lock); p->tcfa_index = index; p->tcfa_tm.install = jiffies; p->tcfa_tm.lastuse = jiffies; p->tcfa_tm.firstuse = 0; - p->tcfa_flags = flags; + p->tcfa_flags = flags & TCA_ACT_FLAGS_USER_MASK; if (est) { err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, &p->tcfa_rate_est, - &p->tcfa_lock, NULL, est); + &p->tcfa_lock, false, est); if (err) goto err4; } @@ -941,7 +943,7 @@ void tcf_idr_insert_many(struct tc_action *actions[]) } } -struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, +struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, bool police, bool rtnl_held, struct netlink_ext_ack *extack) { @@ -951,7 +953,7 @@ struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, struct nlattr *kind; int err; - if (name == NULL) { + if (!police) { err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) @@ -967,7 +969,7 @@ struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, return ERR_PTR(err); } } else { - if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) { + if (strlcpy(act_name, "police", IFNAMSIZ) >= IFNAMSIZ) { NL_SET_ERR_MSG(extack, "TC action name too long"); return ERR_PTR(-EINVAL); } @@ -1004,12 +1006,11 @@ struct tc_action_ops *tc_action_load_ops(char *name, struct nlattr *nla, struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, struct nlattr *nla, struct nlattr *est, - char *name, int ovr, int bind, struct tc_action_ops *a_o, int *init_res, - bool rtnl_held, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { - struct nla_bitfield32 flags = { 0, 0 }; + bool police = flags & TCA_ACT_FLAGS_POLICE; + struct nla_bitfield32 userflags = { 0, 0 }; u8 hw_stats = TCA_ACT_HW_STATS_ANY; struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_cookie *cookie = NULL; @@ -1017,7 +1018,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, int err; /* backward compatibility for policer */ - if (name == NULL) { + if (!police) { err = nla_parse_nested_deprecated(tb, TCA_ACT_MAX, nla, tcf_action_policy, extack); if (err < 0) @@ -1032,22 +1033,22 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, } hw_stats = tcf_action_hw_stats_get(tb[TCA_ACT_HW_STATS]); if (tb[TCA_ACT_FLAGS]) - flags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); + userflags = nla_get_bitfield32(tb[TCA_ACT_FLAGS]); - err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, ovr, bind, - rtnl_held, tp, flags.value, extack); + err = a_o->init(net, tb[TCA_ACT_OPTIONS], est, &a, tp, + userflags.value | flags, extack); } else { - err = a_o->init(net, nla, est, &a, ovr, bind, rtnl_held, - tp, flags.value, extack); + err = a_o->init(net, nla, est, &a, tp, userflags.value | flags, + extack); } if (err < 0) goto err_out; *init_res = err; - if (!name && tb[TCA_ACT_COOKIE]) + if (!police && tb[TCA_ACT_COOKIE]) tcf_set_action_cookie(&a->act_cookie, cookie); - if (!name) + if (!police) a->hw_stats = hw_stats; return a; @@ -1063,9 +1064,9 @@ err_out: /* Returns numbers of initialized actions or negative error. */ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, - struct nlattr *est, char *name, int ovr, int bind, - struct tc_action *actions[], int init_res[], size_t *attr_size, - bool rtnl_held, struct netlink_ext_ack *extack) + struct nlattr *est, struct tc_action *actions[], + int init_res[], size_t *attr_size, u32 flags, + struct netlink_ext_ack *extack) { struct tc_action_ops *ops[TCA_ACT_MAX_PRIO] = {}; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; @@ -1082,7 +1083,9 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { struct tc_action_ops *a_o; - a_o = tc_action_load_ops(name, tb[i], rtnl_held, extack); + a_o = tc_action_load_ops(tb[i], flags & TCA_ACT_FLAGS_POLICE, + !(flags & TCA_ACT_FLAGS_NO_RTNL), + extack); if (IS_ERR(a_o)) { err = PTR_ERR(a_o); goto err_mod; @@ -1091,9 +1094,8 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, } for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) { - act = tcf_action_init_1(net, tp, tb[i], est, name, ovr, bind, - ops[i - 1], &init_res[i - 1], rtnl_held, - extack); + act = tcf_action_init_1(net, tp, tb[i], est, ops[i - 1], + &init_res[i - 1], flags, extack); if (IS_ERR(act)) { err = PTR_ERR(act); goto err; @@ -1113,7 +1115,7 @@ int tcf_action_init(struct net *net, struct tcf_proto *tp, struct nlattr *nla, goto err_mod; err: - tcf_action_destroy(actions, bind); + tcf_action_destroy(actions, flags & TCA_ACT_FLAGS_BIND); err_mod: for (i = 0; i < TCA_ACT_MAX_PRIO; i++) { if (ops[i]) @@ -1126,13 +1128,13 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, u64 drops, bool hw) { if (a->cpu_bstats) { - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + _bstats_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); this_cpu_ptr(a->cpu_qstats)->drops += drops; if (hw) - _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), - bytes, packets); + _bstats_update(this_cpu_ptr(a->cpu_bstats_hw), + bytes, packets); return; } @@ -1171,9 +1173,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, if (err < 0) goto errout; - if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || - gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw, - &p->tcfa_bstats_hw) < 0 || + if (gnet_stats_copy_basic(&d, p->cpu_bstats, + &p->tcfa_bstats, false) < 0 || + gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, + &p->tcfa_bstats_hw, false) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfa_qstats, @@ -1351,8 +1354,6 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, module_put(ops->owner); err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); - if (err > 0) - return 0; if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send TC action flush notification"); @@ -1423,8 +1424,6 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], ret = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); - if (ret > 0) - return 0; return ret; } @@ -1481,7 +1480,6 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], u32 portid, size_t attr_size, struct netlink_ext_ack *extack) { struct sk_buff *skb; - int err = 0; skb = alloc_skb(attr_size <= NLMSG_GOODSIZE ? NLMSG_GOODSIZE : attr_size, GFP_KERNEL); @@ -1495,15 +1493,12 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], return -EINVAL; } - err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); - if (err > 0) - err = 0; - return err; + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); } static int tcf_action_add(struct net *net, struct nlattr *nla, - struct nlmsghdr *n, u32 portid, int ovr, + struct nlmsghdr *n, u32 portid, u32 flags, struct netlink_ext_ack *extack) { size_t attr_size = 0; @@ -1512,8 +1507,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, int init_res[TCA_ACT_MAX_PRIO] = {}; for (loop = 0; loop < 10; loop++) { - ret = tcf_action_init(net, NULL, nla, NULL, NULL, ovr, 0, - actions, init_res, &attr_size, true, extack); + ret = tcf_action_init(net, NULL, nla, NULL, actions, init_res, + &attr_size, flags, extack); if (ret != -EAGAIN) break; } @@ -1543,7 +1538,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_ROOT_MAX + 1]; u32 portid = NETLINK_CB(skb).portid; - int ret = 0, ovr = 0; + u32 flags = 0; + int ret = 0; if ((n->nlmsg_type != RTM_GETACTION) && !netlink_capable(skb, CAP_NET_ADMIN)) @@ -1569,8 +1565,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, * is zero) then just set this */ if (n->nlmsg_flags & NLM_F_REPLACE) - ovr = 1; - ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, ovr, + flags = TCA_ACT_FLAGS_REPLACE; + ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, portid, flags, extack); break; case RTM_DELACTION: diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index e409a0005717..f2bf896331a5 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -41,17 +41,17 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, int action, filter_res; tcf_lastuse_update(&prog->tcf_tm); - bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(prog->common.cpu_bstats), skb); filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(filter, skb); + filter_res = bpf_prog_run(filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(filter, skb); + filter_res = bpf_prog_run(filter, skb); } if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) skb_orphan(skb); @@ -275,11 +275,11 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **act, - int replace, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, bpf_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_bpf_cfg cfg, old; @@ -317,7 +317,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (bind) return 0; - if (!replace) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*act, bind); return -EEXIST; } diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e19885d7fe2c..94e78ac7a748 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -96,12 +96,12 @@ static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, connmark_net_id); struct nlattr *tb[TCA_CONNMARK_MAX + 1]; + bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; struct tcf_connmark_info *ci; struct tc_connmark *parm; @@ -144,7 +144,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, ci = to_connmark(*a); if (bind) return 0; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 4fa4fcb842ba..a15ec95e69c3 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -41,11 +41,12 @@ static unsigned int csum_net_id; static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, bool rtnl_held, struct tcf_proto *tp, + struct nlattr *est, struct tc_action **a, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, csum_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_csum_params *params_new; struct nlattr *tb[TCA_CSUM_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -78,7 +79,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, } else if (err > 0) { if (bind)/* dont override defaults */ return 0; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 1b4b3514c94f..90866ae45573 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -960,6 +960,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, tmpl = p->tmpl; tcf_lastuse_update(&c->tcf_tm); + tcf_action_update_bstats(&c->common, skb); if (clear) { qdisc_skb_cb(skb)->post_ct = false; @@ -1049,7 +1050,6 @@ out_push: qdisc_skb_cb(skb)->post_ct = true; out_clear: - tcf_action_update_bstats(&c->common, skb); if (defrag) qdisc_skb_cb(skb)->pkt_len = skb->len; return retval; @@ -1235,11 +1235,11 @@ static int tcf_ct_fill_params(struct net *net, static int tcf_ct_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int replace, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ct_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_ct_params *params = NULL; struct nlattr *tb[TCA_CT_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -1279,7 +1279,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (bind) return 0; - if (!replace) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index b20c8ce59905..549374a2d008 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -154,11 +154,11 @@ static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; u32 dscpmask = 0, dscpstatemask, index; struct nlattr *tb[TCA_CTINFO_MAX + 1]; struct tcf_ctinfo_params *cp_new; @@ -221,7 +221,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, } else if (err > 0) { if (bind) /* don't override defaults */ return 0; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 73c3926358a0..d8dce173df37 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -52,11 +52,11 @@ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { static int tcf_gact_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gact_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GACT_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_gact *parm; @@ -109,7 +109,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, } else if (err > 0) { if (bind)/* dont override defaults */ return 0; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index a78cb7965718..7df72a4197a3 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -295,12 +295,12 @@ static void gate_setup_timer(struct tcf_gate *gact, u64 basetime, static int tcf_gate_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, gate_net_id); enum tk_offsets tk_offset = TK_OFFS_TAI; + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GATE_MAX + 1]; struct tcf_chain *goto_ch = NULL; u64 cycletime = 0, basetime = 0; @@ -364,7 +364,7 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index a2ddea04183a..b757f90a2d58 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -479,11 +479,11 @@ static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb, static int tcf_ife_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, ife_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -532,7 +532,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, kfree(p); return err; } - err = load_metalist(tb2, rtnl_held); + err = load_metalist(tb2, !(flags & TCA_ACT_FLAGS_NO_RTNL)); if (err) { kfree(p); return err; @@ -560,7 +560,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, return ret; } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); kfree(p); return -EEXIST; @@ -600,7 +600,8 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, } if (tb[TCA_IFE_METALST]) { - err = populate_metalist(ife, tb2, exists, rtnl_held); + err = populate_metalist(ife, tb2, exists, + !(flags & TCA_ACT_FLAGS_NO_RTNL)); if (err) goto metadata_parse_err; } else { @@ -717,7 +718,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, u8 *tlv_data; u16 metalen; - bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (skb_at_tc_ingress(skb)) @@ -805,7 +806,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, exceed_mtu = true; } - bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index ac7297f42355..265b1443e252 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -94,10 +94,11 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - const struct tc_action_ops *ops, int ovr, int bind, + const struct tc_action_ops *ops, struct tcf_proto *tp, u32 flags) { struct tc_action_net *tn = net_generic(net, id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; struct xt_entry_target *td, *t; @@ -154,7 +155,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (bind)/* dont override defaults */ return 0; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } @@ -201,21 +202,21 @@ err1: } static int tcf_ipt_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, bool rtnl_held, struct tcf_proto *tp, + struct nlattr *est, struct tc_action **a, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, - bind, tp, flags); + return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, + tp, flags); } static int tcf_xt_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, bool unlocked, struct tcf_proto *tp, + struct nlattr *est, struct tc_action **a, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, - bind, tp, flags); + return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, + tp, flags); } static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 2ef4cd2c848b..d64b0eeccbe4 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -78,8 +78,7 @@ static void tcf_mirred_release(struct tc_action *a) /* last reference to action, no need to lock */ dev = rcu_dereference_protected(m->tcfm_dev, 1); - if (dev) - dev_put(dev); + dev_put(dev); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { @@ -91,11 +90,11 @@ static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mirred_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tcf_chain *goto_ch = NULL; bool mac_header_xmit = false; @@ -155,7 +154,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, return ret; } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } @@ -180,8 +179,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, mac_header_xmit = dev_is_mac_header_xmit(dev); dev = rcu_replace_pointer(m->tcfm_dev, dev, lockdep_is_held(&m->tcf_lock)); - if (dev) - dev_put(dev); + dev_put(dev); m->tcfm_mac_header_xmit = mac_header_xmit; } goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index d1486ea496a2..8faa4c58305e 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -59,7 +59,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, int ret, mac_len; tcf_lastuse_update(&m->tcf_tm); - bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); /* Ensure 'data' points at mac_header prior calling mpls manipulating * functions. @@ -152,11 +152,11 @@ static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { static int tcf_mpls_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, mpls_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_MPLS_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_mpls_params *p; @@ -255,7 +255,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 1ebd2a86d980..7dd6b586ba7f 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -34,11 +34,11 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { }; static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, - struct tc_action **a, int ovr, int bind, - bool rtnl_held, struct tcf_proto *tp, + struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, nat_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_NAT_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_nat *parm; @@ -70,7 +70,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, } else if (err > 0) { if (bind) return 0; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index b45304446e13..c6c862c459cc 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -136,11 +136,11 @@ nla_failure: static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_pedit_key *keys = NULL; @@ -198,7 +198,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } else if (err > 0) { if (bind) goto out_free; - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { ret = -EEXIST; goto out_release; } diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0fab8de176d2..9e77ba8401e5 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -48,11 +48,11 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_police_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { int ret = 0, tcfp_result = TC_ACT_OK, err, size; + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_POLICE_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_police *parm; @@ -97,7 +97,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; spin_lock_init(&(to_police(*a)->tcfp_lock)); - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } @@ -125,7 +125,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, police->common.cpu_bstats, &police->tcf_rate_est, &police->tcf_lock, - NULL, est); + false, est); if (err) goto failure; } else if (tb[TCA_POLICE_AVRATE] && @@ -248,7 +248,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, int ret; tcf_lastuse_update(&police->tcf_tm); - bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); ret = READ_ONCE(police->tcf_action); p = rcu_dereference_bh(police->params); diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 6a0c16e4351d..ce859b0e0deb 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -34,11 +34,12 @@ static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { }; static int tcf_sample_init(struct net *net, struct nlattr *nla, - struct nlattr *est, struct tc_action **a, int ovr, - int bind, bool rtnl_held, struct tcf_proto *tp, + struct nlattr *est, struct tc_action **a, + struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, sample_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_SAMPLE_MAX + 1]; struct psample_group *psample_group; u32 psample_group_num, rate, index; @@ -75,7 +76,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, return ret; } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } @@ -162,7 +163,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, int retval; tcf_lastuse_update(&s->tcf_tm); - bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(s->common.cpu_bstats), skb); retval = READ_ONCE(s->tcf_action); psample_group = rcu_dereference_bh(s->psample_group); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 726cc956d06f..e617ab4505ca 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -36,7 +36,8 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, * then it would look like "hello_3" (without quotes) */ pr_info("simple: %s_%llu\n", - (char *)d->tcfd_defdata, d->tcf_bstats.packets); + (char *)d->tcfd_defdata, + u64_stats_read(&d->tcf_bstats.packets)); spin_unlock(&d->tcf_lock); return d->tcf_action; } @@ -85,11 +86,11 @@ static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, simp_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_DEF_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_defact *parm; @@ -147,7 +148,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, tcf_action_set_ctrlact(*a, parm->action, goto_ch); ret = ACT_P_CREATED; } else { - if (!ovr) { + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { err = -EEXIST; goto release_idr; } diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index e5f3fb8b00e3..d30ecbfc8f84 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -31,7 +31,7 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, int action; tcf_lastuse_update(&d->tcf_tm); - bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); params = rcu_dereference_bh(d->params); action = READ_ONCE(d->tcf_action); @@ -96,11 +96,11 @@ static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = { static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbedit_net_id); + bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct tcf_skbedit_params *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -186,7 +186,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, ret = ACT_P_CREATED; } else { d = to_skbedit(*a); - if (!ovr) { + if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 8d17a543cc9f..9b6b52c5e24e 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <net/inet_ecn.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> @@ -21,36 +22,49 @@ static unsigned int skbmod_net_id; static struct tc_action_ops act_skbmod_ops; -#define MAX_EDIT_LEN ETH_HLEN static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_skbmod *d = to_skbmod(a); - int action; + int action, max_edit_len, err; struct tcf_skbmod_params *p; u64 flags; - int err; tcf_lastuse_update(&d->tcf_tm); - bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); + bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); action = READ_ONCE(d->tcf_action); if (unlikely(action == TC_ACT_SHOT)) goto drop; - if (!skb->dev || skb->dev->type != ARPHRD_ETHER) - return action; + max_edit_len = skb_mac_header_len(skb); + p = rcu_dereference_bh(d->skbmod_p); + flags = p->flags; + + /* tcf_skbmod_init() guarantees "flags" to be one of the following: + * 1. a combination of SKBMOD_F_{DMAC,SMAC,ETYPE} + * 2. SKBMOD_F_SWAPMAC + * 3. SKBMOD_F_ECN + * SKBMOD_F_ECN only works with IP packets; all other flags only work with Ethernet + * packets. + */ + if (flags == SKBMOD_F_ECN) { + switch (skb_protocol(skb, true)) { + case cpu_to_be16(ETH_P_IP): + case cpu_to_be16(ETH_P_IPV6): + max_edit_len += skb_network_header_len(skb); + break; + default: + goto out; + } + } else if (!skb->dev || skb->dev->type != ARPHRD_ETHER) { + goto out; + } - /* XXX: if you are going to edit more fields beyond ethernet header - * (example when you add IP header replacement or vlan swap) - * then MAX_EDIT_LEN needs to change appropriately - */ - err = skb_ensure_writable(skb, MAX_EDIT_LEN); + err = skb_ensure_writable(skb, max_edit_len); if (unlikely(err)) /* best policy is to drop on the floor */ goto drop; - p = rcu_dereference_bh(d->skbmod_p); - flags = p->flags; if (flags & SKBMOD_F_DMAC) ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst); if (flags & SKBMOD_F_SMAC) @@ -66,6 +80,10 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, ether_addr_copy(eth_hdr(skb)->h_source, (u8 *)tmpaddr); } + if (flags & SKBMOD_F_ECN) + INET_ECN_set_ce(skb); + +out: return action; drop: @@ -82,11 +100,12 @@ static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = { static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, skbmod_net_id); + bool ovr = flags & TCA_ACT_FLAGS_REPLACE; + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_SKBMOD_MAX + 1]; struct tcf_skbmod_params *p, *p_old; struct tcf_chain *goto_ch = NULL; @@ -129,6 +148,8 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, index = parm->index; if (parm->flags & SKBMOD_F_SWAPMAC) lflags = SKBMOD_F_SWAPMAC; + if (parm->flags & SKBMOD_F_ECN) + lflags = SKBMOD_F_ECN; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 85c0d0d5b9da..d9cd174eecb7 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -355,11 +355,11 @@ static void tunnel_key_release_params(struct tcf_tunnel_key_params *p) static int tunnel_key_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; struct tcf_tunnel_key_params *params_new; struct metadata_dst *metadata = NULL; @@ -504,7 +504,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(act_flags & TCA_ACT_FLAGS_REPLACE)) { NL_SET_ERR_MSG(extack, "TC IDR already exists"); ret = -EEXIST; goto release_tun_meta; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 71f2015c70ca..e4dc5a555bd8 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -114,11 +114,11 @@ static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, - int ovr, int bind, bool rtnl_held, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, vlan_net_id); + bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_VLAN_MAX + 1]; struct tcf_chain *goto_ch = NULL; bool push_prio_exists = false; @@ -223,7 +223,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, } ret = ACT_P_CREATED; - } else if (!ovr) { + } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index e3e79e9bd706..2ef8f5a6205a 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -634,6 +634,7 @@ static void tcf_block_offload_init(struct flow_block_offload *bo, bo->block_shared = shared; bo->extack = extack; bo->sch = sch; + bo->cb_list_head = &flow_block->cb_list; INIT_LIST_HEAD(&bo->cb_list); } @@ -1577,21 +1578,11 @@ reset: #endif } -int tcf_classify(struct sk_buff *skb, const struct tcf_proto *tp, +int tcf_classify(struct sk_buff *skb, + const struct tcf_block *block, + const struct tcf_proto *tp, struct tcf_result *res, bool compat_mode) { - u32 last_executed_chain = 0; - - return __tcf_classify(skb, tp, tp, res, compat_mode, - &last_executed_chain); -} -EXPORT_SYMBOL(tcf_classify); - -int tcf_classify_ingress(struct sk_buff *skb, - const struct tcf_block *ingress_block, - const struct tcf_proto *tp, - struct tcf_result *res, bool compat_mode) -{ #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) u32 last_executed_chain = 0; @@ -1603,20 +1594,22 @@ int tcf_classify_ingress(struct sk_buff *skb, struct tc_skb_ext *ext; int ret; - ext = skb_ext_find(skb, TC_SKB_EXT); + if (block) { + ext = skb_ext_find(skb, TC_SKB_EXT); - if (ext && ext->chain) { - struct tcf_chain *fchain; + if (ext && ext->chain) { + struct tcf_chain *fchain; - fchain = tcf_chain_lookup_rcu(ingress_block, ext->chain); - if (!fchain) - return TC_ACT_SHOT; + fchain = tcf_chain_lookup_rcu(block, ext->chain); + if (!fchain) + return TC_ACT_SHOT; - /* Consume, so cloned/redirect skbs won't inherit ext */ - skb_ext_del(skb, TC_SKB_EXT); + /* Consume, so cloned/redirect skbs won't inherit ext */ + skb_ext_del(skb, TC_SKB_EXT); - tp = rcu_dereference_bh(fchain->filter_chain); - last_executed_chain = fchain->index; + tp = rcu_dereference_bh(fchain->filter_chain); + last_executed_chain = fchain->index; + } } ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, @@ -1635,7 +1628,7 @@ int tcf_classify_ingress(struct sk_buff *skb, return ret; #endif } -EXPORT_SYMBOL(tcf_classify_ingress); +EXPORT_SYMBOL(tcf_classify); struct tcf_chain_info { struct tcf_proto __rcu **pprev; @@ -1870,13 +1863,10 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, } if (unicast) - err = netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); - - if (err > 0) - err = 0; return err; } @@ -1909,15 +1899,13 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, } if (unicast) - err = netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter delete notification"); - if (err > 0) - err = 0; return err; } @@ -1962,6 +1950,7 @@ static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, int err; int tp_created; bool rtnl_held = false; + u32 flags; if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -1982,6 +1971,7 @@ replay: tp = NULL; cl = 0; block = NULL; + flags = 0; if (prio == 0) { /* If no priority is provided by the user, @@ -2125,9 +2115,12 @@ replay: goto errout; } + if (!(n->nlmsg_flags & NLM_F_CREATE)) + flags |= TCA_ACT_FLAGS_REPLACE; + if (!rtnl_held) + flags |= TCA_ACT_FLAGS_NO_RTNL; err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, - n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE, - rtnl_held, extack); + flags, extack); if (err == 0) { tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false, rtnl_held); @@ -2711,13 +2704,11 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, } if (unicast) - err = netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + err = rtnl_unicast(skb, net, portid); else err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); - if (err > 0) - err = 0; return err; } @@ -2741,7 +2732,7 @@ static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, } if (unicast) - return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT); + return rtnl_unicast(skb, net, portid); return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO); } @@ -3035,8 +3026,8 @@ void tcf_exts_destroy(struct tcf_exts *exts) EXPORT_SYMBOL(tcf_exts_destroy); int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, - struct nlattr *rate_tlv, struct tcf_exts *exts, bool ovr, - bool rtnl_held, struct netlink_ext_ack *extack) + struct nlattr *rate_tlv, struct tcf_exts *exts, + u32 flags, struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT { @@ -3047,13 +3038,15 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, if (exts->police && tb[exts->police]) { struct tc_action_ops *a_o; - a_o = tc_action_load_ops("police", tb[exts->police], rtnl_held, extack); + a_o = tc_action_load_ops(tb[exts->police], true, + !(flags & TCA_ACT_FLAGS_NO_RTNL), + extack); if (IS_ERR(a_o)) return PTR_ERR(a_o); + flags |= TCA_ACT_FLAGS_POLICE | TCA_ACT_FLAGS_BIND; act = tcf_action_init_1(net, tp, tb[exts->police], - rate_tlv, "police", ovr, - TCA_ACT_BIND, a_o, init_res, - rtnl_held, extack); + rate_tlv, a_o, init_res, flags, + extack); module_put(a_o->owner); if (IS_ERR(act)) return PTR_ERR(act); @@ -3065,10 +3058,10 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp, struct nlattr **tb, } else if (exts->action && tb[exts->action]) { int err; + flags |= TCA_ACT_FLAGS_BIND; err = tcf_action_init(net, tp, tb[exts->action], - rate_tlv, NULL, ovr, TCA_ACT_BIND, - exts->actions, init_res, - &attr_size, rtnl_held, extack); + rate_tlv, exts->actions, init_res, + &attr_size, flags, extack); if (err < 0) return err; exts->nr_actions = err; @@ -3832,7 +3825,7 @@ struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, stru fl = rcu_dereference_bh(qe->filter_chain); - switch (tcf_classify(skb, fl, &cl_res, false)) { + switch (tcf_classify(skb, NULL, fl, &cl_res, false)) { case TC_ACT_SHOT: qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index f256a7c69093..8158fc9ee1ab 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -145,12 +145,12 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { static int basic_set_parms(struct net *net, struct tcf_proto *tp, struct basic_filter *f, unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr, + struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, extack); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack); if (err < 0) return err; @@ -169,8 +169,8 @@ static int basic_set_parms(struct net *net, struct tcf_proto *tp, static int basic_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr, - bool rtnl_held, struct netlink_ext_ack *extack) + struct nlattr **tca, void **arg, + u32 flags, struct netlink_ext_ack *extack) { int err; struct basic_head *head = rtnl_dereference(tp->root); @@ -216,7 +216,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, goto errout; } - err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr, + err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], flags, extack); if (err < 0) { if (!fold) diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index fa739efa59f4..df19a847829e 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -96,11 +96,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter_res = bpf_prog_run(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter_res = bpf_prog_run(prog->filter, skb); } if (prog->exts_integrated) { @@ -404,7 +404,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, struct cls_bpf_prog *prog, unsigned long base, - struct nlattr **tb, struct nlattr *est, bool ovr, + struct nlattr **tb, struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { bool is_bpf, is_ebpf, have_exts = false; @@ -416,7 +416,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) return -EINVAL; - ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, ovr, true, + ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, flags, extack); if (ret < 0) return ret; @@ -455,7 +455,7 @@ static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, + void **arg, u32 flags, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); @@ -500,7 +500,7 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, goto errout; prog->handle = handle; - ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], ovr, + ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], flags, extack); if (ret < 0) goto errout_idr; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index fb881144fa01..ed00001b528a 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -76,7 +76,7 @@ static void cls_cgroup_destroy_work(struct work_struct *work) static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, + void **arg, u32 flags, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_CGROUP_MAX + 1]; @@ -108,8 +108,8 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, ovr, - true, extack); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &new->exts, flags, + extack); if (err < 0) goto errout; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 87398af2715a..972303aa8edd 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -387,7 +387,7 @@ static void flow_destroy_filter_work(struct work_struct *work) static int flow_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, + void **arg, u32 flags, struct netlink_ext_ack *extack) { struct flow_head *head = rtnl_dereference(tp->root); @@ -442,8 +442,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto err2; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, ovr, - true, extack); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &fnew->exts, flags, + extack); if (err < 0) goto err2; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index d7869a984881..aab13ba11767 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -329,7 +329,8 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, ARRAY_SIZE(fl_ct_info_to_flower_map), post_ct); skb_flow_dissect_hash(skb, &mask->dissector, &skb_key); - skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); + skb_flow_dissect(skb, &mask->dissector, &skb_key, + FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP); f = fl_mask_lookup(mask, &skb_key); if (f && !tc_skip_sw(f->flags)) { @@ -1915,23 +1916,22 @@ errout_cleanup: static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct cls_fl_filter *f, struct fl_flow_mask *mask, unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr, - struct fl_flow_tmplt *tmplt, bool rtnl_held, + struct nlattr *est, + struct fl_flow_tmplt *tmplt, u32 flags, struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, rtnl_held, - extack); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack); if (err < 0) return err; if (tb[TCA_FLOWER_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); - if (!rtnl_held) + if (flags & TCA_ACT_FLAGS_NO_RTNL) rtnl_lock(); tcf_bind_filter(tp, &f->res, base); - if (!rtnl_held) + if (flags & TCA_ACT_FLAGS_NO_RTNL) rtnl_unlock(); } @@ -1975,10 +1975,11 @@ static int fl_ht_insert_unique(struct cls_fl_filter *fnew, static int fl_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, + void **arg, u32 flags, struct netlink_ext_ack *extack) { struct cls_fl_head *head = fl_head_dereference(tp); + bool rtnl_held = !(flags & TCA_ACT_FLAGS_NO_RTNL); struct cls_fl_filter *fold = *arg; struct cls_fl_filter *fnew; struct fl_flow_mask *mask; @@ -2034,8 +2035,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } } - err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], ovr, - tp->chain->tmplt_priv, rtnl_held, extack); + err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], + tp->chain->tmplt_priv, flags, extack); if (err) goto errout; @@ -2188,18 +2189,24 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg, arg->count = arg->skip; + rcu_read_lock(); idr_for_each_entry_continue_ul(&head->handle_idr, f, tmp, id) { /* don't return filters that are being deleted */ if (!refcount_inc_not_zero(&f->refcnt)) continue; + rcu_read_unlock(); + if (arg->fn(tp, f, arg) < 0) { __fl_put(f); arg->stop = 1; + rcu_read_lock(); break; } __fl_put(f); arg->count++; + rcu_read_lock(); } + rcu_read_unlock(); arg->cookie = id; } diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index ec945294626a..8654b0ce997c 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -198,15 +198,15 @@ static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { static int fw_set_parms(struct net *net, struct tcf_proto *tp, struct fw_filter *f, struct nlattr **tb, - struct nlattr **tca, unsigned long base, bool ovr, + struct nlattr **tca, unsigned long base, u32 flags, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); u32 mask; int err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, ovr, - true, extack); + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &f->exts, flags, + extack); if (err < 0) return err; @@ -237,8 +237,7 @@ static int fw_set_parms(struct net *net, struct tcf_proto *tp, static int fw_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, - bool ovr, bool rtnl_held, - struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = *arg; @@ -277,7 +276,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, return err; } - err = fw_set_parms(net, tp, fnew, tb, tca, base, ovr, extack); + err = fw_set_parms(net, tp, fnew, tb, tca, base, flags, extack); if (err < 0) { tcf_exts_destroy(&fnew->exts); kfree(fnew); @@ -326,7 +325,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, f->id = handle; f->tp = tp; - err = fw_set_parms(net, tp, f, tb, tca, base, ovr, extack); + err = fw_set_parms(net, tp, f, tb, tca, base, flags, extack); if (err < 0) goto errout; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index cafb84480bab..24f0046ce0b3 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -163,13 +163,12 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { static int mall_set_parms(struct net *net, struct tcf_proto *tp, struct cls_mall_head *head, unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr, + struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &head->exts, ovr, true, - extack); + err = tcf_exts_validate(net, tp, tb, est, &head->exts, flags, extack); if (err < 0) return err; @@ -183,13 +182,13 @@ static int mall_set_parms(struct net *net, struct tcf_proto *tp, static int mall_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, + void **arg, u32 flags, struct netlink_ext_ack *extack) { struct cls_mall_head *head = rtnl_dereference(tp->root); struct nlattr *tb[TCA_MATCHALL_MAX + 1]; struct cls_mall_head *new; - u32 flags = 0; + u32 userflags = 0; int err; if (!tca[TCA_OPTIONS]) @@ -204,8 +203,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, return err; if (tb[TCA_MATCHALL_FLAGS]) { - flags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]); - if (!tc_flags_valid(flags)) + userflags = nla_get_u32(tb[TCA_MATCHALL_FLAGS]); + if (!tc_flags_valid(userflags)) return -EINVAL; } @@ -220,14 +219,14 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (!handle) handle = 1; new->handle = handle; - new->flags = flags; + new->flags = userflags; new->pf = alloc_percpu(struct tc_matchall_pcnt); if (!new->pf) { err = -ENOMEM; goto err_alloc_percpu; } - err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr, + err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], flags, extack); if (err) goto err_set_parms; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 5efa3e7ace15..a35ab8c27866 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -382,7 +382,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct route4_filter *f, u32 handle, struct route4_head *head, struct nlattr **tb, struct nlattr *est, int new, - bool ovr, struct netlink_ext_ack *extack) + u32 flags, struct netlink_ext_ack *extack) { u32 id = 0, to = 0, nhandle = 0x8000; struct route4_filter *fp; @@ -390,7 +390,7 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, struct route4_bucket *b; int err; - err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr, true, extack); + err = tcf_exts_validate(net, tp, tb, est, &f->exts, flags, extack); if (err < 0) return err; @@ -464,8 +464,8 @@ static int route4_set_parms(struct net *net, struct tcf_proto *tp, static int route4_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr, - bool rtnl_held, struct netlink_ext_ack *extack) + struct nlattr **tca, void **arg, u32 flags, + struct netlink_ext_ack *extack) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter __rcu **fp; @@ -510,7 +510,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, } err = route4_set_parms(net, tp, base, f, handle, head, tb, - tca[TCA_RATE], new, ovr, extack); + tca[TCA_RATE], new, flags, extack); if (err < 0) goto errout; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 27a4b6dbcf57..5cd9d6b143c4 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -470,9 +470,8 @@ static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { static int rsvp_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, - u32 handle, - struct nlattr **tca, - void **arg, bool ovr, bool rtnl_held, + u32 handle, struct nlattr **tca, + void **arg, u32 flags, struct netlink_ext_ack *extack) { struct rsvp_head *data = rtnl_dereference(tp->root); @@ -499,7 +498,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, err = tcf_exts_init(&e, net, TCA_RSVP_ACT, TCA_RSVP_POLICE); if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, ovr, true, + err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, flags, extack); if (err < 0) goto errout2; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index e9a8a2c86bbd..742c7d49a958 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -330,7 +330,7 @@ static int tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, u32 handle, struct tcindex_data *p, struct tcindex_filter_result *r, struct nlattr **tb, - struct nlattr *est, bool ovr, struct netlink_ext_ack *extack) + struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { struct tcindex_filter_result new_filter_result, *old_r = r; struct tcindex_data *cp = NULL, *oldp; @@ -342,7 +342,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); if (err < 0) return err; - err = tcf_exts_validate(net, tp, tb, est, &e, ovr, true, extack); + err = tcf_exts_validate(net, tp, tb, est, &e, flags, extack); if (err < 0) goto errout; @@ -529,8 +529,8 @@ errout: static int tcindex_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr, - bool rtnl_held, struct netlink_ext_ack *extack) + struct nlattr **tca, void **arg, u32 flags, + struct netlink_ext_ack *extack) { struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_TCINDEX_MAX + 1]; @@ -551,7 +551,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, return err; return tcindex_set_parms(net, tp, base, handle, p, r, tb, - tca[TCA_RATE], ovr, extack); + tca[TCA_RATE], flags, extack); } static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker, diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 6e1abe805448..4272814487f0 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -709,12 +709,12 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { static int u32_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, struct tc_u_knode *n, struct nlattr **tb, - struct nlattr *est, bool ovr, + struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) { int err; - err = tcf_exts_validate(net, tp, tb, est, &n->exts, ovr, true, extack); + err = tcf_exts_validate(net, tp, tb, est, &n->exts, flags, extack); if (err < 0) return err; @@ -840,7 +840,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, static int u32_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, bool ovr, bool rtnl_held, + struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct tc_u_common *tp_c = tp->data; @@ -849,7 +849,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, struct tc_u32_sel *s; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_U32_MAX + 1]; - u32 htid, flags = 0; + u32 htid, userflags = 0; size_t sel_size; int err; @@ -868,8 +868,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return err; if (tb[TCA_U32_FLAGS]) { - flags = nla_get_u32(tb[TCA_U32_FLAGS]); - if (!tc_flags_valid(flags)) { + userflags = nla_get_u32(tb[TCA_U32_FLAGS]); + if (!tc_flags_valid(userflags)) { NL_SET_ERR_MSG_MOD(extack, "Invalid filter flags"); return -EINVAL; } @@ -884,7 +884,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; } - if ((n->flags ^ flags) & + if ((n->flags ^ userflags) & ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW)) { NL_SET_ERR_MSG_MOD(extack, "Key node flags do not match passed flags"); return -EINVAL; @@ -895,7 +895,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return -ENOMEM; err = u32_set_parms(net, tp, base, new, tb, - tca[TCA_RATE], ovr, extack); + tca[TCA_RATE], flags, extack); if (err) { u32_destroy_key(new, false); @@ -955,9 +955,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, ht->handle = handle; ht->prio = tp->prio; idr_init(&ht->handle_idr); - ht->flags = flags; + ht->flags = userflags; - err = u32_replace_hw_hnode(tp, ht, flags, extack); + err = u32_replace_hw_hnode(tp, ht, userflags, extack); if (err) { idr_remove(&tp_c->handle_idr, handle); kfree(ht); @@ -1038,7 +1038,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, RCU_INIT_POINTER(n->ht_up, ht); n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; - n->flags = flags; + n->flags = userflags; err = tcf_exts_init(&n->exts, net, TCA_U32_ACT, TCA_U32_POLICE); if (err < 0) @@ -1060,7 +1060,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, } #endif - err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], ovr, + err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], flags, extack); if (err == 0) { struct tc_u_knode __rcu **ins; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 46254968d390..0a04468b7314 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -457,7 +457,7 @@ META_COLLECTOR(int_sk_fwd_alloc) *err = -1; return; } - dst->value = sk->sk_forward_alloc; + dst->value = sk_forward_alloc_get(sk); } META_COLLECTOR(int_sk_sndbuf) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f87d07736a14..efcd0b5e9a32 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -507,20 +507,27 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, list_for_each_entry(stab, &qdisc_stab_list, list) { if (memcmp(&stab->szopts, s, sizeof(*s))) continue; - if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16))) + if (tsize > 0 && + memcmp(stab->data, tab, flex_array_size(stab, data, tsize))) continue; stab->refcnt++; return stab; } - stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL); + if (s->size_log > STAB_SIZE_LOG_MAX || + s->cell_log > STAB_SIZE_LOG_MAX) { + NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table"); + return ERR_PTR(-EINVAL); + } + + stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL); if (!stab) return ERR_PTR(-ENOMEM); stab->refcnt = 1; stab->szopts = *s; if (tsize > 0) - memcpy(stab->data, tab, tsize * sizeof(u16)); + memcpy(stab->data, tab, flex_array_size(stab, data, tsize)); list_add_tail(&stab->list, &qdisc_stab_list); @@ -878,7 +885,7 @@ static void qdisc_offload_graft_root(struct net_device *dev, static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event) { - struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; + struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; struct gnet_stats_queue __percpu *cpu_qstats = NULL; struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -936,8 +943,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, cpu_qstats = q->cpu_qstats; } - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), - &d, cpu_bstats, &q->bstats) < 0 || + if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; @@ -1258,26 +1264,17 @@ static struct Qdisc *qdisc_create(struct net_device *dev, rcu_assign_pointer(sch->stab, stab); } if (tca[TCA_RATE]) { - seqcount_t *running; - err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); goto err_out4; } - if (sch->parent != TC_H_ROOT && - !(sch->flags & TCQ_F_INGRESS) && - (!p || !(p->flags & TCQ_F_MQROOT))) - running = qdisc_root_sleeping_running(sch); - else - running = &sch->running; - err = gen_new_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, NULL, - running, + true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); @@ -1353,7 +1350,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, sch->cpu_bstats, &sch->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); } out: @@ -1845,7 +1842,6 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; - int err = 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) @@ -1856,11 +1852,8 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, return -EINVAL; } - err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, - n->nlmsg_flags & NLM_F_ECHO); - if (err > 0) - err = 0; - return err; + return rtnetlink_send(skb, net, portid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); } static int tclass_del_notify(struct net *net, @@ -1894,8 +1887,6 @@ static int tclass_del_notify(struct net *net, err = rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); - if (err > 0) - err = 0; return err; } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index d0c9a57398fc..4c8e994cf0a5 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -52,7 +52,7 @@ struct atm_flow_data { struct atm_qdisc_data *parent; /* parent qdisc */ struct socket *sock; /* for closing */ int ref; /* reference count */ - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct list_head list; struct atm_flow_data *excess; /* flow for excess traffic; @@ -394,7 +394,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, list_for_each_entry(flow, &p->flows, list) { fl = rcu_dereference_bh(flow->filter_list); if (fl) { - result = tcf_classify(skb, fl, &res, true); + result = tcf_classify(skb, NULL, fl, &res, true); if (result < 0) continue; flow = (struct atm_flow_data *)res.class; @@ -548,6 +548,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); INIT_LIST_HEAD(&p->flows); INIT_LIST_HEAD(&p->link.list); + gnet_stats_basic_sync_init(&p->link.bstats); list_add(&p->link.list, &p->flows); p->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle, extack); @@ -652,8 +653,7 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, { struct atm_flow_data *flow = (struct atm_flow_data *)arg; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &flow->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &flow->bstats, true) < 0 || gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) return -1; diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 28af8b1e1bb1..3c2300d14468 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1665,7 +1665,7 @@ static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, goto hash; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, filter, &res, false); + result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index b79a7e27bb31..02d9f0dfe356 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -116,7 +116,7 @@ struct cbq_class { long avgidle; long deficit; /* Saved deficit for WRR */ psched_time_t penalized; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct tc_cbq_xstats xstats; @@ -228,7 +228,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 2+n. Apply classifier. */ - result = tcf_classify(skb, fl, &res, true); + result = tcf_classify(skb, NULL, fl, &res, true); if (!fl || result < 0) goto fallback; @@ -565,8 +565,7 @@ cbq_update(struct cbq_sched_data *q) long avgidle = cl->avgidle; long idle; - cl->bstats.packets++; - cl->bstats.bytes += len; + _bstats_update(&cl->bstats, len, 1); /* * (now - last) is total time between packet right edges. @@ -1384,8 +1383,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (cl->undertime != PSCHED_PASTPERFECT) cl->xstats.undertime = cl->undertime - q->now; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; @@ -1519,7 +1517,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator"); @@ -1611,17 +1609,16 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (cl == NULL) goto failure; + gnet_stats_basic_sync_init(&cl->bstats); err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); - return err; + goto failure; } if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); + NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Couldn't create new estimator"); tcf_block_put(cl->block); diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index fc1e47069593..18e4f7a0b291 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -19,7 +19,7 @@ struct drr_class { struct Qdisc_class_common common; unsigned int filter_cnt; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct list_head alist; @@ -85,8 +85,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), + NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); @@ -106,6 +105,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->quantum = quantum; cl->qdisc = qdisc_create_dflt(sch->dev_queue, @@ -118,9 +118,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); + NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to replace estimator"); qdisc_put(cl->qdisc); @@ -267,8 +265,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, if (qlen) xstats.deficit = cl->deficit; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0) return -1; @@ -317,7 +314,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tcf_classify(skb, fl, &res, false); + result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index d320bcfb2da2..4c100d105269 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -242,7 +242,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, else { struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result = tcf_classify(skb, fl, &res, false); + int result = tcf_classify(skb, NULL, fl, &res, false); pr_debug("result %d class 0x%04x\n", result, res.classid); diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index c76701ac35ab..0eae9ff5edf6 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -41,7 +41,7 @@ struct ets_class { struct Qdisc *qdisc; u32 quantum; u32 deficit; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; }; @@ -325,8 +325,7 @@ static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg, struct ets_class *cl = ets_class_from_arg(sch, arg); struct Qdisc *cl_q = cl->qdisc; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; @@ -390,7 +389,7 @@ static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); - err = tcf_classify(skb, fl, &res, false); + err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: @@ -661,7 +660,6 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, q->nbands = nbands; for (i = nstrict; i < q->nstrict; i++) { - INIT_LIST_HEAD(&q->classes[i].alist); if (q->classes[i].qdisc->q.qlen) { list_add_tail(&q->classes[i].alist, &q->active); q->classes[i].deficit = quanta[i]; @@ -687,7 +685,11 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, ets_offload_change(sch); for (i = q->nbands; i < oldbands; i++) { qdisc_put(q->classes[i].qdisc); - memset(&q->classes[i], 0, sizeof(q->classes[i])); + q->classes[i].qdisc = NULL; + q->classes[i].quantum = 0; + q->classes[i].deficit = 0; + gnet_stats_basic_sync_init(&q->classes[i].bstats); + memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats)); } return 0; } @@ -696,7 +698,7 @@ static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct ets_sched *q = qdisc_priv(sch); - int err; + int err, i; if (!opt) return -EINVAL; @@ -706,6 +708,9 @@ static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, return err; INIT_LIST_HEAD(&q->active); + for (i = 0; i < TCQ_ETS_MAX_BANDS; i++) + INIT_LIST_HEAD(&q->classes[i].alist); + return ets_qdisc_change(sch, opt, extack); } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index a579a4131d22..e1040421b797 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -233,6 +233,9 @@ int fifo_set_limit(struct Qdisc *q, unsigned int limit) if (strncmp(q->ops->id + 1, "fifo", 4) != 0) return 0; + if (!q->ops->change) + return 0; + nla = kmalloc(nla_attr_size(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); if (nla) { nla->nla_type = RTM_NEWQDISC; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index bbd5f8753600..839e1235db05 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -91,7 +91,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, filter, &res, false); + result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -362,6 +362,8 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { [TCA_FQ_CODEL_CE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CODEL_DROP_BATCH_SIZE] = { .type = NLA_U32 }, [TCA_FQ_CODEL_MEMORY_LIMIT] = { .type = NLA_U32 }, + [TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR] = { .type = NLA_U8 }, + [TCA_FQ_CODEL_CE_THRESHOLD_MASK] = { .type = NLA_U8 }, }; static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, @@ -369,6 +371,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, { struct fq_codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; + u32 quantum = 0; int err; if (!opt) @@ -386,6 +389,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, q->flows_cnt > 65536) return -EINVAL; } + if (tb[TCA_FQ_CODEL_QUANTUM]) { + quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + if (quantum > FQ_CODEL_QUANTUM_MAX) { + NL_SET_ERR_MSG(extack, "Invalid quantum"); + return -EINVAL; + } + } sch_tree_lock(sch); if (tb[TCA_FQ_CODEL_TARGET]) { @@ -400,6 +410,11 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, q->cparams.ce_threshold = (val * NSEC_PER_USEC) >> CODEL_SHIFT; } + if (tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR]) + q->cparams.ce_threshold_selector = nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR]); + if (tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK]) + q->cparams.ce_threshold_mask = nla_get_u8(tb[TCA_FQ_CODEL_CE_THRESHOLD_MASK]); + if (tb[TCA_FQ_CODEL_INTERVAL]) { u64 interval = nla_get_u32(tb[TCA_FQ_CODEL_INTERVAL]); @@ -412,8 +427,8 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_CODEL_ECN]) q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); - if (tb[TCA_FQ_CODEL_QUANTUM]) - q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + if (quantum) + q->quantum = quantum; if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]) q->drop_batch_size = max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])); @@ -536,10 +551,15 @@ static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) q->flows_cnt)) goto nla_put_failure; - if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD && - nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, - codel_time_to_us(q->cparams.ce_threshold))) - goto nla_put_failure; + if (q->cparams.ce_threshold != CODEL_DISABLED_THRESHOLD) { + if (nla_put_u32(skb, TCA_FQ_CODEL_CE_THRESHOLD, + codel_time_to_us(q->cparams.ce_threshold))) + goto nla_put_failure; + if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_SELECTOR, q->cparams.ce_threshold_selector)) + goto nla_put_failure; + if (nla_put_u8(skb, TCA_FQ_CODEL_CE_THRESHOLD_MASK, q->cparams.ce_threshold_mask)) + goto nla_put_failure; + } return nla_nest_end(skb, opts); diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index cac684952edc..830f3559f727 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -94,7 +94,7 @@ static unsigned int fq_pie_classify(struct sk_buff *skb, struct Qdisc *sch, return fq_pie_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, filter, &res, false); + result = tcf_classify(skb, NULL, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index a8dd06c74e31..3b0f62095803 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -304,8 +304,8 @@ trace: /* * Transmit possibly several skbs, and handle the return status as - * required. Owning running seqcount bit guarantees that - * only one CPU can execute this function. + * required. Owning qdisc running bit guarantees that only one CPU + * can execute this function. * * Returns to the caller: * false - hardware queue frozen backoff @@ -606,7 +606,6 @@ struct Qdisc noop_qdisc = { .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, - .running = SEQCNT_ZERO(noop_qdisc.running), .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, @@ -867,7 +866,6 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { EXPORT_SYMBOL(pfifo_fast_ops); static struct lock_class_key qdisc_tx_busylock; -static struct lock_class_key qdisc_running_key; struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, @@ -892,11 +890,12 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); qdisc_skb_head_init(&sch->q); + gnet_stats_basic_sync_init(&sch->bstats); spin_lock_init(&sch->q.lock); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = - netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); if (!sch->cpu_bstats) goto errout1; @@ -916,10 +915,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, lockdep_set_class(&sch->seqlock, dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); - seqcount_init(&sch->running); - lockdep_set_class(&sch->running, - dev->qdisc_running_key ?: &qdisc_running_key); - sch->ops = ops; sch->flags = ops->static_flags; sch->enqueue = ops->enqueue; @@ -1330,6 +1325,39 @@ static int qdisc_change_tx_queue_len(struct net_device *dev, return 0; } +void dev_qdisc_change_real_num_tx(struct net_device *dev, + unsigned int new_real_tx) +{ + struct Qdisc *qdisc = dev->qdisc; + + if (qdisc->ops->change_real_num_tx) + qdisc->ops->change_real_num_tx(qdisc, new_real_tx); +} + +void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx) +{ +#ifdef CONFIG_NET_SCHED + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *qdisc; + unsigned int i; + + for (i = new_real_tx; i < dev->real_num_tx_queues; i++) { + qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping; + /* Only update the default qdiscs we created, + * qdiscs with handles are always hashed. + */ + if (qdisc != &noop_qdisc && !qdisc->handle) + qdisc_hash_del(qdisc); + } + for (i = dev->real_num_tx_queues; i < new_real_tx; i++) { + qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping; + if (qdisc != &noop_qdisc && !qdisc->handle) + qdisc_hash_add(qdisc, false); + } +#endif +} +EXPORT_SYMBOL(mq_change_real_num_tx); + int dev_qdisc_change_tx_queue_len(struct net_device *dev) { bool up = dev->flags & IFF_UP; @@ -1459,10 +1487,6 @@ void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64) } EXPORT_SYMBOL(psched_ppscfg_precompute); -static void mini_qdisc_rcu_func(struct rcu_head *head) -{ -} - void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, struct tcf_proto *tp_head) { @@ -1475,28 +1499,30 @@ void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp, if (!tp_head) { RCU_INIT_POINTER(*miniqp->p_miniq, NULL); - /* Wait for flying RCU callback before it is freed. */ - rcu_barrier(); - return; - } + } else { + miniq = miniq_old != &miniqp->miniq1 ? + &miniqp->miniq1 : &miniqp->miniq2; - miniq = !miniq_old || miniq_old == &miniqp->miniq2 ? - &miniqp->miniq1 : &miniqp->miniq2; + /* We need to make sure that readers won't see the miniq + * we are about to modify. So ensure that at least one RCU + * grace period has elapsed since the miniq was made + * inactive. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + cond_synchronize_rcu(miniq->rcu_state); + else if (!poll_state_synchronize_rcu(miniq->rcu_state)) + synchronize_rcu_expedited(); - /* We need to make sure that readers won't see the miniq - * we are about to modify. So wait until previous call_rcu callback - * is done. - */ - rcu_barrier(); - miniq->filter_list = tp_head; - rcu_assign_pointer(*miniqp->p_miniq, miniq); + miniq->filter_list = tp_head; + rcu_assign_pointer(*miniqp->p_miniq, miniq); + } if (miniq_old) - /* This is counterpart of the rcu barriers above. We need to + /* This is counterpart of the rcu sync above. We need to * block potential new user of miniq_old until all readers * are not seeing it. */ - call_rcu(&miniq_old->rcu, mini_qdisc_rcu_func); + miniq_old->rcu_state = start_poll_synchronize_rcu(); } EXPORT_SYMBOL(mini_qdisc_pair_swap); @@ -1515,6 +1541,8 @@ void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc, miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats; miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats; miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats; + miniqp->miniq1.rcu_state = get_state_synchronize_rcu(); + miniqp->miniq2.rcu_state = miniqp->miniq1.rcu_state; miniqp->p_miniq = p_miniq; } EXPORT_SYMBOL(mini_qdisc_pair_init); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 621dc6afde8f..1073c76d05c4 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -56,6 +56,7 @@ struct gred_sched { u32 DPs; u32 def; struct red_vars wred_set; + struct tc_gred_qopt_offload *opt; }; static inline int gred_wred_mode(struct gred_sched *table) @@ -311,48 +312,50 @@ static void gred_offload(struct Qdisc *sch, enum tc_gred_command command) { struct gred_sched *table = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - struct tc_gred_qopt_offload opt = { - .command = command, - .handle = sch->handle, - .parent = sch->parent, - }; + struct tc_gred_qopt_offload *opt = table->opt; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; + memset(opt, 0, sizeof(*opt)); + opt->command = command; + opt->handle = sch->handle; + opt->parent = sch->parent; + if (command == TC_GRED_REPLACE) { unsigned int i; - opt.set.grio_on = gred_rio_mode(table); - opt.set.wred_on = gred_wred_mode(table); - opt.set.dp_cnt = table->DPs; - opt.set.dp_def = table->def; + opt->set.grio_on = gred_rio_mode(table); + opt->set.wred_on = gred_wred_mode(table); + opt->set.dp_cnt = table->DPs; + opt->set.dp_def = table->def; for (i = 0; i < table->DPs; i++) { struct gred_sched_data *q = table->tab[i]; if (!q) continue; - opt.set.tab[i].present = true; - opt.set.tab[i].limit = q->limit; - opt.set.tab[i].prio = q->prio; - opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; - opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; - opt.set.tab[i].is_ecn = gred_use_ecn(q); - opt.set.tab[i].is_harddrop = gred_use_harddrop(q); - opt.set.tab[i].probability = q->parms.max_P; - opt.set.tab[i].backlog = &q->backlog; + opt->set.tab[i].present = true; + opt->set.tab[i].limit = q->limit; + opt->set.tab[i].prio = q->prio; + opt->set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; + opt->set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; + opt->set.tab[i].is_ecn = gred_use_ecn(q); + opt->set.tab[i].is_harddrop = gred_use_harddrop(q); + opt->set.tab[i].probability = q->parms.max_P; + opt->set.tab[i].backlog = &q->backlog; } - opt.set.qstats = &sch->qstats; + opt->set.qstats = &sch->qstats; } - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt); + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, opt); } static int gred_offload_dump_stats(struct Qdisc *sch) { struct gred_sched *table = qdisc_priv(sch); struct tc_gred_qopt_offload *hw_stats; + u64 bytes = 0, packets = 0; unsigned int i; int ret; @@ -364,9 +367,11 @@ static int gred_offload_dump_stats(struct Qdisc *sch) hw_stats->handle = sch->handle; hw_stats->parent = sch->parent; - for (i = 0; i < MAX_DPs; i++) + for (i = 0; i < MAX_DPs; i++) { + gnet_stats_basic_sync_init(&hw_stats->stats.bstats[i]); if (table->tab[i]) hw_stats->stats.xstats[i] = &table->tab[i]->stats; + } ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats); /* Even if driver returns failure adjust the stats - in case offload @@ -375,19 +380,19 @@ static int gred_offload_dump_stats(struct Qdisc *sch) for (i = 0; i < MAX_DPs; i++) { if (!table->tab[i]) continue; - table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets; - table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes; + table->tab[i]->packetsin += u64_stats_read(&hw_stats->stats.bstats[i].packets); + table->tab[i]->bytesin += u64_stats_read(&hw_stats->stats.bstats[i].bytes); table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog; - _bstats_update(&sch->bstats, - hw_stats->stats.bstats[i].bytes, - hw_stats->stats.bstats[i].packets); + bytes += u64_stats_read(&hw_stats->stats.bstats[i].bytes); + packets += u64_stats_read(&hw_stats->stats.bstats[i].packets); sch->qstats.qlen += hw_stats->stats.qstats[i].qlen; sch->qstats.backlog += hw_stats->stats.qstats[i].backlog; sch->qstats.drops += hw_stats->stats.qstats[i].drops; sch->qstats.requeues += hw_stats->stats.qstats[i].requeues; sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; } + _bstats_update(&sch->bstats, bytes, packets); kfree(hw_stats); return ret; @@ -728,6 +733,7 @@ err_unlock_free: static int gred_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct gred_sched *table = qdisc_priv(sch); struct nlattr *tb[TCA_GRED_MAX + 1]; int err; @@ -751,6 +757,12 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, sch->limit = qdisc_dev(sch)->tx_queue_len * psched_mtu(qdisc_dev(sch)); + if (qdisc_dev(sch)->netdev_ops->ndo_setup_tc) { + table->opt = kzalloc(sizeof(*table->opt), GFP_KERNEL); + if (!table->opt) + return -ENOMEM; + } + return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); } @@ -907,6 +919,7 @@ static void gred_destroy(struct Qdisc *sch) gred_destroy_vq(table->tab[i]); } gred_offload(sch, TC_GRED_DESTROY); + kfree(table->opt); } static struct Qdisc_ops gred_qdisc_ops __read_mostly = { diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index bf0034c66e35..d3979a6000e7 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -111,7 +111,7 @@ enum hfsc_class_flags { struct hfsc_class { struct Qdisc_class_common cl_common; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct tcf_proto __rcu *filter_list; /* filter list */ @@ -965,7 +965,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) return err; @@ -1033,9 +1033,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (tca[TCA_RATE]) { err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, - qdisc_root_sleeping_running(sch), - tca[TCA_RATE]); + NULL, true, tca[TCA_RATE]); if (err) { tcf_block_put(cl->block); kfree(cl); @@ -1130,7 +1128,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; tcf = rcu_dereference_bh(q->root.filter_list); - while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) { + while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: @@ -1328,7 +1326,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.work = cl->cl_total; xstats.rtwork = cl->cl_cumul; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) return -1; @@ -1406,6 +1404,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt, if (err) return err; + gnet_stats_basic_sync_init(&q->root.bstats); q->root.cl_common.classid = sch->handle; q->root.sched = q; q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 5f7ac27a5264..9267922ea9c3 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -113,8 +113,8 @@ struct htb_class { /* * Written often fields */ - struct gnet_stats_basic_packed bstats; - struct gnet_stats_basic_packed bstats_bias; + struct gnet_stats_basic_sync bstats; + struct gnet_stats_basic_sync bstats_bias; struct tc_htb_xstats xstats; /* our special stats */ /* token bucket parameters */ @@ -125,6 +125,7 @@ struct htb_class { struct htb_class_leaf { int deficit[TC_HTB_MAXDEPTH]; struct Qdisc *q; + struct netdev_queue *offload_queue; } leaf; struct htb_class_inner { struct htb_prio clprio[TC_HTB_NUMPRIO]; @@ -238,7 +239,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - while (tcf && (result = tcf_classify(skb, tcf, &res, false)) >= 0) { + while (tcf && (result = tcf_classify(skb, NULL, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: @@ -1083,11 +1084,15 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt, offload = nla_get_flag(tb[TCA_HTB_OFFLOAD]); if (offload) { - if (sch->parent != TC_H_ROOT) + if (sch->parent != TC_H_ROOT) { + NL_SET_ERR_MSG(extack, "HTB must be the root qdisc to use offload"); return -EOPNOTSUPP; + } - if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) { + NL_SET_ERR_MSG(extack, "hw-tc-offload ethtool feature flag must be on"); return -EOPNOTSUPP; + } q->num_direct_qdiscs = dev->real_num_tx_queues; q->direct_qdiscs = kcalloc(q->num_direct_qdiscs, @@ -1307,10 +1312,11 @@ nla_put_failure: static void htb_offload_aggregate_stats(struct htb_sched *q, struct htb_class *cl) { + u64 bytes = 0, packets = 0; struct htb_class *c; unsigned int i; - memset(&cl->bstats, 0, sizeof(cl->bstats)); + gnet_stats_basic_sync_init(&cl->bstats); for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) { @@ -1322,14 +1328,15 @@ static void htb_offload_aggregate_stats(struct htb_sched *q, if (p != cl) continue; - cl->bstats.bytes += c->bstats_bias.bytes; - cl->bstats.packets += c->bstats_bias.packets; + bytes += u64_stats_read(&c->bstats_bias.bytes); + packets += u64_stats_read(&c->bstats_bias.packets); if (c->level == 0) { - cl->bstats.bytes += c->leaf.q->bstats.bytes; - cl->bstats.packets += c->leaf.q->bstats.packets; + bytes += u64_stats_read(&c->leaf.q->bstats.bytes); + packets += u64_stats_read(&c->leaf.q->bstats.packets); } } } + _bstats_update(&cl->bstats, bytes, packets); } static int @@ -1356,16 +1363,16 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) if (cl->leaf.q) cl->bstats = cl->leaf.q->bstats; else - memset(&cl->bstats, 0, sizeof(cl->bstats)); - cl->bstats.bytes += cl->bstats_bias.bytes; - cl->bstats.packets += cl->bstats_bias.packets; + gnet_stats_basic_sync_init(&cl->bstats); + _bstats_update(&cl->bstats, + u64_stats_read(&cl->bstats_bias.bytes), + u64_stats_read(&cl->bstats_bias.packets)); } else { htb_offload_aggregate_stats(q, cl); } } - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0) return -1; @@ -1411,24 +1418,47 @@ htb_graft_helper(struct netdev_queue *dev_queue, struct Qdisc *new_q) return old_q; } -static void htb_offload_move_qdisc(struct Qdisc *sch, u16 qid_old, u16 qid_new) +static struct netdev_queue *htb_offload_get_queue(struct htb_class *cl) +{ + struct netdev_queue *queue; + + queue = cl->leaf.offload_queue; + if (!(cl->leaf.q->flags & TCQ_F_BUILTIN)) + WARN_ON(cl->leaf.q->dev_queue != queue); + + return queue; +} + +static void htb_offload_move_qdisc(struct Qdisc *sch, struct htb_class *cl_old, + struct htb_class *cl_new, bool destroying) { struct netdev_queue *queue_old, *queue_new; struct net_device *dev = qdisc_dev(sch); - struct Qdisc *qdisc; - queue_old = netdev_get_tx_queue(dev, qid_old); - queue_new = netdev_get_tx_queue(dev, qid_new); + queue_old = htb_offload_get_queue(cl_old); + queue_new = htb_offload_get_queue(cl_new); - if (dev->flags & IFF_UP) - dev_deactivate(dev); - qdisc = dev_graft_qdisc(queue_old, NULL); - qdisc->dev_queue = queue_new; - qdisc = dev_graft_qdisc(queue_new, qdisc); - if (dev->flags & IFF_UP) - dev_activate(dev); + if (!destroying) { + struct Qdisc *qdisc; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + qdisc = dev_graft_qdisc(queue_old, NULL); + WARN_ON(qdisc != cl_old->leaf.q); + } + + if (!(cl_old->leaf.q->flags & TCQ_F_BUILTIN)) + cl_old->leaf.q->dev_queue = queue_new; + cl_old->leaf.offload_queue = queue_new; - WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN)); + if (!destroying) { + struct Qdisc *qdisc; + + qdisc = dev_graft_qdisc(queue_new, cl_old->leaf.q); + if (dev->flags & IFF_UP) + dev_activate(dev); + WARN_ON(!(qdisc->flags & TCQ_F_BUILTIN)); + } } static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, @@ -1442,10 +1472,8 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (cl->level) return -EINVAL; - if (q->offload) { - dev_queue = new->dev_queue; - WARN_ON(dev_queue != cl->leaf.q->dev_queue); - } + if (q->offload) + dev_queue = htb_offload_get_queue(cl); if (!new) { new = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, @@ -1514,6 +1542,8 @@ static void htb_parent_to_leaf(struct Qdisc *sch, struct htb_class *cl, parent->ctokens = parent->cbuffer; parent->t_c = ktime_get_ns(); parent->cmode = HTB_CAN_SEND; + if (q->offload) + parent->leaf.offload_queue = cl->leaf.offload_queue; } static void htb_parent_to_leaf_offload(struct Qdisc *sch, @@ -1534,6 +1564,7 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, struct netlink_ext_ack *extack) { struct tc_htb_qopt_offload offload_opt; + struct netdev_queue *dev_queue; struct Qdisc *q = cl->leaf.q; struct Qdisc *old = NULL; int err; @@ -1542,20 +1573,20 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, return -EINVAL; WARN_ON(!q); - if (!destroying) { - /* On destroy of HTB, two cases are possible: - * 1. q is a normal qdisc, but q->dev_queue has noop qdisc. - * 2. q is a noop qdisc (for nodes that were inner), - * q->dev_queue is noop_netdev_queue. + dev_queue = htb_offload_get_queue(cl); + old = htb_graft_helper(dev_queue, NULL); + if (destroying) + /* Before HTB is destroyed, the kernel grafts noop_qdisc to + * all queues. */ - old = htb_graft_helper(q->dev_queue, NULL); - WARN_ON(!old); + WARN_ON(!(old->flags & TCQ_F_BUILTIN)); + else WARN_ON(old != q); - } if (cl->parent) { - cl->parent->bstats_bias.bytes += q->bstats.bytes; - cl->parent->bstats_bias.packets += q->bstats.packets; + _bstats_update(&cl->parent->bstats_bias, + u64_stats_read(&q->bstats.bytes), + u64_stats_read(&q->bstats.packets)); } offload_opt = (struct tc_htb_qopt_offload) { @@ -1570,18 +1601,17 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, if (!err || destroying) qdisc_put(old); else - htb_graft_helper(q->dev_queue, old); + htb_graft_helper(dev_queue, old); if (last_child) return err; - if (!err && offload_opt.moved_qid != 0) { - if (destroying) - q->dev_queue = netdev_get_tx_queue(qdisc_dev(sch), - offload_opt.qid); - else - htb_offload_move_qdisc(sch, offload_opt.moved_qid, - offload_opt.qid); + if (!err && offload_opt.classid != TC_H_MIN(cl->common.classid)) { + u32 classid = TC_H_MAJ(sch->handle) | + TC_H_MIN(offload_opt.classid); + struct htb_class *moved_cl = htb_find(classid, sch); + + htb_offload_move_qdisc(sch, moved_cl, cl, destroying); } return err; @@ -1704,9 +1734,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, } if (last_child) { - struct netdev_queue *dev_queue; + struct netdev_queue *dev_queue = sch->dev_queue; + + if (q->offload) + dev_queue = htb_offload_get_queue(cl); - dev_queue = q->offload ? cl->leaf.q->dev_queue : sch->dev_queue; new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, cl->parent->common.classid, NULL); @@ -1824,6 +1856,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!cl) goto failure; + gnet_stats_basic_sync_init(&cl->bstats); + gnet_stats_basic_sync_init(&cl->bstats_bias); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @@ -1833,7 +1868,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE] ? : &est.nla); if (err) goto err_block_put; @@ -1878,7 +1913,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, } dev_queue = netdev_get_tx_queue(dev, offload_opt.qid); } else { /* First child. */ - dev_queue = parent->leaf.q->dev_queue; + dev_queue = htb_offload_get_queue(parent); old_q = htb_graft_helper(dev_queue, NULL); WARN_ON(old_q != parent->leaf.q); offload_opt = (struct tc_htb_qopt_offload) { @@ -1897,8 +1932,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, htb_graft_helper(dev_queue, old_q); goto err_kill_estimator; } - parent->bstats_bias.bytes += old_q->bstats.bytes; - parent->bstats_bias.packets += old_q->bstats.packets; + _bstats_update(&parent->bstats_bias, + u64_stats_read(&old_q->bstats.bytes), + u64_stats_read(&old_q->bstats.packets)); qdisc_put(old_q); } new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, @@ -1935,6 +1971,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* leaf (we) needs elementary qdisc */ cl->leaf.q = new_q ? new_q : &noop_qdisc; + if (q->offload) + cl->leaf.offload_queue = dev_queue; cl->parent = parent; @@ -1956,7 +1994,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) return err; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index e79f1afe0cfd..83d2e54bf303 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -130,10 +130,9 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int ntx; - __u32 qlen = 0; sch->q.qlen = 0; - memset(&sch->bstats, 0, sizeof(sch->bstats)); + gnet_stats_basic_sync_init(&sch->bstats); memset(&sch->qstats, 0, sizeof(sch->qstats)); /* MQ supports lockless qdiscs. However, statistics accounting needs @@ -145,25 +144,11 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - qlen = qdisc_qlen_sum(qdisc); - __gnet_stats_copy_basic(NULL, &sch->bstats, - qdisc->cpu_bstats, - &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - qdisc->cpu_qstats, - &qdisc->qstats, qlen); - sch->q.qlen += qlen; - } else { - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.qlen += qdisc->qstats.qlen; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; - } + gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, + &qdisc->bstats, false); + gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, + &qdisc->qstats); + sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } @@ -246,8 +231,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats, - &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; return 0; @@ -288,6 +272,7 @@ struct Qdisc_ops mq_qdisc_ops __read_mostly = { .init = mq_init, .destroy = mq_destroy, .attach = mq_attach, + .change_real_num_tx = mq_change_real_num_tx, .dump = mq_dump, .owner = THIS_MODULE, }; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 8766ab5b8788..b29f3453c6ea 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -390,7 +390,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) unsigned int ntx, tc; sch->q.qlen = 0; - memset(&sch->bstats, 0, sizeof(sch->bstats)); + gnet_stats_basic_sync_init(&sch->bstats); memset(&sch->qstats, 0, sizeof(sch->qstats)); /* MQ supports lockless qdiscs. However, statistics accounting needs @@ -402,25 +402,11 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - __u32 qlen = qdisc_qlen_sum(qdisc); - - __gnet_stats_copy_basic(NULL, &sch->bstats, - qdisc->cpu_bstats, - &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - qdisc->cpu_qstats, - &qdisc->qstats, qlen); - sch->q.qlen += qlen; - } else { - sch->q.qlen += qdisc->q.qlen; - sch->bstats.bytes += qdisc->bstats.bytes; - sch->bstats.packets += qdisc->bstats.packets; - sch->qstats.backlog += qdisc->qstats.backlog; - sch->qstats.drops += qdisc->qstats.drops; - sch->qstats.requeues += qdisc->qstats.requeues; - sch->qstats.overlimits += qdisc->qstats.overlimits; - } + gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, + &qdisc->bstats, false); + gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, + &qdisc->qstats); + sch->q.qlen += qdisc_qlen(qdisc); spin_unlock_bh(qdisc_lock(qdisc)); } @@ -512,12 +498,13 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, { if (cl >= TC_H_MIN_PRIORITY) { int i; - __u32 qlen = 0; + __u32 qlen; struct gnet_stats_queue qstats = {0}; - struct gnet_stats_basic_packed bstats = {0}; + struct gnet_stats_basic_sync bstats; struct net_device *dev = qdisc_dev(sch); struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; + gnet_stats_basic_sync_init(&bstats); /* Drop lock here it will be reclaimed before touching * statistics this is required because the d->lock we * hold here is the look on dev_queue->qdisc_sleeping @@ -529,37 +516,31 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); struct Qdisc *qdisc = rtnl_dereference(q->qdisc); - struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; - struct gnet_stats_queue __percpu *cpu_qstats = NULL; spin_lock_bh(qdisc_lock(qdisc)); - if (qdisc_is_percpu_stats(qdisc)) { - cpu_bstats = qdisc->cpu_bstats; - cpu_qstats = qdisc->cpu_qstats; - } - qlen = qdisc_qlen_sum(qdisc); - __gnet_stats_copy_basic(NULL, &sch->bstats, - cpu_bstats, &qdisc->bstats); - __gnet_stats_copy_queue(&sch->qstats, - cpu_qstats, - &qdisc->qstats, - qlen); + gnet_stats_add_basic(&bstats, qdisc->cpu_bstats, + &qdisc->bstats, false); + gnet_stats_add_queue(&qstats, qdisc->cpu_qstats, + &qdisc->qstats); + sch->q.qlen += qdisc_qlen(qdisc); + spin_unlock_bh(qdisc_lock(qdisc)); } + qlen = qdisc_qlen(sch) + qstats.qlen; /* Reclaim root sleeping lock before completing stats */ if (d->lock) spin_lock_bh(d->lock); - if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &bstats, false) < 0 || gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, - sch->cpu_bstats, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, sch->cpu_bstats, + &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; } @@ -623,6 +604,7 @@ static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { .init = mqprio_init, .destroy = mqprio_destroy, .attach = mqprio_attach, + .change_real_num_tx = mq_change_real_num_tx, .dump = mqprio_dump, .owner = THIS_MODULE, }; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 5c27b4270b90..cd8ab90c4765 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -36,7 +36,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - err = tcf_classify(skb, fl, &res, false); + err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: @@ -338,8 +338,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 0c345e43a09a..ecbb10db1111 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -785,7 +785,7 @@ static int get_dist_table(struct Qdisc *sch, struct disttable **tbl, if (!n || n > NETEM_DIST_MAX) return -EINVAL; - d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); + d = kvmalloc(struct_size(d, table, n), GFP_KERNEL); if (!d) return -ENOMEM; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 3eabb871a1d5..3b8d7197c06b 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -39,7 +39,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); - err = tcf_classify(skb, fl, &res, false); + err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: @@ -361,8 +361,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || + if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, + &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index b692a0de1ad5..0b7f9ba28deb 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -131,7 +131,7 @@ struct qfq_class { unsigned int filter_cnt; - struct gnet_stats_basic_packed bstats; + struct gnet_stats_basic_sync bstats; struct gnet_stats_queue qstats; struct net_rate_estimator __rcu *rate_est; struct Qdisc *qdisc; @@ -451,7 +451,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) return err; @@ -465,6 +465,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + gnet_stats_basic_sync_init(&cl->bstats); cl->common.classid = classid; cl->deficit = lmax; @@ -477,7 +478,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, NULL, - qdisc_root_sleeping_running(sch), + true, tca[TCA_RATE]); if (err) goto destroy_class; @@ -639,8 +640,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, xstats.weight = cl->agg->class_weight; xstats.lmax = cl->agg->lmax; - if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), - d, NULL, &cl->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || qdisc_qstats_copy(d, cl->qdisc) < 0) return -1; @@ -690,7 +690,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tcf_classify(skb, fl, &res, false); + result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { @@ -1234,8 +1234,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, return err; } - cl->bstats.bytes += len; - cl->bstats.packets += gso_segs; + _bstats_update(&cl->bstats, len, gso_segs); sch->qstats.backlog += len; ++sch->q.qlen; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index dde829d4b9f8..3d061a13d7ed 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -257,7 +257,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, struct tcf_result res; int result; - result = tcf_classify(skb, fl, &res, false); + result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 066754a18569..f8e569f79f13 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -178,7 +178,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return sfq_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tcf_classify(skb, fl, &res, false); + result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 9c79374457a0..9ab068fa2672 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1513,7 +1513,9 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, taprio_set_picos_per_byte(dev, q); if (mqprio) { - netdev_set_num_tc(dev, mqprio->num_tc); + err = netdev_set_num_tc(dev, mqprio->num_tc); + if (err) + goto free_sched; for (i = 0; i < mqprio->num_tc; i++) netdev_set_tc_queue(dev, i, mqprio->count[i], @@ -1639,6 +1641,10 @@ static void taprio_destroy(struct Qdisc *sch) list_del(&q->taprio_list); spin_unlock(&taprio_list_lock); + /* Note that taprio_reset() might not be called if an error + * happens in qdisc_create(), after taprio_init() has been called. + */ + hrtimer_cancel(&q->advance_timer); taprio_disable_offload(dev, q, NULL); @@ -1971,7 +1977,7 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; - if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || + if (gnet_stats_copy_basic(d, NULL, &sch->bstats, true) < 0 || qdisc_qstats_copy(d, sch) < 0) return -1; return 0; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 78e79029dc63..72102277449e 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -184,6 +184,20 @@ static int tbf_offload_dump(struct Qdisc *sch) return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_TBF, &qopt); } +static void tbf_offload_graft(struct Qdisc *sch, struct Qdisc *new, + struct Qdisc *old, struct netlink_ext_ack *extack) +{ + struct tc_tbf_qopt_offload graft_offload = { + .handle = sch->handle, + .parent = sch->parent, + .child_handle = new->handle, + .command = TC_TBF_GRAFT, + }; + + qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, old, + TC_SETUP_QDISC_TBF, &graft_offload, extack); +} + /* GSO packet is too big, segment it so that tbf can transmit * each segment in time */ @@ -547,6 +561,8 @@ static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, new = &noop_qdisc; *old = qdisc_replace(sch, new, &q->qdisc); + + tbf_offload_graft(sch, new, *old, extack); return 0; } diff --git a/net/sctp/input.c b/net/sctp/input.c index 5ef86fdb1176..1f1786021d9c 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -702,7 +702,7 @@ static int sctp_rcv_ootb(struct sk_buff *skb) ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); /* Break out if chunk length is less then minimal. */ - if (ntohs(ch->length) < sizeof(_ch)) + if (!ch || ntohs(ch->length) < sizeof(_ch)) break; ch_end = offset + SCTP_PAD4(ntohs(ch->length)); diff --git a/net/sctp/output.c b/net/sctp/output.c index 4dfb5ea82b05..cdfdbd353c67 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -581,13 +581,16 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); sk = chunk->skb->sk; - /* check gso */ if (packet->size > tp->pathmtu && !packet->ipfragok && !chunk->pmtu_probe) { - if (!sk_can_gso(sk)) { - pr_err_once("Trying to GSO but underlying device doesn't support it."); - goto out; + if (tp->pl.state == SCTP_PL_ERROR) { /* do IP fragmentation if in Error state */ + packet->ipfragok = 1; + } else { + if (!sk_can_gso(sk)) { /* check gso */ + pr_err_once("Trying to GSO but underlying device doesn't support it."); + goto out; + } + gso = 1; } - gso = 1; } /* alloc head skb */ diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index b8fa8f1a7277..c7503fd64915 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3697,7 +3697,7 @@ struct sctp_chunk *sctp_make_strreset_req( outlen = (sizeof(outreq) + stream_len) * out; inlen = (sizeof(inreq) + stream_len) * in; - retval = sctp_make_reconf(asoc, outlen + inlen); + retval = sctp_make_reconf(asoc, SCTP_PAD4(outlen) + SCTP_PAD4(inlen)); if (!retval) return NULL; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 32df65f68c12..fb3da4d8f4a3 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -156,6 +156,12 @@ static enum sctp_disposition __sctp_sf_do_9_1_abort( void *arg, struct sctp_cmd_seq *commands); +static enum sctp_disposition +__sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const union sctp_subtype type, void *arg, + struct sctp_cmd_seq *commands); + /* Small helper function that checks if the chunk length * is of the appropriate length. The 'required_length' argument * is set to be the size of a specific chunk we are testing. @@ -337,6 +343,14 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, if (!chunk->singleton) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Make sure that the INIT chunk has a valid length. + * Normally, this would cause an ABORT with a Protocol Violation + * error, but since we don't have an association, we'll + * just discard the packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk))) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* If the packet is an OOTB packet which is temporarily on the * control endpoint, respond with an ABORT. */ @@ -351,14 +365,6 @@ enum sctp_disposition sctp_sf_do_5_1B_init(struct net *net, if (chunk->sctp_hdr->vtag != 0) return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands); - /* Make sure that the INIT chunk has a valid length. - * Normally, this would cause an ABORT with a Protocol Violation - * error, but since we don't have an association, we'll - * just discard the packet. - */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk))) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - /* If the INIT is coming toward a closing socket, we'll send back * and ABORT. Essentially, this catches the race of INIT being * backloged to the socket at the same time as the user issues close(). @@ -704,6 +710,9 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, struct sock *sk; int error = 0; + if (asoc && !sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* If the packet is an OOTB packet which is temporarily on the * control endpoint, respond with an ABORT. */ @@ -718,7 +727,8 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, * in sctp_unpack_cookie(). */ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, + commands); /* If the endpoint is not listening or if the number of associations * on the TCP-style socket exceed the max backlog, respond with an @@ -1524,20 +1534,16 @@ static enum sctp_disposition sctp_sf_do_unexpected_init( if (!chunk->singleton) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Make sure that the INIT chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk))) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* 3.1 A packet containing an INIT chunk MUST have a zero Verification * Tag. */ if (chunk->sctp_hdr->vtag != 0) return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands); - /* Make sure that the INIT chunk has a valid length. - * In this case, we generate a protocol violation since we have - * an association established. - */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk))) - return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, - commands); - if (SCTP_INPUT_CB(chunk->skb)->encap_port != chunk->transport->encap_port) return sctp_sf_new_encap_port(net, ep, asoc, type, arg, commands); @@ -1882,9 +1888,9 @@ static enum sctp_disposition sctp_sf_do_dupcook_a( * its peer. */ if (sctp_state(asoc, SHUTDOWN_ACK_SENT)) { - disposition = sctp_sf_do_9_2_reshutack(net, ep, asoc, - SCTP_ST_CHUNK(chunk->chunk_hdr->type), - chunk, commands); + disposition = __sctp_sf_do_9_2_reshutack(net, ep, asoc, + SCTP_ST_CHUNK(chunk->chunk_hdr->type), + chunk, commands); if (SCTP_DISPOSITION_NOMEM == disposition) goto nomem; @@ -2202,9 +2208,11 @@ enum sctp_disposition sctp_sf_do_5_2_4_dupcook( * enough for the chunk header. Cookie length verification is * done later. */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) - return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, - commands); + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) { + if (!sctp_vtag_verify(chunk, asoc)) + asoc = NULL; + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, commands); + } /* "Decode" the chunk. We have no optional parameters so we * are in good shape. @@ -2341,7 +2349,7 @@ enum sctp_disposition sctp_sf_shutdown_pending_abort( */ if (SCTP_ADDR_DEL == sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); if (!sctp_err_chunk_valid(chunk)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -2387,7 +2395,7 @@ enum sctp_disposition sctp_sf_shutdown_sent_abort( */ if (SCTP_ADDR_DEL == sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); if (!sctp_err_chunk_valid(chunk)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -2657,7 +2665,7 @@ enum sctp_disposition sctp_sf_do_9_1_abort( */ if (SCTP_ADDR_DEL == sctp_bind_addr_state(&asoc->base.bind_addr, &chunk->dest)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); if (!sctp_err_chunk_valid(chunk)) return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); @@ -2970,13 +2978,11 @@ enum sctp_disposition sctp_sf_do_9_2_shut_ctsn( * that belong to this association, it should discard the INIT chunk and * retransmit the SHUTDOWN ACK chunk. */ -enum sctp_disposition sctp_sf_do_9_2_reshutack( - struct net *net, - const struct sctp_endpoint *ep, - const struct sctp_association *asoc, - const union sctp_subtype type, - void *arg, - struct sctp_cmd_seq *commands) +static enum sctp_disposition +__sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const union sctp_subtype type, void *arg, + struct sctp_cmd_seq *commands) { struct sctp_chunk *chunk = arg; struct sctp_chunk *reply; @@ -3010,6 +3016,26 @@ nomem: return SCTP_DISPOSITION_NOMEM; } +enum sctp_disposition +sctp_sf_do_9_2_reshutack(struct net *net, const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const union sctp_subtype type, void *arg, + struct sctp_cmd_seq *commands) +{ + struct sctp_chunk *chunk = arg; + + if (!chunk->singleton) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk))) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + + if (chunk->sctp_hdr->vtag != 0) + return sctp_sf_tabort_8_4_8(net, ep, asoc, type, arg, commands); + + return __sctp_sf_do_9_2_reshutack(net, ep, asoc, type, arg, commands); +} + /* * sctp_sf_do_ecn_cwr * @@ -3662,6 +3688,9 @@ enum sctp_disposition sctp_sf_ootb(struct net *net, SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES); + if (asoc && !sctp_vtag_verify(chunk, asoc)) + asoc = NULL; + ch = (struct sctp_chunkhdr *)chunk->chunk_hdr; do { /* Report violation if the chunk is less then minimal */ @@ -3777,12 +3806,6 @@ static enum sctp_disposition sctp_sf_shut_8_4_5( SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS); - /* If the chunk length is invalid, we don't want to process - * the reset of the packet. - */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); - /* We need to discard the rest of the packet to prevent * potential boomming attacks from additional bundled chunks. * This is documented in SCTP Threats ID. @@ -3810,6 +3833,9 @@ enum sctp_disposition sctp_sf_do_8_5_1_E_sa(struct net *net, { struct sctp_chunk *chunk = arg; + if (!sctp_vtag_verify(chunk, asoc)) + asoc = NULL; + /* Make sure that the SHUTDOWN_ACK chunk has a valid length. */ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, @@ -3845,6 +3871,11 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net, return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } + /* Make sure that the ASCONF ADDIP chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_addip_chunk))) + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, + commands); + /* ADD-IP: Section 4.1.1 * This chunk MUST be sent in an authenticated way by using * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk @@ -3853,13 +3884,7 @@ enum sctp_disposition sctp_sf_do_asconf(struct net *net, */ if (!asoc->peer.asconf_capable || (!net->sctp.addip_noauth && !chunk->auth)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, - commands); - - /* Make sure that the ASCONF ADDIP chunk has a valid length. */ - if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_addip_chunk))) - return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, - commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); hdr = (struct sctp_addiphdr *)chunk->skb->data; serial = ntohl(hdr->serial); @@ -3988,6 +4013,12 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net, return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); } + /* Make sure that the ADDIP chunk has a valid length. */ + if (!sctp_chunk_length_valid(asconf_ack, + sizeof(struct sctp_addip_chunk))) + return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, + commands); + /* ADD-IP, Section 4.1.2: * This chunk MUST be sent in an authenticated way by using * the mechanism defined in [I-D.ietf-tsvwg-sctp-auth]. If this chunk @@ -3996,14 +4027,7 @@ enum sctp_disposition sctp_sf_do_asconf_ack(struct net *net, */ if (!asoc->peer.asconf_capable || (!net->sctp.addip_noauth && !asconf_ack->auth)) - return sctp_sf_discard_chunk(net, ep, asoc, type, arg, - commands); - - /* Make sure that the ADDIP chunk has a valid length. */ - if (!sctp_chunk_length_valid(asconf_ack, - sizeof(struct sctp_addip_chunk))) - return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, - commands); + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); addip_hdr = (struct sctp_addiphdr *)asconf_ack->skb->data; rcvd_serial = ntohl(addip_hdr->serial); @@ -4575,6 +4599,9 @@ enum sctp_disposition sctp_sf_discard_chunk(struct net *net, { struct sctp_chunk *chunk = arg; + if (asoc && !sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Make sure that the chunk has a valid length. * Since we don't know the chunk type, we use a general * chunkhdr structure to make a comparison. @@ -4642,6 +4669,9 @@ enum sctp_disposition sctp_sf_violation(struct net *net, { struct sctp_chunk *chunk = arg; + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands); + /* Make sure that the chunk has a valid length. */ if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_chunkhdr))) return sctp_sf_violation_chunklen(net, ep, asoc, type, arg, @@ -6348,6 +6378,7 @@ static struct sctp_packet *sctp_ootb_pkt_new( * yet. */ switch (chunk->chunk_hdr->type) { + case SCTP_CID_INIT: case SCTP_CID_INIT_ACK: { struct sctp_initack_chunk *initack; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index a3d3ca6dd63d..133f1719bf1b 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -269,7 +269,7 @@ bool sctp_transport_pl_send(struct sctp_transport *t) if (t->pl.probe_size == SCTP_BASE_PLPMTU) { /* BASE_PLPMTU Confirmation Failed */ t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ - t->pl.pmtu = SCTP_MIN_PLPMTU; + t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } @@ -366,8 +366,9 @@ static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) if (pmtu >= SCTP_MIN_PLPMTU && pmtu < SCTP_BASE_PLPMTU) { t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ - t->pl.pmtu = SCTP_MIN_PLPMTU; + t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + return true; } } else if (t->pl.state == SCTP_PL_SEARCH) { if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { @@ -378,11 +379,10 @@ static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + return true; } else if (pmtu > t->pl.pmtu && pmtu < t->pl.probe_size) { t->pl.probe_size = pmtu; t->pl.probe_count = 0; - - return false; } } else if (t->pl.state == SCTP_PL_COMPLETE) { if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { @@ -393,10 +393,11 @@ static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + return true; } } - return true; + return false; } bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) diff --git a/net/smc/Makefile b/net/smc/Makefile index 99a0186cba5b..196fb6f01b14 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only +ccflags-y += -I$(src) obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o +smc-y += smc_tracepoint.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c038efc23ce3..0cf7ed2f5d41 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -50,6 +50,7 @@ #include "smc_rx.h" #include "smc_close.h" #include "smc_stats.h" +#include "smc_tracepoint.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -439,6 +440,47 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc) return 0; } +static bool smc_isascii(char *hostname) +{ + int i; + + for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++) + if (!isascii(hostname[i])) + return false; + return true; +} + +static void smc_conn_save_peer_info_fce(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *clc) +{ + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)clc; + struct smc_clc_first_contact_ext *fce; + int clc_v2_len; + + if (clc->hdr.version == SMC_V1 || + !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) + return; + + if (smc->conn.lgr->is_smcd) { + memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid, + SMC_MAX_EID_LEN); + clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, + d1); + } else { + memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid, + SMC_MAX_EID_LEN); + clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2, + r1); + } + fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len); + smc->conn.lgr->peer_os = fce->os_type; + smc->conn.lgr->peer_smc_release = fce->release; + if (smc_isascii(fce->hostname)) + memcpy(smc->conn.lgr->peer_hostname, fce->hostname, + SMC_MAX_HOSTNAME_LEN); +} + static void smcr_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { @@ -451,16 +493,6 @@ static void smcr_conn_save_peer_info(struct smc_sock *smc, smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); } -static bool smc_isascii(char *hostname) -{ - int i; - - for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++) - if (!isascii(hostname[i])) - return false; - return true; -} - static void smcd_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { @@ -472,22 +504,6 @@ static void smcd_conn_save_peer_info(struct smc_sock *smc, smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg); atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx; - if (clc->hdr.version > SMC_V1 && - (clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) { - struct smc_clc_msg_accept_confirm_v2 *clc_v2 = - (struct smc_clc_msg_accept_confirm_v2 *)clc; - struct smc_clc_first_contact_ext *fce = - (struct smc_clc_first_contact_ext *) - (((u8 *)clc_v2) + sizeof(*clc_v2)); - - memcpy(smc->conn.lgr->negotiated_eid, clc_v2->eid, - SMC_MAX_EID_LEN); - smc->conn.lgr->peer_os = fce->os_type; - smc->conn.lgr->peer_smc_release = fce->release; - if (smc_isascii(fce->hostname)) - memcpy(smc->conn.lgr->peer_hostname, fce->hostname, - SMC_MAX_HOSTNAME_LEN); - } } static void smc_conn_save_peer_info(struct smc_sock *smc, @@ -497,14 +513,16 @@ static void smc_conn_save_peer_info(struct smc_sock *smc, smcd_conn_save_peer_info(smc, clc); else smcr_conn_save_peer_info(smc, clc); + smc_conn_save_peer_info_fce(smc, clc); } static void smc_link_save_peer_info(struct smc_link *link, - struct smc_clc_msg_accept_confirm *clc) + struct smc_clc_msg_accept_confirm *clc, + struct smc_init_info *ini) { link->peer_qpn = ntoh24(clc->r0.qpn); - memcpy(link->peer_gid, clc->r0.lcl.gid, SMC_GID_SIZE); - memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac)); + memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE); + memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac)); link->peer_psn = ntoh24(clc->r0.psn); link->peer_mtu = clc->r0.qp_mtu; } @@ -547,6 +565,7 @@ static void smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->use_fallback = true; smc->fallback_rsn = reason_code; smc_stat_fallback(smc); + trace_smc_switch_to_fallback(smc, reason_code); if (smc->sk.sk_socket && smc->sk.sk_socket->file) { smc->clcsock->file = smc->sk.sk_socket->file; smc->clcsock->file->private_data = smc->clcsock; @@ -608,7 +627,9 @@ static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini) * used for the internal TCP socket */ smc_pnet_find_roce_resource(smc->clcsock->sk, ini); - if (!ini->ib_dev) + if (!ini->check_smcrv2 && !ini->ib_dev) + return SMC_CLC_DECL_NOSMCRDEV; + if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2) return SMC_CLC_DECL_NOSMCRDEV; return 0; } @@ -692,27 +713,42 @@ static int smc_find_proposal_devices(struct smc_sock *smc, int rc = 0; /* check if there is an ism device available */ - if (ini->smcd_version & SMC_V1) { - if (smc_find_ism_device(smc, ini) || - smc_connect_ism_vlan_setup(smc, ini)) { - if (ini->smc_type_v1 == SMC_TYPE_B) - ini->smc_type_v1 = SMC_TYPE_R; - else - ini->smc_type_v1 = SMC_TYPE_N; - } /* else ISM V1 is supported for this connection */ - if (smc_find_rdma_device(smc, ini)) { - if (ini->smc_type_v1 == SMC_TYPE_B) - ini->smc_type_v1 = SMC_TYPE_D; - else - ini->smc_type_v1 = SMC_TYPE_N; - } /* else RDMA is supported for this connection */ - } - if (smc_ism_is_v2_capable() && smc_find_ism_v2_device_clnt(smc, ini)) - ini->smc_type_v2 = SMC_TYPE_N; + if (!(ini->smcd_version & SMC_V1) || + smc_find_ism_device(smc, ini) || + smc_connect_ism_vlan_setup(smc, ini)) + ini->smcd_version &= ~SMC_V1; + /* else ISM V1 is supported for this connection */ + + /* check if there is an rdma device available */ + if (!(ini->smcr_version & SMC_V1) || + smc_find_rdma_device(smc, ini)) + ini->smcr_version &= ~SMC_V1; + /* else RDMA is supported for this connection */ + + ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1, + ini->smcr_version & SMC_V1); + + /* check if there is an ism v2 device available */ + if (!(ini->smcd_version & SMC_V2) || + !smc_ism_is_v2_capable() || + smc_find_ism_v2_device_clnt(smc, ini)) + ini->smcd_version &= ~SMC_V2; + + /* check if there is an rdma v2 device available */ + ini->check_smcrv2 = true; + ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr; + if (!(ini->smcr_version & SMC_V2) || + smc->clcsock->sk->sk_family != AF_INET || + !smc_clc_ueid_count() || + smc_find_rdma_device(smc, ini)) + ini->smcr_version &= ~SMC_V2; + ini->check_smcrv2 = false; + + ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2, + ini->smcr_version & SMC_V2); /* if neither ISM nor RDMA are supported, fallback */ - if (!smcr_indicated(ini->smc_type_v1) && - ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N) + if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N) rc = SMC_CLC_DECL_NOSMCDEV; return rc; @@ -752,6 +788,64 @@ static int smc_connect_clc(struct smc_sock *smc, SMC_CLC_ACCEPT, CLC_WAIT_TIME); } +void smc_fill_gid_list(struct smc_link_group *lgr, + struct smc_gidlist *gidlist, + struct smc_ib_device *known_dev, u8 *known_gid) +{ + struct smc_init_info *alt_ini = NULL; + + memset(gidlist, 0, sizeof(*gidlist)); + memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE); + + alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL); + if (!alt_ini) + goto out; + + alt_ini->vlan_id = lgr->vlan_id; + alt_ini->check_smcrv2 = true; + alt_ini->smcrv2.saddr = lgr->saddr; + smc_pnet_find_alt_roce(lgr, alt_ini, known_dev); + + if (!alt_ini->smcrv2.ib_dev_v2) + goto out; + + memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2, + SMC_GID_SIZE); + +out: + kfree(alt_ini); +} + +static int smc_connect_rdma_v2_prepare(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smc_init_info *ini) +{ + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + struct smc_clc_first_contact_ext *fce = + (struct smc_clc_first_contact_ext *) + (((u8 *)clc_v2) + sizeof(*clc_v2)); + + if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1) + return 0; + + if (fce->v2_direct) { + memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN); + ini->smcrv2.uses_gateway = false; + } else { + if (smc_ib_find_route(smc->clcsock->sk->sk_rcv_saddr, + smc_ib_gid_to_ipv4(aclc->r0.lcl.gid), + ini->smcrv2.nexthop_mac, + &ini->smcrv2.uses_gateway)) + return SMC_CLC_DECL_NOROUTE; + if (!ini->smcrv2.uses_gateway) { + /* mismatch: peer claims indirect, but its direct */ + return SMC_CLC_DECL_NOINDIRECT; + } + } + return 0; +} + /* setup for RDMA connection of client */ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, @@ -759,11 +853,18 @@ static int smc_connect_rdma(struct smc_sock *smc, { int i, reason_code = 0; struct smc_link *link; + u8 *eid = NULL; ini->is_smcd = false; - ini->ib_lcl = &aclc->r0.lcl; ini->ib_clcqpn = ntoh24(aclc->r0.qpn); ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK; + memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN); + memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE); + memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN); + + reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini); + if (reason_code) + return reason_code; mutex_lock(&smc_client_lgr_pending); reason_code = smc_conn_create(smc, ini); @@ -785,8 +886,9 @@ static int smc_connect_rdma(struct smc_sock *smc, if (l->peer_qpn == ntoh24(aclc->r0.qpn) && !memcmp(l->peer_gid, &aclc->r0.lcl.gid, SMC_GID_SIZE) && - !memcmp(l->peer_mac, &aclc->r0.lcl.mac, - sizeof(l->peer_mac))) { + (aclc->hdr.version > SMC_V1 || + !memcmp(l->peer_mac, &aclc->r0.lcl.mac, + sizeof(l->peer_mac)))) { link = l; break; } @@ -805,7 +907,7 @@ static int smc_connect_rdma(struct smc_sock *smc, } if (ini->first_contact_local) - smc_link_save_peer_info(link, aclc); + smc_link_save_peer_info(link, aclc, ini); if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) { reason_code = SMC_CLC_DECL_ERR_RTOK; @@ -828,8 +930,18 @@ static int smc_connect_rdma(struct smc_sock *smc, } smc_rmb_sync_sg_for_device(&smc->conn); + if (aclc->hdr.version > SMC_V1) { + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + + eid = clc_v2->r1.eid; + if (ini->first_contact_local) + smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist, + link->smcibdev, link->gid); + } + reason_code = smc_clc_send_confirm(smc, ini->first_contact_local, - SMC_V1); + aclc->hdr.version, eid, ini); if (reason_code) goto connect_abort; @@ -869,7 +981,7 @@ smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc, int i; for (i = 0; i < ini->ism_offered_cnt + 1; i++) { - if (ini->ism_chid[i] == ntohs(aclc->chid)) { + if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) { ini->ism_selected = i; return 0; } @@ -883,6 +995,7 @@ static int smc_connect_ism(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { + u8 *eid = NULL; int rc = 0; ini->is_smcd = true; @@ -918,8 +1031,15 @@ static int smc_connect_ism(struct smc_sock *smc, smc_rx_init(smc); smc_tx_init(smc); + if (aclc->hdr.version > SMC_V1) { + struct smc_clc_msg_accept_confirm_v2 *clc_v2 = + (struct smc_clc_msg_accept_confirm_v2 *)aclc; + + eid = clc_v2->d1.eid; + } + rc = smc_clc_send_confirm(smc, ini->first_contact_local, - aclc->hdr.version); + aclc->hdr.version, eid, NULL); if (rc) goto connect_abort; mutex_unlock(&smc_server_lgr_pending); @@ -942,17 +1062,24 @@ connect_abort: static int smc_connect_check_aclc(struct smc_init_info *ini, struct smc_clc_msg_accept_confirm *aclc) { - if ((aclc->hdr.typev1 == SMC_TYPE_R && - !smcr_indicated(ini->smc_type_v1)) || - (aclc->hdr.typev1 == SMC_TYPE_D && - ((!smcd_indicated(ini->smc_type_v1) && - !smcd_indicated(ini->smc_type_v2)) || - (aclc->hdr.version == SMC_V1 && - !smcd_indicated(ini->smc_type_v1)) || - (aclc->hdr.version == SMC_V2 && - !smcd_indicated(ini->smc_type_v2))))) + if (aclc->hdr.typev1 != SMC_TYPE_R && + aclc->hdr.typev1 != SMC_TYPE_D) return SMC_CLC_DECL_MODEUNSUPP; + if (aclc->hdr.version >= SMC_V2) { + if ((aclc->hdr.typev1 == SMC_TYPE_R && + !smcr_indicated(ini->smc_type_v2)) || + (aclc->hdr.typev1 == SMC_TYPE_D && + !smcd_indicated(ini->smc_type_v2))) + return SMC_CLC_DECL_MODEUNSUPP; + } else { + if ((aclc->hdr.typev1 == SMC_TYPE_R && + !smcr_indicated(ini->smc_type_v1)) || + (aclc->hdr.typev1 == SMC_TYPE_D && + !smcd_indicated(ini->smc_type_v1))) + return SMC_CLC_DECL_MODEUNSUPP; + } + return 0; } @@ -983,14 +1110,15 @@ static int __smc_connect(struct smc_sock *smc) return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM, version); - ini->smcd_version = SMC_V1; - ini->smcd_version |= smc_ism_is_v2_capable() ? SMC_V2 : 0; + ini->smcd_version = SMC_V1 | SMC_V2; + ini->smcr_version = SMC_V1 | SMC_V2; ini->smc_type_v1 = SMC_TYPE_B; - ini->smc_type_v2 = smc_ism_is_v2_capable() ? SMC_TYPE_D : SMC_TYPE_N; + ini->smc_type_v2 = SMC_TYPE_B; /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(smc->clcsock, ini)) { ini->smcd_version &= ~SMC_V1; + ini->smcr_version = 0; ini->smc_type_v1 = SMC_TYPE_N; if (!ini->smcd_version) { rc = SMC_CLC_DECL_GETVLANERR; @@ -1018,15 +1146,17 @@ static int __smc_connect(struct smc_sock *smc) /* check if smc modes and versions of CLC proposal and accept match */ rc = smc_connect_check_aclc(ini, aclc); version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2; - ini->smcd_version = version; if (rc) goto vlan_cleanup; /* depending on previous steps, connect using rdma or ism */ - if (aclc->hdr.typev1 == SMC_TYPE_R) + if (aclc->hdr.typev1 == SMC_TYPE_R) { + ini->smcr_version = version; rc = smc_connect_rdma(smc, aclc, ini); - else if (aclc->hdr.typev1 == SMC_TYPE_D) + } else if (aclc->hdr.typev1 == SMC_TYPE_D) { + ini->smcd_version = version; rc = smc_connect_ism(smc, aclc, ini); + } if (rc) goto vlan_cleanup; @@ -1057,7 +1187,7 @@ static void smc_connect_work(struct work_struct *work) if (smc->clcsock->sk->sk_err) { smc->sk.sk_err = smc->clcsock->sk->sk_err; } else if ((1 << smc->clcsock->sk->sk_state) & - (TCPF_SYN_SENT | TCP_SYN_RECV)) { + (TCPF_SYN_SENT | TCPF_SYN_RECV)) { rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo); if ((rc == -EPIPE) && ((1 << smc->clcsock->sk->sk_state) & @@ -1307,7 +1437,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc) smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); /* initial contact - try to establish second link */ - smc_llc_srv_add_link(link); + smc_llc_srv_add_link(link, NULL); return 0; } @@ -1387,33 +1517,48 @@ static int smc_listen_v2_check(struct smc_sock *new_smc, ini->smc_type_v1 = pclc->hdr.typev1; ini->smc_type_v2 = pclc->hdr.typev2; - ini->smcd_version = ini->smc_type_v1 != SMC_TYPE_N ? SMC_V1 : 0; - if (pclc->hdr.version > SMC_V1) - ini->smcd_version |= - ini->smc_type_v2 != SMC_TYPE_N ? SMC_V2 : 0; - if (!(ini->smcd_version & SMC_V2)) { + ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0; + ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0; + if (pclc->hdr.version > SMC_V1) { + if (smcd_indicated(ini->smc_type_v2)) + ini->smcd_version |= SMC_V2; + if (smcr_indicated(ini->smc_type_v2)) + ini->smcr_version |= SMC_V2; + } + if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) { rc = SMC_CLC_DECL_PEERNOSMC; goto out; } - if (!smc_ism_is_v2_capable()) { - ini->smcd_version &= ~SMC_V2; - rc = SMC_CLC_DECL_NOISM2SUPP; - goto out; - } pclc_v2_ext = smc_get_clc_v2_ext(pclc); if (!pclc_v2_ext) { ini->smcd_version &= ~SMC_V2; + ini->smcr_version &= ~SMC_V2; rc = SMC_CLC_DECL_NOV2EXT; goto out; } pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext); - if (!pclc_smcd_v2_ext) { - ini->smcd_version &= ~SMC_V2; - rc = SMC_CLC_DECL_NOV2DEXT; + if (ini->smcd_version & SMC_V2) { + if (!smc_ism_is_v2_capable()) { + ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOISM2SUPP; + } else if (!pclc_smcd_v2_ext) { + ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOV2DEXT; + } else if (!pclc_v2_ext->hdr.eid_cnt && + !pclc_v2_ext->hdr.flag.seid) { + ini->smcd_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOUEID; + } + } + if (ini->smcr_version & SMC_V2) { + if (!pclc_v2_ext->hdr.eid_cnt) { + ini->smcr_version &= ~SMC_V2; + rc = SMC_CLC_DECL_NOUEID; + } } out: - if (!ini->smcd_version) + if (!ini->smcd_version && !ini->smcr_version) return rc; return 0; @@ -1533,11 +1678,6 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, pclc_smcd = smc_get_clc_msg_smcd(pclc); smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); - if (!smcd_v2_ext || - !smc_v2_ext->hdr.flag.seid) { /* no system EID support for SMCD */ - smc_find_ism_store_rc(SMC_CLC_DECL_NOSEID, ini); - goto not_found; - } mutex_lock(&smcd_dev_list.mutex); if (pclc_smcd->ism.chid) @@ -1555,14 +1695,16 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, } mutex_unlock(&smcd_dev_list.mutex); - if (ini->ism_dev[0]) { - smc_ism_get_system_eid(ini->ism_dev[0], &eid); - if (memcmp(eid, smcd_v2_ext->system_eid, SMC_MAX_EID_LEN)) - goto not_found; - } else { + if (!ini->ism_dev[0]) { + smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); goto not_found; } + smc_ism_get_system_eid(&eid); + if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, + smcd_v2_ext->system_eid, eid)) + goto not_found; + /* separate - outside the smcd_dev_list.lock */ smcd_version = ini->smcd_version; for (i = 0; i < matches; i++) { @@ -1579,6 +1721,7 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, } /* no V2 ISM device could be initialized */ ini->smcd_version = smcd_version; /* restore original value */ + ini->negotiated_eid[0] = 0; not_found: ini->smcd_version &= ~SMC_V2; @@ -1608,6 +1751,7 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, not_found: smc_find_ism_store_rc(rc, ini); + ini->smcd_version &= ~SMC_V1; ini->ism_dev[0] = NULL; ini->is_smcd = false; } @@ -1626,24 +1770,69 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first) return 0; } +static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_init_info *ini) +{ + struct smc_clc_v2_extension *smc_v2_ext; + u8 smcr_version; + int rc; + + if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2)) + goto not_found; + + smc_v2_ext = smc_get_clc_v2_ext(pclc); + if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) + goto not_found; + + /* prepare RDMA check */ + memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); + memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE); + memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); + ini->check_smcrv2 = true; + ini->smcrv2.clc_sk = new_smc->clcsock->sk; + ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr; + ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); + rc = smc_find_rdma_device(new_smc, ini); + if (rc) { + smc_find_ism_store_rc(rc, ini); + goto not_found; + } + if (!ini->smcrv2.uses_gateway) + memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN); + + smcr_version = ini->smcr_version; + ini->smcr_version = SMC_V2; + rc = smc_listen_rdma_init(new_smc, ini); + if (!rc) + rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local); + if (!rc) + return; + ini->smcr_version = smcr_version; + smc_find_ism_store_rc(rc, ini); + +not_found: + ini->smcr_version &= ~SMC_V2; + ini->check_smcrv2 = false; +} + static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { int rc; - if (!smcr_indicated(ini->smc_type_v1)) + if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1)) return SMC_CLC_DECL_NOSMCDEV; /* prepare RDMA check */ - ini->ib_lcl = &pclc->lcl; + memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN); + memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE); + memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN); rc = smc_find_rdma_device(new_smc, ini); if (rc) { /* no RDMA device found */ - if (ini->smc_type_v1 == SMC_TYPE_B) - /* neither ISM nor RDMA device found */ - rc = SMC_CLC_DECL_NOSMCDEV; - return rc; + return SMC_CLC_DECL_NOSMCDEV; } rc = smc_listen_rdma_init(new_smc, ini); if (rc) @@ -1656,51 +1845,60 @@ static int smc_listen_find_device(struct smc_sock *new_smc, struct smc_clc_msg_proposal *pclc, struct smc_init_info *ini) { - int rc; + int prfx_rc; /* check for ISM device matching V2 proposed device */ smc_find_ism_v2_device_serv(new_smc, pclc, ini); if (ini->ism_dev[0]) return 0; - if (!(ini->smcd_version & SMC_V1)) - return ini->rc ?: SMC_CLC_DECL_NOSMCD2DEV; - - /* check for matching IP prefix and subnet length */ - rc = smc_listen_prfx_check(new_smc, pclc); - if (rc) - return ini->rc ?: rc; + /* check for matching IP prefix and subnet length (V1) */ + prfx_rc = smc_listen_prfx_check(new_smc, pclc); + if (prfx_rc) + smc_find_ism_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) return ini->rc ?: SMC_CLC_DECL_GETVLANERR; /* check for ISM device matching V1 proposed device */ - smc_find_ism_v1_device_serv(new_smc, pclc, ini); + if (!prfx_rc) + smc_find_ism_v1_device_serv(new_smc, pclc, ini); if (ini->ism_dev[0]) return 0; - if (pclc->hdr.typev1 == SMC_TYPE_D) + if (!smcr_indicated(pclc->hdr.typev1) && + !smcr_indicated(pclc->hdr.typev2)) /* skip RDMA and decline */ return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV; - /* check if RDMA is available */ - rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); - smc_find_ism_store_rc(rc, ini); + /* check if RDMA V2 is available */ + smc_find_rdma_v2_device_serv(new_smc, pclc, ini); + if (ini->smcrv2.ib_dev_v2) + return 0; - return (!rc) ? 0 : ini->rc; + /* check if RDMA V1 is available */ + if (!prfx_rc) { + int rc; + + rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); + smc_find_ism_store_rc(rc, ini); + return (!rc) ? 0 : ini->rc; + } + return SMC_CLC_DECL_NOSMCDEV; } /* listen worker: finish RDMA setup */ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, - bool local_first) + bool local_first, + struct smc_init_info *ini) { struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; if (local_first) - smc_link_save_peer_info(link, cclc); + smc_link_save_peer_info(link, cclc, ini); if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) return SMC_CLC_DECL_ERR_RTOK; @@ -1721,12 +1919,13 @@ static void smc_listen_work(struct work_struct *work) { struct smc_sock *new_smc = container_of(work, struct smc_sock, smc_listen_work); - u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1; struct socket *newclcsock = new_smc->clcsock; struct smc_clc_msg_accept_confirm *cclc; struct smc_clc_msg_proposal_area *buf; struct smc_clc_msg_proposal *pclc; struct smc_init_info *ini = NULL; + u8 proposal_version = SMC_V1; + u8 accept_version; int rc = 0; if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) @@ -1757,7 +1956,9 @@ static void smc_listen_work(struct work_struct *work) SMC_CLC_PROPOSAL, CLC_WAIT_TIME); if (rc) goto out_decl; - version = pclc->hdr.version == SMC_V1 ? SMC_V1 : version; + + if (pclc->hdr.version > SMC_V1) + proposal_version = SMC_V2; /* IPSec connections opt out of SMC optimizations */ if (using_ipsec(new_smc)) { @@ -1787,8 +1988,9 @@ static void smc_listen_work(struct work_struct *work) goto out_unlock; /* send SMC Accept CLC message */ + accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version; rc = smc_clc_send_accept(new_smc, ini->first_contact_local, - ini->smcd_version == SMC_V2 ? SMC_V2 : SMC_V1); + accept_version, ini->negotiated_eid); if (rc) goto out_unlock; @@ -1810,7 +2012,7 @@ static void smc_listen_work(struct work_struct *work) /* finish worker */ if (!ini->is_smcd) { rc = smc_listen_rdma_finish(new_smc, cclc, - ini->first_contact_local); + ini->first_contact_local, ini); if (rc) goto out_unlock; mutex_unlock(&smc_server_lgr_pending); @@ -1824,7 +2026,7 @@ out_unlock: mutex_unlock(&smc_server_lgr_pending); out_decl: smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, - version); + proposal_version); out_free: kfree(ini); kfree(buf); @@ -2662,6 +2864,7 @@ static void __exit smc_exit(void) proto_unregister(&smc_proto); smc_pnet_exit(); smc_nl_exit(); + smc_clc_exit(); unregister_pernet_subsys(&smc_net_stat_ops); unregister_pernet_subsys(&smc_net_ops); rcu_barrier(); diff --git a/net/smc/smc.h b/net/smc/smc.h index d65e15f0c944..f4286ca1f228 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -29,9 +29,6 @@ * devices */ -#define SMC_MAX_HOSTNAME_LEN 32 -#define SMC_MAX_EID_LEN 32 - extern struct proto smc_proto; extern struct proto smc_proto6; @@ -59,7 +56,20 @@ enum smc_state { /* possible states of an SMC socket */ struct smc_link_group; struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ - u8 type; + union { + u8 type; +#if defined(__BIG_ENDIAN_BITFIELD) + struct { + u8 llc_version:4, + llc_type:4; + }; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + struct { + u8 llc_type:4, + llc_version:4; + }; +#endif + }; } __aligned(1); struct smc_cdc_conn_state_flags { @@ -289,7 +299,12 @@ static inline bool using_ipsec(struct smc_sock *smc) } #endif +struct smc_gidlist; + struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); void smc_close_non_accepted(struct sock *sk); +void smc_fill_gid_list(struct smc_link_group *lgr, + struct smc_gidlist *gidlist, + struct smc_ib_device *known_dev, u8 *known_gid); #endif /* __SMC_H */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index f23f558054a7..99acd337ba90 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -150,9 +150,11 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) again: link = conn->lnk; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); if (rc) - return rc; + goto put_out; spin_lock_bh(&conn->send_lock); if (link != conn->lnk) { @@ -160,6 +162,7 @@ again: spin_unlock_bh(&conn->send_lock); smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); + smc_wr_tx_link_put(link); if (again) return -ENOLINK; again = true; @@ -167,6 +170,8 @@ again: } rc = smc_cdc_msg_send(conn, wr_buf, pend); spin_unlock_bh(&conn->send_lock); +put_out: + smc_wr_tx_link_put(link); return rc; } diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index e286dafd6e88..8409ab71a5e4 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -26,10 +26,12 @@ #include "smc_clc.h" #include "smc_ib.h" #include "smc_ism.h" +#include "smc_netlink.h" #define SMCR_CLC_ACCEPT_CONFIRM_LEN 68 #define SMCD_CLC_ACCEPT_CONFIRM_LEN 48 #define SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 78 +#define SMCR_CLC_ACCEPT_CONFIRM_LEN_V2 108 #define SMC_CLC_RECV_BUF_LEN 100 /* eye catcher "SMCR" EBCDIC for CLC messages */ @@ -39,6 +41,297 @@ static const char SMCD_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xc4'}; static u8 smc_hostname[SMC_MAX_HOSTNAME_LEN]; +struct smc_clc_eid_table { + rwlock_t lock; + struct list_head list; + u8 ueid_cnt; + u8 seid_enabled; +}; + +static struct smc_clc_eid_table smc_clc_eid_table; + +struct smc_clc_eid_entry { + struct list_head list; + u8 eid[SMC_MAX_EID_LEN]; +}; + +/* The size of a user EID is 32 characters. + * Valid characters should be (single-byte character set) A-Z, 0-9, '.' and '-'. + * Blanks should only be used to pad to the expected size. + * First character must be alphanumeric. + */ +static bool smc_clc_ueid_valid(char *ueid) +{ + char *end = ueid + SMC_MAX_EID_LEN; + + while (--end >= ueid && isspace(*end)) + ; + if (end < ueid) + return false; + if (!isalnum(*ueid) || islower(*ueid)) + return false; + while (ueid <= end) { + if ((!isalnum(*ueid) || islower(*ueid)) && *ueid != '.' && + *ueid != '-') + return false; + ueid++; + } + return true; +} + +static int smc_clc_ueid_add(char *ueid) +{ + struct smc_clc_eid_entry *new_ueid, *tmp_ueid; + int rc; + + if (!smc_clc_ueid_valid(ueid)) + return -EINVAL; + + /* add a new ueid entry to the ueid table if there isn't one */ + new_ueid = kzalloc(sizeof(*new_ueid), GFP_KERNEL); + if (!new_ueid) + return -ENOMEM; + memcpy(new_ueid->eid, ueid, SMC_MAX_EID_LEN); + + write_lock(&smc_clc_eid_table.lock); + if (smc_clc_eid_table.ueid_cnt >= SMC_MAX_UEID) { + rc = -ERANGE; + goto err_out; + } + list_for_each_entry(tmp_ueid, &smc_clc_eid_table.list, list) { + if (!memcmp(tmp_ueid->eid, ueid, SMC_MAX_EID_LEN)) { + rc = -EEXIST; + goto err_out; + } + } + list_add_tail(&new_ueid->list, &smc_clc_eid_table.list); + smc_clc_eid_table.ueid_cnt++; + write_unlock(&smc_clc_eid_table.lock); + return 0; + +err_out: + write_unlock(&smc_clc_eid_table.lock); + kfree(new_ueid); + return rc; +} + +int smc_clc_ueid_count(void) +{ + int count; + + read_lock(&smc_clc_eid_table.lock); + count = smc_clc_eid_table.ueid_cnt; + read_unlock(&smc_clc_eid_table.lock); + + return count; +} + +int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY]; + char *ueid; + + if (!nla_ueid || nla_len(nla_ueid) != SMC_MAX_EID_LEN + 1) + return -EINVAL; + ueid = (char *)nla_data(nla_ueid); + + return smc_clc_ueid_add(ueid); +} + +/* remove one or all ueid entries from the table */ +static int smc_clc_ueid_remove(char *ueid) +{ + struct smc_clc_eid_entry *lst_ueid, *tmp_ueid; + int rc = -ENOENT; + + /* remove table entry */ + write_lock(&smc_clc_eid_table.lock); + list_for_each_entry_safe(lst_ueid, tmp_ueid, &smc_clc_eid_table.list, + list) { + if (!ueid || !memcmp(lst_ueid->eid, ueid, SMC_MAX_EID_LEN)) { + list_del(&lst_ueid->list); + smc_clc_eid_table.ueid_cnt--; + kfree(lst_ueid); + rc = 0; + } + } + if (!rc && !smc_clc_eid_table.ueid_cnt) { + smc_clc_eid_table.seid_enabled = 1; + rc = -EAGAIN; /* indicate success and enabling of seid */ + } + write_unlock(&smc_clc_eid_table.lock); + return rc; +} + +int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY]; + char *ueid; + + if (!nla_ueid || nla_len(nla_ueid) != SMC_MAX_EID_LEN + 1) + return -EINVAL; + ueid = (char *)nla_data(nla_ueid); + + return smc_clc_ueid_remove(ueid); +} + +int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info) +{ + smc_clc_ueid_remove(NULL); + return 0; +} + +static int smc_nl_ueid_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, + u32 flags, char *ueid) +{ + char ueid_str[SMC_MAX_EID_LEN + 1]; + void *hdr; + + hdr = genlmsg_put(skb, portid, seq, &smc_gen_nl_family, + flags, SMC_NETLINK_DUMP_UEID); + if (!hdr) + return -ENOMEM; + snprintf(ueid_str, sizeof(ueid_str), "%s", ueid); + if (nla_put_string(skb, SMC_NLA_EID_TABLE_ENTRY, ueid_str)) { + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; + } + genlmsg_end(skb, hdr); + return 0; +} + +static int _smc_nl_ueid_dump(struct sk_buff *skb, u32 portid, u32 seq, + int start_idx) +{ + struct smc_clc_eid_entry *lst_ueid; + int idx = 0; + + read_lock(&smc_clc_eid_table.lock); + list_for_each_entry(lst_ueid, &smc_clc_eid_table.list, list) { + if (idx++ < start_idx) + continue; + if (smc_nl_ueid_dumpinfo(skb, portid, seq, NLM_F_MULTI, + lst_ueid->eid)) { + --idx; + break; + } + } + read_unlock(&smc_clc_eid_table.lock); + return idx; +} + +int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + int idx; + + idx = _smc_nl_ueid_dump(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, cb_ctx->pos[0]); + + cb_ctx->pos[0] = idx; + return skb->len; +} + +int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); + char seid_str[SMC_MAX_EID_LEN + 1]; + u8 seid_enabled; + void *hdr; + u8 *seid; + + if (cb_ctx->pos[0]) + return skb->len; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &smc_gen_nl_family, NLM_F_MULTI, + SMC_NETLINK_DUMP_SEID); + if (!hdr) + return -ENOMEM; + if (!smc_ism_is_v2_capable()) + goto end; + + smc_ism_get_system_eid(&seid); + snprintf(seid_str, sizeof(seid_str), "%s", seid); + if (nla_put_string(skb, SMC_NLA_SEID_ENTRY, seid_str)) + goto err; + read_lock(&smc_clc_eid_table.lock); + seid_enabled = smc_clc_eid_table.seid_enabled; + read_unlock(&smc_clc_eid_table.lock); + if (nla_put_u8(skb, SMC_NLA_SEID_ENABLED, seid_enabled)) + goto err; +end: + genlmsg_end(skb, hdr); + cb_ctx->pos[0]++; + return skb->len; +err: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info) +{ + write_lock(&smc_clc_eid_table.lock); + smc_clc_eid_table.seid_enabled = 1; + write_unlock(&smc_clc_eid_table.lock); + return 0; +} + +int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info) +{ + int rc = 0; + + write_lock(&smc_clc_eid_table.lock); + if (!smc_clc_eid_table.ueid_cnt) + rc = -ENOENT; + else + smc_clc_eid_table.seid_enabled = 0; + write_unlock(&smc_clc_eid_table.lock); + return rc; +} + +static bool _smc_clc_match_ueid(u8 *peer_ueid) +{ + struct smc_clc_eid_entry *tmp_ueid; + + list_for_each_entry(tmp_ueid, &smc_clc_eid_table.list, list) { + if (!memcmp(tmp_ueid->eid, peer_ueid, SMC_MAX_EID_LEN)) + return true; + } + return false; +} + +bool smc_clc_match_eid(u8 *negotiated_eid, + struct smc_clc_v2_extension *smc_v2_ext, + u8 *peer_eid, u8 *local_eid) +{ + bool match = false; + int i; + + negotiated_eid[0] = 0; + read_lock(&smc_clc_eid_table.lock); + if (peer_eid && local_eid && + smc_clc_eid_table.seid_enabled && + smc_v2_ext->hdr.flag.seid && + !memcmp(peer_eid, local_eid, SMC_MAX_EID_LEN)) { + memcpy(negotiated_eid, peer_eid, SMC_MAX_EID_LEN); + match = true; + goto out; + } + + for (i = 0; i < smc_v2_ext->hdr.eid_cnt; i++) { + if (_smc_clc_match_ueid(smc_v2_ext->user_eids[i])) { + memcpy(negotiated_eid, smc_v2_ext->user_eids[i], + SMC_MAX_EID_LEN); + match = true; + goto out; + } + } +out: + read_unlock(&smc_clc_eid_table.lock); + return match; +} + /* check arriving CLC proposal */ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) { @@ -100,6 +393,27 @@ smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2) (ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 + sizeof(struct smc_clc_first_contact_ext))) return false; + if (hdr->typev1 == SMC_TYPE_R && + ntohs(hdr->length) < SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) + return false; + } + return true; +} + +/* check arriving CLC decline */ +static bool +smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) +{ + struct smc_clc_msg_hdr *hdr = &dclc->hdr; + + if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) + return false; + if (hdr->version == SMC_V1) { + if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline)) + return false; + } else { + if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline_v2)) + return false; } return true; } @@ -145,9 +459,9 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) break; case SMC_CLC_DECLINE: dclc = (struct smc_clc_msg_decline *)clcm; - if (ntohs(dclc->hdr.length) != sizeof(*dclc)) + if (!smc_clc_msg_decl_valid(dclc)) return false; - trl = &dclc->trl; + check_trl = false; break; default: return false; @@ -230,7 +544,8 @@ static int smc_clc_prfx_set(struct socket *clcsock, goto out_rel; } /* get address to which the internal TCP socket is bound */ - kernel_getsockname(clcsock, (struct sockaddr *)&addrs); + if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0) + goto out_rel; /* analyze IP specific data of net_device belonging to TCP socket */ addr6 = (struct sockaddr_in6 *)&addrs; rcu_read_lock(); @@ -445,15 +760,16 @@ out: /* send CLC DECLINE message across internal TCP socket */ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) { - struct smc_clc_msg_decline dclc; + struct smc_clc_msg_decline *dclc_v1; + struct smc_clc_msg_decline_v2 dclc; struct msghdr msg; + int len, send_len; struct kvec vec; - int len; + dclc_v1 = (struct smc_clc_msg_decline *)&dclc; memset(&dclc, 0, sizeof(dclc)); memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); dclc.hdr.type = SMC_CLC_DECLINE; - dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); dclc.hdr.version = version; dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX; dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? @@ -463,14 +779,22 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version) memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); dclc.peer_diagnosis = htonl(peer_diag_info); - memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + if (version == SMC_V1) { + memcpy(dclc_v1->trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + send_len = sizeof(*dclc_v1); + } else { + memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, + sizeof(SMC_EYECATCHER)); + send_len = sizeof(dclc); + } + dclc.hdr.length = htons(send_len); memset(&msg, 0, sizeof(msg)); vec.iov_base = &dclc; - vec.iov_len = sizeof(struct smc_clc_msg_decline); - len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, - sizeof(struct smc_clc_msg_decline)); - if (len < 0 || len < sizeof(struct smc_clc_msg_decline)) + vec.iov_len = send_len; + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len); + if (len < 0 || len < send_len) len = -EPROTO; return len > 0 ? 0 : len; } @@ -550,9 +874,10 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) if (ini->smc_type_v2 == SMC_TYPE_N) { pclc_smcd->v2_ext_offset = 0; } else { + struct smc_clc_eid_entry *ueident; u16 v2_ext_offset; - u8 *eid = NULL; + v2_ext->hdr.flag.release = SMC_RELEASE; v2_ext_offset = sizeof(*pclc_smcd) - offsetofend(struct smc_clc_msg_smcd, v2_ext_offset); if (ini->smc_type_v1 != SMC_TYPE_N) @@ -560,21 +885,31 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) pclc_prfx->ipv6_prefixes_cnt * sizeof(ipv6_prfx[0]); pclc_smcd->v2_ext_offset = htons(v2_ext_offset); - v2_ext->hdr.eid_cnt = 0; + plen += sizeof(*v2_ext); + + read_lock(&smc_clc_eid_table.lock); + v2_ext->hdr.eid_cnt = smc_clc_eid_table.ueid_cnt; + plen += smc_clc_eid_table.ueid_cnt * SMC_MAX_EID_LEN; + i = 0; + list_for_each_entry(ueident, &smc_clc_eid_table.list, list) { + memcpy(v2_ext->user_eids[i++], ueident->eid, + sizeof(ueident->eid)); + } + read_unlock(&smc_clc_eid_table.lock); + } + if (smcd_indicated(ini->smc_type_v2)) { + u8 *eid = NULL; + + v2_ext->hdr.flag.seid = smc_clc_eid_table.seid_enabled; v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt; - v2_ext->hdr.flag.release = SMC_RELEASE; - v2_ext->hdr.flag.seid = 1; v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) - offsetofend(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset) + v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN); - if (ini->ism_dev[0]) - smc_ism_get_system_eid(ini->ism_dev[0], &eid); - else - smc_ism_get_system_eid(ini->ism_dev[1], &eid); - if (eid) + smc_ism_get_system_eid(&eid); + if (eid && v2_ext->hdr.flag.seid) memcpy(smcd_v2_ext->system_eid, eid, SMC_MAX_EID_LEN); - plen += sizeof(*v2_ext) + sizeof(*smcd_v2_ext); + plen += sizeof(*smcd_v2_ext); if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { gidchids[i - 1].gid = @@ -586,6 +921,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) sizeof(struct smc_clc_smcd_gid_chid); } } + if (smcr_indicated(ini->smc_type_v2)) + memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE); + pclc_base->hdr.length = htons(plen); memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -607,13 +945,16 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) } if (ini->smc_type_v2 != SMC_TYPE_N) { vec[i].iov_base = v2_ext; - vec[i++].iov_len = sizeof(*v2_ext); - vec[i].iov_base = smcd_v2_ext; - vec[i++].iov_len = sizeof(*smcd_v2_ext); - if (ini->ism_offered_cnt) { - vec[i].iov_base = gidchids; - vec[i++].iov_len = ini->ism_offered_cnt * + vec[i++].iov_len = sizeof(*v2_ext) + + (v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN); + if (smcd_indicated(ini->smc_type_v2)) { + vec[i].iov_base = smcd_v2_ext; + vec[i++].iov_len = sizeof(*smcd_v2_ext); + if (ini->ism_offered_cnt) { + vec[i].iov_base = gidchids; + vec[i++].iov_len = ini->ism_offered_cnt * sizeof(struct smc_clc_smcd_gid_chid); + } } } vec[i].iov_base = trl; @@ -635,13 +976,15 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) /* build and send CLC CONFIRM / ACCEPT message */ static int smc_clc_send_confirm_accept(struct smc_sock *smc, struct smc_clc_msg_accept_confirm_v2 *clc_v2, - int first_contact, u8 version) + int first_contact, u8 version, + u8 *eid, struct smc_init_info *ini) { struct smc_connection *conn = &smc->conn; struct smc_clc_msg_accept_confirm *clc; struct smc_clc_first_contact_ext fce; + struct smc_clc_fce_gid_ext gle; struct smc_clc_msg_trail trl; - struct kvec vec[3]; + struct kvec vec[5]; struct msghdr msg; int i, len; @@ -663,12 +1006,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, if (version == SMC_V1) { clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN); } else { - u8 *eid = NULL; - - clc_v2->chid = htons(smc_ism_get_chid(conn->lgr->smcd)); - smc_ism_get_system_eid(conn->lgr->smcd, &eid); - if (eid) - memcpy(clc_v2->eid, eid, SMC_MAX_EID_LEN); + clc_v2->d1.chid = + htons(smc_ism_get_chid(conn->lgr->smcd)); + if (eid && eid[0]) + memcpy(clc_v2->d1.eid, eid, SMC_MAX_EID_LEN); len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; if (first_contact) smc_clc_fill_fce(&fce, &len); @@ -707,6 +1048,26 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(clc->r0.psn, link->psn_initial); + if (version == SMC_V1) { + clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN); + } else { + if (eid && eid[0]) + memcpy(clc_v2->r1.eid, eid, SMC_MAX_EID_LEN); + len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2; + if (first_contact) { + smc_clc_fill_fce(&fce, &len); + fce.v2_direct = !link->lgr->uses_gateway; + memset(&gle, 0, sizeof(gle)); + if (ini && clc->hdr.type == SMC_CLC_CONFIRM) { + gle.gid_cnt = ini->smcrv2.gidlist.len; + len += sizeof(gle); + len += gle.gid_cnt * sizeof(gle.gid[0]); + } else { + len += sizeof(gle.reserved); + } + } + clc_v2->hdr.length = htons(len); + } memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); } @@ -714,7 +1075,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, i = 0; vec[i].iov_base = clc_v2; if (version > SMC_V1) - vec[i++].iov_len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 - sizeof(trl); + vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? + SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 : + SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) - + sizeof(trl); else vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ? SMCD_CLC_ACCEPT_CONFIRM_LEN : @@ -723,6 +1087,18 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, if (version > SMC_V1 && first_contact) { vec[i].iov_base = &fce; vec[i++].iov_len = sizeof(fce); + if (!conn->lgr->is_smcd) { + if (clc->hdr.type == SMC_CLC_CONFIRM) { + vec[i].iov_base = &gle; + vec[i++].iov_len = sizeof(gle); + vec[i].iov_base = &ini->smcrv2.gidlist.list; + vec[i++].iov_len = gle.gid_cnt * + sizeof(gle.gid[0]); + } else { + vec[i].iov_base = &gle.reserved; + vec[i++].iov_len = sizeof(gle.reserved); + } + } } vec[i].iov_base = &trl; vec[i++].iov_len = sizeof(trl); @@ -732,7 +1108,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc, /* send CLC CONFIRM message across internal TCP socket */ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, - u8 version) + u8 version, u8 *eid, struct smc_init_info *ini) { struct smc_clc_msg_accept_confirm_v2 cclc_v2; int reason_code = 0; @@ -742,7 +1118,7 @@ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, memset(&cclc_v2, 0, sizeof(cclc_v2)); cclc_v2.hdr.type = SMC_CLC_CONFIRM; len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact, - version); + version, eid, ini); if (len < ntohs(cclc_v2.hdr.length)) { if (len >= 0) { reason_code = -ENETUNREACH; @@ -757,7 +1133,7 @@ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, /* send CLC ACCEPT message across internal TCP socket */ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, - u8 version) + u8 version, u8 *negotiated_eid) { struct smc_clc_msg_accept_confirm_v2 aclc_v2; int len; @@ -765,7 +1141,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact, memset(&aclc_v2, 0, sizeof(aclc_v2)); aclc_v2.hdr.type = SMC_CLC_ACCEPT; len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact, - version); + version, negotiated_eid, NULL); if (len < ntohs(aclc_v2.hdr.length)) len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err; @@ -785,4 +1161,14 @@ void __init smc_clc_init(void) u = utsname(); memcpy(smc_hostname, u->nodename, min_t(size_t, strlen(u->nodename), sizeof(smc_hostname))); + + INIT_LIST_HEAD(&smc_clc_eid_table.list); + rwlock_init(&smc_clc_eid_table.lock); + smc_clc_eid_table.ueid_cnt = 0; + smc_clc_eid_table.seid_enabled = 1; +} + +void smc_clc_exit(void) +{ + smc_clc_ueid_remove(NULL); } diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 32d37f7b70f2..83f02f131fc0 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -14,8 +14,10 @@ #define _SMC_CLC_H #include <rdma/ib_verbs.h> +#include <linux/smc.h> #include "smc.h" +#include "smc_netlink.h" #define SMC_CLC_PROPOSAL 0x01 #define SMC_CLC_ACCEPT 0x02 @@ -42,6 +44,7 @@ #define SMC_CLC_DECL_NOV2DEXT 0x03030005 /* peer sent no clc SMC-Dv2 ext. */ #define SMC_CLC_DECL_NOSEID 0x03030006 /* peer sent no SEID */ #define SMC_CLC_DECL_NOSMCD2DEV 0x03030007 /* no SMC-Dv2 device found */ +#define SMC_CLC_DECL_NOUEID 0x03030008 /* peer sent no UEID */ #define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/ #define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */ #define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */ @@ -52,6 +55,8 @@ #define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */ #define SMC_CLC_DECL_VERSMISMAT 0x030c0000 /* SMC version mismatch */ #define SMC_CLC_DECL_MAX_DMB 0x030d0000 /* SMC-D DMB limit exceeded */ +#define SMC_CLC_DECL_NOROUTE 0x030e0000 /* SMC-Rv2 conn. no route to peer */ +#define SMC_CLC_DECL_NOINDIRECT 0x030f0000 /* SMC-Rv2 conn. indirect mismatch*/ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ @@ -158,6 +163,7 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ } __aligned(4); #define SMC_CLC_MAX_V6_PREFIX 8 +#define SMC_CLC_MAX_UEID 8 struct smc_clc_msg_proposal_area { struct smc_clc_msg_proposal pclc_base; @@ -165,6 +171,7 @@ struct smc_clc_msg_proposal_area { struct smc_clc_msg_proposal_prefix pclc_prfx; struct smc_clc_ipv6_prefix pclc_prfx_ipv6[SMC_CLC_MAX_V6_PREFIX]; struct smc_clc_v2_extension pclc_v2_ext; + u8 user_eids[SMC_CLC_MAX_UEID][SMC_MAX_EID_LEN]; struct smc_clc_smcd_v2_extension pclc_smcd_v2_ext; struct smc_clc_smcd_gid_chid pclc_gidchids[SMC_MAX_ISM_DEVS]; struct smc_clc_msg_trail pclc_trl; @@ -209,11 +216,14 @@ struct smcd_clc_msg_accept_confirm_common { /* SMCD accept/confirm */ #define SMC_CLC_OS_AIX 3 struct smc_clc_first_contact_ext { - u8 reserved1; #if defined(__BIG_ENDIAN_BITFIELD) + u8 v2_direct : 1, + reserved : 7; u8 os_type : 4, release : 4; #elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 7, + v2_direct : 1; u8 release : 4, os_type : 4; #endif @@ -221,6 +231,13 @@ struct smc_clc_first_contact_ext { u8 hostname[SMC_MAX_HOSTNAME_LEN]; }; +struct smc_clc_fce_gid_ext { + u8 reserved[16]; + u8 gid_cnt; + u8 reserved2[3]; + u8 gid[][SMC_GID_SIZE]; +}; + struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_hdr hdr; union { @@ -235,13 +252,17 @@ struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_accept_confirm_v2 { /* clc accept / confirm message */ struct smc_clc_msg_hdr hdr; union { - struct smcr_clc_msg_accept_confirm r0; /* SMC-R */ + struct { /* SMC-R */ + struct smcr_clc_msg_accept_confirm r0; + u8 eid[SMC_MAX_EID_LEN]; + u8 reserved6[8]; + } r1; struct { /* SMC-D */ struct smcd_clc_msg_accept_confirm_common d0; __be16 chid; u8 eid[SMC_MAX_EID_LEN]; u8 reserved5[8]; - }; + } d1; }; }; @@ -260,6 +281,24 @@ struct smc_clc_msg_decline { /* clc decline message */ struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */ } __aligned(4); +#define SMC_DECL_DIAG_COUNT_V2 4 /* no. of additional peer diagnosis codes */ + +struct smc_clc_msg_decline_v2 { /* clc decline message */ + struct smc_clc_msg_hdr hdr; + u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */ + __be32 peer_diagnosis; /* diagnosis information */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 os_type : 4, + reserved : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 4, + os_type : 4; +#endif + u8 reserved2[3]; + __be32 peer_diagnosis_v2[SMC_DECL_DIAG_COUNT_V2]; + struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */ +} __aligned(4); + /* determine start of the prefix area within the proposal message */ static inline struct smc_clc_msg_proposal_prefix * smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) @@ -278,6 +317,17 @@ static inline bool smcd_indicated(int smc_type) return smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B; } +static inline u8 smc_indicated_type(int is_smcd, int is_smcr) +{ + if (is_smcd && is_smcr) + return SMC_TYPE_B; + if (is_smcd) + return SMC_TYPE_D; + if (is_smcr) + return SMC_TYPE_R; + return SMC_TYPE_N; +} + /* get SMC-D info from proposal message */ static inline struct smc_clc_msg_smcd * smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop) @@ -330,10 +380,22 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version); int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini); int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact, - u8 version); + u8 version, u8 *eid, struct smc_init_info *ini); int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact, - u8 version); + u8 version, u8 *negotiated_eid); void smc_clc_init(void) __init; +void smc_clc_exit(void); void smc_clc_get_hostname(u8 **host); +bool smc_clc_match_eid(u8 *negotiated_eid, + struct smc_clc_v2_extension *smc_v2_ext, + u8 *peer_eid, u8 *local_eid); +int smc_clc_ueid_count(void); +int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info); +int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info); +int smc_nl_flush_ueid(struct sk_buff *skb, struct genl_info *info); +int smc_nl_dump_seid(struct sk_buff *skb, struct netlink_callback *cb); +int smc_nl_enable_seid(struct sk_buff *skb, struct genl_info *info); +int smc_nl_disable_seid(struct sk_buff *skb, struct genl_info *info); #endif diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index c160ff50c053..49b8ba3bb683 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -34,6 +34,7 @@ #include "smc_ism.h" #include "smc_netlink.h" #include "smc_stats.h" +#include "smc_tracepoint.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) @@ -223,7 +224,6 @@ int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); char hostname[SMC_MAX_HOSTNAME_LEN + 1]; char smc_seid[SMC_MAX_EID_LEN + 1]; - struct smcd_dev *smcd_dev; struct nlattr *attrs; u8 *seid = NULL; u8 *host = NULL; @@ -245,6 +245,8 @@ int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) goto errattr; if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable())) goto errattr; + if (nla_put_u8(skb, SMC_NLA_SYS_IS_SMCR_V2, true)) + goto errattr; smc_clc_get_hostname(&host); if (host) { memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN); @@ -252,13 +254,8 @@ int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb) if (nla_put_string(skb, SMC_NLA_SYS_LOCAL_HOST, hostname)) goto errattr; } - mutex_lock(&smcd_dev_list.mutex); - smcd_dev = list_first_entry_or_null(&smcd_dev_list.list, - struct smcd_dev, list); - if (smcd_dev) - smc_ism_get_system_eid(smcd_dev, &seid); - mutex_unlock(&smcd_dev_list.mutex); - if (seid && smc_ism_is_v2_capable()) { + if (smc_ism_is_v2_capable()) { + smc_ism_get_system_eid(&seid); memcpy(smc_seid, seid, SMC_MAX_EID_LEN); smc_seid[SMC_MAX_EID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_SYS_SEID, smc_seid)) @@ -277,12 +274,65 @@ errmsg: return skb->len; } +/* Fill SMC_NLA_LGR_D_V2_COMMON/SMC_NLA_LGR_R_V2_COMMON nested attributes */ +static int smc_nl_fill_lgr_v2_common(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb, + struct nlattr *v2_attrs) +{ + char smc_host[SMC_MAX_HOSTNAME_LEN + 1]; + char smc_eid[SMC_MAX_EID_LEN + 1]; + + if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release)) + goto errv2attr; + if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os)) + goto errv2attr; + memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN); + smc_host[SMC_MAX_HOSTNAME_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host)) + goto errv2attr; + memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN); + smc_eid[SMC_MAX_EID_LEN] = 0; + if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid)) + goto errv2attr; + + nla_nest_end(skb, v2_attrs); + return 0; + +errv2attr: + nla_nest_cancel(skb, v2_attrs); + return -EMSGSIZE; +} + +static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr, + struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nlattr *v2_attrs; + + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2); + if (!v2_attrs) + goto errattr; + if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway)) + goto errv2attr; + + nla_nest_end(skb, v2_attrs); + return 0; + +errv2attr: + nla_nest_cancel(skb, v2_attrs); +errattr: + return -EMSGSIZE; +} + static int smc_nl_fill_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { char smc_target[SMC_MAX_PNETID_LEN + 1]; - struct nlattr *attrs; + struct nlattr *attrs, *v2_attrs; attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR); if (!attrs) @@ -302,6 +352,15 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr, smc_target[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target)) goto errattr; + if (lgr->smc_version > SMC_V1) { + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON); + if (!v2_attrs) + goto errattr; + if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) + goto errattr; + if (smc_nl_fill_smcr_lgr_v2(lgr, skb, cb)) + goto errattr; + } nla_nest_end(skb, attrs); return 0; @@ -434,10 +493,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, struct sk_buff *skb, struct netlink_callback *cb) { - char smc_host[SMC_MAX_HOSTNAME_LEN + 1]; char smc_pnet[SMC_MAX_PNETID_LEN + 1]; - char smc_eid[SMC_MAX_EID_LEN + 1]; - struct nlattr *v2_attrs; struct nlattr *attrs; void *nlh; @@ -469,32 +525,19 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet)) goto errattr; + if (lgr->smc_version > SMC_V1) { + struct nlattr *v2_attrs; - v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_V2); - if (!v2_attrs) - goto errattr; - if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version)) - goto errv2attr; - if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release)) - goto errv2attr; - if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os)) - goto errv2attr; - memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN); - smc_host[SMC_MAX_HOSTNAME_LEN] = 0; - if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host)) - goto errv2attr; - memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN); - smc_eid[SMC_MAX_EID_LEN] = 0; - if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid)) - goto errv2attr; - - nla_nest_end(skb, v2_attrs); + v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_D_V2_COMMON); + if (!v2_attrs) + goto errattr; + if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs)) + goto errattr; + } nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return 0; -errv2attr: - nla_nest_cancel(skb, v2_attrs); errattr: nla_nest_cancel(skb, attrs); errout: @@ -690,24 +733,30 @@ static void smcr_copy_dev_info_to_link(struct smc_link *link) int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini) { + struct smc_ib_device *smcibdev; u8 rndvec[3]; int rc; - get_device(&ini->ib_dev->ibdev->dev); - atomic_inc(&ini->ib_dev->lnk_cnt); + if (lgr->smc_version == SMC_V2) { + lnk->smcibdev = ini->smcrv2.ib_dev_v2; + lnk->ibport = ini->smcrv2.ib_port_v2; + } else { + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + } + get_device(&lnk->smcibdev->ibdev->dev); + atomic_inc(&lnk->smcibdev->lnk_cnt); + lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; lnk->link_idx = link_idx; - lnk->smcibdev = ini->ib_dev; - lnk->ibport = ini->ib_port; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); - lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; atomic_set(&lnk->conn_cnt, 0); smc_llc_link_set_uid(lnk); INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); - if (!ini->ib_dev->initialized) { - rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev); + if (!lnk->smcibdev->initialized) { + rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev); if (rc) goto out; } @@ -715,7 +764,9 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, - ini->vlan_id, lnk->gid, &lnk->sgid_index); + ini->vlan_id, lnk->gid, &lnk->sgid_index, + lgr->smc_version == SMC_V2 ? + &ini->smcrv2 : NULL); if (rc) goto out; rc = smc_llc_link_init(lnk); @@ -746,11 +797,12 @@ clear_llc_lnk: smc_llc_link_clear(lnk, false); out: smc_ibdev_cnt_dec(lnk); - put_device(&ini->ib_dev->ibdev->dev); + put_device(&lnk->smcibdev->ibdev->dev); + smcibdev = lnk->smcibdev; memset(lnk, 0, sizeof(struct smc_link)); lnk->state = SMC_LNK_UNUSED; - if (!atomic_dec_return(&ini->ib_dev->lnk_cnt)) - wake_up(&ini->ib_dev->lnks_deleted); + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); return rc; } @@ -814,18 +866,37 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt); } else { /* SMC-R specific settings */ + struct smc_ib_device *ibdev; + int ibport; + lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, + lgr->smc_version = ini->smcr_version; + memcpy(lgr->peer_systemid, ini->peer_systemid, SMC_SYSTEMID_LEN); - memcpy(lgr->pnet_id, ini->ib_dev->pnetid[ini->ib_port - 1], + if (lgr->smc_version == SMC_V2) { + ibdev = ini->smcrv2.ib_dev_v2; + ibport = ini->smcrv2.ib_port_v2; + lgr->saddr = ini->smcrv2.saddr; + lgr->uses_gateway = ini->smcrv2.uses_gateway; + memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac, + ETH_ALEN); + } else { + ibdev = ini->ib_dev; + ibport = ini->ib_port; + } + memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); + if (smc_wr_alloc_lgr_mem(lgr)) + goto free_wq; smc_llc_lgr_init(lgr, smc); link_idx = SMC_SINGLE_LINK; lnk = &lgr->lnk[link_idx]; rc = smcr_link_init(lgr, lnk, link_idx, ini); - if (rc) + if (rc) { + smc_wr_free_lgr_mem(lgr); goto free_wq; + } lgr_list = &smc_lgr_list.list; lgr_lock = &smc_lgr_list.lock; atomic_inc(&lgr_cnt); @@ -949,7 +1020,7 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, to_lnk = &lgr->lnk[i]; break; } - if (!to_lnk) { + if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) { smc_lgr_terminate_sched(lgr); return NULL; } @@ -981,24 +1052,26 @@ again: read_unlock_bh(&lgr->conns_lock); /* pre-fetch buffer outside of send_lock, might sleep */ rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); - if (rc) { - smcr_link_down_cond_sched(to_lnk); - return NULL; - } + if (rc) + goto err_out; /* avoid race with smcr_tx_sndbuf_nonempty() */ spin_lock_bh(&conn->send_lock); smc_switch_link_and_count(conn, to_lnk); rc = smc_switch_cursor(smc, pend, wr_buf); spin_unlock_bh(&conn->send_lock); sock_put(&smc->sk); - if (rc) { - smcr_link_down_cond_sched(to_lnk); - return NULL; - } + if (rc) + goto err_out; goto again; } read_unlock_bh(&lgr->conns_lock); + smc_wr_tx_link_put(to_lnk); return to_lnk; + +err_out: + smcr_link_down_cond_sched(to_lnk); + smc_wr_tx_link_put(to_lnk); + return NULL; } static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, @@ -1230,6 +1303,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { + smc_wr_free_lgr_mem(lgr); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } @@ -1474,7 +1548,9 @@ static void smc_conn_abort_work(struct work_struct *work) abort_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + lock_sock(&smc->sk); smc_conn_kill(conn, true); + release_sock(&smc->sk); sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */ } @@ -1545,15 +1621,19 @@ static void smcr_link_down(struct smc_link *lnk) /* must be called under lgr->llc_conf_mutex lock */ void smcr_link_down_cond(struct smc_link *lnk) { - if (smc_link_downing(&lnk->state)) + if (smc_link_downing(&lnk->state)) { + trace_smcr_link_down(lnk, __builtin_return_address(0)); smcr_link_down(lnk); + } } /* will get the lgr->llc_conf_mutex lock */ void smcr_link_down_cond_sched(struct smc_link *lnk) { - if (smc_link_downing(&lnk->state)) + if (smc_link_downing(&lnk->state)) { + trace_smcr_link_down(lnk, __builtin_return_address(0)); schedule_work(&lnk->link_down_wrk); + } } void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) @@ -1638,13 +1718,15 @@ out: return rc; } -static bool smcr_lgr_match(struct smc_link_group *lgr, - struct smc_clc_msg_local *lcl, +static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, + u8 peer_systemid[], + u8 peer_gid[], + u8 peer_mac_v1[], enum smc_lgr_role role, u32 clcqpn) { int i; - if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) || + if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) || lgr->role != role) return false; @@ -1652,8 +1734,9 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, if (!smc_link_active(&lgr->lnk[i])) continue; if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) && - !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) && - !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac))) + !memcmp(lgr->lnk[i].peer_gid, peer_gid, SMC_GID_SIZE) && + (smcr_version == SMC_V2 || + !memcmp(lgr->lnk[i].peer_mac, peer_mac_v1, ETH_ALEN))) return true; } return false; @@ -1692,7 +1775,10 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) if ((ini->is_smcd ? smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected], ini->ism_peer_gid[ini->ism_selected]) : - smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) && + smcr_lgr_match(lgr, ini->smcr_version, + ini->peer_systemid, + ini->peer_gid, ini->peer_mac, role, + ini->ib_clcqpn)) && !lgr->sync_err && (ini->smcd_version == SMC_V2 || lgr->vlan_id == ini->vlan_id) && @@ -1752,21 +1838,30 @@ out: return rc; } -/* convert the RMB size into the compressed notation - minimum 16K. +#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +#define SMCR_RMBE_SIZES 5 /* 0 -> 16KB, 1 -> 32KB, .. 5 -> 512KB */ + +/* convert the RMB size into the compressed notation (minimum 16K, see + * SMCD/R_DMBE_SIZES. * In contrast to plain ilog2, this rounds towards the next power of 2, * so the socket application gets at least its desired sndbuf / rcvbuf size. */ -static u8 smc_compress_bufsize(int size) +static u8 smc_compress_bufsize(int size, bool is_smcd, bool is_rmb) { + const unsigned int max_scat = SG_MAX_SINGLE_ALLOC * PAGE_SIZE; u8 compressed; if (size <= SMC_BUF_MIN_SIZE) return 0; - size = (size - 1) >> 14; - compressed = ilog2(size) + 1; - if (compressed >= SMC_RMBE_SIZES) - compressed = SMC_RMBE_SIZES - 1; + size = (size - 1) >> 14; /* convert to 16K multiple */ + compressed = min_t(u8, ilog2(size) + 1, + is_smcd ? SMCD_DMBE_SIZES : SMCR_RMBE_SIZES); + + if (!is_smcd && is_rmb) + /* RMBs are backed by & limited to max size of scatterlists */ + compressed = min_t(u8, compressed, ilog2(max_scat >> 14)); + return compressed; } @@ -1982,17 +2077,12 @@ out: return rc; } -#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ - static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) { struct smc_buf_desc *buf_desc; int rc; - if (smc_compress_bufsize(bufsize) > SMCD_DMBE_SIZES) - return ERR_PTR(-EAGAIN); - /* try to alloc a new DMB */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) @@ -2041,9 +2131,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) /* use socket send buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_sndbuf / 2; - for (bufsize_short = smc_compress_bufsize(sk_buf_size); + for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); bufsize_short >= 0; bufsize_short--) { - if (is_rmb) { lock = &lgr->rmbs_lock; buf_list = &lgr->rmbs[bufsize_short]; @@ -2052,8 +2141,6 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) buf_list = &lgr->sndbufs[bufsize_short]; } bufsize = smc_uncompress_bufsize(bufsize_short); - if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) - continue; /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index c043ecdca5c4..59cef3b830d8 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -42,11 +42,16 @@ enum smc_link_state { /* possible states of a link */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ +#define SMC_WR_BUF_V2_SIZE 8192 /* size of v2 work request buffer */ struct smc_wr_buf { u8 raw[SMC_WR_BUF_SIZE]; }; +struct smc_wr_v2_buf { + u8 raw[SMC_WR_BUF_V2_SIZE]; +}; + #define SMC_WR_REG_MR_WAIT_TIME (5 * HZ)/* wait time for ib_wr_reg_mr result */ enum smc_wr_reg_state { @@ -92,7 +97,11 @@ struct smc_link { struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ struct completion *wr_tx_compl; /* WR send CQE completion */ /* above four vectors have wr_tx_cnt elements and use the same index */ + struct ib_send_wr *wr_tx_v2_ib; /* WR send v2 meta data */ + struct ib_sge *wr_tx_v2_sge; /* WR send v2 gather meta data*/ + struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ + dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/ atomic_long_t wr_tx_id; /* seq # of last sent WR */ unsigned long *wr_tx_mask; /* bit mask of used indexes */ u32 wr_tx_cnt; /* number of WR send buffers */ @@ -104,6 +113,7 @@ struct smc_link { struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */ /* above three vectors have wr_rx_cnt elements and use the same index */ dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ + dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ @@ -208,6 +218,7 @@ enum smc_llc_flowtype { SMC_LLC_FLOW_NONE = 0, SMC_LLC_FLOW_ADD_LINK = 2, SMC_LLC_FLOW_DEL_LINK = 4, + SMC_LLC_FLOW_REQ_ADD_LINK = 5, SMC_LLC_FLOW_RKEY = 6, }; @@ -250,6 +261,10 @@ struct smc_link_group { /* client or server */ struct smc_link lnk[SMC_LINKS_PER_LGR_MAX]; /* smc link */ + struct smc_wr_v2_buf *wr_rx_buf_v2; + /* WR v2 recv payload buffer */ + struct smc_wr_v2_buf *wr_tx_buf_v2; + /* WR v2 send payload buffer */ char peer_systemid[SMC_SYSTEMID_LEN]; /* unique system_id of peer */ struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX] @@ -288,6 +303,9 @@ struct smc_link_group { /* link keep alive time */ u32 llc_termination_rsn; /* rsn code for termination */ + u8 nexthop_mac[ETH_ALEN]; + u8 uses_gateway; + __be32 saddr; }; struct { /* SMC-D */ u64 peer_gid; @@ -302,6 +320,31 @@ struct smc_link_group { struct smc_clc_msg_local; +#define GID_LIST_SIZE 2 + +struct smc_gidlist { + u8 len; + u8 list[GID_LIST_SIZE][SMC_GID_SIZE]; +}; + +struct smc_init_info_smcrv2 { + /* Input fields */ + __be32 saddr; + struct sock *clc_sk; + __be32 daddr; + + /* Output fields when saddr is set */ + struct smc_ib_device *ib_dev_v2; + u8 ib_port_v2; + u8 ib_gid_v2[SMC_GID_SIZE]; + + /* Additional output fields when clc_sk and daddr is set as well */ + u8 uses_gateway; + u8 nexthop_mac[ETH_ALEN]; + + struct smc_gidlist gidlist; +}; + struct smc_init_info { u8 is_smcd; u8 smc_type_v1; @@ -310,12 +353,18 @@ struct smc_init_info { u8 first_contact_local; unsigned short vlan_id; u32 rc; + u8 negotiated_eid[SMC_MAX_EID_LEN]; /* SMC-R */ - struct smc_clc_msg_local *ib_lcl; + u8 smcr_version; + u8 check_smcrv2; + u8 peer_gid[SMC_GID_SIZE]; + u8 peer_mac[ETH_ALEN]; + u8 peer_systemid[SMC_SYSTEMID_LEN]; struct smc_ib_device *ib_dev; u8 ib_gid[SMC_GID_SIZE]; u8 ib_port; u32 ib_clcqpn; + struct smc_init_info_smcrv2 smcrv2; /* SMC-D */ u64 ism_peer_gid[SMC_MAX_ISM_DEVS + 1]; struct smcd_dev *ism_dev[SMC_MAX_ISM_DEVS + 1]; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 7d7ba0320d5a..d93055ec17ae 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -17,6 +17,7 @@ #include <linux/scatterlist.h> #include <linux/wait.h> #include <linux/mutex.h> +#include <linux/inetdevice.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> @@ -62,16 +63,23 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk) IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER; struct ib_qp_attr qp_attr; + u8 hop_lim = 1; memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.qp_state = IB_QPS_RTR; qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); - rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0); + if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) + hop_lim = IPV6_DEFAULT_HOPLIMIT; + rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0); rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); - memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, - sizeof(lnk->peer_mac)); + if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway) + memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac, + sizeof(lnk->lgr->nexthop_mac)); + else + memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, + sizeof(lnk->peer_mac)); qp_attr.dest_qp_num = lnk->peer_qpn; qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming @@ -183,9 +191,81 @@ bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport) return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE; } +int smc_ib_find_route(__be32 saddr, __be32 daddr, + u8 nexthop_mac[], u8 *uses_gateway) +{ + struct neighbour *neigh = NULL; + struct rtable *rt = NULL; + struct flowi4 fl4 = { + .saddr = saddr, + .daddr = daddr + }; + + if (daddr == cpu_to_be32(INADDR_NONE)) + goto out; + rt = ip_route_output_flow(&init_net, &fl4, NULL); + if (IS_ERR(rt)) + goto out; + if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET) + goto out; + neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr); + if (neigh) { + memcpy(nexthop_mac, neigh->ha, ETH_ALEN); + *uses_gateway = rt->rt_uses_gateway; + return 0; + } +out: + return -ENOENT; +} + +static int smc_ib_determine_gid_rcu(const struct net_device *ndev, + const struct ib_gid_attr *attr, + u8 gid[], u8 *sgid_index, + struct smc_init_info_smcrv2 *smcrv2) +{ + if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) { + if (gid) + memcpy(gid, &attr->gid, SMC_GID_SIZE); + if (sgid_index) + *sgid_index = attr->index; + return 0; + } + if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && + smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) { + struct in_device *in_dev = __in_dev_get_rcu(ndev); + const struct in_ifaddr *ifa; + bool subnet_match = false; + + if (!in_dev) + goto out; + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (!inet_ifa_match(smcrv2->saddr, ifa)) + continue; + subnet_match = true; + break; + } + if (!subnet_match) + goto out; + if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr, + smcrv2->daddr, + smcrv2->nexthop_mac, + &smcrv2->uses_gateway)) + goto out; + + if (gid) + memcpy(gid, &attr->gid, SMC_GID_SIZE); + if (sgid_index) + *sgid_index = attr->index; + return 0; + } +out: + return -ENODEV; +} + /* determine the gid for an ib-device port and vlan id */ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, - unsigned short vlan_id, u8 gid[], u8 *sgid_index) + unsigned short vlan_id, u8 gid[], u8 *sgid_index, + struct smc_init_info_smcrv2 *smcrv2) { const struct ib_gid_attr *attr; const struct net_device *ndev; @@ -201,15 +281,13 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, if (!IS_ERR(ndev) && ((!vlan_id && !is_vlan_dev(ndev)) || (vlan_id && is_vlan_dev(ndev) && - vlan_dev_vlan_id(ndev) == vlan_id)) && - attr->gid_type == IB_GID_TYPE_ROCE) { - rcu_read_unlock(); - if (gid) - memcpy(gid, &attr->gid, SMC_GID_SIZE); - if (sgid_index) - *sgid_index = attr->index; - rdma_put_gid_attr(attr); - return 0; + vlan_dev_vlan_id(ndev) == vlan_id))) { + if (!smc_ib_determine_gid_rcu(ndev, attr, gid, + sgid_index, smcrv2)) { + rcu_read_unlock(); + rdma_put_gid_attr(attr); + return 0; + } } rcu_read_unlock(); rdma_put_gid_attr(attr); @@ -217,6 +295,58 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, return -ENODEV; } +/* check if gid is still defined on smcibdev */ +static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2, + struct smc_ib_device *smcibdev, u8 ibport) +{ + const struct ib_gid_attr *attr; + bool rc = false; + int i; + + for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) { + attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i); + if (IS_ERR(attr)) + continue; + + rcu_read_lock(); + if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) || + (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP && + !(ipv6_addr_type((const struct in6_addr *)&attr->gid) + & IPV6_ADDR_LINKLOCAL))) + if (!memcmp(gid, &attr->gid, SMC_GID_SIZE)) + rc = true; + rcu_read_unlock(); + rdma_put_gid_attr(attr); + } + return rc; +} + +/* check all links if the gid is still defined on smcibdev */ +static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr; + int i; + + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry(lgr, &smc_lgr_list.list, list) { + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN)) + continue; /* lgr is not affected */ + if (list_empty(&lgr->list)) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state == SMC_LNK_UNUSED || + lgr->lnk[i].smcibdev != smcibdev) + continue; + if (!smc_ib_check_link_gid(lgr->lnk[i].gid, + lgr->smc_version == SMC_V2, + smcibdev, ibport)) + smcr_port_err(smcibdev, ibport); + } + } + spin_unlock_bh(&smc_lgr_list.lock); +} + static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) { int rc; @@ -255,6 +385,7 @@ static void smc_ib_port_event_work(struct work_struct *work) } else { clear_bit(port_idx, smcibdev->ports_going_away); smcr_port_add(smcibdev, port_idx + 1); + smc_ib_gid_check(smcibdev, port_idx + 1); } } } @@ -523,6 +654,7 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk) /* create a queue pair within the protection domain for a link */ int smc_ib_create_queue_pair(struct smc_link *lnk) { + int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; struct ib_qp_init_attr qp_attr = { .event_handler = smc_ib_qp_event_handler, .qp_context = lnk, @@ -536,7 +668,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .max_send_wr = SMC_WR_BUF_CNT * 3, .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, - .max_recv_sge = 1, + .max_recv_sge = sges_per_buf, }, .sq_sig_type = IB_SIGNAL_REQ_WR, .qp_type = IB_QPT_RC, @@ -753,8 +885,7 @@ void smc_ib_ndev_change(struct net_device *ndev, unsigned long event) if (!libdev->ops.get_netdev) continue; lndev = libdev->ops.get_netdev(libdev, i + 1); - if (lndev) - dev_put(lndev); + dev_put(lndev); if (lndev != ndev) continue; if (event == NETDEV_REGISTER) diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 3085f5180da7..07585937370e 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -59,6 +59,17 @@ struct smc_ib_device { /* ib-device infos for smc */ int ndev_ifidx[SMC_MAX_PORTS]; /* ndev if indexes */ }; +static inline __be32 smc_ib_gid_to_ipv4(u8 gid[SMC_GID_SIZE]) +{ + struct in6_addr *addr6 = (struct in6_addr *)gid; + + if (ipv6_addr_v4mapped(addr6) || + !(addr6->s6_addr32[0] | addr6->s6_addr32[1] | addr6->s6_addr32[2])) + return addr6->s6_addr32[3]; + return cpu_to_be32(INADDR_NONE); +} + +struct smc_init_info_smcrv2; struct smc_buf_desc; struct smc_link; @@ -90,7 +101,10 @@ void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, - unsigned short vlan_id, u8 gid[], u8 *sgid_index); + unsigned short vlan_id, u8 gid[], u8 *sgid_index, + struct smc_init_info_smcrv2 *smcrv2); +int smc_ib_find_route(__be32 saddr, __be32 daddr, + u8 nexthop_mac[], u8 *uses_gateway); bool smc_ib_is_valid_local_systemid(void); int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb); #endif diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 9cb2df289963..fd28cc498b98 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -23,6 +23,7 @@ struct smcd_dev_list smcd_dev_list = { }; static bool smc_ism_v2_capable; +static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; /* Test if an ISM communication is possible - same CPC */ int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) @@ -42,9 +43,12 @@ int smc_ism_write(struct smcd_dev *smcd, const struct smc_ism_position *pos, return rc < 0 ? rc : 0; } -void smc_ism_get_system_eid(struct smcd_dev *smcd, u8 **eid) +void smc_ism_get_system_eid(u8 **eid) { - smcd->ops->get_system_eid(smcd, eid); + if (!smc_ism_v2_capable) + *eid = NULL; + else + *eid = smc_ism_v2_system_eid; } u16 smc_ism_get_chid(struct smcd_dev *smcd) @@ -435,9 +439,12 @@ int smcd_register_dev(struct smcd_dev *smcd) if (list_empty(&smcd_dev_list.list)) { u8 *system_eid = NULL; - smc_ism_get_system_eid(smcd, &system_eid); - if (system_eid[24] != '0' || system_eid[28] != '0') + smcd->ops->get_system_eid(smcd, &system_eid); + if (system_eid[24] != '0' || system_eid[28] != '0') { smc_ism_v2_capable = true; + memcpy(smc_ism_v2_system_eid, system_eid, + SMC_MAX_EID_LEN); + } } /* sort list: devices without pnetid before devices with pnetid */ if (smcd->pnetid[0]) @@ -533,4 +540,5 @@ EXPORT_SYMBOL_GPL(smcd_handle_irq); void __init smc_ism_init(void) { smc_ism_v2_capable = false; + memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN); } diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 113efc7352ed..004b22a13ffa 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -48,7 +48,7 @@ int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); int smc_ism_write(struct smcd_dev *dev, const struct smc_ism_position *pos, void *data, size_t len); int smc_ism_signal_shutdown(struct smc_link_group *lgr); -void smc_ism_get_system_eid(struct smcd_dev *dev, u8 **eid); +void smc_ism_get_system_eid(u8 **eid); u16 smc_ism_get_chid(struct smcd_dev *dev); bool smc_ism_is_v2_capable(void); void smc_ism_init(void); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 2e7560eba981..b102680296b8 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -23,16 +23,24 @@ struct smc_llc_hdr { struct smc_wr_rx_hdr common; - u8 length; /* 44 */ -#if defined(__BIG_ENDIAN_BITFIELD) - u8 reserved:4, - add_link_rej_rsn:4; + union { + struct { + u8 length; /* 44 */ + #if defined(__BIG_ENDIAN_BITFIELD) + u8 reserved:4, + add_link_rej_rsn:4; #elif defined(__LITTLE_ENDIAN_BITFIELD) - u8 add_link_rej_rsn:4, - reserved:4; + u8 add_link_rej_rsn:4, + reserved:4; #endif + }; + u16 length_v2; /* 44 - 8192*/ + }; u8 flags; -}; +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 + * (https://www.ibm.com/support/pages/node/6326337) + */ #define SMC_LLC_FLAG_NO_RMBE_EYEC 0x03 @@ -76,6 +84,32 @@ struct smc_llc_msg_add_link_cont_rt { __be64 rmb_vaddr_new; }; +struct smc_llc_msg_add_link_v2_ext { +#if defined(__BIG_ENDIAN_BITFIELD) + u8 v2_direct : 1, + reserved : 7; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 reserved : 7, + v2_direct : 1; +#endif + u8 reserved2; + u8 client_target_gid[SMC_GID_SIZE]; + u8 reserved3[8]; + u16 num_rkeys; + struct smc_llc_msg_add_link_cont_rt rt[]; +} __packed; /* format defined in + * IBM Shared Memory Communications Version 2 + * (https://www.ibm.com/support/pages/node/6326337) + */ + +struct smc_llc_msg_req_add_link_v2 { + struct smc_llc_hdr hd; + u8 reserved[20]; + u8 gid_cnt; + u8 reserved2[3]; + u8 gid[][SMC_GID_SIZE]; +}; + #define SMC_LLC_RKEYS_PER_CONT_MSG 2 struct smc_llc_msg_add_link_cont { /* type 0x03 */ @@ -114,7 +148,8 @@ struct smc_rmb_rtoken { __be64 rmb_vaddr; } __packed; /* format defined in RFC7609 */ -#define SMC_LLC_RKEYS_PER_MSG 3 +#define SMC_LLC_RKEYS_PER_MSG 3 +#define SMC_LLC_RKEYS_PER_MSG_V2 255 struct smc_llc_msg_confirm_rkey { /* type 0x06 */ struct smc_llc_hdr hd; @@ -135,9 +170,18 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ u8 reserved2[4]; }; +struct smc_llc_msg_delete_rkey_v2 { /* type 0x29 */ + struct smc_llc_hdr hd; + u8 num_rkeys; + u8 num_inval_rkeys; + u8 reserved[2]; + __be32 rkey[]; +}; + union smc_llc_msg { struct smc_llc_msg_confirm_link confirm_link; struct smc_llc_msg_add_link add_link; + struct smc_llc_msg_req_add_link_v2 req_add_link; struct smc_llc_msg_add_link_cont add_link_cont; struct smc_llc_msg_del_link delete_link; @@ -189,7 +233,7 @@ static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow, static void smc_llc_flow_parallel(struct smc_link_group *lgr, u8 flow_type, struct smc_llc_qentry *qentry) { - u8 msg_type = qentry->msg.raw.hdr.common.type; + u8 msg_type = qentry->msg.raw.hdr.common.llc_type; if ((msg_type == SMC_LLC_ADD_LINK || msg_type == SMC_LLC_DELETE_LINK) && flow_type != msg_type && !lgr->delayed_event) { @@ -219,7 +263,7 @@ static bool smc_llc_flow_start(struct smc_llc_flow *flow, spin_unlock_bh(&lgr->llc_flow_lock); return false; } - switch (qentry->msg.raw.hdr.common.type) { + switch (qentry->msg.raw.hdr.common.llc_type) { case SMC_LLC_ADD_LINK: flow->type = SMC_LLC_FLOW_ADD_LINK; break; @@ -306,7 +350,7 @@ struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, smc_llc_flow_qentry_del(flow); goto out; } - rcv_msg = flow->qentry->msg.raw.hdr.common.type; + rcv_msg = flow->qentry->msg.raw.hdr.common.llc_type; if (exp_msg && rcv_msg != exp_msg) { if (exp_msg == SMC_LLC_ADD_LINK && rcv_msg == SMC_LLC_DELETE_LINK) { @@ -374,6 +418,30 @@ static int smc_llc_add_pending_send(struct smc_link *link, return 0; } +static int smc_llc_add_pending_send_v2(struct smc_link *link, + struct smc_wr_v2_buf **wr_buf, + struct smc_wr_tx_pend_priv **pend) +{ + int rc; + + rc = smc_wr_tx_get_v2_slot(link, smc_llc_tx_handler, wr_buf, pend); + if (rc < 0) + return rc; + return 0; +} + +static void smc_llc_init_msg_hdr(struct smc_llc_hdr *hdr, + struct smc_link_group *lgr, size_t len) +{ + if (lgr->smc_version == SMC_V2) { + hdr->common.llc_version = SMC_V2; + hdr->length_v2 = len; + } else { + hdr->common.llc_version = 0; + hdr->length = len; + } +} + /* high-level API to send LLC confirm link */ int smc_llc_send_confirm_link(struct smc_link *link, enum smc_llc_reqresp reqresp) @@ -383,13 +451,15 @@ int smc_llc_send_confirm_link(struct smc_link *link, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; confllc = (struct smc_llc_msg_confirm_link *)wr_buf; memset(confllc, 0, sizeof(*confllc)); - confllc->hd.common.type = SMC_LLC_CONFIRM_LINK; - confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link); + confllc->hd.common.llc_type = SMC_LLC_CONFIRM_LINK; + smc_llc_init_msg_hdr(&confllc->hd, link->lgr, sizeof(*confllc)); confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC; if (reqresp == SMC_LLC_RESP) confllc->hd.flags |= SMC_LLC_FLAG_RESP; @@ -402,6 +472,8 @@ int smc_llc_send_confirm_link(struct smc_link *link, confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -415,13 +487,15 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, struct smc_link *link; int i, rc, rtok_ix; + if (!smc_wr_tx_link_hold(send_link)) + return -ENOLINK; rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); - rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; - rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey); + rkeyllc->hd.common.llc_type = SMC_LLC_CONFIRM_RKEY; + smc_llc_init_msg_hdr(&rkeyllc->hd, send_link->lgr, sizeof(*rkeyllc)); rtok_ix = 1; for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { @@ -444,6 +518,8 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link, (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ rc = smc_wr_tx_send(send_link, pend); +put_out: + smc_wr_tx_link_put(send_link); return rc; } @@ -456,38 +532,134 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); - rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; - rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); + rkeyllc->hd.common.llc_type = SMC_LLC_DELETE_RKEY; + smc_llc_init_msg_hdr(&rkeyllc->hd, link->lgr, sizeof(*rkeyllc)); rkeyllc->num_rkeys = 1; rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } +/* return first buffer from any of the next buf lists */ +static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + struct smc_buf_desc *buf_pos; + + while (*buf_lst < SMC_RMBE_SIZES) { + buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst], + struct smc_buf_desc, list); + if (buf_pos) + return buf_pos; + (*buf_lst)++; + } + return NULL; +} + +/* return next rmb from buffer lists */ +static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst, + struct smc_buf_desc *buf_pos) +{ + struct smc_buf_desc *buf_next; + + if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { + (*buf_lst)++; + return _smc_llc_get_next_rmb(lgr, buf_lst); + } + buf_next = list_next_entry(buf_pos, list); + return buf_next; +} + +static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + *buf_lst = 0; + return smc_llc_get_next_rmb(lgr, buf_lst, NULL); +} + +static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext, + struct smc_link *link, struct smc_link *link_new) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_buf_desc *buf_pos; + int prim_lnk_idx, lnk_idx, i; + struct smc_buf_desc *rmb; + int len = sizeof(*ext); + int buf_lst; + + ext->v2_direct = !lgr->uses_gateway; + memcpy(ext->client_target_gid, link_new->gid, SMC_GID_SIZE); + + prim_lnk_idx = link->link_idx; + lnk_idx = link_new->link_idx; + mutex_lock(&lgr->rmbs_lock); + ext->num_rkeys = lgr->conns_num; + if (!ext->num_rkeys) + goto out; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + for (i = 0; i < ext->num_rkeys; i++) { + if (!buf_pos) + break; + rmb = buf_pos; + ext->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey); + ext->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey); + ext->rt[i].rmb_vaddr_new = + cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); + buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); + while (buf_pos && !(buf_pos)->used) + buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos); + } + len += i * sizeof(ext->rt[0]); +out: + mutex_unlock(&lgr->rmbs_lock); + return len; +} + /* send ADD LINK request or response */ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], struct smc_link *link_new, enum smc_llc_reqresp reqresp) { + struct smc_llc_msg_add_link_v2_ext *ext = NULL; struct smc_llc_msg_add_link *addllc; struct smc_wr_tx_pend_priv *pend; - struct smc_wr_buf *wr_buf; + int len = sizeof(*addllc); int rc; - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); - if (rc) - return rc; - addllc = (struct smc_llc_msg_add_link *)wr_buf; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; + if (link->lgr->smc_version == SMC_V2) { + struct smc_wr_v2_buf *wr_buf; + + rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend); + if (rc) + goto put_out; + addllc = (struct smc_llc_msg_add_link *)wr_buf; + ext = (struct smc_llc_msg_add_link_v2_ext *) + &wr_buf->raw[sizeof(*addllc)]; + memset(ext, 0, SMC_WR_TX_SIZE); + } else { + struct smc_wr_buf *wr_buf; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + goto put_out; + addllc = (struct smc_llc_msg_add_link *)wr_buf; + } memset(addllc, 0, sizeof(*addllc)); - addllc->hd.common.type = SMC_LLC_ADD_LINK; - addllc->hd.length = sizeof(struct smc_llc_msg_add_link); + addllc->hd.common.llc_type = SMC_LLC_ADD_LINK; if (reqresp == SMC_LLC_RESP) addllc->hd.flags |= SMC_LLC_FLAG_RESP; memcpy(addllc->sender_mac, mac, ETH_ALEN); @@ -502,8 +674,16 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], addllc->qp_mtu = min(link_new->path_mtu, link_new->peer_mtu); } + if (ext && link_new) + len += smc_llc_fill_ext_v2(ext, link, link_new); + smc_llc_init_msg_hdr(&addllc->hd, link->lgr, len); /* send llc message */ - rc = smc_wr_tx_send(link, pend); + if (link->lgr->smc_version == SMC_V2) + rc = smc_wr_tx_v2_send(link, pend, len); + else + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -517,14 +697,16 @@ int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; delllc = (struct smc_llc_msg_del_link *)wr_buf; memset(delllc, 0, sizeof(*delllc)); - delllc->hd.common.type = SMC_LLC_DELETE_LINK; - delllc->hd.length = sizeof(struct smc_llc_msg_del_link); + delllc->hd.common.llc_type = SMC_LLC_DELETE_LINK; + smc_llc_init_msg_hdr(&delllc->hd, link->lgr, sizeof(*delllc)); if (reqresp == SMC_LLC_RESP) delllc->hd.flags |= SMC_LLC_FLAG_RESP; if (orderly) @@ -536,6 +718,8 @@ int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, delllc->reason = htonl(reason); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -547,16 +731,20 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) struct smc_wr_buf *wr_buf; int rc; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; testllc = (struct smc_llc_msg_test_link *)wr_buf; memset(testllc, 0, sizeof(*testllc)); - testllc->hd.common.type = SMC_LLC_TEST_LINK; - testllc->hd.length = sizeof(struct smc_llc_msg_test_link); + testllc->hd.common.llc_type = SMC_LLC_TEST_LINK; + smc_llc_init_msg_hdr(&testllc->hd, link->lgr, sizeof(*testllc)); memcpy(testllc->user_data, user_data, sizeof(testllc->user_data)); /* send llc message */ rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); return rc; } @@ -567,13 +755,16 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf) struct smc_wr_buf *wr_buf; int rc; - if (!smc_link_usable(link)) + if (!smc_wr_tx_link_hold(link)) return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); - return smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; } /* schedule an llc send on link, may wait for buffers, @@ -586,13 +777,16 @@ static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf) struct smc_wr_buf *wr_buf; int rc; - if (!smc_link_usable(link)) + if (!smc_wr_tx_link_hold(link)) return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); - return smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); + rc = smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); +put_out: + smc_wr_tx_link_put(link); + return rc; } /********************************* receive ***********************************/ @@ -621,44 +815,6 @@ static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, return -EMLINK; } -/* return first buffer from any of the next buf lists */ -static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr, - int *buf_lst) -{ - struct smc_buf_desc *buf_pos; - - while (*buf_lst < SMC_RMBE_SIZES) { - buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst], - struct smc_buf_desc, list); - if (buf_pos) - return buf_pos; - (*buf_lst)++; - } - return NULL; -} - -/* return next rmb from buffer lists */ -static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, - int *buf_lst, - struct smc_buf_desc *buf_pos) -{ - struct smc_buf_desc *buf_next; - - if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { - (*buf_lst)++; - return _smc_llc_get_next_rmb(lgr, buf_lst); - } - buf_next = list_next_entry(buf_pos, list); - return buf_next; -} - -static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr, - int *buf_lst) -{ - *buf_lst = 0; - return smc_llc_get_next_rmb(lgr, buf_lst, NULL); -} - /* send one add_link_continue msg */ static int smc_llc_add_link_cont(struct smc_link *link, struct smc_link *link_new, u8 *num_rkeys_todo, @@ -672,9 +828,11 @@ static int smc_llc_add_link_cont(struct smc_link *link, struct smc_buf_desc *rmb; u8 n; + if (!smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - return rc; + goto put_out; addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf; memset(addc_llc, 0, sizeof(*addc_llc)); @@ -702,11 +860,14 @@ static int smc_llc_add_link_cont(struct smc_link *link, while (*buf_pos && !(*buf_pos)->used) *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); } - addc_llc->hd.common.type = SMC_LLC_ADD_LINK_CONT; + addc_llc->hd.common.llc_type = SMC_LLC_ADD_LINK_CONT; addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont); if (lgr->role == SMC_CLNT) addc_llc->hd.flags |= SMC_LLC_FLAG_RESP; - return smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(link, pend); +put_out: + smc_wr_tx_link_put(link); + return rc; } static int smc_llc_cli_rkey_exchange(struct smc_link *link, @@ -758,6 +919,8 @@ static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry) qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; + smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr, + sizeof(qentry->msg)); return smc_llc_send_message(qentry->link, &qentry->msg); } @@ -778,7 +941,7 @@ static int smc_llc_cli_conf_link(struct smc_link *link, SMC_LLC_DEL_LOST_PATH); return -ENOLINK; } - if (qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) { + if (qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) { /* received DELETE_LINK instead */ qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; smc_llc_send_message(link, &qentry->msg); @@ -819,6 +982,26 @@ static int smc_llc_cli_conf_link(struct smc_link *link, return 0; } +static void smc_llc_save_add_link_rkeys(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_v2_ext *ext; + struct smc_link_group *lgr = link->lgr; + int max, i; + + ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 + + SMC_WR_TX_SIZE); + max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); + mutex_lock(&lgr->rmbs_lock); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + ext->rt[i].rmb_key, + ext->rt[i].rmb_vaddr_new, + ext->rt[i].rmb_key_new); + } + mutex_unlock(&lgr->rmbs_lock); +} + static void smc_llc_save_add_link_info(struct smc_link *link, struct smc_llc_msg_add_link *add_llc) { @@ -835,31 +1018,47 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) struct smc_llc_msg_add_link *llc = &qentry->msg.add_link; enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_init_info *ini = NULL; struct smc_link *lnk_new = NULL; - struct smc_init_info ini; int lnk_idx, rc = 0; if (!llc->qp_mtu) goto out_reject; - ini.vlan_id = lgr->vlan_id; - smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) { + rc = -ENOMEM; + goto out_reject; + } + + ini->vlan_id = lgr->vlan_id; + if (lgr->smc_version == SMC_V2) { + ini->check_smcrv2 = true; + ini->smcrv2.saddr = lgr->saddr; + ini->smcrv2.daddr = smc_ib_gid_to_ipv4(llc->sender_gid); + } + smc_pnet_find_alt_roce(lgr, ini, link->smcibdev); if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && - !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN)) { - if (!ini.ib_dev) + (lgr->smc_version == SMC_V2 || + !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN))) { + if (!ini->ib_dev && !ini->smcrv2.ib_dev_v2) goto out_reject; lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; } - if (!ini.ib_dev) { + if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) { lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; - ini.ib_dev = link->smcibdev; - ini.ib_port = link->ibport; + ini->smcrv2.ib_dev_v2 = link->smcibdev; + ini->smcrv2.ib_port_v2 = link->ibport; + } else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini->ib_dev = link->smcibdev; + ini->ib_port = link->ibport; } lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); if (lnk_idx < 0) goto out_reject; lnk_new = &lgr->lnk[lnk_idx]; - rc = smcr_link_init(lgr, lnk_new, lnk_idx, &ini); + rc = smcr_link_init(lgr, lnk_new, lnk_idx, ini); if (rc) goto out_reject; smc_llc_save_add_link_info(lnk_new, llc); @@ -875,16 +1074,20 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) goto out_clear_lnk; rc = smc_llc_send_add_link(link, - lnk_new->smcibdev->mac[ini.ib_port - 1], + lnk_new->smcibdev->mac[lnk_new->ibport - 1], lnk_new->gid, lnk_new, SMC_LLC_RESP); if (rc) goto out_clear_lnk; - rc = smc_llc_cli_rkey_exchange(link, lnk_new); - if (rc) { - rc = 0; - goto out_clear_lnk; + if (lgr->smc_version == SMC_V2) { + smc_llc_save_add_link_rkeys(link, lnk_new); + } else { + rc = smc_llc_cli_rkey_exchange(link, lnk_new); + if (rc) { + rc = 0; + goto out_clear_lnk; + } } - rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t); + rc = smc_llc_cli_conf_link(link, ini, lnk_new, lgr_new_t); if (!rc) goto out; out_clear_lnk: @@ -893,29 +1096,78 @@ out_clear_lnk: out_reject: smc_llc_cli_add_link_reject(qentry); out: + kfree(ini); kfree(qentry); return rc; } +static void smc_llc_send_request_add_link(struct smc_link *link) +{ + struct smc_llc_msg_req_add_link_v2 *llc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_v2_buf *wr_buf; + struct smc_gidlist gidlist; + int rc, len, i; + + if (!smc_wr_tx_link_hold(link)) + return; + if (link->lgr->type == SMC_LGR_SYMMETRIC || + link->lgr->type == SMC_LGR_ASYMMETRIC_PEER) + goto put_out; + + smc_fill_gid_list(link->lgr, &gidlist, link->smcibdev, link->gid); + if (gidlist.len <= 1) + goto put_out; + + rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend); + if (rc) + goto put_out; + llc = (struct smc_llc_msg_req_add_link_v2 *)wr_buf; + memset(llc, 0, SMC_WR_TX_SIZE); + + llc->hd.common.llc_type = SMC_LLC_REQ_ADD_LINK; + for (i = 0; i < gidlist.len; i++) + memcpy(llc->gid[i], gidlist.list[i], sizeof(gidlist.list[0])); + llc->gid_cnt = gidlist.len; + len = sizeof(*llc) + (gidlist.len * sizeof(gidlist.list[0])); + smc_llc_init_msg_hdr(&llc->hd, link->lgr, len); + rc = smc_wr_tx_v2_send(link, pend, len); + if (!rc) + /* set REQ_ADD_LINK flow and wait for response from peer */ + link->lgr->llc_flow_lcl.type = SMC_LLC_FLOW_REQ_ADD_LINK; +put_out: + smc_wr_tx_link_put(link); +} + /* as an SMC client, invite server to start the add_link processing */ static void smc_llc_cli_add_link_invite(struct smc_link *link, struct smc_llc_qentry *qentry) { struct smc_link_group *lgr = smc_get_lgr(link); - struct smc_init_info ini; + struct smc_init_info *ini = NULL; + + if (lgr->smc_version == SMC_V2) { + smc_llc_send_request_add_link(link); + goto out; + } if (lgr->type == SMC_LGR_SYMMETRIC || lgr->type == SMC_LGR_ASYMMETRIC_PEER) goto out; - ini.vlan_id = lgr->vlan_id; - smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); - if (!ini.ib_dev) + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) goto out; - smc_llc_send_add_link(link, ini.ib_dev->mac[ini.ib_port - 1], - ini.ib_gid, NULL, SMC_LLC_REQ); + ini->vlan_id = lgr->vlan_id; + smc_pnet_find_alt_roce(lgr, ini, link->smcibdev); + if (!ini->ib_dev) + goto out; + + smc_llc_send_add_link(link, ini->ib_dev->mac[ini->ib_port - 1], + ini->ib_gid, NULL, SMC_LLC_REQ); out: + kfree(ini); kfree(qentry); } @@ -931,7 +1183,7 @@ static bool smc_llc_is_empty_llc_message(union smc_llc_msg *llc) static bool smc_llc_is_local_add_link(union smc_llc_msg *llc) { - if (llc->raw.hdr.common.type == SMC_LLC_ADD_LINK && + if (llc->raw.hdr.common.llc_type == SMC_LLC_ADD_LINK && smc_llc_is_empty_llc_message(llc)) return true; return false; @@ -1098,7 +1350,7 @@ static int smc_llc_srv_conf_link(struct smc_link *link, /* receive CONFIRM LINK response over the RoCE fabric */ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, 0); if (!qentry || - qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) { + qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) { /* send DELETE LINK */ smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, false, SMC_LLC_DEL_LOST_PATH); @@ -1117,37 +1369,80 @@ static int smc_llc_srv_conf_link(struct smc_link *link, return 0; } -int smc_llc_srv_add_link(struct smc_link *link) +static void smc_llc_send_req_add_link_response(struct smc_llc_qentry *qentry) +{ + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr, + sizeof(qentry->msg)); + memset(&qentry->msg.raw.data, 0, sizeof(qentry->msg.raw.data)); + smc_llc_send_message(qentry->link, &qentry->msg); +} + +int smc_llc_srv_add_link(struct smc_link *link, + struct smc_llc_qentry *req_qentry) { enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; struct smc_link_group *lgr = link->lgr; struct smc_llc_msg_add_link *add_llc; struct smc_llc_qentry *qentry = NULL; - struct smc_link *link_new; - struct smc_init_info ini; + bool send_req_add_link_resp = false; + struct smc_link *link_new = NULL; + struct smc_init_info *ini = NULL; int lnk_idx, rc = 0; + if (req_qentry && + req_qentry->msg.raw.hdr.common.llc_type == SMC_LLC_REQ_ADD_LINK) + send_req_add_link_resp = true; + + ini = kzalloc(sizeof(*ini), GFP_KERNEL); + if (!ini) { + rc = -ENOMEM; + goto out; + } + /* ignore client add link recommendation, start new flow */ - ini.vlan_id = lgr->vlan_id; - smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); - if (!ini.ib_dev) { + ini->vlan_id = lgr->vlan_id; + if (lgr->smc_version == SMC_V2) { + ini->check_smcrv2 = true; + ini->smcrv2.saddr = lgr->saddr; + if (send_req_add_link_resp) { + struct smc_llc_msg_req_add_link_v2 *req_add = + &req_qentry->msg.req_add_link; + + ini->smcrv2.daddr = smc_ib_gid_to_ipv4(req_add->gid[0]); + } + } + smc_pnet_find_alt_roce(lgr, ini, link->smcibdev); + if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini->smcrv2.ib_dev_v2 = link->smcibdev; + ini->smcrv2.ib_port_v2 = link->ibport; + } else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) { lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; - ini.ib_dev = link->smcibdev; - ini.ib_port = link->ibport; + ini->ib_dev = link->smcibdev; + ini->ib_port = link->ibport; } lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); - if (lnk_idx < 0) - return 0; + if (lnk_idx < 0) { + rc = 0; + goto out; + } - rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, &ini); + rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, ini); if (rc) - return rc; + goto out; link_new = &lgr->lnk[lnk_idx]; + + rc = smcr_buf_map_lgr(link_new); + if (rc) + goto out_err; + rc = smc_llc_send_add_link(link, - link_new->smcibdev->mac[ini.ib_port - 1], + link_new->smcibdev->mac[link_new->ibport-1], link_new->gid, link_new, SMC_LLC_REQ); if (rc) goto out_err; + send_req_add_link_resp = false; /* receive ADD LINK response over the RoCE fabric */ qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK); if (!qentry) { @@ -1162,7 +1457,8 @@ int smc_llc_srv_add_link(struct smc_link *link) } if (lgr->type == SMC_LGR_SINGLE && (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && - !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN))) { + (lgr->smc_version == SMC_V2 || + !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN)))) { lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; } smc_llc_save_add_link_info(link_new, add_llc); @@ -1171,39 +1467,49 @@ int smc_llc_srv_add_link(struct smc_link *link) rc = smc_ib_ready_link(link_new); if (rc) goto out_err; - rc = smcr_buf_map_lgr(link_new); - if (rc) - goto out_err; rc = smcr_buf_reg_lgr(link_new); if (rc) goto out_err; - rc = smc_llc_srv_rkey_exchange(link, link_new); - if (rc) - goto out_err; + if (lgr->smc_version == SMC_V2) { + smc_llc_save_add_link_rkeys(link, link_new); + } else { + rc = smc_llc_srv_rkey_exchange(link, link_new); + if (rc) + goto out_err; + } rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); if (rc) goto out_err; + kfree(ini); return 0; out_err: - link_new->state = SMC_LNK_INACTIVE; - smcr_link_clear(link_new, false); + if (link_new) { + link_new->state = SMC_LNK_INACTIVE; + smcr_link_clear(link_new, false); + } +out: + kfree(ini); + if (send_req_add_link_resp) + smc_llc_send_req_add_link_response(req_qentry); return rc; } static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) { struct smc_link *link = lgr->llc_flow_lcl.qentry->link; + struct smc_llc_qentry *qentry; int rc; - smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); mutex_lock(&lgr->llc_conf_mutex); - rc = smc_llc_srv_add_link(link); + rc = smc_llc_srv_add_link(link, qentry); if (!rc && lgr->type == SMC_LGR_SYMMETRIC) { /* delete any asymmetric link */ smc_llc_delete_asym_link(lgr); } mutex_unlock(&lgr->llc_conf_mutex); + kfree(qentry); } /* enqueue a local add_link req to trigger a new add_link flow */ @@ -1211,8 +1517,8 @@ void smc_llc_add_link_local(struct smc_link *link) { struct smc_llc_msg_add_link add_llc = {}; - add_llc.hd.length = sizeof(add_llc); - add_llc.hd.common.type = SMC_LLC_ADD_LINK; + add_llc.hd.common.llc_type = SMC_LLC_ADD_LINK; + smc_llc_init_msg_hdr(&add_llc.hd, link->lgr, sizeof(add_llc)); /* no dev and port needed */ smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc); } @@ -1234,7 +1540,8 @@ static void smc_llc_add_link_work(struct work_struct *work) else smc_llc_process_srv_add_link(lgr); out: - smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_REQ_ADD_LINK) + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } /* enqueue a local del_link msg to trigger a new del_link flow, @@ -1244,8 +1551,8 @@ void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id) { struct smc_llc_msg_del_link del_llc = {}; - del_llc.hd.length = sizeof(del_llc); - del_llc.hd.common.type = SMC_LLC_DELETE_LINK; + del_llc.hd.common.llc_type = SMC_LLC_DELETE_LINK; + smc_llc_init_msg_hdr(&del_llc.hd, link->lgr, sizeof(del_llc)); del_llc.link_num = del_link_id; del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH); del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; @@ -1315,8 +1622,8 @@ void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn) struct smc_llc_msg_del_link delllc = {}; int i; - delllc.hd.common.type = SMC_LLC_DELETE_LINK; - delllc.hd.length = sizeof(delllc); + delllc.hd.common.llc_type = SMC_LLC_DELETE_LINK; + smc_llc_init_msg_hdr(&delllc.hd, lgr, sizeof(delllc)); if (ord) delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; @@ -1432,6 +1739,8 @@ static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr) link = qentry->link; num_entries = llc->rtoken[0].num_rkeys; + if (num_entries > SMC_LLC_RKEYS_PER_MSG) + goto out_err; /* first rkey entry is for receiving link */ rk_idx = smc_rtoken_add(link, llc->rtoken[0].rmb_vaddr, @@ -1450,6 +1759,7 @@ out_err: llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY; out: llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc)); smc_llc_send_message(link, &qentry->msg); smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); } @@ -1467,6 +1777,28 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) llc = &qentry->msg.delete_rkey; link = qentry->link; + if (lgr->smc_version == SMC_V2) { + struct smc_llc_msg_delete_rkey_v2 *llcv2; + + memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc)); + llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2; + llcv2->num_inval_rkeys = 0; + + max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2); + for (i = 0; i < max; i++) { + if (smc_rtoken_delete(link, llcv2->rkey[i])) + llcv2->num_inval_rkeys++; + } + memset(&llc->rkey[0], 0, sizeof(llc->rkey)); + memset(&llc->reserved2, 0, sizeof(llc->reserved2)); + smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc)); + if (llcv2->num_inval_rkeys) { + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->err_mask = llcv2->num_inval_rkeys; + } + goto finish; + } + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); for (i = 0; i < max; i++) { if (smc_rtoken_delete(link, llc->rkey[i])) @@ -1476,6 +1808,7 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; llc->err_mask = err_mask; } +finish: llc->hd.flags |= SMC_LLC_FLAG_RESP; smc_llc_send_message(link, &qentry->msg); smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); @@ -1511,7 +1844,7 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) if (!smc_link_usable(link)) goto out; - switch (llc->raw.hdr.common.type) { + switch (llc->raw.hdr.common.llc_type) { case SMC_LLC_TEST_LINK: llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP; smc_llc_send_message(link, llc); @@ -1536,8 +1869,18 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); wake_up(&lgr->llc_msg_waiter); - } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, - qentry)) { + return; + } + if (lgr->llc_flow_lcl.type == + SMC_LLC_FLOW_REQ_ADD_LINK) { + /* server started add_link processing */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK; + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + schedule_work(&lgr->llc_add_link_work); + return; + } + if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { schedule_work(&lgr->llc_add_link_work); } } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { @@ -1585,6 +1928,23 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry) smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); } return; + case SMC_LLC_REQ_ADD_LINK: + /* handle response here, smc_llc_flow_stop() cannot be called + * in tasklet context + */ + if (lgr->role == SMC_CLNT && + lgr->llc_flow_lcl.type == SMC_LLC_FLOW_REQ_ADD_LINK && + (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP)) { + smc_llc_flow_stop(link->lgr, &lgr->llc_flow_lcl); + } else if (lgr->role == SMC_SERV) { + if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + /* as smc server, handle client suggestion */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK; + schedule_work(&lgr->llc_add_link_work); + } + return; + } + break; default: smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type); break; @@ -1628,7 +1988,7 @@ static void smc_llc_rx_response(struct smc_link *link, { enum smc_llc_flowtype flowtype = link->lgr->llc_flow_lcl.type; struct smc_llc_flow *flow = &link->lgr->llc_flow_lcl; - u8 llc_type = qentry->msg.raw.hdr.common.type; + u8 llc_type = qentry->msg.raw.hdr.common.llc_type; switch (llc_type) { case SMC_LLC_TEST_LINK: @@ -1654,7 +2014,8 @@ static void smc_llc_rx_response(struct smc_link *link, /* not used because max links is 3 */ break; default: - smc_llc_protocol_violation(link->lgr, llc_type); + smc_llc_protocol_violation(link->lgr, + qentry->msg.raw.hdr.common.type); break; } kfree(qentry); @@ -1679,7 +2040,8 @@ static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc) memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg)); /* process responses immediately */ - if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) { + if ((llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) && + llc->raw.hdr.common.llc_type != SMC_LLC_REQ_ADD_LINK) { smc_llc_rx_response(link, qentry); return; } @@ -1699,8 +2061,13 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) if (wc->byte_len < sizeof(*llc)) return; /* short message */ - if (llc->raw.hdr.length != sizeof(*llc)) - return; /* invalid message */ + if (!llc->raw.hdr.common.llc_version) { + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + } else { + if (llc->raw.hdr.length_v2 < sizeof(*llc)) + return; /* invalid message */ + } smc_llc_enqueue(link, llc); } @@ -1787,7 +2154,7 @@ void smc_llc_link_active(struct smc_link *link) link->smcibdev->ibdev->name, link->ibport); link->state = SMC_LNK_ACTIVE; if (link->lgr->llc_testlink_time) { - link->llc_testlink_time = link->lgr->llc_testlink_time * HZ; + link->llc_testlink_time = link->lgr->llc_testlink_time; schedule_delayed_work(&link->llc_testlink_wrk, link->llc_testlink_time); } @@ -1919,6 +2286,35 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_RKEY }, + /* V2 types */ + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_TEST_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_ADD_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_REQ_ADD_LINK_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_CONFIRM_RKEY_V2 + }, + { + .handler = smc_llc_rx_handler, + .type = SMC_LLC_DELETE_RKEY_V2 + }, { .handler = NULL, } diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index cc00a2ec4e92..4404e52b3346 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -30,10 +30,19 @@ enum smc_llc_msg_type { SMC_LLC_ADD_LINK = 0x02, SMC_LLC_ADD_LINK_CONT = 0x03, SMC_LLC_DELETE_LINK = 0x04, + SMC_LLC_REQ_ADD_LINK = 0x05, SMC_LLC_CONFIRM_RKEY = 0x06, SMC_LLC_TEST_LINK = 0x07, SMC_LLC_CONFIRM_RKEY_CONT = 0x08, SMC_LLC_DELETE_RKEY = 0x09, + /* V2 types */ + SMC_LLC_CONFIRM_LINK_V2 = 0x21, + SMC_LLC_ADD_LINK_V2 = 0x22, + SMC_LLC_DELETE_LINK_V2 = 0x24, + SMC_LLC_REQ_ADD_LINK_V2 = 0x25, + SMC_LLC_CONFIRM_RKEY_V2 = 0x26, + SMC_LLC_TEST_LINK_V2 = 0x27, + SMC_LLC_DELETE_RKEY_V2 = 0x29, }; #define smc_link_downing(state) \ @@ -102,7 +111,8 @@ void smc_llc_flow_qentry_del(struct smc_llc_flow *flow); void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn); int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry); -int smc_llc_srv_add_link(struct smc_link *link); +int smc_llc_srv_add_link(struct smc_link *link, + struct smc_llc_qentry *req_qentry); void smc_llc_add_link_local(struct smc_link *link); int smc_llc_init(void) __init; diff --git a/net/smc/smc_netlink.c b/net/smc/smc_netlink.c index 6fb6f96c1d17..f13ab0661ed5 100644 --- a/net/smc/smc_netlink.c +++ b/net/smc/smc_netlink.c @@ -19,11 +19,19 @@ #include "smc_core.h" #include "smc_ism.h" #include "smc_ib.h" +#include "smc_clc.h" #include "smc_stats.h" #include "smc_netlink.h" -#define SMC_CMD_MAX_ATTR 1 +const struct nla_policy +smc_gen_ueid_policy[SMC_NLA_EID_TABLE_MAX + 1] = { + [SMC_NLA_EID_TABLE_UNSPEC] = { .type = NLA_UNSPEC }, + [SMC_NLA_EID_TABLE_ENTRY] = { .type = NLA_STRING, + .len = SMC_MAX_EID_LEN, + }, +}; +#define SMC_CMD_MAX_ATTR 1 /* SMC_GENL generic netlink operation definition */ static const struct genl_ops smc_gen_nl_ops[] = { { @@ -66,6 +74,43 @@ static const struct genl_ops smc_gen_nl_ops[] = { /* can be retrieved by unprivileged users */ .dumpit = smc_nl_get_fback_stats, }, + { + .cmd = SMC_NETLINK_DUMP_UEID, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_dump_ueid, + }, + { + .cmd = SMC_NETLINK_ADD_UEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_add_ueid, + .policy = smc_gen_ueid_policy, + }, + { + .cmd = SMC_NETLINK_REMOVE_UEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_remove_ueid, + .policy = smc_gen_ueid_policy, + }, + { + .cmd = SMC_NETLINK_FLUSH_UEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_flush_ueid, + }, + { + .cmd = SMC_NETLINK_DUMP_SEID, + /* can be retrieved by unprivileged users */ + .dumpit = smc_nl_dump_seid, + }, + { + .cmd = SMC_NETLINK_ENABLE_SEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_enable_seid, + }, + { + .cmd = SMC_NETLINK_DISABLE_SEID, + .flags = GENL_ADMIN_PERM, + .doit = smc_nl_disable_seid, + }, }; static const struct nla_policy smc_gen_nl_policy[2] = { diff --git a/net/smc/smc_netlink.h b/net/smc/smc_netlink.h index 5ce2c0a89ccd..e8c6c3f0e98c 100644 --- a/net/smc/smc_netlink.h +++ b/net/smc/smc_netlink.h @@ -17,6 +17,8 @@ extern struct genl_family smc_gen_nl_family; +extern const struct nla_policy smc_gen_ueid_policy[]; + struct smc_nl_dmp_ctx { int pos[3]; }; diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 6f6d33edb135..67e9d9fde085 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -394,8 +394,7 @@ static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, return 0; out_put: - if (ndev) - dev_put(ndev); + dev_put(ndev); return rc; } @@ -954,6 +953,26 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, return rc; } +static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i, + struct smc_init_info *ini) +{ + if (!ini->check_smcrv2 && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL, + NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; + return 0; + } + if (ini->check_smcrv2 && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2, + NULL, &ini->smcrv2)) { + ini->smcrv2.ib_dev_v2 = ibdev; + ini->smcrv2.ib_port_v2 = i; + return 0; + } + return -ENODEV; +} + /* find a roce device for the given pnetid */ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, struct smc_init_info *ini, @@ -962,7 +981,6 @@ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, struct smc_ib_device *ibdev; int i; - ini->ib_dev = NULL; mutex_lock(&smc_ib_devices.mutex); list_for_each_entry(ibdev, &smc_ib_devices.list, list) { if (ibdev == known_dev) @@ -972,12 +990,9 @@ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, continue; if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && smc_ib_port_active(ibdev, i) && - !test_bit(i - 1, ibdev->ports_going_away) && - !smc_ib_determine_gid(ibdev, i, ini->vlan_id, - ini->ib_gid, NULL)) { - ini->ib_dev = ibdev; - ini->ib_port = i; - goto out; + !test_bit(i - 1, ibdev->ports_going_away)) { + if (!smc_pnet_determine_gid(ibdev, i, ini)) + goto out; } } } @@ -1017,12 +1032,9 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev, dev_put(ndev); if (netdev == ndev && smc_ib_port_active(ibdev, i) && - !test_bit(i - 1, ibdev->ports_going_away) && - !smc_ib_determine_gid(ibdev, i, ini->vlan_id, - ini->ib_gid, NULL)) { - ini->ib_dev = ibdev; - ini->ib_port = i; - break; + !test_bit(i - 1, ibdev->ports_going_away)) { + if (!smc_pnet_determine_gid(ibdev, i, ini)) + break; } } } @@ -1084,8 +1096,6 @@ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { struct dst_entry *dst = sk_dst_get(sk); - ini->ib_dev = NULL; - ini->ib_port = 0; if (!dst) goto out; if (!dst->dev) diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index 170b733bc736..51e8eb2933ff 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -22,6 +22,7 @@ #include "smc_tx.h" /* smc_tx_consumer_update() */ #include "smc_rx.h" #include "smc_stats.h" +#include "smc_tracepoint.h" /* callback implementation to wakeup consumers blocked with smc_rx_wait(). * indirectly called by smc_cdc_msg_recv_action(). @@ -438,6 +439,8 @@ copy: if (msg && smc_rx_update_consumer(smc, cons, copylen)) goto out; } + + trace_smc_rx_recvmsg(smc, copylen); } while (read_remaining); out: return read_done; diff --git a/net/smc/smc_tracepoint.c b/net/smc/smc_tracepoint.c new file mode 100644 index 000000000000..8d47ced5a492 --- /dev/null +++ b/net/smc/smc_tracepoint.c @@ -0,0 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define CREATE_TRACE_POINTS +#include "smc_tracepoint.h" + +EXPORT_TRACEPOINT_SYMBOL(smc_switch_to_fallback); +EXPORT_TRACEPOINT_SYMBOL(smc_tx_sendmsg); +EXPORT_TRACEPOINT_SYMBOL(smc_rx_recvmsg); +EXPORT_TRACEPOINT_SYMBOL(smcr_link_down); diff --git a/net/smc/smc_tracepoint.h b/net/smc/smc_tracepoint.h new file mode 100644 index 000000000000..b4c36795a928 --- /dev/null +++ b/net/smc/smc_tracepoint.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM smc + +#if !defined(_TRACE_SMC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SMC_H + +#include <linux/ipv6.h> +#include <linux/tcp.h> +#include <linux/tracepoint.h> +#include <net/ipv6.h> +#include "smc.h" +#include "smc_core.h" + +TRACE_EVENT(smc_switch_to_fallback, + + TP_PROTO(const struct smc_sock *smc, int fallback_rsn), + + TP_ARGS(smc, fallback_rsn), + + TP_STRUCT__entry( + __field(const void *, sk) + __field(const void *, clcsk) + __field(int, fallback_rsn) + ), + + TP_fast_assign( + const struct sock *sk = &smc->sk; + const struct sock *clcsk = smc->clcsock->sk; + + __entry->sk = sk; + __entry->clcsk = clcsk; + __entry->fallback_rsn = fallback_rsn; + ), + + TP_printk("sk=%p clcsk=%p fallback_rsn=%d", + __entry->sk, __entry->clcsk, __entry->fallback_rsn) +); + +DECLARE_EVENT_CLASS(smc_msg_event, + + TP_PROTO(const struct smc_sock *smc, size_t len), + + TP_ARGS(smc, len), + + TP_STRUCT__entry( + __field(const void *, smc) + __field(size_t, len) + __string(name, smc->conn.lnk->ibname) + ), + + TP_fast_assign( + __entry->smc = smc; + __entry->len = len; + __assign_str(name, smc->conn.lnk->ibname); + ), + + TP_printk("smc=%p len=%zu dev=%s", + __entry->smc, __entry->len, + __get_str(name)) +); + +DEFINE_EVENT(smc_msg_event, smc_tx_sendmsg, + + TP_PROTO(const struct smc_sock *smc, size_t len), + + TP_ARGS(smc, len) +); + +DEFINE_EVENT(smc_msg_event, smc_rx_recvmsg, + + TP_PROTO(const struct smc_sock *smc, size_t len), + + TP_ARGS(smc, len) +); + +TRACE_EVENT(smcr_link_down, + + TP_PROTO(const struct smc_link *lnk, void *location), + + TP_ARGS(lnk, location), + + TP_STRUCT__entry( + __field(const void *, lnk) + __field(const void *, lgr) + __field(int, state) + __string(name, lnk->ibname) + __field(void *, location) + ), + + TP_fast_assign( + const struct smc_link_group *lgr = lnk->lgr; + + __entry->lnk = lnk; + __entry->lgr = lgr; + __entry->state = lnk->state; + __assign_str(name, lnk->ibname); + __entry->location = location; + ), + + TP_printk("lnk=%p lgr=%p state=%d dev=%s location=%p", + __entry->lnk, __entry->lgr, + __entry->state, __get_str(name), + __entry->location) +); + +#endif /* _TRACE_SMC_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE smc_tracepoint + +#include <trace/define_trace.h> diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index c79361dfcdfb..be241d53020f 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -28,6 +28,7 @@ #include "smc_ism.h" #include "smc_tx.h" #include "smc_stats.h" +#include "smc_tracepoint.h" #define SMC_TX_WORK_DELAY 0 #define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ @@ -245,6 +246,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) SMC_TX_CORK_DELAY); else smc_tx_sndbuf_nonempty(conn); + + trace_smc_tx_sendmsg(smc, copylen); } /* while (msg_data_left(msg)) */ return send_done; @@ -496,7 +499,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, /* Wakeup sndbuf consumers from any context (IRQ or process) * since there is more data to transmit; usable snd_wnd as max transmit */ -static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) +static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; struct smc_link *link = conn->lnk; @@ -505,8 +508,11 @@ static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) struct smc_wr_buf *wr_buf; int rc; + if (!link || !smc_wr_tx_link_hold(link)) + return -ENOLINK; rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { + smc_wr_tx_link_put(link); if (rc == -EBUSY) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); @@ -547,22 +553,7 @@ static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) out_unlock: spin_unlock_bh(&conn->send_lock); - return rc; -} - -static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) -{ - struct smc_link *link = conn->lnk; - int rc = -ENOLINK; - - if (!link) - return rc; - - atomic_inc(&link->wr_tx_refcnt); - if (smc_link_usable(link)) - rc = _smcr_tx_sndbuf_nonempty(conn); - if (atomic_dec_and_test(&link->wr_tx_refcnt)) - wake_up_all(&link->wr_tx_wait); + smc_wr_tx_link_put(link); return rc; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index a419e9af36b9..600ab5889227 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -101,19 +101,32 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) } pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id); - if (pnd_snd_idx == link->wr_tx_cnt) - return; - link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; - if (link->wr_tx_pends[pnd_snd_idx].compl_requested) - complete(&link->wr_tx_compl[pnd_snd_idx]); - memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd)); - /* clear the full struct smc_wr_tx_pend including .priv */ - memset(&link->wr_tx_pends[pnd_snd_idx], 0, - sizeof(link->wr_tx_pends[pnd_snd_idx])); - memset(&link->wr_tx_bufs[pnd_snd_idx], 0, - sizeof(link->wr_tx_bufs[pnd_snd_idx])); - if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) - return; + if (pnd_snd_idx == link->wr_tx_cnt) { + if (link->lgr->smc_version != SMC_V2 || + link->wr_tx_v2_pend->wr_id != wc->wr_id) + return; + link->wr_tx_v2_pend->wc_status = wc->status; + memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd)); + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(link->wr_tx_v2_pend, 0, + sizeof(*link->wr_tx_v2_pend)); + memset(link->lgr->wr_tx_buf_v2, 0, + sizeof(*link->lgr->wr_tx_buf_v2)); + } else { + link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; + if (link->wr_tx_pends[pnd_snd_idx].compl_requested) + complete(&link->wr_tx_compl[pnd_snd_idx]); + memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], + sizeof(pnd_snd)); + /* clear the full struct smc_wr_tx_pend including .priv */ + memset(&link->wr_tx_pends[pnd_snd_idx], 0, + sizeof(link->wr_tx_pends[pnd_snd_idx])); + memset(&link->wr_tx_bufs[pnd_snd_idx], 0, + sizeof(link->wr_tx_bufs[pnd_snd_idx])); + if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask)) + return; + } + if (wc->status) { for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) { /* clear full struct smc_wr_tx_pend including .priv */ @@ -123,6 +136,12 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) sizeof(link->wr_tx_bufs[i])); clear_bit(i, link->wr_tx_mask); } + if (link->lgr->smc_version == SMC_V2) { + memset(link->wr_tx_v2_pend, 0, + sizeof(*link->wr_tx_v2_pend)); + memset(link->lgr->wr_tx_buf_v2, 0, + sizeof(*link->lgr->wr_tx_buf_v2)); + } /* terminate link */ smcr_link_down_cond_sched(link); } @@ -239,6 +258,33 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, return 0; } +int smc_wr_tx_get_v2_slot(struct smc_link *link, + smc_wr_tx_handler handler, + struct smc_wr_v2_buf **wr_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv) +{ + struct smc_wr_tx_pend *wr_pend; + struct ib_send_wr *wr_ib; + u64 wr_id; + + if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt) + return -EBUSY; + + *wr_buf = NULL; + *wr_pend_priv = NULL; + wr_id = smc_wr_tx_get_next_wr_id(link); + wr_pend = link->wr_tx_v2_pend; + wr_pend->wr_id = wr_id; + wr_pend->handler = handler; + wr_pend->link = link; + wr_pend->idx = link->wr_tx_cnt; + wr_ib = link->wr_tx_v2_ib; + wr_ib->wr_id = wr_id; + *wr_buf = link->lgr->wr_tx_buf_v2; + *wr_pend_priv = &wr_pend->priv; + return 0; +} + int smc_wr_tx_put_slot(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv) { @@ -256,6 +302,14 @@ int smc_wr_tx_put_slot(struct smc_link *link, test_and_clear_bit(idx, link->wr_tx_mask); wake_up(&link->wr_tx_wait); return 1; + } else if (link->lgr->smc_version == SMC_V2 && + pend->idx == link->wr_tx_cnt) { + /* Large v2 buffer */ + memset(&link->wr_tx_v2_pend, 0, + sizeof(link->wr_tx_v2_pend)); + memset(&link->lgr->wr_tx_buf_v2, 0, + sizeof(link->lgr->wr_tx_buf_v2)); + return 1; } return 0; @@ -280,6 +334,22 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) return rc; } +int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + int len) +{ + int rc; + + link->wr_tx_v2_ib->sg_list[0].length = len; + ib_req_notify_cq(link->smcibdev->roce_cq_send, + IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); + rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL); + if (rc) { + smc_wr_tx_put_slot(link, priv); + smcr_link_down_cond_sched(link); + } + return rc; +} + /* Send prepared WR slot via ib_post_send and wait for send completion * notification. * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer @@ -517,6 +587,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) static void smc_wr_init_sge(struct smc_link *lnk) { + int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; u32 i; for (i = 0; i < lnk->wr_tx_cnt; i++) { @@ -545,14 +616,44 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list = lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge; } + + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr; + lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE; + lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey; + + lnk->wr_tx_v2_ib->next = NULL; + lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; + lnk->wr_tx_v2_ib->num_sge = 1; + lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; + lnk->wr_tx_v2_ib->send_flags = + IB_SEND_SIGNALED | IB_SEND_SOLICITED; + } + + /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. + * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer + * and the same buffer for all sges. When a larger message arrived then + * the content of the first small sge is copied to the beginning of + * the larger spillover buffer, allowing easy data mapping. + */ for (i = 0; i < lnk->wr_rx_cnt; i++) { - lnk->wr_rx_sges[i].addr = + int x = i * sges_per_buf; + + lnk->wr_rx_sges[x].addr = lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; - lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE; - lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; + lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_rx_sges[x + 1].addr = + lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x + 1].length = + SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE; + lnk->wr_rx_sges[x + 1].lkey = + lnk->roce_pd->local_dma_lkey; + } lnk->wr_rx_ibs[i].next = NULL; - lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i]; - lnk->wr_rx_ibs[i].num_sge = 1; + lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; + lnk->wr_rx_ibs[i].num_sge = sges_per_buf; } lnk->wr_reg.wr.next = NULL; lnk->wr_reg.wr.num_sge = 0; @@ -585,16 +686,45 @@ void smc_wr_free_link(struct smc_link *lnk) DMA_FROM_DEVICE); lnk->wr_rx_dma_addr = 0; } + if (lnk->wr_rx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + lnk->wr_rx_v2_dma_addr = 0; + } if (lnk->wr_tx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, DMA_TO_DEVICE); lnk->wr_tx_dma_addr = 0; } + if (lnk->wr_tx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_TO_DEVICE); + lnk->wr_tx_v2_dma_addr = 0; + } +} + +void smc_wr_free_lgr_mem(struct smc_link_group *lgr) +{ + if (lgr->smc_version < SMC_V2) + return; + + kfree(lgr->wr_rx_buf_v2); + lgr->wr_rx_buf_v2 = NULL; + kfree(lgr->wr_tx_buf_v2); + lgr->wr_tx_buf_v2 = NULL; } void smc_wr_free_link_mem(struct smc_link *lnk) { + kfree(lnk->wr_tx_v2_ib); + lnk->wr_tx_v2_ib = NULL; + kfree(lnk->wr_tx_v2_sge); + lnk->wr_tx_v2_sge = NULL; + kfree(lnk->wr_tx_v2_pend); + lnk->wr_tx_v2_pend = NULL; kfree(lnk->wr_tx_compl); lnk->wr_tx_compl = NULL; kfree(lnk->wr_tx_pends); @@ -619,8 +749,26 @@ void smc_wr_free_link_mem(struct smc_link *lnk) lnk->wr_rx_bufs = NULL; } +int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) +{ + if (lgr->smc_version < SMC_V2) + return 0; + + lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); + if (!lgr->wr_rx_buf_v2) + return -ENOMEM; + lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); + if (!lgr->wr_tx_buf_v2) { + kfree(lgr->wr_rx_buf_v2); + return -ENOMEM; + } + return 0; +} + int smc_wr_alloc_link_mem(struct smc_link *link) { + int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; + /* allocate link related memory */ link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) @@ -653,7 +801,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link) if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, - sizeof(link->wr_rx_sges[0]), + sizeof(link->wr_rx_sges[0]) * sges_per_buf, GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; @@ -672,8 +820,29 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_compl) goto no_mem_wr_tx_pends; + + if (link->lgr->smc_version == SMC_V2) { + link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib), + GFP_KERNEL); + if (!link->wr_tx_v2_ib) + goto no_mem_tx_compl; + link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge), + GFP_KERNEL); + if (!link->wr_tx_v2_sge) + goto no_mem_v2_ib; + link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend), + GFP_KERNEL); + if (!link->wr_tx_v2_pend) + goto no_mem_v2_sge; + } return 0; +no_mem_v2_sge: + kfree(link->wr_tx_v2_sge); +no_mem_v2_ib: + kfree(link->wr_tx_v2_ib); +no_mem_tx_compl: + kfree(link->wr_tx_compl); no_mem_wr_tx_pends: kfree(link->wr_tx_pends); no_mem_wr_tx_mask: @@ -725,6 +894,24 @@ int smc_wr_create_link(struct smc_link *lnk) rc = -EIO; goto out; } + if (lnk->lgr->smc_version == SMC_V2) { + lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev, + lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) { + lnk->wr_rx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } + lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev, + lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) { + lnk->wr_tx_v2_dma_addr = 0; + rc = -EIO; + goto dma_unmap; + } + } lnk->wr_tx_dma_addr = ib_dma_map_single( ibdev, lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, DMA_TO_DEVICE); @@ -742,6 +929,18 @@ int smc_wr_create_link(struct smc_link *lnk) return rc; dma_unmap: + if (lnk->wr_rx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_FROM_DEVICE); + lnk->wr_rx_v2_dma_addr = 0; + } + if (lnk->wr_tx_v2_dma_addr) { + ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr, + SMC_WR_BUF_V2_SIZE, + DMA_TO_DEVICE); + lnk->wr_tx_v2_dma_addr = 0; + } ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, DMA_FROM_DEVICE); diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 423b8709f1c9..f353311e6f84 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -60,6 +60,20 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val) atomic_long_set(wr_tx_id, val); } +static inline bool smc_wr_tx_link_hold(struct smc_link *link) +{ + if (!smc_link_usable(link)) + return false; + atomic_inc(&link->wr_tx_refcnt); + return true; +} + +static inline void smc_wr_tx_link_put(struct smc_link *link) +{ + if (atomic_dec_and_test(&link->wr_tx_refcnt)) + wake_up_all(&link->wr_tx_wait); +} + static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) { wake_up_all(&lnk->wr_tx_wait); @@ -87,8 +101,10 @@ static inline int smc_wr_rx_post(struct smc_link *link) int smc_wr_create_link(struct smc_link *lnk); int smc_wr_alloc_link_mem(struct smc_link *lnk); +int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr); void smc_wr_free_link(struct smc_link *lnk); void smc_wr_free_link_mem(struct smc_link *lnk); +void smc_wr_free_lgr_mem(struct smc_link_group *lgr); void smc_wr_remember_qp_attr(struct smc_link *lnk); void smc_wr_remove_dev(struct smc_ib_device *smcibdev); void smc_wr_add_dev(struct smc_ib_device *smcibdev); @@ -97,10 +113,16 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wrs, struct smc_wr_tx_pend_priv **wr_pend_priv); +int smc_wr_tx_get_v2_slot(struct smc_link *link, + smc_wr_tx_handler handler, + struct smc_wr_v2_buf **wr_buf, + struct smc_wr_tx_pend_priv **wr_pend_priv); int smc_wr_tx_put_slot(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); +int smc_wr_tx_v2_send(struct smc_link *link, + struct smc_wr_tx_pend_priv *priv, int len); int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, unsigned long timeout); void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); diff --git a/net/socket.c b/net/socket.c index 0b2dad3bdf7f..7f64a6eccf63 100644 --- a/net/socket.c +++ b/net/socket.c @@ -212,6 +212,7 @@ static const char * const pf_family_names[] = { [PF_QIPCRTR] = "PF_QIPCRTR", [PF_SMC] = "PF_SMC", [PF_XDP] = "PF_XDP", + [PF_MCTP] = "PF_MCTP", }; /* @@ -1064,9 +1065,13 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) */ static DEFINE_MUTEX(br_ioctl_mutex); -static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg); +static int (*br_ioctl_hook)(struct net *net, struct net_bridge *br, + unsigned int cmd, struct ifreq *ifr, + void __user *uarg); -void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *)) +void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br, + unsigned int cmd, struct ifreq *ifr, + void __user *uarg)) { mutex_lock(&br_ioctl_mutex); br_ioctl_hook = hook; @@ -1074,6 +1079,22 @@ void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *)) } EXPORT_SYMBOL(brioctl_set); +int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd, + struct ifreq *ifr, void __user *uarg) +{ + int err = -ENOPKG; + + if (!br_ioctl_hook) + request_module("bridge"); + + mutex_lock(&br_ioctl_mutex); + if (br_ioctl_hook) + err = br_ioctl_hook(net, br, cmd, ifr, uarg); + mutex_unlock(&br_ioctl_mutex); + + return err; +} + static DEFINE_MUTEX(vlan_ioctl_mutex); static int (*vlan_ioctl_hook) (struct net *, void __user *arg); @@ -1088,8 +1109,11 @@ EXPORT_SYMBOL(vlan_ioctl_set); static long sock_do_ioctl(struct net *net, struct socket *sock, unsigned int cmd, unsigned long arg) { + struct ifreq ifr; + bool need_copyout; int err; void __user *argp = (void __user *)arg; + void __user *data; err = sock->ops->ioctl(sock, cmd, arg); @@ -1100,25 +1124,16 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, if (err != -ENOIOCTLCMD) return err; - if (cmd == SIOCGIFCONF) { - struct ifconf ifc; - if (copy_from_user(&ifc, argp, sizeof(struct ifconf))) - return -EFAULT; - rtnl_lock(); - err = dev_ifconf(net, &ifc, sizeof(struct ifreq)); - rtnl_unlock(); - if (!err && copy_to_user(argp, &ifc, sizeof(struct ifconf))) - err = -EFAULT; - } else { - struct ifreq ifr; - bool need_copyout; - if (copy_from_user(&ifr, argp, sizeof(struct ifreq))) + if (!is_socket_ioctl_cmd(cmd)) + return -ENOTTY; + + if (get_user_ifreq(&ifr, &data, argp)) + return -EFAULT; + err = dev_ioctl(net, cmd, &ifr, data, &need_copyout); + if (!err && need_copyout) + if (put_user_ifreq(&ifr, argp)) return -EFAULT; - err = dev_ioctl(net, cmd, &ifr, &need_copyout); - if (!err && need_copyout) - if (copy_to_user(argp, &ifr, sizeof(struct ifreq))) - return -EFAULT; - } + return err; } @@ -1140,12 +1155,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) net = sock_net(sk); if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) { struct ifreq ifr; + void __user *data; bool need_copyout; - if (copy_from_user(&ifr, argp, sizeof(struct ifreq))) + if (get_user_ifreq(&ifr, &data, argp)) return -EFAULT; - err = dev_ioctl(net, cmd, &ifr, &need_copyout); + err = dev_ioctl(net, cmd, &ifr, data, &need_copyout); if (!err && need_copyout) - if (copy_to_user(argp, &ifr, sizeof(struct ifreq))) + if (put_user_ifreq(&ifr, argp)) return -EFAULT; } else #ifdef CONFIG_WEXT_CORE @@ -1170,14 +1186,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) case SIOCSIFBR: case SIOCBRADDBR: case SIOCBRDELBR: - err = -ENOPKG; - if (!br_ioctl_hook) - request_module("bridge"); - - mutex_lock(&br_ioctl_mutex); - if (br_ioctl_hook) - err = br_ioctl_hook(net, cmd, argp); - mutex_unlock(&br_ioctl_mutex); + err = br_ioctl_call(net, NULL, cmd, NULL, argp); break; case SIOCGIFVLAN: case SIOCSIFVLAN: @@ -1217,6 +1226,11 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) cmd == SIOCGSTAMP_NEW, false); break; + + case SIOCGIFCONF: + err = dev_ifconf(net, argp); + break; + default: err = sock_do_ioctl(net, sock, cmd, arg); break; @@ -1722,32 +1736,22 @@ SYSCALL_DEFINE2(listen, int, fd, int, backlog) return __sys_listen(fd, backlog); } -int __sys_accept4_file(struct file *file, unsigned file_flags, +struct file *do_accept(struct file *file, unsigned file_flags, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags, - unsigned long nofile) + int __user *upeer_addrlen, int flags) { struct socket *sock, *newsock; struct file *newfile; - int err, len, newfd; + int err, len; struct sockaddr_storage address; - if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) - return -EINVAL; - - if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) - flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - sock = sock_from_file(file); - if (!sock) { - err = -ENOTSOCK; - goto out; - } + if (!sock) + return ERR_PTR(-ENOTSOCK); - err = -ENFILE; newsock = sock_alloc(); if (!newsock) - goto out; + return ERR_PTR(-ENFILE); newsock->type = sock->type; newsock->ops = sock->ops; @@ -1758,18 +1762,9 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, */ __module_get(newsock->ops->owner); - newfd = __get_unused_fd_flags(flags, nofile); - if (unlikely(newfd < 0)) { - err = newfd; - sock_release(newsock); - goto out; - } newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name); - if (IS_ERR(newfile)) { - err = PTR_ERR(newfile); - put_unused_fd(newfd); - goto out; - } + if (IS_ERR(newfile)) + return newfile; err = security_socket_accept(sock, newsock); if (err) @@ -1794,16 +1789,38 @@ int __sys_accept4_file(struct file *file, unsigned file_flags, } /* File flags are not inherited via accept() unlike another OSes. */ - - fd_install(newfd, newfile); - err = newfd; -out: - return err; + return newfile; out_fd: fput(newfile); - put_unused_fd(newfd); - goto out; + return ERR_PTR(err); +} + +int __sys_accept4_file(struct file *file, unsigned file_flags, + struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags, + unsigned long nofile) +{ + struct file *newfile; + int newfd; + + if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) + flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; + + newfd = __get_unused_fd_flags(flags, nofile); + if (unlikely(newfd < 0)) + return newfd; + + newfile = do_accept(file, file_flags, upeer_sockaddr, upeer_addrlen, + flags); + if (IS_ERR(newfile)) { + put_unused_fd(newfd); + return PTR_ERR(newfile); + } + fd_install(newfd, newfile); + return newfd; } /* @@ -3126,154 +3143,55 @@ void socket_seq_show(struct seq_file *seq) } #endif /* CONFIG_PROC_FS */ -#ifdef CONFIG_COMPAT -static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32) +/* Handle the fact that while struct ifreq has the same *layout* on + * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data, + * which are handled elsewhere, it still has different *size* due to + * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit, + * resulting in struct ifreq being 32 and 40 bytes respectively). + * As a result, if the struct happens to be at the end of a page and + * the next page isn't readable/writable, we get a fault. To prevent + * that, copy back and forth to the full size. + */ +int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg) { - struct compat_ifconf ifc32; - struct ifconf ifc; - int err; + if (in_compat_syscall()) { + struct compat_ifreq *ifr32 = (struct compat_ifreq *)ifr; - if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf))) - return -EFAULT; + memset(ifr, 0, sizeof(*ifr)); + if (copy_from_user(ifr32, arg, sizeof(*ifr32))) + return -EFAULT; - ifc.ifc_len = ifc32.ifc_len; - ifc.ifc_req = compat_ptr(ifc32.ifcbuf); + if (ifrdata) + *ifrdata = compat_ptr(ifr32->ifr_data); - rtnl_lock(); - err = dev_ifconf(net, &ifc, sizeof(struct compat_ifreq)); - rtnl_unlock(); - if (err) - return err; + return 0; + } - ifc32.ifc_len = ifc.ifc_len; - if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf))) + if (copy_from_user(ifr, arg, sizeof(*ifr))) return -EFAULT; + if (ifrdata) + *ifrdata = ifr->ifr_data; + return 0; } +EXPORT_SYMBOL(get_user_ifreq); -static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32) +int put_user_ifreq(struct ifreq *ifr, void __user *arg) { - struct compat_ethtool_rxnfc __user *compat_rxnfc; - bool convert_in = false, convert_out = false; - size_t buf_size = 0; - struct ethtool_rxnfc __user *rxnfc = NULL; - struct ifreq ifr; - u32 rule_cnt = 0, actual_rule_cnt; - u32 ethcmd; - u32 data; - int ret; + size_t size = sizeof(*ifr); - if (get_user(data, &ifr32->ifr_ifru.ifru_data)) - return -EFAULT; + if (in_compat_syscall()) + size = sizeof(struct compat_ifreq); - compat_rxnfc = compat_ptr(data); - - if (get_user(ethcmd, &compat_rxnfc->cmd)) + if (copy_to_user(arg, ifr, size)) return -EFAULT; - /* Most ethtool structures are defined without padding. - * Unfortunately struct ethtool_rxnfc is an exception. - */ - switch (ethcmd) { - default: - break; - case ETHTOOL_GRXCLSRLALL: - /* Buffer size is variable */ - if (get_user(rule_cnt, &compat_rxnfc->rule_cnt)) - return -EFAULT; - if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32)) - return -ENOMEM; - buf_size += rule_cnt * sizeof(u32); - fallthrough; - case ETHTOOL_GRXRINGS: - case ETHTOOL_GRXCLSRLCNT: - case ETHTOOL_GRXCLSRULE: - case ETHTOOL_SRXCLSRLINS: - convert_out = true; - fallthrough; - case ETHTOOL_SRXCLSRLDEL: - buf_size += sizeof(struct ethtool_rxnfc); - convert_in = true; - rxnfc = compat_alloc_user_space(buf_size); - break; - } - - if (copy_from_user(&ifr.ifr_name, &ifr32->ifr_name, IFNAMSIZ)) - return -EFAULT; - - ifr.ifr_data = convert_in ? rxnfc : (void __user *)compat_rxnfc; - - if (convert_in) { - /* We expect there to be holes between fs.m_ext and - * fs.ring_cookie and at the end of fs, but nowhere else. - */ - BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) + - sizeof(compat_rxnfc->fs.m_ext) != - offsetof(struct ethtool_rxnfc, fs.m_ext) + - sizeof(rxnfc->fs.m_ext)); - BUILD_BUG_ON( - offsetof(struct compat_ethtool_rxnfc, fs.location) - - offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) != - offsetof(struct ethtool_rxnfc, fs.location) - - offsetof(struct ethtool_rxnfc, fs.ring_cookie)); - - if (copy_in_user(rxnfc, compat_rxnfc, - (void __user *)(&rxnfc->fs.m_ext + 1) - - (void __user *)rxnfc) || - copy_in_user(&rxnfc->fs.ring_cookie, - &compat_rxnfc->fs.ring_cookie, - (void __user *)(&rxnfc->fs.location + 1) - - (void __user *)&rxnfc->fs.ring_cookie)) - return -EFAULT; - if (ethcmd == ETHTOOL_GRXCLSRLALL) { - if (put_user(rule_cnt, &rxnfc->rule_cnt)) - return -EFAULT; - } else if (copy_in_user(&rxnfc->rule_cnt, - &compat_rxnfc->rule_cnt, - sizeof(rxnfc->rule_cnt))) - return -EFAULT; - } - - ret = dev_ioctl(net, SIOCETHTOOL, &ifr, NULL); - if (ret) - return ret; - - if (convert_out) { - if (copy_in_user(compat_rxnfc, rxnfc, - (const void __user *)(&rxnfc->fs.m_ext + 1) - - (const void __user *)rxnfc) || - copy_in_user(&compat_rxnfc->fs.ring_cookie, - &rxnfc->fs.ring_cookie, - (const void __user *)(&rxnfc->fs.location + 1) - - (const void __user *)&rxnfc->fs.ring_cookie) || - copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt, - sizeof(rxnfc->rule_cnt))) - return -EFAULT; - - if (ethcmd == ETHTOOL_GRXCLSRLALL) { - /* As an optimisation, we only copy the actual - * number of rules that the underlying - * function returned. Since Mallory might - * change the rule count in user memory, we - * check that it is less than the rule count - * originally given (as the user buffer size), - * which has been range-checked. - */ - if (get_user(actual_rule_cnt, &rxnfc->rule_cnt)) - return -EFAULT; - if (actual_rule_cnt < rule_cnt) - rule_cnt = actual_rule_cnt; - if (copy_in_user(&compat_rxnfc->rule_locs[0], - &rxnfc->rule_locs[0], - rule_cnt * sizeof(u32))) - return -EFAULT; - } - } - return 0; } +EXPORT_SYMBOL(put_user_ifreq); +#ifdef CONFIG_COMPAT static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32) { compat_uptr_t uptr32; @@ -3281,7 +3199,7 @@ static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32 void __user *saved; int err; - if (copy_from_user(&ifr, uifr32, sizeof(struct compat_ifreq))) + if (get_user_ifreq(&ifr, NULL, uifr32)) return -EFAULT; if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu)) @@ -3290,10 +3208,10 @@ static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32 saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc; ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32); - err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL); + err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL, NULL); if (!err) { ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved; - if (copy_to_user(uifr32, &ifr, sizeof(struct compat_ifreq))) + if (put_user_ifreq(&ifr, uifr32)) err = -EFAULT; } return err; @@ -3304,97 +3222,15 @@ static int compat_ifr_data_ioctl(struct net *net, unsigned int cmd, struct compat_ifreq __user *u_ifreq32) { struct ifreq ifreq; - u32 data32; + void __user *data; - if (copy_from_user(ifreq.ifr_name, u_ifreq32->ifr_name, IFNAMSIZ)) - return -EFAULT; - if (get_user(data32, &u_ifreq32->ifr_data)) + if (!is_socket_ioctl_cmd(cmd)) + return -ENOTTY; + if (get_user_ifreq(&ifreq, &data, u_ifreq32)) return -EFAULT; - ifreq.ifr_data = compat_ptr(data32); + ifreq.ifr_data = data; - return dev_ioctl(net, cmd, &ifreq, NULL); -} - -static int compat_ifreq_ioctl(struct net *net, struct socket *sock, - unsigned int cmd, - struct compat_ifreq __user *uifr32) -{ - struct ifreq __user *uifr; - int err; - - /* Handle the fact that while struct ifreq has the same *layout* on - * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data, - * which are handled elsewhere, it still has different *size* due to - * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit, - * resulting in struct ifreq being 32 and 40 bytes respectively). - * As a result, if the struct happens to be at the end of a page and - * the next page isn't readable/writable, we get a fault. To prevent - * that, copy back and forth to the full size. - */ - - uifr = compat_alloc_user_space(sizeof(*uifr)); - if (copy_in_user(uifr, uifr32, sizeof(*uifr32))) - return -EFAULT; - - err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr); - - if (!err) { - switch (cmd) { - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFMEM: - case SIOCGIFHWADDR: - case SIOCGIFINDEX: - case SIOCGIFADDR: - case SIOCGIFBRDADDR: - case SIOCGIFDSTADDR: - case SIOCGIFNETMASK: - case SIOCGIFPFLAGS: - case SIOCGIFTXQLEN: - case SIOCGMIIPHY: - case SIOCGMIIREG: - case SIOCGIFNAME: - if (copy_in_user(uifr32, uifr, sizeof(*uifr32))) - err = -EFAULT; - break; - } - } - return err; -} - -static int compat_sioc_ifmap(struct net *net, unsigned int cmd, - struct compat_ifreq __user *uifr32) -{ - struct ifreq ifr; - struct compat_ifmap __user *uifmap32; - int err; - - uifmap32 = &uifr32->ifr_ifru.ifru_map; - err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name)); - err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); - err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); - err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); - err |= get_user(ifr.ifr_map.irq, &uifmap32->irq); - err |= get_user(ifr.ifr_map.dma, &uifmap32->dma); - err |= get_user(ifr.ifr_map.port, &uifmap32->port); - if (err) - return -EFAULT; - - err = dev_ioctl(net, cmd, &ifr, NULL); - - if (cmd == SIOCGIFMAP && !err) { - err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name)); - err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); - err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); - err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); - err |= put_user(ifr.ifr_map.irq, &uifmap32->irq); - err |= put_user(ifr.ifr_map.dma, &uifmap32->dma); - err |= put_user(ifr.ifr_map.port, &uifmap32->port); - if (err) - err = -EFAULT; - } - return err; + return dev_ioctl(net, cmd, &ifreq, data, NULL); } /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE @@ -3420,21 +3256,14 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, struct net *net = sock_net(sk); if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) - return compat_ifr_data_ioctl(net, cmd, argp); + return sock_ioctl(file, cmd, (unsigned long)argp); switch (cmd) { case SIOCSIFBR: case SIOCGIFBR: return old_bridge_ioctl(argp); - case SIOCGIFCONF: - return compat_dev_ifconf(net, argp); - case SIOCETHTOOL: - return ethtool_ioctl(net, argp); case SIOCWANDEV: return compat_siocwandev(net, argp); - case SIOCGIFMAP: - case SIOCSIFMAP: - return compat_sioc_ifmap(net, cmd, argp); case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: if (!sock->ops->gettstamp) @@ -3442,6 +3271,7 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD, !COMPAT_USE_64BIT_TIME); + case SIOCETHTOOL: case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: case SIOCSHWTSTAMP: @@ -3459,10 +3289,13 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGSKNS: case SIOCGSTAMP_NEW: case SIOCGSTAMPNS_NEW: + case SIOCGIFCONF: return sock_ioctl(file, cmd, arg); case SIOCGIFFLAGS: case SIOCSIFFLAGS: + case SIOCGIFMAP: + case SIOCSIFMAP: case SIOCGIFMETRIC: case SIOCSIFMETRIC: case SIOCGIFMTU: @@ -3499,8 +3332,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCBONDRELEASE: case SIOCBONDSETHWADDR: case SIOCBONDCHANGEACTIVE: - return compat_ifreq_ioctl(net, sock, cmd, argp); - case SIOCSARP: case SIOCGARP: case SIOCDARP: diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c index d1c003a25b0f..61c276bddaf2 100644 --- a/net/sunrpc/auth_gss/gss_rpc_upcall.c +++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c @@ -160,7 +160,7 @@ static struct rpc_clnt *get_gssp_clnt(struct sunrpc_net *sn) mutex_lock(&sn->gssp_lock); clnt = sn->gssp_clnt; if (clnt) - atomic_inc(&clnt->cl_count); + refcount_inc(&clnt->cl_count); mutex_unlock(&sn->gssp_lock); return clnt; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index a81be45f40d9..1f2817195549 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -194,6 +194,8 @@ static void rsi_request(struct cache_detail *cd, qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len); qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len); (*bpp)[-1] = '\n'; + WARN_ONCE(*blen < 0, + "RPCSEC/GSS credential too large - please use gssproxy\n"); } static int rsi_parse(struct cache_detail *cd, @@ -643,7 +645,7 @@ static bool gss_check_seq_num(const struct svc_rqst *rqstp, struct rsc *rsci, } __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); goto ok; - } else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) { + } else if (seq_num + GSS_SEQ_WIN <= sd->sd_max) { goto toolow; } if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) @@ -707,11 +709,11 @@ svc_safe_putnetobj(struct kvec *resv, struct xdr_netobj *o) /* * Verify the checksum on the header and return SVC_OK on success. * Otherwise, return SVC_DROP (in the case of a bad sequence number) - * or return SVC_DENIED and indicate error in authp. + * or return SVC_DENIED and indicate error in rqstp->rq_auth_stat. */ static int gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, - __be32 *rpcstart, struct rpc_gss_wire_cred *gc, __be32 *authp) + __be32 *rpcstart, struct rpc_gss_wire_cred *gc) { struct gss_ctx *ctx_id = rsci->mechctx; struct xdr_buf rpchdr; @@ -725,7 +727,7 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, iov.iov_len = (u8 *)argv->iov_base - (u8 *)rpcstart; xdr_buf_from_iov(&iov, &rpchdr); - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; if (argv->iov_len < 4) return SVC_DENIED; flavor = svc_getnl(argv); @@ -737,13 +739,13 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, if (rqstp->rq_deferred) /* skip verification of revisited request */ return SVC_OK; if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) { - *authp = rpcsec_gsserr_credproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; return SVC_DENIED; } if (gc->gc_seq > MAXSEQ) { trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq); - *authp = rpcsec_gsserr_ctxproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; return SVC_DENIED; } if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq)) @@ -1038,6 +1040,8 @@ svcauth_gss_set_client(struct svc_rqst *rqstp) struct rpc_gss_wire_cred *gc = &svcdata->clcred; int stat; + rqstp->rq_auth_stat = rpc_autherr_badcred; + /* * A gss export can be specified either by: * export *(sec=krb5,rw) @@ -1053,6 +1057,8 @@ svcauth_gss_set_client(struct svc_rqst *rqstp) stat = svcauth_unix_set_client(rqstp); if (stat == SVC_DROP || stat == SVC_CLOSE) return stat; + + rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } @@ -1142,7 +1148,7 @@ static void gss_free_in_token_pages(struct gssp_in_token *in_token) } static int gss_read_proxy_verf(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp, + struct rpc_gss_wire_cred *gc, struct xdr_netobj *in_handle, struct gssp_in_token *in_token) { @@ -1151,7 +1157,7 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp, int pages, i, res, pgto, pgfrom; size_t inlen, to_offs, from_offs; - res = gss_read_common_verf(gc, argv, authp, in_handle); + res = gss_read_common_verf(gc, argv, &rqstp->rq_auth_stat, in_handle); if (res) return res; @@ -1227,7 +1233,7 @@ gss_write_resv(struct kvec *resv, size_t size_limit, * Otherwise, drop the request pending an answer to the upcall. */ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp) + struct rpc_gss_wire_cred *gc) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -1236,7 +1242,7 @@ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); memset(&rsikey, 0, sizeof(rsikey)); - ret = gss_read_verf(gc, argv, authp, + ret = gss_read_verf(gc, argv, &rqstp->rq_auth_stat, &rsikey.in_handle, &rsikey.in_token); if (ret) return ret; @@ -1339,7 +1345,7 @@ out: } static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, - struct rpc_gss_wire_cred *gc, __be32 *authp) + struct rpc_gss_wire_cred *gc) { struct kvec *resv = &rqstp->rq_res.head[0]; struct xdr_netobj cli_handle; @@ -1351,8 +1357,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); memset(&ud, 0, sizeof(ud)); - ret = gss_read_proxy_verf(rqstp, gc, authp, - &ud.in_handle, &ud.in_token); + ret = gss_read_proxy_verf(rqstp, gc, &ud.in_handle, &ud.in_token); if (ret) return ret; @@ -1525,7 +1530,7 @@ static void destroy_use_gss_proxy_proc_entry(struct net *net) {} * response here and return SVC_COMPLETE. */ static int -svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) +svcauth_gss_accept(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -1538,7 +1543,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; if (!svcdata) svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); if (!svcdata) @@ -1575,22 +1580,22 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0)) goto auth_err; - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; switch (gc->gc_proc) { case RPC_GSS_PROC_INIT: case RPC_GSS_PROC_CONTINUE_INIT: if (use_gss_proxy(SVC_NET(rqstp))) - return svcauth_gss_proxy_init(rqstp, gc, authp); + return svcauth_gss_proxy_init(rqstp, gc); else - return svcauth_gss_legacy_init(rqstp, gc, authp); + return svcauth_gss_legacy_init(rqstp, gc); case RPC_GSS_PROC_DATA: case RPC_GSS_PROC_DESTROY: /* Look up the context, and check the verifier: */ - *authp = rpcsec_gsserr_credproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_credproblem; rsci = gss_svc_searchbyctx(sn->rsc_cache, &gc->gc_ctx); if (!rsci) goto auth_err; - switch (gss_verify_header(rqstp, rsci, rpcstart, gc, authp)) { + switch (gss_verify_header(rqstp, rsci, rpcstart, gc)) { case SVC_OK: break; case SVC_DENIED: @@ -1600,7 +1605,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) } break; default: - *authp = rpc_autherr_rejectedcred; + rqstp->rq_auth_stat = rpc_autherr_rejectedcred; goto auth_err; } @@ -1616,13 +1621,13 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) svc_putnl(resv, RPC_SUCCESS); goto complete; case RPC_GSS_PROC_DATA: - *authp = rpcsec_gsserr_ctxproblem; + rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem; svcdata->verf_start = resv->iov_base + resv->iov_len; if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; rqstp->rq_cred = rsci->cred; get_group_info(rsci->cred.cr_group_info); - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; switch (gc->gc_svc) { case RPC_GSS_SVC_NONE: break; @@ -1980,7 +1985,7 @@ gss_svc_init_net(struct net *net) goto out2; return 0; out2: - destroy_use_gss_proxy_proc_entry(net); + rsi_cache_destroy_net(net); out1: rsc_cache_destroy_net(net); return rv; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 1a2c1c44bb00..59641803472c 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -803,7 +803,7 @@ static int cache_request(struct cache_detail *detail, detail->cache_request(detail, crq->item, &bp, &len); if (len < 0) - return -EAGAIN; + return -E2BIG; return PAGE_SIZE - len; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8b4de70e8ead..f056ff931444 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -167,7 +167,7 @@ static int rpc_clnt_skip_event(struct rpc_clnt *clnt, unsigned long event) case RPC_PIPEFS_MOUNT: if (clnt->cl_pipedir_objects.pdh_dentry != NULL) return 1; - if (atomic_read(&clnt->cl_count) == 0) + if (refcount_read(&clnt->cl_count) == 0) return 1; break; case RPC_PIPEFS_UMOUNT: @@ -419,7 +419,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, clnt->cl_rtt = &clnt->cl_rtt_default; rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval); - atomic_set(&clnt->cl_count, 1); + refcount_set(&clnt->cl_count, 1); if (nodename == NULL) nodename = utsname()->nodename; @@ -431,7 +431,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, if (err) goto out_no_path; if (parent) - atomic_inc(&parent->cl_count); + refcount_inc(&parent->cl_count); trace_rpc_clnt_new(clnt, xprt, program->name, args->servername); return clnt; @@ -918,18 +918,16 @@ rpc_free_client(struct rpc_clnt *clnt) static struct rpc_clnt * rpc_free_auth(struct rpc_clnt *clnt) { - if (clnt->cl_auth == NULL) - return rpc_free_client(clnt); - /* * Note: RPCSEC_GSS may need to send NULL RPC calls in order to * release remaining GSS contexts. This mechanism ensures * that it can do so safely. */ - atomic_inc(&clnt->cl_count); - rpcauth_release(clnt->cl_auth); - clnt->cl_auth = NULL; - if (atomic_dec_and_test(&clnt->cl_count)) + if (clnt->cl_auth != NULL) { + rpcauth_release(clnt->cl_auth); + clnt->cl_auth = NULL; + } + if (refcount_dec_and_test(&clnt->cl_count)) return rpc_free_client(clnt); return NULL; } @@ -943,7 +941,7 @@ rpc_release_client(struct rpc_clnt *clnt) do { if (list_empty(&clnt->cl_tasks)) wake_up(&destroy_wait); - if (!atomic_dec_and_test(&clnt->cl_count)) + if (refcount_dec_not_one(&clnt->cl_count)) break; clnt = rpc_free_auth(clnt); } while (clnt != NULL); @@ -1082,7 +1080,7 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) if (clnt != NULL) { rpc_task_set_transport(task, clnt); task->tk_client = clnt; - atomic_inc(&clnt->cl_count); + refcount_inc(&clnt->cl_count); if (clnt->cl_softrtry) task->tk_flags |= RPC_TASK_SOFT; if (clnt->cl_softerr) @@ -2694,17 +2692,18 @@ static const struct rpc_procinfo rpcproc_null = { .p_decode = rpcproc_decode_null, }; -static int rpc_ping(struct rpc_clnt *clnt) +static void +rpc_null_call_prepare(struct rpc_task *task, void *data) { - struct rpc_message msg = { - .rpc_proc = &rpcproc_null, - }; - int err; - err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN | - RPC_TASK_NULLCREDS); - return err; + task->tk_flags &= ~RPC_TASK_NO_RETRANS_TIMEOUT; + rpc_call_start(task); } +static const struct rpc_call_ops rpc_null_ops = { + .rpc_call_prepare = rpc_null_call_prepare, + .rpc_call_done = rpc_default_callback, +}; + static struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, struct rpc_xprt *xprt, struct rpc_cred *cred, int flags, @@ -2718,7 +2717,7 @@ struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, .rpc_xprt = xprt, .rpc_message = &msg, .rpc_op_cred = cred, - .callback_ops = (ops != NULL) ? ops : &rpc_default_ops, + .callback_ops = ops ?: &rpc_null_ops, .callback_data = data, .flags = flags | RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS, @@ -2733,6 +2732,19 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int } EXPORT_SYMBOL_GPL(rpc_call_null); +static int rpc_ping(struct rpc_clnt *clnt) +{ + struct rpc_task *task; + int status; + + task = rpc_call_null_helper(clnt, NULL, NULL, 0, NULL, NULL); + if (IS_ERR(task)) + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); + return status; +} + struct rpc_cb_add_xprt_calldata { struct rpc_xprt_switch *xps; struct rpc_xprt *xprt; @@ -2756,6 +2768,7 @@ static void rpc_cb_add_xprt_release(void *calldata) } static const struct rpc_call_ops rpc_cb_add_xprt_call_ops = { + .rpc_call_prepare = rpc_null_call_prepare, .rpc_call_done = rpc_cb_add_xprt_done, .rpc_release = rpc_cb_add_xprt_release, }; @@ -2774,6 +2787,15 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_cb_add_xprt_calldata *data; struct rpc_task *task; + if (xps->xps_nunique_destaddr_xprts + 1 > clnt->cl_max_connect) { + rcu_read_lock(); + pr_warn("SUNRPC: reached max allowed number (%d) did not add " + "transport to server: %s\n", clnt->cl_max_connect, + rpc_peeraddr2str(clnt, RPC_DISPLAY_ADDR)); + rcu_read_unlock(); + return -EINVAL; + } + data = kmalloc(sizeof(*data), GFP_NOFS); if (!data) return -ENOMEM; @@ -2786,7 +2808,7 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, task = rpc_call_null_helper(clnt, xprt, NULL, RPC_TASK_ASYNC, &rpc_cb_add_xprt_call_ops, data); - + data->xps->xps_nunique_destaddr_xprts++; rpc_put_task(task); success: return 1; diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index 56029e3af6ff..7dc9cc929bfd 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -8,14 +8,14 @@ #include <linux/debugfs.h> #include <linux/sunrpc/sched.h> #include <linux/sunrpc/clnt.h> + #include "netns.h" +#include "fail.h" static struct dentry *topdir; static struct dentry *rpc_clnt_dir; static struct dentry *rpc_xprt_dir; -unsigned int rpc_inject_disconnect; - static int tasks_show(struct seq_file *f, void *v) { @@ -90,7 +90,7 @@ static int tasks_open(struct inode *inode, struct file *filp) struct seq_file *seq = filp->private_data; struct rpc_clnt *clnt = seq->private = inode->i_private; - if (!atomic_inc_not_zero(&clnt->cl_count)) { + if (!refcount_inc_not_zero(&clnt->cl_count)) { seq_release(inode, filp); ret = -EINVAL; } @@ -235,8 +235,6 @@ rpc_xprt_debugfs_register(struct rpc_xprt *xprt) /* make tasks file */ debugfs_create_file("info", S_IFREG | 0400, xprt->debugfs, xprt, &xprt_info_fops); - - atomic_set(&xprt->inject_disconnect, rpc_inject_disconnect); } void @@ -246,56 +244,30 @@ rpc_xprt_debugfs_unregister(struct rpc_xprt *xprt) xprt->debugfs = NULL; } -static int -fault_open(struct inode *inode, struct file *filp) -{ - filp->private_data = kmalloc(128, GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - return 0; -} +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) +struct fail_sunrpc_attr fail_sunrpc = { + .attr = FAULT_ATTR_INITIALIZER, +}; +EXPORT_SYMBOL_GPL(fail_sunrpc); -static int -fault_release(struct inode *inode, struct file *filp) +static void fail_sunrpc_init(void) { - kfree(filp->private_data); - return 0; -} + struct dentry *dir; -static ssize_t -fault_disconnect_read(struct file *filp, char __user *user_buf, - size_t len, loff_t *offset) -{ - char *buffer = (char *)filp->private_data; - size_t size; + dir = fault_create_debugfs_attr("fail_sunrpc", NULL, + &fail_sunrpc.attr); - size = sprintf(buffer, "%u\n", rpc_inject_disconnect); - return simple_read_from_buffer(user_buf, len, offset, buffer, size); -} + debugfs_create_bool("ignore-client-disconnect", S_IFREG | 0600, dir, + &fail_sunrpc.ignore_client_disconnect); -static ssize_t -fault_disconnect_write(struct file *filp, const char __user *user_buf, - size_t len, loff_t *offset) + debugfs_create_bool("ignore-server-disconnect", S_IFREG | 0600, dir, + &fail_sunrpc.ignore_server_disconnect); +} +#else +static void fail_sunrpc_init(void) { - char buffer[16]; - - if (len >= sizeof(buffer)) - len = sizeof(buffer) - 1; - if (copy_from_user(buffer, user_buf, len)) - return -EFAULT; - buffer[len] = '\0'; - if (kstrtouint(buffer, 10, &rpc_inject_disconnect)) - return -EINVAL; - return len; } - -static const struct file_operations fault_disconnect_fops = { - .owner = THIS_MODULE, - .open = fault_open, - .read = fault_disconnect_read, - .write = fault_disconnect_write, - .release = fault_release, -}; +#endif void __exit sunrpc_debugfs_exit(void) @@ -309,16 +281,11 @@ sunrpc_debugfs_exit(void) void __init sunrpc_debugfs_init(void) { - struct dentry *rpc_fault_dir; - topdir = debugfs_create_dir("sunrpc", NULL); rpc_clnt_dir = debugfs_create_dir("rpc_clnt", topdir); rpc_xprt_dir = debugfs_create_dir("rpc_xprt", topdir); - rpc_fault_dir = debugfs_create_dir("inject_fault", topdir); - - debugfs_create_file("disconnect", S_IFREG | 0400, rpc_fault_dir, NULL, - &fault_disconnect_fops); + fail_sunrpc_init(); } diff --git a/net/sunrpc/fail.h b/net/sunrpc/fail.h new file mode 100644 index 000000000000..69dc30cc44b8 --- /dev/null +++ b/net/sunrpc/fail.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021, Oracle. All rights reserved. + */ + +#ifndef _NET_SUNRPC_FAIL_H_ +#define _NET_SUNRPC_FAIL_H_ + +#include <linux/fault-inject.h> + +#if IS_ENABLED(CONFIG_FAULT_INJECTION) + +struct fail_sunrpc_attr { + struct fault_attr attr; + + bool ignore_client_disconnect; + + bool ignore_server_disconnect; +}; + +extern struct fail_sunrpc_attr fail_sunrpc; + +#endif /* CONFIG_FAULT_INJECTION */ + +#endif /* _NET_SUNRPC_FAIL_H_ */ diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 09c000d490a1..ee5336d73fdd 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -423,7 +423,7 @@ rpc_info_open(struct inode *inode, struct file *file) spin_lock(&file->f_path.dentry->d_lock); if (!d_unhashed(file->f_path.dentry)) clnt = RPC_I(inode)->private; - if (clnt != NULL && atomic_inc_not_zero(&clnt->cl_count)) { + if (clnt != NULL && refcount_inc_not_zero(&clnt->cl_count)) { spin_unlock(&file->f_path.dentry->d_lock); m->private = clnt; } else { diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 0de918cb3d90..a3bbe5ce4570 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -31,6 +31,8 @@ #include <trace/events/sunrpc.h> +#include "fail.h" + #define RPCDBG_FACILITY RPCDBG_SVCDSP static void svc_unregister(const struct svc_serv *serv, struct net *net); @@ -838,6 +840,27 @@ svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrser } EXPORT_SYMBOL_GPL(svc_set_num_threads_sync); +/** + * svc_rqst_replace_page - Replace one page in rq_pages[] + * @rqstp: svc_rqst with pages to replace + * @page: replacement page + * + * When replacing a page in rq_pages, batch the release of the + * replaced pages to avoid hammering the page allocator. + */ +void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page) +{ + if (*rqstp->rq_next_page) { + if (!pagevec_space(&rqstp->rq_pvec)) + __pagevec_release(&rqstp->rq_pvec); + pagevec_add(&rqstp->rq_pvec, *rqstp->rq_next_page); + } + + get_page(page); + *(rqstp->rq_next_page++) = page; +} +EXPORT_SYMBOL_GPL(svc_rqst_replace_page); + /* * Called from a server thread as it's exiting. Caller must hold the "service * mutex" for the service. @@ -1163,22 +1186,6 @@ void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {} #endif -__be32 -svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err) -{ - set_bit(RQ_AUTHERR, &rqstp->rq_flags); - return auth_err; -} -EXPORT_SYMBOL_GPL(svc_return_autherr); - -static __be32 -svc_get_autherr(struct svc_rqst *rqstp, __be32 *statp) -{ - if (test_and_clear_bit(RQ_AUTHERR, &rqstp->rq_flags)) - return *statp; - return rpc_auth_ok; -} - static int svc_generic_dispatch(struct svc_rqst *rqstp, __be32 *statp) { @@ -1202,7 +1209,7 @@ svc_generic_dispatch(struct svc_rqst *rqstp, __be32 *statp) test_bit(RQ_DROPME, &rqstp->rq_flags)) return 0; - if (test_bit(RQ_AUTHERR, &rqstp->rq_flags)) + if (rqstp->rq_auth_stat != rpc_auth_ok) return 1; if (*statp != rpc_success) @@ -1283,7 +1290,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) struct svc_process_info process; __be32 *statp; u32 prog, vers; - __be32 auth_stat, rpc_stat; + __be32 rpc_stat; int auth_res; __be32 *reply_statp; @@ -1326,14 +1333,12 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) * We do this before anything else in order to get a decent * auth verifier. */ - auth_res = svc_authenticate(rqstp, &auth_stat); + auth_res = svc_authenticate(rqstp); /* Also give the program a chance to reject this call: */ - if (auth_res == SVC_OK && progp) { - auth_stat = rpc_autherr_badcred; + if (auth_res == SVC_OK && progp) auth_res = progp->pg_authenticate(rqstp); - } if (auth_res != SVC_OK) - trace_svc_authenticate(rqstp, auth_res, auth_stat); + trace_svc_authenticate(rqstp, auth_res); switch (auth_res) { case SVC_OK: break; @@ -1392,15 +1397,15 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) goto release_dropit; if (*statp == rpc_garbage_args) goto err_garbage; - auth_stat = svc_get_autherr(rqstp, statp); - if (auth_stat != rpc_auth_ok) - goto err_release_bad_auth; } else { dprintk("svc: calling dispatcher\n"); if (!process.dispatch(rqstp, statp)) goto release_dropit; /* Release reply info */ } + if (rqstp->rq_auth_stat != rpc_auth_ok) + goto err_release_bad_auth; + /* Check RPC status result */ if (*statp != rpc_success) resv->iov_len = ((void*)statp) - resv->iov_base + 4; @@ -1450,13 +1455,14 @@ err_release_bad_auth: if (procp->pc_release) procp->pc_release(rqstp); err_bad_auth: - dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat)); + dprintk("svc: authentication failed (%d)\n", + be32_to_cpu(rqstp->rq_auth_stat)); serv->sv_stats->rpcbadauth++; /* Restore write pointer to location of accept status: */ xdr_ressize_check(rqstp, reply_statp); svc_putnl(resv, 1); /* REJECT */ svc_putnl(resv, 1); /* AUTH_ERROR */ - svc_putnl(resv, ntohl(auth_stat)); /* status */ + svc_putu32(resv, rqstp->rq_auth_stat); /* status */ goto sendit; err_bad_prog: @@ -1503,6 +1509,12 @@ svc_process(struct svc_rqst *rqstp) struct svc_serv *serv = rqstp->rq_server; u32 dir; +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) + if (!fail_sunrpc.ignore_server_disconnect && + should_fail(&fail_sunrpc.attr, 1)) + svc_xprt_deferred_close(rqstp->rq_xprt); +#endif + /* * Setup response xdr_buf. * Initially it has just one page @@ -1630,6 +1642,21 @@ u32 svc_max_payload(const struct svc_rqst *rqstp) EXPORT_SYMBOL_GPL(svc_max_payload); /** + * svc_proc_name - Return RPC procedure name in string form + * @rqstp: svc_rqst to operate on + * + * Return value: + * Pointer to a NUL-terminated string + */ +const char *svc_proc_name(const struct svc_rqst *rqstp) +{ + if (rqstp && rqstp->rq_procinfo) + return rqstp->rq_procinfo->pc_name; + return "unknown"; +} + + +/** * svc_encode_result_payload - mark a range of bytes as a result payload * @rqstp: svc_rqst to operate on * @offset: payload's byte offset in rqstp->rq_res diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index dbb41821b1b8..6316bd2b8f37 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -539,6 +539,7 @@ static void svc_xprt_release(struct svc_rqst *rqstp) kfree(rqstp->rq_deferred); rqstp->rq_deferred = NULL; + pagevec_release(&rqstp->rq_pvec); svc_free_res_pages(rqstp); rqstp->rq_res.page_len = 0; rqstp->rq_res.page_base = 0; @@ -662,7 +663,9 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; struct xdr_buf *arg = &rqstp->rq_arg; - unsigned long pages, filled; + unsigned long pages, filled, ret; + + pagevec_init(&rqstp->rq_pvec); pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT; if (pages > RPCSVC_MAXPAGES) { @@ -672,11 +675,12 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) pages = RPCSVC_MAXPAGES; } - for (;;) { - filled = alloc_pages_bulk_array(GFP_KERNEL, pages, - rqstp->rq_pages); - if (filled == pages) - break; + for (filled = 0; filled < pages; filled = ret) { + ret = alloc_pages_bulk_array(GFP_KERNEL, pages, + rqstp->rq_pages); + if (ret > filled) + /* Made progress, don't sleep yet */ + continue; set_current_state(TASK_INTERRUPTIBLE); if (signalled() || kthread_should_stop()) { diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c index 998b196b6176..5a8b8e03fdd4 100644 --- a/net/sunrpc/svcauth.c +++ b/net/sunrpc/svcauth.c @@ -59,12 +59,12 @@ svc_put_auth_ops(struct auth_ops *aops) } int -svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) +svc_authenticate(struct svc_rqst *rqstp) { rpc_authflavor_t flavor; struct auth_ops *aops; - *authp = rpc_auth_ok; + rqstp->rq_auth_stat = rpc_auth_ok; flavor = svc_getnl(&rqstp->rq_arg.head[0]); @@ -72,7 +72,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) aops = svc_get_auth_ops(flavor); if (aops == NULL) { - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } @@ -80,7 +80,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp) init_svc_cred(&rqstp->rq_cred); rqstp->rq_authop = aops; - return aops->accept(rqstp, authp); + return aops->accept(rqstp); } EXPORT_SYMBOL_GPL(svc_authenticate); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 35b7966ac3b3..d7ed7d49115a 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -681,8 +681,9 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) rqstp->rq_client = NULL; if (rqstp->rq_proc == 0) - return SVC_OK; + goto out; + rqstp->rq_auth_stat = rpc_autherr_badcred; ipm = ip_map_cached_get(xprt); if (ipm == NULL) ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class, @@ -719,13 +720,16 @@ svcauth_unix_set_client(struct svc_rqst *rqstp) put_group_info(cred->cr_group_info); cred->cr_group_info = gi; } + +out: + rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } EXPORT_SYMBOL_GPL(svcauth_unix_set_client); static int -svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) +svcauth_null_accept(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -736,12 +740,12 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp) if (svc_getu32(argv) != 0) { dprintk("svc: bad null cred\n"); - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { dprintk("svc: bad null verf\n"); - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } @@ -785,7 +789,7 @@ struct auth_ops svcauth_null = { static int -svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) +svcauth_unix_accept(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; @@ -827,7 +831,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) } groups_sort(cred->cr_group_info); if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { - *authp = rpc_autherr_badverf; + rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } @@ -839,7 +843,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) return SVC_OK; badcred: - *authp = rpc_autherr_badcred; + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 64da3bfd28e6..9a6f17e18f73 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -100,6 +100,28 @@ static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, return ret + 1; } +static ssize_t rpc_sysfs_xprt_srcaddr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + struct sockaddr_storage saddr; + struct sock_xprt *sock; + ssize_t ret = -1; + + if (!xprt) + return 0; + + sock = container_of(xprt, struct sock_xprt, xprt); + if (kernel_getsockname(sock->sock, (struct sockaddr *)&saddr) < 0) + goto out; + + ret = sprintf(buf, "%pISc\n", &saddr); +out: + xprt_put(xprt); + return ret + 1; +} + static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -114,14 +136,16 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n" "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n" - "tasks_queuelen=%ld\n", + "tasks_queuelen=%ld\ndst_port=%s\n", xprt->last_used, xprt->cong, xprt->cwnd, xprt->max_reqs, xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen, xprt->sending.qlen, xprt->pending.qlen, xprt->backlog.qlen, xprt->main, (xprt->xprt_class->ident == XPRT_TRANSPORT_TCP) ? get_srcport(xprt) : 0, - atomic_long_read(&xprt->queuelen)); + atomic_long_read(&xprt->queuelen), + (xprt->xprt_class->ident == XPRT_TRANSPORT_TCP) ? + xprt->address_strings[RPC_DISPLAY_PORT] : "0"); xprt_put(xprt); return ret + 1; } @@ -183,8 +207,10 @@ static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, if (!xprt_switch) return 0; - ret = sprintf(buf, "num_xprts=%u\nnum_active=%u\nqueue_len=%ld\n", + ret = sprintf(buf, "num_xprts=%u\nnum_active=%u\n" + "num_unique_destaddr=%u\nqueue_len=%ld\n", xprt_switch->xps_nxprts, xprt_switch->xps_nactive, + xprt_switch->xps_nunique_destaddr_xprts, atomic_long_read(&xprt_switch->xps_queuelen)); xprt_switch_put(xprt_switch); return ret + 1; @@ -376,6 +402,9 @@ static const void *rpc_sysfs_xprt_namespace(struct kobject *kobj) static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store); +static struct kobj_attribute rpc_sysfs_xprt_srcaddr = __ATTR(srcaddr, + 0644, rpc_sysfs_xprt_srcaddr_show, NULL); + static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, 0444, rpc_sysfs_xprt_info_show, NULL); @@ -384,6 +413,7 @@ static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state, static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_dstaddr.attr, + &rpc_sysfs_xprt_srcaddr.attr, &rpc_sysfs_xprt_info.attr, &rpc_sysfs_xprt_change_state.attr, NULL, diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index fb6db09725c7..cfd681700d1a 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -56,6 +56,7 @@ #include "sunrpc.h" #include "sysfs.h" +#include "fail.h" /* * Local variables @@ -761,6 +762,20 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) EXPORT_SYMBOL_GPL(xprt_disconnect_done); /** + * xprt_schedule_autoclose_locked - Try to schedule an autoclose RPC call + * @xprt: transport to disconnect + */ +static void xprt_schedule_autoclose_locked(struct rpc_xprt *xprt) +{ + set_bit(XPRT_CLOSE_WAIT, &xprt->state); + if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) + queue_work(xprtiod_workqueue, &xprt->task_cleanup); + else if (xprt->snd_task && !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) + rpc_wake_up_queued_task_set_status(&xprt->pending, + xprt->snd_task, -ENOTCONN); +} + +/** * xprt_force_disconnect - force a transport to disconnect * @xprt: transport to disconnect * @@ -771,13 +786,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) /* Don't race with the test_bit() in xprt_clear_locked() */ spin_lock(&xprt->transport_lock); - set_bit(XPRT_CLOSE_WAIT, &xprt->state); - /* Try to schedule an autoclose RPC call */ - if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(xprtiod_workqueue, &xprt->task_cleanup); - else if (xprt->snd_task) - rpc_wake_up_queued_task_set_status(&xprt->pending, - xprt->snd_task, -ENOTCONN); + xprt_schedule_autoclose_locked(xprt); spin_unlock(&xprt->transport_lock); } EXPORT_SYMBOL_GPL(xprt_force_disconnect); @@ -817,11 +826,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie) goto out; if (test_bit(XPRT_CLOSING, &xprt->state)) goto out; - set_bit(XPRT_CLOSE_WAIT, &xprt->state); - /* Try to schedule an autoclose RPC call */ - if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) - queue_work(xprtiod_workqueue, &xprt->task_cleanup); - xprt_wake_pending_tasks(xprt, -EAGAIN); + xprt_schedule_autoclose_locked(xprt); out: spin_unlock(&xprt->transport_lock); } @@ -855,6 +860,19 @@ xprt_init_autodisconnect(struct timer_list *t) queue_work(xprtiod_workqueue, &xprt->task_cleanup); } +#if IS_ENABLED(CONFIG_FAIL_SUNRPC) +static void xprt_inject_disconnect(struct rpc_xprt *xprt) +{ + if (!fail_sunrpc.ignore_client_disconnect && + should_fail(&fail_sunrpc.attr, 1)) + xprt->ops->inject_disconnect(xprt); +} +#else +static inline void xprt_inject_disconnect(struct rpc_xprt *xprt) +{ +} +#endif + bool xprt_lock_connect(struct rpc_xprt *xprt, struct rpc_task *task, void *cookie) @@ -866,12 +884,14 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; + set_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->snd_task = cookie; ret = true; out: spin_unlock(&xprt->transport_lock); return ret; } +EXPORT_SYMBOL_GPL(xprt_lock_connect); void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) { @@ -881,12 +901,14 @@ void xprt_unlock_connect(struct rpc_xprt *xprt, void *cookie) if (!test_bit(XPRT_LOCKED, &xprt->state)) goto out; xprt->snd_task =NULL; + clear_bit(XPRT_SND_IS_COOKIE, &xprt->state); xprt->ops->release_xprt(xprt, NULL); xprt_schedule_autodisconnect(xprt); out: spin_unlock(&xprt->transport_lock); wake_up_bit(&xprt->state, XPRT_LOCKED); } +EXPORT_SYMBOL_GPL(xprt_unlock_connect); /** * xprt_connect - schedule a transport connect operation diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index c60820e45082..1693f81aae37 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -139,6 +139,7 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt, xps->xps_iter_ops = &rpc_xprt_iter_singular; rpc_sysfs_xprt_switch_setup(xps, xprt, gfp_flags); xprt_switch_add_xprt_locked(xps, xprt); + xps->xps_nunique_destaddr_xprts = 1; rpc_sysfs_xprt_setup(xps, xprt, gfp_flags); } diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 1151efd09b27..17f174d6ea3b 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -115,7 +115,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) if (rc < 0) goto failed_marshal; - if (rpcrdma_post_sends(r_xprt, req)) + if (frwr_send(r_xprt, req)) goto drop_connection; return 0; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 229fcc9a9064..f700b34a5bfd 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -394,6 +394,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) struct rpcrdma_ep *ep = r_xprt->rx_ep; struct rpcrdma_mr *mr; unsigned int num_wrs; + int ret; num_wrs = 1; post_wr = send_wr; @@ -420,7 +421,10 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) } trace_xprtrdma_post_send(req); - return ib_post_send(ep->re_id->qp, post_wr, NULL); + ret = ib_post_send(ep->re_id->qp, post_wr, NULL); + if (ret) + trace_xprtrdma_post_send_err(r_xprt, req, ret); + return ret; } /** @@ -557,6 +561,10 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* On error, the MRs get destroyed once the QP has drained. */ trace_xprtrdma_post_linv_err(req, rc); + + /* Force a connection loss to ensure complete recovery. + */ + rpcrdma_force_disconnect(ep); } /** @@ -653,4 +661,8 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * retransmission. */ rpcrdma_unpin_rqst(req->rl_reply); + + /* Force a connection loss to ensure complete recovery. + */ + rpcrdma_force_disconnect(ep); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 1e651447dc4e..e27433f08ca7 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -35,6 +35,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc); * controlling svcxprt_rdma is destroyed. */ struct svc_rdma_rw_ctxt { + struct llist_node rw_node; struct list_head rw_list; struct rdma_rw_ctx rw_ctx; unsigned int rw_nents; @@ -53,19 +54,19 @@ static struct svc_rdma_rw_ctxt * svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) { struct svc_rdma_rw_ctxt *ctxt; + struct llist_node *node; spin_lock(&rdma->sc_rw_ctxt_lock); - - ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts); - if (ctxt) { - list_del(&ctxt->rw_list); - spin_unlock(&rdma->sc_rw_ctxt_lock); + node = llist_del_first(&rdma->sc_rw_ctxts); + spin_unlock(&rdma->sc_rw_ctxt_lock); + if (node) { + ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); } else { - spin_unlock(&rdma->sc_rw_ctxt_lock); ctxt = kmalloc(struct_size(ctxt, rw_first_sgl, SG_CHUNK_SIZE), GFP_KERNEL); if (!ctxt) goto out_noctx; + INIT_LIST_HEAD(&ctxt->rw_list); } @@ -83,14 +84,18 @@ out_noctx: return NULL; } -static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, - struct svc_rdma_rw_ctxt *ctxt) +static void __svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt, + struct llist_head *list) { sg_free_table_chained(&ctxt->rw_sg_table, SG_CHUNK_SIZE); + llist_add(&ctxt->rw_node, list); +} - spin_lock(&rdma->sc_rw_ctxt_lock); - list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts); - spin_unlock(&rdma->sc_rw_ctxt_lock); +static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt) +{ + __svc_rdma_put_rw_ctxt(rdma, ctxt, &rdma->sc_rw_ctxts); } /** @@ -101,9 +106,10 @@ static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) { struct svc_rdma_rw_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) { - list_del(&ctxt->rw_list); + while ((node = llist_del_first(&rdma->sc_rw_ctxts)) != NULL) { + ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); kfree(ctxt); } } @@ -171,20 +177,35 @@ static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, cc->cc_sqecount = 0; } +/* + * The consumed rw_ctx's are cleaned and placed on a local llist so + * that only one atomic llist operation is needed to put them all + * back on the free list. + */ static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc, enum dma_data_direction dir) { struct svcxprt_rdma *rdma = cc->cc_rdma; + struct llist_node *first, *last; struct svc_rdma_rw_ctxt *ctxt; + LLIST_HEAD(free); + first = last = NULL; while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { list_del(&ctxt->rw_list); rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, ctxt->rw_sg_table.sgl, ctxt->rw_nents, dir); - svc_rdma_put_rw_ctxt(rdma, ctxt); + __svc_rdma_put_rw_ctxt(rdma, ctxt, &free); + + ctxt->rw_node.next = first; + first = &ctxt->rw_node; + if (!last) + last = first; } + if (first) + llist_add_batch(first, last, &rdma->sc_rw_ctxts); } /* State for sending a Write or Reply chunk. @@ -248,8 +269,7 @@ static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_write(wc, &cc->cc_cid); - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); if (unlikely(wc->status != IB_WC_SUCCESS)) svc_xprt_deferred_close(&rdma->sc_xprt); @@ -304,9 +324,7 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_read(wc, &cc->cc_cid); - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); - + svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); cc->cc_status = wc->status; complete(&cc->cc_done); return; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index d6bbafb773e1..599021b2391d 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -113,13 +113,6 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc); -static inline struct svc_rdma_send_ctxt * -svc_rdma_next_send_ctxt(struct list_head *list) -{ - return list_first_entry_or_null(list, struct svc_rdma_send_ctxt, - sc_list); -} - static void svc_rdma_send_cid_init(struct svcxprt_rdma *rdma, struct rpc_rdma_cid *cid) { @@ -182,9 +175,10 @@ fail0: void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + struct llist_node *node; - while ((ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts))) { - list_del(&ctxt->sc_list); + while ((node = llist_del_first(&rdma->sc_send_ctxts)) != NULL) { + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); ib_dma_unmap_single(rdma->sc_pd->device, ctxt->sc_sges[0].addr, rdma->sc_max_req_size, @@ -204,12 +198,13 @@ void svc_rdma_send_ctxts_destroy(struct svcxprt_rdma *rdma) struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) { struct svc_rdma_send_ctxt *ctxt; + struct llist_node *node; spin_lock(&rdma->sc_send_lock); - ctxt = svc_rdma_next_send_ctxt(&rdma->sc_send_ctxts); - if (!ctxt) + node = llist_del_first(&rdma->sc_send_ctxts); + if (!node) goto out_empty; - list_del(&ctxt->sc_list); + ctxt = llist_entry(node, struct svc_rdma_send_ctxt, sc_node); spin_unlock(&rdma->sc_send_lock); out: @@ -253,9 +248,21 @@ void svc_rdma_send_ctxt_put(struct svcxprt_rdma *rdma, ctxt->sc_sges[i].length); } - spin_lock(&rdma->sc_send_lock); - list_add(&ctxt->sc_list, &rdma->sc_send_ctxts); - spin_unlock(&rdma->sc_send_lock); + llist_add(&ctxt->sc_node, &rdma->sc_send_ctxts); +} + +/** + * svc_rdma_wake_send_waiters - manage Send Queue accounting + * @rdma: controlling transport + * @avail: Number of additional SQEs that are now available + * + */ +void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail) +{ + atomic_add(avail, &rdma->sc_sq_avail); + smp_mb__after_atomic(); + if (unlikely(waitqueue_active(&rdma->sc_send_wait))) + wake_up(&rdma->sc_send_wait); } /** @@ -275,11 +282,9 @@ static void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) trace_svcrdma_wc_send(wc, &ctxt->sc_cid); + svc_rdma_wake_send_waiters(rdma, 1); complete(&ctxt->sc_done); - atomic_inc(&rdma->sc_sq_avail); - wake_up(&rdma->sc_send_wait); - if (unlikely(wc->status != IB_WC_SUCCESS)) svc_xprt_deferred_close(&rdma->sc_xprt); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index d94b7759ada1..94b20fb47135 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -136,9 +136,9 @@ static struct svcxprt_rdma *svc_rdma_create_xprt(struct svc_serv *serv, svc_xprt_init(net, &svc_rdma_class, &cma_xprt->sc_xprt, serv); INIT_LIST_HEAD(&cma_xprt->sc_accept_q); INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); - INIT_LIST_HEAD(&cma_xprt->sc_send_ctxts); + init_llist_head(&cma_xprt->sc_send_ctxts); init_llist_head(&cma_xprt->sc_recv_ctxts); - INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); + init_llist_head(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); @@ -545,7 +545,6 @@ static void __svc_rdma_free(struct work_struct *work) { struct svcxprt_rdma *rdma = container_of(work, struct svcxprt_rdma, sc_work); - struct svc_xprt *xprt = &rdma->sc_xprt; /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -553,12 +552,6 @@ static void __svc_rdma_free(struct work_struct *work) svc_rdma_flush_recv_queues(rdma); - /* Final put of backchannel client transport */ - if (xprt->xpt_bc_xprt) { - xprt_put(xprt->xpt_bc_xprt); - xprt->xpt_bc_xprt = NULL; - } - svc_rdma_destroy_rw_ctxts(rdma); svc_rdma_send_ctxts_destroy(rdma); svc_rdma_recv_ctxts_destroy(rdma); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 9c2ffc67c0fd..16e5696314a4 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -250,12 +250,9 @@ xprt_rdma_connect_worker(struct work_struct *work) xprt->stat.connect_start; xprt_set_connected(xprt); rc = -EAGAIN; - } else { - /* Force a call to xprt_rdma_close to clean up */ - spin_lock(&xprt->transport_lock); - set_bit(XPRT_CLOSE_WAIT, &xprt->state); - spin_unlock(&xprt->transport_lock); - } + } else + rpcrdma_xprt_disconnect(r_xprt); + xprt_unlock_connect(xprt, r_xprt); xprt_wake_pending_tasks(xprt, rc); } @@ -489,6 +486,8 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned long delay; + WARN_ON_ONCE(!xprt_lock_connect(xprt, task, r_xprt)); + delay = 0; if (ep && ep->re_connect_status != 0) { delay = xprt_reconnect_delay(xprt); @@ -661,7 +660,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst) goto drop_connection; rqst->rq_xtime = ktime_get(); - if (rpcrdma_post_sends(r_xprt, req)) + if (frwr_send(r_xprt, req)) goto drop_connection; rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 649c23518ec0..aaec3c9be8db 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -124,7 +124,7 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) * connection is closed or lost. (The important thing is it needs * to be invoked "at least" once). */ -static void rpcrdma_force_disconnect(struct rpcrdma_ep *ep) +void rpcrdma_force_disconnect(struct rpcrdma_ep *ep) { if (atomic_add_unless(&ep->re_force_disconnect, 1, 1)) xprt_force_disconnect(ep->re_xprt); @@ -1350,21 +1350,6 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) } /** - * rpcrdma_post_sends - Post WRs to a transport's Send Queue - * @r_xprt: controlling transport instance - * @req: rpcrdma_req containing the Send WR to post - * - * Returns 0 if the post was successful, otherwise -ENOTCONN - * is returned. - */ -int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) -{ - if (frwr_send(r_xprt, req)) - return -ENOTCONN; - return 0; -} - -/** * rpcrdma_post_recvs - Refill the Receive Queue * @r_xprt: controlling transport instance * @needed: current credit grant @@ -1416,12 +1401,8 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp) rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); - if (atomic_dec_return(&ep->re_receiving) > 0) - complete(&ep->re_done); - -out: - trace_xprtrdma_post_recvs(r_xprt, count, rc); if (rc) { + trace_xprtrdma_post_recvs_err(r_xprt, rc); for (wr = bad_wr; wr;) { struct rpcrdma_rep *rep; @@ -1431,6 +1412,11 @@ out: --count; } } + if (atomic_dec_return(&ep->re_receiving) > 0) + complete(&ep->re_done); + +out: + trace_xprtrdma_post_recvs(r_xprt, count); ep->re_receive_count += count; return; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 5d231d94e944..d91f54eae00b 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -454,11 +454,11 @@ extern unsigned int xprt_rdma_memreg_strategy; /* * Endpoint calls - xprtrdma/verbs.c */ +void rpcrdma_force_disconnect(struct rpcrdma_ep *ep); void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc); int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); -int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp); /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e573dcecdd66..04f1b78bcbca 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1656,7 +1656,7 @@ static int xs_get_srcport(struct sock_xprt *transport) unsigned short get_srcport(struct rpc_xprt *xprt) { struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); - return sock->srcport; + return xs_sock_getport(sock->sock); } EXPORT_SYMBOL(get_srcport); @@ -2099,13 +2099,20 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) if (sock == NULL) return; + if (!xprt->reuseport) { + xs_close(xprt); + return; + } switch (skst) { - default: + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + break; + case TCP_ESTABLISHED: + case TCP_CLOSE_WAIT: kernel_sock_shutdown(sock, SHUT_RDWR); trace_rpc_socket_shutdown(xprt, sock); break; - case TCP_CLOSE: - case TCP_TIME_WAIT: + default: xs_reset_transport(transport); } } @@ -3149,24 +3156,6 @@ void cleanup_socket_xprt(void) xprt_unregister_transport(&xs_bc_tcp_transport); } -static int param_set_uint_minmax(const char *val, - const struct kernel_param *kp, - unsigned int min, unsigned int max) -{ - unsigned int num; - int ret; - - if (!val) - return -EINVAL; - ret = kstrtouint(val, 0, &num); - if (ret) - return ret; - if (num < min || num > max) - return -EINVAL; - *((unsigned int *)kp->arg) = num; - return 0; -} - static int param_set_portnr(const char *val, const struct kernel_param *kp) { return param_set_uint_minmax(val, kp, diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 070698dd19bc..83460470e883 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -378,6 +378,160 @@ int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev, } EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers); +struct switchdev_nested_priv { + bool (*check_cb)(const struct net_device *dev); + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev); + const struct net_device *dev; + struct net_device *lower_dev; +}; + +static int switchdev_lower_dev_walk(struct net_device *lower_dev, + struct netdev_nested_priv *priv) +{ + struct switchdev_nested_priv *switchdev_priv = priv->data; + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev); + bool (*check_cb)(const struct net_device *dev); + const struct net_device *dev; + + check_cb = switchdev_priv->check_cb; + foreign_dev_check_cb = switchdev_priv->foreign_dev_check_cb; + dev = switchdev_priv->dev; + + if (check_cb(lower_dev) && !foreign_dev_check_cb(lower_dev, dev)) { + switchdev_priv->lower_dev = lower_dev; + return 1; + } + + return 0; +} + +static struct net_device * +switchdev_lower_dev_find(struct net_device *dev, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev)) +{ + struct switchdev_nested_priv switchdev_priv = { + .check_cb = check_cb, + .foreign_dev_check_cb = foreign_dev_check_cb, + .dev = dev, + .lower_dev = NULL, + }; + struct netdev_nested_priv priv = { + .data = &switchdev_priv, + }; + + netdev_walk_all_lower_dev_rcu(dev, switchdev_lower_dev_walk, &priv); + + return switchdev_priv.lower_dev; +} + +static int __switchdev_handle_fdb_event_to_device(struct net_device *dev, + struct net_device *orig_dev, unsigned long event, + const struct switchdev_notifier_fdb_info *fdb_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, + unsigned long event, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info), + int (*lag_mod_cb)(struct net_device *dev, struct net_device *orig_dev, + unsigned long event, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info)) +{ + const struct switchdev_notifier_info *info = &fdb_info->info; + struct net_device *br, *lower_dev; + struct list_head *iter; + int err = -EOPNOTSUPP; + + if (check_cb(dev)) + return mod_cb(dev, orig_dev, event, info->ctx, fdb_info); + + if (netif_is_lag_master(dev)) { + if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) + goto maybe_bridged_with_us; + + /* This is a LAG interface that we offload */ + if (!lag_mod_cb) + return -EOPNOTSUPP; + + return lag_mod_cb(dev, orig_dev, event, info->ctx, fdb_info); + } + + /* Recurse through lower interfaces in case the FDB entry is pointing + * towards a bridge device. + */ + if (netif_is_bridge_master(dev)) { + if (!switchdev_lower_dev_find(dev, check_cb, foreign_dev_check_cb)) + return 0; + + /* This is a bridge interface that we offload */ + netdev_for_each_lower_dev(dev, lower_dev, iter) { + /* Do not propagate FDB entries across bridges */ + if (netif_is_bridge_master(lower_dev)) + continue; + + /* Bridge ports might be either us, or LAG interfaces + * that we offload. + */ + if (!check_cb(lower_dev) && + !switchdev_lower_dev_find(lower_dev, check_cb, + foreign_dev_check_cb)) + continue; + + err = __switchdev_handle_fdb_event_to_device(lower_dev, orig_dev, + event, fdb_info, check_cb, + foreign_dev_check_cb, + mod_cb, lag_mod_cb); + if (err && err != -EOPNOTSUPP) + return err; + } + + return 0; + } + +maybe_bridged_with_us: + /* Event is neither on a bridge nor a LAG. Check whether it is on an + * interface that is in a bridge with us. + */ + br = netdev_master_upper_dev_get_rcu(dev); + if (!br || !netif_is_bridge_master(br)) + return 0; + + if (!switchdev_lower_dev_find(br, check_cb, foreign_dev_check_cb)) + return 0; + + return __switchdev_handle_fdb_event_to_device(br, orig_dev, event, fdb_info, + check_cb, foreign_dev_check_cb, + mod_cb, lag_mod_cb); +} + +int switchdev_handle_fdb_event_to_device(struct net_device *dev, unsigned long event, + const struct switchdev_notifier_fdb_info *fdb_info, + bool (*check_cb)(const struct net_device *dev), + bool (*foreign_dev_check_cb)(const struct net_device *dev, + const struct net_device *foreign_dev), + int (*mod_cb)(struct net_device *dev, struct net_device *orig_dev, + unsigned long event, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info), + int (*lag_mod_cb)(struct net_device *dev, struct net_device *orig_dev, + unsigned long event, const void *ctx, + const struct switchdev_notifier_fdb_info *fdb_info)) +{ + int err; + + err = __switchdev_handle_fdb_event_to_device(dev, dev, event, fdb_info, + check_cb, foreign_dev_check_cb, + mod_cb, lag_mod_cb); + if (err == -EOPNOTSUPP) + err = 0; + + return err; +} +EXPORT_SYMBOL_GPL(switchdev_handle_fdb_event_to_device); + static int __switchdev_handle_port_obj_add(struct net_device *dev, struct switchdev_notifier_port_obj_info *port_obj_info, bool (*check_cb)(const struct net_device *dev), @@ -549,3 +703,51 @@ int switchdev_handle_port_attr_set(struct net_device *dev, return err; } EXPORT_SYMBOL_GPL(switchdev_handle_port_attr_set); + +int switchdev_bridge_port_offload(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + bool tx_fwd_offload, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_brport_info brport_info = { + .brport = { + .dev = dev, + .ctx = ctx, + .atomic_nb = atomic_nb, + .blocking_nb = blocking_nb, + .tx_fwd_offload = tx_fwd_offload, + }, + }; + int err; + + ASSERT_RTNL(); + + err = call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_OFFLOADED, + brport_dev, &brport_info.info, + extack); + return notifier_to_errno(err); +} +EXPORT_SYMBOL_GPL(switchdev_bridge_port_offload); + +void switchdev_bridge_port_unoffload(struct net_device *brport_dev, + const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb) +{ + struct switchdev_notifier_brport_info brport_info = { + .brport = { + .ctx = ctx, + .atomic_nb = atomic_nb, + .blocking_nb = blocking_nb, + }, + }; + + ASSERT_RTNL(); + + call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_UNOFFLOADED, + brport_dev, &brport_info.info, + NULL); +} +EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload); diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 443f8e5b9477..60bc74b76adc 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -462,7 +462,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b, b->bcast_addr.media_id = b->media->type_id; b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT; b->mtu = dev->mtu; - b->media->raw2addr(b, &b->addr, (char *)dev->dev_addr); + b->media->raw2addr(b, &b->addr, (const char *)dev->dev_addr); rcu_assign_pointer(dev->tipc_ptr, b); return 0; } @@ -703,7 +703,7 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt, break; case NETDEV_CHANGEADDR: b->media->raw2addr(b, &b->addr, - (char *)dev->dev_addr); + (const char *)dev->dev_addr); tipc_reset_bearer(net, b); break; case NETDEV_UNREGISTER: diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index 57c6a1a719e2..490ad6e5f7a3 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -117,7 +117,7 @@ struct tipc_media { char *msg); int (*raw2addr)(struct tipc_bearer *b, struct tipc_media_addr *addr, - char *raw); + const char *raw); u32 priority; u32 tolerance; u32 min_win; diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index c9391d38de85..dc60c32bb70d 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -2285,43 +2285,53 @@ static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr) u16 key_gen = msg_key_gen(hdr); u16 size = msg_data_sz(hdr); u8 *data = msg_data(hdr); + unsigned int keylen; + + /* Verify whether the size can exist in the packet */ + if (unlikely(size < sizeof(struct tipc_aead_key) + TIPC_AEAD_KEYLEN_MIN)) { + pr_debug("%s: message data size is too small\n", rx->name); + goto exit; + } + + keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME))); + + /* Verify the supplied size values */ + if (unlikely(size != keylen + sizeof(struct tipc_aead_key) || + keylen > TIPC_AEAD_KEY_SIZE_MAX)) { + pr_debug("%s: invalid MSG_CRYPTO key size\n", rx->name); + goto exit; + } spin_lock(&rx->lock); if (unlikely(rx->skey || (key_gen == rx->key_gen && rx->key.keys))) { pr_err("%s: key existed <%p>, gen %d vs %d\n", rx->name, rx->skey, key_gen, rx->key_gen); - goto exit; + goto exit_unlock; } /* Allocate memory for the key */ skey = kmalloc(size, GFP_ATOMIC); if (unlikely(!skey)) { pr_err("%s: unable to allocate memory for skey\n", rx->name); - goto exit; + goto exit_unlock; } /* Copy key from msg data */ - skey->keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME))); + skey->keylen = keylen; memcpy(skey->alg_name, data, TIPC_AEAD_ALG_NAME); memcpy(skey->key, data + TIPC_AEAD_ALG_NAME + sizeof(__be32), skey->keylen); - /* Sanity check */ - if (unlikely(size != tipc_aead_key_size(skey))) { - kfree(skey); - skey = NULL; - goto exit; - } - rx->key_gen = key_gen; rx->skey_mode = msg_key_mode(hdr); rx->skey = skey; rx->nokey = 0; mb(); /* for nokey flag */ -exit: +exit_unlock: spin_unlock(&rx->lock); +exit: /* Schedule the key attaching on this crypto */ if (likely(skey && queue_delayed_work(tx->wq, &rx->work, 0))) return true; diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index c68019697cfe..cb0d185e06af 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -60,7 +60,7 @@ static int tipc_eth_addr2msg(char *msg, struct tipc_media_addr *addr) /* Convert raw mac address format to media addr format */ static int tipc_eth_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, - char *msg) + const char *msg) { memset(addr, 0, sizeof(*addr)); ether_addr_copy(addr->value, msg); diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index 7aa9ff88458d..b9ad0434c3cd 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -67,7 +67,7 @@ static int tipc_ib_addr2msg(char *msg, struct tipc_media_addr *addr) /* Convert raw InfiniBand address format to media addr format */ static int tipc_ib_raw2addr(struct tipc_bearer *b, struct tipc_media_addr *addr, - char *msg) + const char *msg) { memset(addr, 0, sizeof(*addr)); memcpy(addr->value, msg, INFINIBAND_ALEN); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 8754bd885169..ad570c2450be 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1426,7 +1426,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (ua) { if (!tipc_uaddr_valid(ua, m->msg_namelen)) return -EINVAL; - atype = ua->addrtype; + atype = ua->addrtype; } /* If socket belongs to a communication group follow other paths */ @@ -1886,6 +1886,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, bool connected = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); int rc, err, hlen, dlen, copy; + struct tipc_skb_cb *skb_cb; struct sk_buff_head xmitq; struct tipc_msg *hdr; struct sk_buff *skb; @@ -1909,6 +1910,7 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, if (unlikely(rc)) goto exit; skb = skb_peek(&sk->sk_receive_queue); + skb_cb = TIPC_SKB_CB(skb); hdr = buf_msg(skb); dlen = msg_data_sz(hdr); hlen = msg_hdr_sz(hdr); @@ -1928,18 +1930,33 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, /* Capture data if non-error msg, otherwise just set return value */ if (likely(!err)) { - copy = min_t(int, dlen, buflen); - if (unlikely(copy != dlen)) - m->msg_flags |= MSG_TRUNC; - rc = skb_copy_datagram_msg(skb, hlen, m, copy); + int offset = skb_cb->bytes_read; + + copy = min_t(int, dlen - offset, buflen); + rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); + if (unlikely(rc)) + goto exit; + if (unlikely(offset + copy < dlen)) { + if (flags & MSG_EOR) { + if (!(flags & MSG_PEEK)) + skb_cb->bytes_read = offset + copy; + } else { + m->msg_flags |= MSG_TRUNC; + skb_cb->bytes_read = 0; + } + } else { + if (flags & MSG_EOR) + m->msg_flags |= MSG_EOR; + skb_cb->bytes_read = 0; + } } else { copy = 0; rc = 0; - if (err != TIPC_CONN_SHUTDOWN && connected && !m->msg_control) + if (err != TIPC_CONN_SHUTDOWN && connected && !m->msg_control) { rc = -ECONNRESET; + goto exit; + } } - if (unlikely(rc)) - goto exit; /* Mark message as group event if applicable */ if (unlikely(grp_evt)) { @@ -1962,6 +1979,9 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, tipc_node_distr_xmit(sock_net(sk), &xmitq); } + if (skb_cb->bytes_read) + goto exit; + tsk_advance_rx_queue(sk); if (likely(!connected)) @@ -2403,7 +2423,7 @@ static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, u32 dport, struct sk_buff_head *xmitq) { - unsigned long time_limit = jiffies + 2; + unsigned long time_limit = jiffies + usecs_to_jiffies(20000); struct sk_buff *skb; unsigned int lim; atomic_t *dcnt; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index fde56ff49163..acfba9f1ba72 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -421,6 +421,88 @@ static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, rc = -EFAULT; break; } + case TLS_CIPHER_AES_CCM_128: { + struct tls12_crypto_info_aes_ccm_128 *aes_ccm_128 = + container_of(crypto_info, + struct tls12_crypto_info_aes_ccm_128, info); + + if (len != sizeof(*aes_ccm_128)) { + rc = -EINVAL; + goto out; + } + lock_sock(sk); + memcpy(aes_ccm_128->iv, + cctx->iv + TLS_CIPHER_AES_CCM_128_SALT_SIZE, + TLS_CIPHER_AES_CCM_128_IV_SIZE); + memcpy(aes_ccm_128->rec_seq, cctx->rec_seq, + TLS_CIPHER_AES_CCM_128_REC_SEQ_SIZE); + release_sock(sk); + if (copy_to_user(optval, aes_ccm_128, sizeof(*aes_ccm_128))) + rc = -EFAULT; + break; + } + case TLS_CIPHER_CHACHA20_POLY1305: { + struct tls12_crypto_info_chacha20_poly1305 *chacha20_poly1305 = + container_of(crypto_info, + struct tls12_crypto_info_chacha20_poly1305, + info); + + if (len != sizeof(*chacha20_poly1305)) { + rc = -EINVAL; + goto out; + } + lock_sock(sk); + memcpy(chacha20_poly1305->iv, + cctx->iv + TLS_CIPHER_CHACHA20_POLY1305_SALT_SIZE, + TLS_CIPHER_CHACHA20_POLY1305_IV_SIZE); + memcpy(chacha20_poly1305->rec_seq, cctx->rec_seq, + TLS_CIPHER_CHACHA20_POLY1305_REC_SEQ_SIZE); + release_sock(sk); + if (copy_to_user(optval, chacha20_poly1305, + sizeof(*chacha20_poly1305))) + rc = -EFAULT; + break; + } + case TLS_CIPHER_SM4_GCM: { + struct tls12_crypto_info_sm4_gcm *sm4_gcm_info = + container_of(crypto_info, + struct tls12_crypto_info_sm4_gcm, info); + + if (len != sizeof(*sm4_gcm_info)) { + rc = -EINVAL; + goto out; + } + lock_sock(sk); + memcpy(sm4_gcm_info->iv, + cctx->iv + TLS_CIPHER_SM4_GCM_SALT_SIZE, + TLS_CIPHER_SM4_GCM_IV_SIZE); + memcpy(sm4_gcm_info->rec_seq, cctx->rec_seq, + TLS_CIPHER_SM4_GCM_REC_SEQ_SIZE); + release_sock(sk); + if (copy_to_user(optval, sm4_gcm_info, sizeof(*sm4_gcm_info))) + rc = -EFAULT; + break; + } + case TLS_CIPHER_SM4_CCM: { + struct tls12_crypto_info_sm4_ccm *sm4_ccm_info = + container_of(crypto_info, + struct tls12_crypto_info_sm4_ccm, info); + + if (len != sizeof(*sm4_ccm_info)) { + rc = -EINVAL; + goto out; + } + lock_sock(sk); + memcpy(sm4_ccm_info->iv, + cctx->iv + TLS_CIPHER_SM4_CCM_SALT_SIZE, + TLS_CIPHER_SM4_CCM_IV_SIZE); + memcpy(sm4_ccm_info->rec_seq, cctx->rec_seq, + TLS_CIPHER_SM4_CCM_REC_SEQ_SIZE); + release_sock(sk); + if (copy_to_user(optval, sm4_ccm_info, sizeof(*sm4_ccm_info))) + rc = -EFAULT; + break; + } default: rc = -EINVAL; } @@ -524,6 +606,12 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, case TLS_CIPHER_CHACHA20_POLY1305: optsize = sizeof(struct tls12_crypto_info_chacha20_poly1305); break; + case TLS_CIPHER_SM4_GCM: + optsize = sizeof(struct tls12_crypto_info_sm4_gcm); + break; + case TLS_CIPHER_SM4_CCM: + optsize = sizeof(struct tls12_crypto_info_sm4_ccm); + break; default: rc = -EINVAL; goto err_crypto_info; @@ -681,12 +769,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; - prot[TLS_BASE][TLS_SW].stream_memory_read = tls_sw_stream_read; + prot[TLS_BASE][TLS_SW].sock_is_readable = tls_sw_sock_is_readable; prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE]; prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; - prot[TLS_SW][TLS_SW].stream_memory_read = tls_sw_stream_read; + prot[TLS_SW][TLS_SW].sock_is_readable = tls_sw_sock_is_readable; prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; #ifdef CONFIG_TLS_DEVICE diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 4feb95e34b64..d81564078557 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -35,6 +35,7 @@ * SOFTWARE. */ +#include <linux/bug.h> #include <linux/sched/signal.h> #include <linux/module.h> #include <linux/splice.h> @@ -43,6 +44,14 @@ #include <net/strparser.h> #include <net/tls.h> +noinline void tls_err_abort(struct sock *sk, int err) +{ + WARN_ON_ONCE(err >= 0); + /* sk->sk_err should contain a positive error code. */ + sk->sk_err = -err; + sk_error_report(sk); +} + static int __skb_nsg(struct sk_buff *skb, int offset, int len, unsigned int recursion_level) { @@ -419,7 +428,7 @@ int tls_tx_records(struct sock *sk, int flags) tx_err: if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); return rc; } @@ -450,7 +459,7 @@ static void tls_encrypt_done(struct crypto_async_request *req, int err) /* If err is already set on socket, return the same code */ if (sk->sk_err) { - ctx->async_wait.err = sk->sk_err; + ctx->async_wait.err = -sk->sk_err; } else { ctx->async_wait.err = err; tls_err_abort(sk, err); @@ -498,9 +507,15 @@ static int tls_do_encryption(struct sock *sk, int rc, iv_offset = 0; /* For CCM based ciphers, first byte of IV is a constant */ - if (prot->cipher_type == TLS_CIPHER_AES_CCM_128) { + switch (prot->cipher_type) { + case TLS_CIPHER_AES_CCM_128: rec->iv_data[0] = TLS_AES_CCM_IV_B0_BYTE; iv_offset = 1; + break; + case TLS_CIPHER_SM4_CCM: + rec->iv_data[0] = TLS_SM4_CCM_IV_B0_BYTE; + iv_offset = 1; + break; } memcpy(&rec->iv_data[iv_offset], tls_ctx->tx.iv, @@ -763,7 +778,7 @@ static int tls_push_record(struct sock *sk, int flags, msg_pl->sg.size + prot->tail_size, i); if (rc < 0) { if (rc != -EINPROGRESS) { - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); if (split) { tls_ctx->pending_open_record_frags = true; tls_merge_open_record(sk, rec, tmp, orig_end); @@ -1457,10 +1472,16 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, aad = (u8 *)(sgout + n_sgout); iv = aad + prot->aad_size; - /* For CCM based ciphers, first byte of nonce+iv is always '2' */ - if (prot->cipher_type == TLS_CIPHER_AES_CCM_128) { - iv[0] = 2; + /* For CCM based ciphers, first byte of nonce+iv is a constant */ + switch (prot->cipher_type) { + case TLS_CIPHER_AES_CCM_128: + iv[0] = TLS_AES_CCM_IV_B0_BYTE; iv_offset = 1; + break; + case TLS_CIPHER_SM4_CCM: + iv[0] = TLS_SM4_CCM_IV_B0_BYTE; + iv_offset = 1; + break; } /* Prepare IV */ @@ -1827,7 +1848,7 @@ int tls_sw_recvmsg(struct sock *sk, err = decrypt_skb_update(sk, skb, &msg->msg_iter, &chunk, &zc, async_capable); if (err < 0 && err != -EINPROGRESS) { - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); goto recv_end; } @@ -2007,7 +2028,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos, } if (err < 0) { - tls_err_abort(sk, EBADMSG); + tls_err_abort(sk, -EBADMSG); goto splice_read_end; } ctx->decrypted = 1; @@ -2026,7 +2047,7 @@ splice_read_end: return copied ? : err; } -bool tls_sw_stream_read(const struct sock *sk) +bool tls_sw_sock_is_readable(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); @@ -2424,6 +2445,40 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) cipher_name = "rfc7539(chacha20,poly1305)"; break; } + case TLS_CIPHER_SM4_GCM: { + struct tls12_crypto_info_sm4_gcm *sm4_gcm_info; + + sm4_gcm_info = (void *)crypto_info; + nonce_size = TLS_CIPHER_SM4_GCM_IV_SIZE; + tag_size = TLS_CIPHER_SM4_GCM_TAG_SIZE; + iv_size = TLS_CIPHER_SM4_GCM_IV_SIZE; + iv = sm4_gcm_info->iv; + rec_seq_size = TLS_CIPHER_SM4_GCM_REC_SEQ_SIZE; + rec_seq = sm4_gcm_info->rec_seq; + keysize = TLS_CIPHER_SM4_GCM_KEY_SIZE; + key = sm4_gcm_info->key; + salt = sm4_gcm_info->salt; + salt_size = TLS_CIPHER_SM4_GCM_SALT_SIZE; + cipher_name = "gcm(sm4)"; + break; + } + case TLS_CIPHER_SM4_CCM: { + struct tls12_crypto_info_sm4_ccm *sm4_ccm_info; + + sm4_ccm_info = (void *)crypto_info; + nonce_size = TLS_CIPHER_SM4_CCM_IV_SIZE; + tag_size = TLS_CIPHER_SM4_CCM_TAG_SIZE; + iv_size = TLS_CIPHER_SM4_CCM_IV_SIZE; + iv = sm4_ccm_info->iv; + rec_seq_size = TLS_CIPHER_SM4_CCM_REC_SEQ_SIZE; + rec_seq = sm4_ccm_info->rec_seq; + keysize = TLS_CIPHER_SM4_CCM_KEY_SIZE; + key = sm4_ccm_info->key; + salt = sm4_ccm_info->salt; + salt_size = TLS_CIPHER_SM4_CCM_SALT_SIZE; + cipher_name = "ccm(sm4)"; + break; + } default: rc = -EINVAL; goto free_priv; diff --git a/net/unix/Kconfig b/net/unix/Kconfig index b6c4282899ec..b7f811216820 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -25,6 +25,11 @@ config UNIX_SCM depends on UNIX default y +config AF_UNIX_OOB + bool + depends on UNIX + default y + config UNIX_DIAG tristate "UNIX: socket monitoring interface" depends on UNIX diff --git a/net/unix/Makefile b/net/unix/Makefile index 54e58cc4f945..20491825b4d0 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -7,6 +7,7 @@ obj-$(CONFIG_UNIX) += unix.o unix-y := af_unix.o garbage.o unix-$(CONFIG_SYSCTL) += sysctl_net_unix.o +unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o obj-$(CONFIG_UNIX_DIAG) += unix_diag.o unix_diag-y := diag.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ba7ced947e51..78e08e82c08c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -113,6 +113,7 @@ #include <linux/security.h> #include <linux/freezer.h> #include <linux/file.h> +#include <linux/btf_ids.h> #include "scm.h" @@ -494,6 +495,7 @@ static void unix_dgram_disconnected(struct sock *sk, struct sock *other) sk_error_report(other); } } + other->sk_state = TCP_CLOSE; } static void unix_sock_destructor(struct sock *sk) @@ -502,6 +504,12 @@ static void unix_sock_destructor(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + kfree_skb(u->oob_skb); + u->oob_skb = NULL; + } +#endif WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(!sk_unhashed(sk)); WARN_ON(sk->sk_socket); @@ -600,20 +608,42 @@ static void unix_release_sock(struct sock *sk, int embrion) static void init_peercred(struct sock *sk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + spin_lock(&sk->sk_peer_lock); + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); + spin_unlock(&sk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static void copy_peercred(struct sock *sk, struct sock *peersk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + if (sk < peersk) { + spin_lock(&sk->sk_peer_lock); + spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&peersk->sk_peer_lock); + spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); + + spin_unlock(&sk->sk_peer_lock); + spin_unlock(&peersk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static int unix_listen(struct socket *sock, int backlog) @@ -669,6 +699,10 @@ static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); +static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); +static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); @@ -722,6 +756,7 @@ static const struct proto_ops unix_stream_ops = { .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, + .read_sock = unix_stream_read_sock, .mmap = sock_no_mmap, .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, @@ -746,6 +781,7 @@ static const struct proto_ops unix_dgram_ops = { .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, + .read_sock = unix_read_sock, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, @@ -777,24 +813,62 @@ static const struct proto_ops unix_seqpacket_ops = { .show_fdinfo = unix_show_fdinfo, }; -static struct proto unix_proto = { +static void unix_close(struct sock *sk, long timeout) +{ + /* Nothing to do here, unix socket does not need a ->close(). + * This is merely for sockmap. + */ +} + +static void unix_unhash(struct sock *sk) +{ + /* Nothing to do here, unix socket does not need a ->unhash(). + * This is merely for sockmap. + */ +} + +struct proto unix_dgram_proto = { .name = "UNIX", .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), + .close = unix_close, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = unix_dgram_bpf_update_proto, +#endif +}; + +struct proto unix_stream_proto = { + .name = "UNIX-STREAM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct unix_sock), + .close = unix_close, + .unhash = unix_unhash, +#ifdef CONFIG_BPF_SYSCALL + .psock_update_sk_prot = unix_stream_bpf_update_proto, +#endif }; -static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) +static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { - struct sock *sk = NULL; struct unix_sock *u; + struct sock *sk; + int err; atomic_long_inc(&unix_nr_socks); - if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) - goto out; + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { + err = -ENFILE; + goto err; + } - sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern); - if (!sk) - goto out; + if (type == SOCK_STREAM) + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); + else /*dgram and seqpacket */ + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); + + if (!sk) { + err = -ENOMEM; + goto err; + } sock_init_data(sock, sk); @@ -814,20 +888,23 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_socket(unix_sockets_unbound(sk), sk); -out: - if (sk == NULL) - atomic_long_dec(&unix_nr_socks); - else { - local_bh_disable(); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - local_bh_enable(); - } + + local_bh_disable(); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + local_bh_enable(); + return sk; + +err: + atomic_long_dec(&unix_nr_socks); + return ERR_PTR(err); } static int unix_create(struct net *net, struct socket *sock, int protocol, int kern) { + struct sock *sk; + if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; @@ -854,7 +931,11 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock, kern) ? 0 : -ENOMEM; + sk = unix_create1(net, sock, kern, sock->type); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + return 0; } static int unix_release(struct socket *sock) @@ -864,6 +945,7 @@ static int unix_release(struct socket *sock) if (!sk) return 0; + sk->sk_prot->close(sk, 0); unix_release_sock(sk, 0); sock->sk = NULL; @@ -1174,6 +1256,7 @@ restart: if (err) goto out_unlock; + sk->sk_state = other->sk_state = TCP_ESTABLISHED; } else { /* * 1003.1g breaking connected state with AF_UNSPEC @@ -1187,7 +1270,10 @@ restart: */ if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); + unix_peer(sk) = other; + if (!other) + sk->sk_state = TCP_CLOSE; unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer); unix_state_double_unlock(sk, other); @@ -1199,6 +1285,7 @@ restart: unix_peer(sk) = other; unix_state_double_unlock(sk, other); } + return 0; out_unlock: @@ -1261,12 +1348,15 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, we will have to recheck all again in any case. */ - err = -ENOMEM; - /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL, 0); - if (newsk == NULL) + newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); + if (IS_ERR(newsk)) { + err = PTR_ERR(newsk); + newsk = NULL; goto out; + } + + err = -ENOMEM; /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); @@ -1431,12 +1521,10 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) init_peercred(ska); init_peercred(skb); - if (ska->sk_type != SOCK_DGRAM) { - ska->sk_state = TCP_ESTABLISHED; - skb->sk_state = TCP_ESTABLISHED; - socka->state = SS_CONNECTED; - sockb->state = SS_CONNECTED; - } + ska->sk_state = TCP_ESTABLISHED; + skb->sk_state = TCP_ESTABLISHED; + socka->state = SS_CONNECTED; + sockb->state = SS_CONNECTED; return 0; } @@ -1782,6 +1870,7 @@ restart_locked: unix_state_unlock(sk); + sk->sk_state = TCP_CLOSE; unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; @@ -1872,6 +1961,53 @@ out: */ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) +#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) +static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other) +{ + struct unix_sock *ousk = unix_sk(other); + struct sk_buff *skb; + int err = 0; + + skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); + + if (!skb) + return err; + + skb_put(skb, 1); + err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); + + if (err) { + kfree_skb(skb); + return err; + } + + unix_state_lock(other); + + if (sock_flag(other, SOCK_DEAD) || + (other->sk_shutdown & RCV_SHUTDOWN)) { + unix_state_unlock(other); + kfree_skb(skb); + return -EPIPE; + } + + maybe_add_creds(skb, sock, other); + skb_get(skb); + + if (ousk->oob_skb) + consume_skb(ousk->oob_skb); + + ousk->oob_skb = skb; + + scm_stat_add(other, skb); + skb_queue_tail(&other->sk_receive_queue, skb); + sk_send_sigurg(other); + unix_state_unlock(other); + other->sk_data_ready(other); + + return err; +} +#endif + static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { @@ -1890,8 +2026,14 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, return err; err = -EOPNOTSUPP; - if (msg->msg_flags&MSG_OOB) - goto out_err; + if (msg->msg_flags & MSG_OOB) { +#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) + if (len) + len--; + else +#endif + goto out_err; + } if (msg->msg_namelen) { err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; @@ -1956,6 +2098,15 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, sent += size; } +#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) + if (msg->msg_flags & MSG_OOB) { + err = queue_oob(sock, msg, other); + if (err) + goto out_err; + sent++; + } +#endif + scm_destroy(&scm); return sent; @@ -2128,11 +2279,11 @@ static void unix_copy_addr(struct msghdr *msg, struct sock *sk) } } -static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, - size_t size, int flags) +int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, + int flags) { struct scm_cookie scm; - struct sock *sk = sock->sk; + struct socket *sock = sk->sk_socket; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb, *last; long timeo; @@ -2235,6 +2386,55 @@ out: return err; } +static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + +#ifdef CONFIG_BPF_SYSCALL + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot != &unix_dgram_proto) + return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, NULL); +#endif + return __unix_dgram_recvmsg(sk, msg, size, flags); +} + +static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + int copied = 0; + + while (1) { + struct unix_sock *u = unix_sk(sk); + struct sk_buff *skb; + int used, err; + + mutex_lock(&u->iolock); + skb = skb_recv_datagram(sk, 0, 1, &err); + mutex_unlock(&u->iolock); + if (!skb) + return err; + + used = recv_actor(desc, skb, 0, skb->len); + if (used <= 0) { + if (!copied) + copied = used; + kfree_skb(skb); + break; + } else if (used <= skb->len) { + copied += used; + } + + kfree_skb(skb); + if (!desc->count) + break; + } + + return copied; +} + /* * Sleep until more data has arrived. But check for races.. */ @@ -2294,6 +2494,86 @@ struct unix_stream_read_state { unsigned int splice_flags; }; +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) +static int unix_stream_recv_urg(struct unix_stream_read_state *state) +{ + struct socket *sock = state->socket; + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk); + int chunk = 1; + struct sk_buff *oob_skb; + + mutex_lock(&u->iolock); + unix_state_lock(sk); + + if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) { + unix_state_unlock(sk); + mutex_unlock(&u->iolock); + return -EINVAL; + } + + oob_skb = u->oob_skb; + + if (!(state->flags & MSG_PEEK)) { + u->oob_skb = NULL; + } + + unix_state_unlock(sk); + + chunk = state->recv_actor(oob_skb, 0, chunk, state); + + if (!(state->flags & MSG_PEEK)) { + UNIXCB(oob_skb).consumed += 1; + kfree_skb(oob_skb); + } + + mutex_unlock(&u->iolock); + + if (chunk < 0) + return -EFAULT; + + state->msg->msg_flags |= MSG_OOB; + return 1; +} + +static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, + int flags, int copied) +{ + struct unix_sock *u = unix_sk(sk); + + if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) { + skb_unlink(skb, &sk->sk_receive_queue); + consume_skb(skb); + skb = NULL; + } else { + if (skb == u->oob_skb) { + if (copied) { + skb = NULL; + } else if (sock_flag(sk, SOCK_URGINLINE)) { + if (!(flags & MSG_PEEK)) { + u->oob_skb = NULL; + consume_skb(skb); + } + } else if (!(flags & MSG_PEEK)) { + skb_unlink(skb, &sk->sk_receive_queue); + consume_skb(skb); + skb = skb_peek(&sk->sk_receive_queue); + } + } + } + return skb; +} +#endif + +static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + if (unlikely(sk->sk_state != TCP_ESTABLISHED)) + return -ENOTCONN; + + return unix_read_sock(sk, desc, recv_actor); +} + static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { @@ -2319,6 +2599,9 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, if (unlikely(flags & MSG_OOB)) { err = -EOPNOTSUPP; +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + err = unix_stream_recv_urg(state); +#endif goto out; } @@ -2347,6 +2630,18 @@ redo: } last = skb = skb_peek(&sk->sk_receive_queue); last_len = last ? last->len : 0; + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (skb) { + skb = manage_oob(skb, sk, flags, copied); + if (!skb) { + unix_state_unlock(sk); + if (copied) + break; + goto redo; + } + } +#endif again: if (skb == NULL) { if (copied >= target) @@ -2504,6 +2799,20 @@ static int unix_stream_read_actor(struct sk_buff *skb, return ret ?: chunk; } +int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg, + size_t size, int flags) +{ + struct unix_stream_read_state state = { + .recv_actor = unix_stream_read_actor, + .socket = sk->sk_socket, + .msg = msg, + .size = size, + .flags = flags + }; + + return unix_stream_read_generic(&state, true); +} + static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { @@ -2515,6 +2824,14 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, .flags = flags }; +#ifdef CONFIG_BPF_SYSCALL + struct sock *sk = sock->sk; + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot != &unix_stream_proto) + return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, NULL); +#endif return unix_stream_read_generic(&state, true); } @@ -2565,6 +2882,9 @@ static int unix_shutdown(struct socket *sock, int mode) unix_state_lock(sk); sk->sk_shutdown |= mode; + if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && + mode == SHUTDOWN_MASK) + sk->sk_state = TCP_CLOSE; other = unix_peer(sk); if (other) sock_hold(other); @@ -2575,7 +2895,10 @@ static int unix_shutdown(struct socket *sock, int mode) (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { int peer_mode = 0; + const struct proto *prot = READ_ONCE(other->sk_prot); + if (prot->unhash) + prot->unhash(other); if (mode&RCV_SHUTDOWN) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) @@ -2682,6 +3005,20 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCUNIXFILE: err = unix_open_file(sk); break; +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + case SIOCATMARK: + { + struct sk_buff *skb; + struct unix_sock *u = unix_sk(sk); + int answ = 0; + + skb = skb_peek(&sk->sk_receive_queue); + if (skb && skb == u->oob_skb) + answ = 1; + err = put_user(answ, (int __user *)arg); + } + break; +#endif default: err = -ENOIOCTLCMD; break; @@ -2715,6 +3052,8 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; + if (sk_is_readable(sk)) + mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && @@ -2754,6 +3093,8 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, /* readable? */ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= EPOLLIN | EPOLLRDNORM; + if (sk_is_readable(sk)) + mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ if (sk->sk_type == SOCK_SEQPACKET) { @@ -2774,7 +3115,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, other = unix_peer(sk); if (other && unix_peer(other) != sk && - unix_recvq_full(other) && + unix_recvq_full_lockless(other) && unix_dgram_peer_wake_me(sk, other)) writable = 0; @@ -2918,6 +3259,64 @@ static const struct seq_operations unix_seq_ops = { .stop = unix_seq_stop, .show = unix_seq_show, }; + +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) +struct bpf_iter__unix { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct unix_sock *, unix_sk); + uid_t uid __aligned(8); +}; + +static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, + struct unix_sock *unix_sk, uid_t uid) +{ + struct bpf_iter__unix ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.unix_sk = unix_sk; + ctx.uid = uid; + return bpf_iter_run_prog(prog, &ctx); +} + +static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + struct sock *sk = v; + uid_t uid; + + if (v == SEQ_START_TOKEN) + return 0; + + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + return unix_prog_seq_show(prog, &meta, v, uid); +} + +static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)unix_prog_seq_show(prog, &meta, v, 0); + } + + unix_seq_stop(seq, v); +} + +static const struct seq_operations bpf_iter_unix_seq_ops = { + .start = unix_seq_start, + .next = unix_seq_next, + .stop = bpf_iter_unix_seq_stop, + .show = bpf_iter_unix_seq_show, +}; +#endif #endif static const struct net_proto_family unix_family_ops = { @@ -2958,13 +3357,48 @@ static struct pernet_operations unix_net_ops = { .exit = unix_net_exit, }; +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, + struct unix_sock *unix_sk, uid_t uid) + +static const struct bpf_iter_seq_info unix_seq_info = { + .seq_ops = &bpf_iter_unix_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct seq_net_private), +}; + +static struct bpf_iter_reg unix_reg_info = { + .target = "unix", + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__unix, unix_sk), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &unix_seq_info, +}; + +static void __init bpf_iter_register(void) +{ + unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX]; + if (bpf_iter_reg_target(&unix_reg_info)) + pr_warn("Warning: could not register bpf iterator unix\n"); +} +#endif + static int __init af_unix_init(void) { int rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); - rc = proto_register(&unix_proto, 1); + rc = proto_register(&unix_dgram_proto, 1); + if (rc != 0) { + pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); + goto out; + } + + rc = proto_register(&unix_stream_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); goto out; @@ -2972,6 +3406,12 @@ static int __init af_unix_init(void) sock_register(&unix_family_ops); register_pernet_subsys(&unix_net_ops); + unix_bpf_build_proto(); + +#if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_register(); +#endif + out: return rc; } @@ -2979,7 +3419,8 @@ out: static void __exit af_unix_exit(void) { sock_unregister(PF_UNIX); - proto_unregister(&unix_proto); + proto_unregister(&unix_dgram_proto); + proto_unregister(&unix_stream_proto); unregister_pernet_subsys(&unix_net_ops); } diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c new file mode 100644 index 000000000000..452376c6f419 --- /dev/null +++ b/net/unix/unix_bpf.c @@ -0,0 +1,176 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Cong Wang <cong.wang@bytedance.com> */ + +#include <linux/skmsg.h> +#include <linux/bpf.h> +#include <net/sock.h> +#include <net/af_unix.h> + +#define unix_sk_has_data(__sk, __psock) \ + ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ + !skb_queue_empty(&__psock->ingress_skb) || \ + !list_empty(&__psock->ingress_msg); \ + }) + +static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock, + long timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + struct unix_sock *u = unix_sk(sk); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + if (!unix_sk_has_data(sk, psock)) { + mutex_unlock(&u->iolock); + wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + mutex_lock(&u->iolock); + ret = unix_sk_has_data(sk, psock); + } + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} + +static int __unix_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int flags) +{ + if (sk->sk_type == SOCK_DGRAM) + return __unix_dgram_recvmsg(sk, msg, len, flags); + else + return __unix_stream_recvmsg(sk, msg, len, flags); +} + +static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg, + size_t len, int nonblock, int flags, + int *addr_len) +{ + struct unix_sock *u = unix_sk(sk); + struct sk_psock *psock; + int copied; + + psock = sk_psock_get(sk); + if (unlikely(!psock)) + return __unix_recvmsg(sk, msg, len, flags); + + mutex_lock(&u->iolock); + if (!skb_queue_empty(&sk->sk_receive_queue) && + sk_psock_queue_empty(psock)) { + mutex_unlock(&u->iolock); + sk_psock_put(sk, psock); + return __unix_recvmsg(sk, msg, len, flags); + } + +msg_bytes_ready: + copied = sk_msg_recvmsg(sk, psock, msg, len, flags); + if (!copied) { + long timeo; + int data; + + timeo = sock_rcvtimeo(sk, nonblock); + data = unix_msg_wait_data(sk, psock, timeo); + if (data) { + if (!sk_psock_queue_empty(psock)) + goto msg_bytes_ready; + mutex_unlock(&u->iolock); + sk_psock_put(sk, psock); + return __unix_recvmsg(sk, msg, len, flags); + } + copied = -EAGAIN; + } + mutex_unlock(&u->iolock); + sk_psock_put(sk, psock); + return copied; +} + +static struct proto *unix_dgram_prot_saved __read_mostly; +static DEFINE_SPINLOCK(unix_dgram_prot_lock); +static struct proto unix_dgram_bpf_prot; + +static struct proto *unix_stream_prot_saved __read_mostly; +static DEFINE_SPINLOCK(unix_stream_prot_lock); +static struct proto unix_stream_bpf_prot; + +static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base) +{ + *prot = *base; + prot->close = sock_map_close; + prot->recvmsg = unix_bpf_recvmsg; + prot->sock_is_readable = sk_msg_is_readable; +} + +static void unix_stream_bpf_rebuild_protos(struct proto *prot, + const struct proto *base) +{ + *prot = *base; + prot->close = sock_map_close; + prot->recvmsg = unix_bpf_recvmsg; + prot->sock_is_readable = sk_msg_is_readable; + prot->unhash = sock_map_unhash; +} + +static void unix_dgram_bpf_check_needs_rebuild(struct proto *ops) +{ + if (unlikely(ops != smp_load_acquire(&unix_dgram_prot_saved))) { + spin_lock_bh(&unix_dgram_prot_lock); + if (likely(ops != unix_dgram_prot_saved)) { + unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, ops); + smp_store_release(&unix_dgram_prot_saved, ops); + } + spin_unlock_bh(&unix_dgram_prot_lock); + } +} + +static void unix_stream_bpf_check_needs_rebuild(struct proto *ops) +{ + if (unlikely(ops != smp_load_acquire(&unix_stream_prot_saved))) { + spin_lock_bh(&unix_stream_prot_lock); + if (likely(ops != unix_stream_prot_saved)) { + unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, ops); + smp_store_release(&unix_stream_prot_saved, ops); + } + spin_unlock_bh(&unix_stream_prot_lock); + } +} + +int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) +{ + if (sk->sk_type != SOCK_DGRAM) + return -EOPNOTSUPP; + + if (restore) { + sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } + + unix_dgram_bpf_check_needs_rebuild(psock->sk_proto); + WRITE_ONCE(sk->sk_prot, &unix_dgram_bpf_prot); + return 0; +} + +int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) +{ + if (restore) { + sk->sk_write_space = psock->saved_write_space; + WRITE_ONCE(sk->sk_prot, psock->sk_proto); + return 0; + } + + unix_stream_bpf_check_needs_rebuild(psock->sk_proto); + WRITE_ONCE(sk->sk_prot, &unix_stream_bpf_prot); + return 0; +} + +void __init unix_bpf_build_proto(void) +{ + unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, &unix_dgram_proto); + unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, &unix_stream_proto); + +} diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 3e02cc3b24f8..7d851eb3a683 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1614,13 +1614,18 @@ static int vsock_connectible_setsockopt(struct socket *sock, vsock_update_buffer_size(vsk, transport, vsk->buffer_size); break; - case SO_VM_SOCKETS_CONNECT_TIMEOUT: { - struct __kernel_old_timeval tv; - COPY_IN(tv); + case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: + case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: { + struct __kernel_sock_timeval tv; + + err = sock_copy_user_timeval(&tv, optval, optlen, + optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); + if (err) + break; if (tv.tv_sec >= 0 && tv.tv_usec < USEC_PER_SEC && tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)) { vsk->connect_timeout = tv.tv_sec * HZ + - DIV_ROUND_UP(tv.tv_usec, (1000000 / HZ)); + DIV_ROUND_UP((unsigned long)tv.tv_usec, (USEC_PER_SEC / HZ)); if (vsk->connect_timeout == 0) vsk->connect_timeout = VSOCK_DEFAULT_CONNECT_TIMEOUT; @@ -1648,68 +1653,59 @@ static int vsock_connectible_getsockopt(struct socket *sock, char __user *optval, int __user *optlen) { - int err; + struct sock *sk = sock->sk; + struct vsock_sock *vsk = vsock_sk(sk); + + union { + u64 val64; + struct old_timeval32 tm32; + struct __kernel_old_timeval tm; + struct __kernel_sock_timeval stm; + } v; + + int lv = sizeof(v.val64); int len; - struct sock *sk; - struct vsock_sock *vsk; - u64 val; if (level != AF_VSOCK) return -ENOPROTOOPT; - err = get_user(len, optlen); - if (err != 0) - return err; - -#define COPY_OUT(_v) \ - do { \ - if (len < sizeof(_v)) \ - return -EINVAL; \ - \ - len = sizeof(_v); \ - if (copy_to_user(optval, &_v, len) != 0) \ - return -EFAULT; \ - \ - } while (0) + if (get_user(len, optlen)) + return -EFAULT; - err = 0; - sk = sock->sk; - vsk = vsock_sk(sk); + memset(&v, 0, sizeof(v)); switch (optname) { case SO_VM_SOCKETS_BUFFER_SIZE: - val = vsk->buffer_size; - COPY_OUT(val); + v.val64 = vsk->buffer_size; break; case SO_VM_SOCKETS_BUFFER_MAX_SIZE: - val = vsk->buffer_max_size; - COPY_OUT(val); + v.val64 = vsk->buffer_max_size; break; case SO_VM_SOCKETS_BUFFER_MIN_SIZE: - val = vsk->buffer_min_size; - COPY_OUT(val); + v.val64 = vsk->buffer_min_size; break; - case SO_VM_SOCKETS_CONNECT_TIMEOUT: { - struct __kernel_old_timeval tv; - tv.tv_sec = vsk->connect_timeout / HZ; - tv.tv_usec = - (vsk->connect_timeout - - tv.tv_sec * HZ) * (1000000 / HZ); - COPY_OUT(tv); + case SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW: + case SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD: + lv = sock_get_timeout(vsk->connect_timeout, &v, + optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD); break; - } + default: return -ENOPROTOOPT; } - err = put_user(len, optlen); - if (err != 0) + if (len < lv) + return -EINVAL; + if (len > lv) + len = lv; + if (copy_to_user(optval, &v, len)) return -EFAULT; -#undef COPY_OUT + if (put_user(len, optlen)) + return -EFAULT; return 0; } @@ -2014,7 +2010,7 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, { const struct vsock_transport *transport; struct vsock_sock *vsk; - ssize_t record_len; + ssize_t msg_len; long timeout; int err = 0; DEFINE_WAIT(wait); @@ -2028,9 +2024,9 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, if (err <= 0) goto out; - record_len = transport->seqpacket_dequeue(vsk, msg, flags); + msg_len = transport->seqpacket_dequeue(vsk, msg, flags); - if (record_len < 0) { + if (msg_len < 0) { err = -ENOMEM; goto out; } @@ -2044,14 +2040,14 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, struct msghdr *msg, * packet. */ if (flags & MSG_TRUNC) - err = record_len; + err = msg_len; else err = len - msg_data_left(msg); /* Always set MSG_TRUNC if real length of packet is * bigger than user's buffer. */ - if (record_len > len) + if (msg_len > len) msg->msg_flags |= MSG_TRUNC; } diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 081e7ae93cb1..59ee1be5a6dd 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -76,8 +76,12 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, goto out; if (msg_data_left(info->msg) == 0 && - info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) - pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); + info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) { + pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); + + if (info->msg->msg_flags & MSG_EOR) + pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); + } } trace_virtio_transport_alloc_pkt(src_cid, src_port, @@ -457,9 +461,12 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, dequeued_len += pkt_len; } - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) { + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) { msg_ready = true; vvs->msg_count--; + + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) + msg->msg_flags |= MSG_EOR; } virtio_transport_dec_rx_pkt(vvs, pkt); @@ -1029,7 +1036,7 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, goto out; } - if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) + if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) vvs->msg_count++; /* Try to copy small packets into the buffer of last packet queued, @@ -1044,12 +1051,12 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, /* If there is space in the last packet queued, we copy the * new packet in its buffer. We avoid this if the last packet - * queued has VIRTIO_VSOCK_SEQ_EOR set, because this is - * delimiter of SEQPACKET record, so 'pkt' is the first packet - * of a new record. + * queued has VIRTIO_VSOCK_SEQ_EOM set, because this is + * delimiter of SEQPACKET message, so 'pkt' is the first packet + * of a new message. */ if ((pkt->len <= last_pkt->buf_len - last_pkt->len) && - !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)) { + !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)) { memcpy(last_pkt->buf + last_pkt->len, pkt->buf, pkt->len); last_pkt->len += pkt->len; diff --git a/net/wireless/Makefile b/net/wireless/Makefile index af590ae606b6..756e7de7e33f 100644 --- a/net/wireless/Makefile +++ b/net/wireless/Makefile @@ -26,7 +26,7 @@ endif $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex) @$(kecho) " GEN $@" - @(echo '#include "reg.h"'; \ + $(Q)(echo '#include "reg.h"'; \ echo 'const u8 shipped_regdb_certs[] = {'; \ echo | cat - $^ ; \ echo '};'; \ @@ -36,7 +36,7 @@ $(obj)/shipped-certs.c: $(wildcard $(srctree)/$(src)/certs/*.hex) $(obj)/extra-certs.c: $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%) \ $(wildcard $(CONFIG_CFG80211_EXTRA_REGDB_KEYDIR:"%"=%)/*.x509) @$(kecho) " GEN $@" - @(set -e; \ + $(Q)(set -e; \ allf=""; \ for f in $^ ; do \ test -f $$f || continue;\ diff --git a/net/wireless/core.c b/net/wireless/core.c index 03323121ca50..eb297e1015e0 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -524,6 +524,7 @@ use_default_name: INIT_WORK(&rdev->propagate_cac_done_wk, cfg80211_propagate_cac_done_wk); INIT_WORK(&rdev->mgmt_registrations_update_wk, cfg80211_mgmt_registrations_update_wk); + spin_lock_init(&rdev->mgmt_registrations_lock); #ifdef CONFIG_CFG80211_DEFAULT_PS rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; @@ -1080,6 +1081,16 @@ void cfg80211_dev_free(struct cfg80211_registered_device *rdev) list_for_each_entry_safe(scan, tmp, &rdev->bss_list, list) cfg80211_put_bss(&rdev->wiphy, &scan->pub); mutex_destroy(&rdev->wiphy.mtx); + + /* + * The 'regd' can only be non-NULL if we never finished + * initializing the wiphy and thus never went through the + * unregister path - e.g. in failure scenarios. Thus, it + * cannot have been visible to anyone if non-NULL, so we + * can just free it here. + */ + kfree(rcu_dereference_raw(rdev->wiphy.regd)); + kfree(rdev); } @@ -1279,7 +1290,6 @@ void cfg80211_init_wdev(struct wireless_dev *wdev) INIT_LIST_HEAD(&wdev->event_list); spin_lock_init(&wdev->event_lock); INIT_LIST_HEAD(&wdev->mgmt_registrations); - spin_lock_init(&wdev->mgmt_registrations_lock); INIT_LIST_HEAD(&wdev->pmsr_list); spin_lock_init(&wdev->pmsr_lock); INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); diff --git a/net/wireless/core.h b/net/wireless/core.h index b35d0db12f1d..1720abf36f92 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -100,6 +100,8 @@ struct cfg80211_registered_device { struct work_struct propagate_cac_done_wk; struct work_struct mgmt_registrations_update_wk; + /* lock for all wdev lists */ + spinlock_t mgmt_registrations_lock; /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 3aa69b375a10..783acd2c4211 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -452,9 +452,9 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) lockdep_assert_held(&rdev->wiphy.mtx); - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); if (!wdev->mgmt_registrations_need_update) { - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); return; } @@ -479,7 +479,7 @@ static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) rcu_read_unlock(); wdev->mgmt_registrations_need_update = 0; - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); rdev_update_mgmt_frame_registrations(rdev, wdev, &upd); } @@ -503,6 +503,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, int match_len, bool multicast_rx, struct netlink_ext_ack *extack) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; @@ -548,7 +549,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, if (!nreg) return -ENOMEM; - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { int mlen = min(match_len, reg->match_len); @@ -583,7 +584,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, list_add(&nreg->list, &wdev->mgmt_registrations); } wdev->mgmt_registrations_need_update = 1; - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); @@ -591,7 +592,7 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, out: kfree(nreg); - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); return err; } @@ -602,7 +603,7 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg, *tmp; - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { if (reg->nlportid != nlportid) @@ -615,7 +616,7 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) schedule_work(&rdev->mgmt_registrations_update_wk); } - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); if (nlportid && rdev->crit_proto_nlportid == nlportid) { rdev->crit_proto_nlportid = 0; @@ -628,15 +629,16 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_mgmt_registration *reg, *tmp; - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { list_del(®->list); kfree(reg); } wdev->mgmt_registrations_need_update = 1; - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); cfg80211_mgmt_registrations_update(wdev); } @@ -784,7 +786,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, data = buf + ieee80211_hdrlen(mgmt->frame_control); data_len = len - ieee80211_hdrlen(mgmt->frame_control); - spin_lock_bh(&wdev->mgmt_registrations_lock); + spin_lock_bh(&rdev->mgmt_registrations_lock); list_for_each_entry(reg, &wdev->mgmt_registrations, list) { if (reg->frame_type != ftype) @@ -808,7 +810,7 @@ bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, break; } - spin_unlock_bh(&wdev->mgmt_registrations_lock); + spin_unlock_bh(&rdev->mgmt_registrations_lock); trace_cfg80211_return_bool(result); return result; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 16c88beea48b..81232b73df8f 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -437,6 +437,16 @@ sar_policy[NL80211_SAR_ATTR_MAX + 1] = { [NL80211_SAR_ATTR_SPECS] = NLA_POLICY_NESTED_ARRAY(sar_specs_policy), }; +static const struct nla_policy +nl80211_mbssid_config_policy[NL80211_MBSSID_CONFIG_ATTR_MAX + 1] = { + [NL80211_MBSSID_CONFIG_ATTR_MAX_INTERFACES] = NLA_POLICY_MIN(NLA_U8, 2), + [NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY] = + NLA_POLICY_MIN(NLA_U8, 1), + [NL80211_MBSSID_CONFIG_ATTR_INDEX] = { .type = NLA_U8 }, + [NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX] = { .type = NLA_U32 }, + [NL80211_MBSSID_CONFIG_ATTR_EMA] = { .type = NLA_FLAG }, +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, @@ -759,6 +769,13 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_RECONNECT_REQUESTED] = { .type = NLA_REJECT }, [NL80211_ATTR_SAR_SPEC] = NLA_POLICY_NESTED(sar_policy), [NL80211_ATTR_DISABLE_HE] = { .type = NLA_FLAG }, + [NL80211_ATTR_OBSS_COLOR_BITMAP] = { .type = NLA_U64 }, + [NL80211_ATTR_COLOR_CHANGE_COUNT] = { .type = NLA_U8 }, + [NL80211_ATTR_COLOR_CHANGE_COLOR] = { .type = NLA_U8 }, + [NL80211_ATTR_COLOR_CHANGE_ELEMS] = NLA_POLICY_NESTED(nl80211_policy), + [NL80211_ATTR_MBSSID_CONFIG] = + NLA_POLICY_NESTED(nl80211_mbssid_config_policy), + [NL80211_ATTR_MBSSID_ELEMS] = { .type = NLA_NESTED }, }; /* policy for the key attributes */ @@ -849,6 +866,7 @@ nl80211_match_band_rssi_policy[NUM_NL80211_BANDS] = { [NL80211_BAND_5GHZ] = { .type = NLA_S32 }, [NL80211_BAND_6GHZ] = { .type = NLA_S32 }, [NL80211_BAND_60GHZ] = { .type = NLA_S32 }, + [NL80211_BAND_LC] = { .type = NLA_S32 }, }; static const struct nla_policy @@ -2203,6 +2221,35 @@ fail: return -ENOBUFS; } +static int nl80211_put_mbssid_support(struct wiphy *wiphy, struct sk_buff *msg) +{ + struct nlattr *config; + + if (!wiphy->mbssid_max_interfaces) + return 0; + + config = nla_nest_start(msg, NL80211_ATTR_MBSSID_CONFIG); + if (!config) + return -ENOBUFS; + + if (nla_put_u8(msg, NL80211_MBSSID_CONFIG_ATTR_MAX_INTERFACES, + wiphy->mbssid_max_interfaces)) + goto fail; + + if (wiphy->ema_max_profile_periodicity && + nla_put_u8(msg, + NL80211_MBSSID_CONFIG_ATTR_MAX_EMA_PROFILE_PERIODICITY, + wiphy->ema_max_profile_periodicity)) + goto fail; + + nla_nest_end(msg, config); + return 0; + +fail: + nla_nest_cancel(msg, config); + return -ENOBUFS; +} + struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; @@ -2788,6 +2835,9 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (nl80211_put_sar_specs(rdev, msg)) goto nla_put_failure; + if (nl80211_put_mbssid_support(&rdev->wiphy, msg)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; @@ -4977,6 +5027,96 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return 0; } +static int nl80211_parse_mbssid_config(struct wiphy *wiphy, + struct net_device *dev, + struct nlattr *attrs, + struct cfg80211_mbssid_config *config, + u8 num_elems) +{ + struct nlattr *tb[NL80211_MBSSID_CONFIG_ATTR_MAX + 1]; + + if (!wiphy->mbssid_max_interfaces) + return -EOPNOTSUPP; + + if (nla_parse_nested(tb, NL80211_MBSSID_CONFIG_ATTR_MAX, attrs, NULL, + NULL) || + !tb[NL80211_MBSSID_CONFIG_ATTR_INDEX]) + return -EINVAL; + + config->ema = nla_get_flag(tb[NL80211_MBSSID_CONFIG_ATTR_EMA]); + if (config->ema) { + if (!wiphy->ema_max_profile_periodicity) + return -EOPNOTSUPP; + + if (num_elems > wiphy->ema_max_profile_periodicity) + return -EINVAL; + } + + config->index = nla_get_u8(tb[NL80211_MBSSID_CONFIG_ATTR_INDEX]); + if (config->index >= wiphy->mbssid_max_interfaces || + (!config->index && !num_elems)) + return -EINVAL; + + if (tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]) { + u32 tx_ifindex = + nla_get_u32(tb[NL80211_MBSSID_CONFIG_ATTR_TX_IFINDEX]); + + if ((!config->index && tx_ifindex != dev->ifindex) || + (config->index && tx_ifindex == dev->ifindex)) + return -EINVAL; + + if (tx_ifindex != dev->ifindex) { + struct net_device *tx_netdev = + dev_get_by_index(wiphy_net(wiphy), tx_ifindex); + + if (!tx_netdev || !tx_netdev->ieee80211_ptr || + tx_netdev->ieee80211_ptr->wiphy != wiphy || + tx_netdev->ieee80211_ptr->iftype != + NL80211_IFTYPE_AP) { + dev_put(tx_netdev); + return -EINVAL; + } + + config->tx_wdev = tx_netdev->ieee80211_ptr; + } else { + config->tx_wdev = dev->ieee80211_ptr; + } + } else if (!config->index) { + config->tx_wdev = dev->ieee80211_ptr; + } else { + return -EINVAL; + } + + return 0; +} + +static struct cfg80211_mbssid_elems * +nl80211_parse_mbssid_elems(struct wiphy *wiphy, struct nlattr *attrs) +{ + struct nlattr *nl_elems; + struct cfg80211_mbssid_elems *elems; + int rem_elems; + u8 i = 0, num_elems = 0; + + if (!wiphy->mbssid_max_interfaces) + return ERR_PTR(-EINVAL); + + nla_for_each_nested(nl_elems, attrs, rem_elems) + num_elems++; + + elems = kzalloc(struct_size(elems, elem, num_elems), GFP_KERNEL); + if (!elems) + return ERR_PTR(-ENOMEM); + + nla_for_each_nested(nl_elems, attrs, rem_elems) { + elems->elem[i].data = nla_data(nl_elems); + elems->elem[i].len = nla_len(nl_elems); + i++; + } + elems->cnt = num_elems; + return elems; +} + static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev, struct nlattr *attrs[], struct cfg80211_beacon_data *bcn) @@ -5057,6 +5197,17 @@ static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev, bcn->ftm_responder = -1; } + if (attrs[NL80211_ATTR_MBSSID_ELEMS]) { + struct cfg80211_mbssid_elems *mbssid = + nl80211_parse_mbssid_elems(&rdev->wiphy, + attrs[NL80211_ATTR_MBSSID_ELEMS]); + + if (IS_ERR(mbssid)) + return PTR_ERR(mbssid); + + bcn->mbssid_ies = mbssid; + } + return 0; } @@ -5188,21 +5339,21 @@ nl80211_parse_unsol_bcast_probe_resp(struct cfg80211_registered_device *rdev, } static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, - const u8 *rates) + const struct element *rates) { int i; if (!rates) return; - for (i = 0; i < rates[1]; i++) { - if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HT_PHY) + for (i = 0; i < rates->datalen; i++) { + if (rates->data[i] == BSS_MEMBERSHIP_SELECTOR_HT_PHY) params->ht_required = true; - if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY) + if (rates->data[i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY) params->vht_required = true; - if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY) + if (rates->data[i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY) params->he_required = true; - if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_SAE_H2E) + if (rates->data[i] == BSS_MEMBERSHIP_SELECTOR_SAE_H2E) params->sae_h2e_required = true; } } @@ -5217,27 +5368,27 @@ static void nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) const struct cfg80211_beacon_data *bcn = ¶ms->beacon; size_t ies_len = bcn->tail_len; const u8 *ies = bcn->tail; - const u8 *rates; - const u8 *cap; + const struct element *rates; + const struct element *cap; - rates = cfg80211_find_ie(WLAN_EID_SUPP_RATES, ies, ies_len); + rates = cfg80211_find_elem(WLAN_EID_SUPP_RATES, ies, ies_len); nl80211_check_ap_rate_selectors(params, rates); - rates = cfg80211_find_ie(WLAN_EID_EXT_SUPP_RATES, ies, ies_len); + rates = cfg80211_find_elem(WLAN_EID_EXT_SUPP_RATES, ies, ies_len); nl80211_check_ap_rate_selectors(params, rates); - cap = cfg80211_find_ie(WLAN_EID_HT_CAPABILITY, ies, ies_len); - if (cap && cap[1] >= sizeof(*params->ht_cap)) - params->ht_cap = (void *)(cap + 2); - cap = cfg80211_find_ie(WLAN_EID_VHT_CAPABILITY, ies, ies_len); - if (cap && cap[1] >= sizeof(*params->vht_cap)) - params->vht_cap = (void *)(cap + 2); - cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_CAPABILITY, ies, ies_len); - if (cap && cap[1] >= sizeof(*params->he_cap) + 1) - params->he_cap = (void *)(cap + 3); - cap = cfg80211_find_ext_ie(WLAN_EID_EXT_HE_OPERATION, ies, ies_len); - if (cap && cap[1] >= sizeof(*params->he_oper) + 1) - params->he_oper = (void *)(cap + 3); + cap = cfg80211_find_elem(WLAN_EID_HT_CAPABILITY, ies, ies_len); + if (cap && cap->datalen >= sizeof(*params->ht_cap)) + params->ht_cap = (void *)cap->data; + cap = cfg80211_find_elem(WLAN_EID_VHT_CAPABILITY, ies, ies_len); + if (cap && cap->datalen >= sizeof(*params->vht_cap)) + params->vht_cap = (void *)cap->data; + cap = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ies, ies_len); + if (cap && cap->datalen >= sizeof(*params->he_cap) + 1) + params->he_cap = (void *)(cap->data + 1); + cap = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_OPERATION, ies, ies_len); + if (cap && cap->datalen >= sizeof(*params->he_oper) + 1) + params->he_oper = (void *)(cap->data + 1); } static bool nl80211_get_ap_channel(struct cfg80211_registered_device *rdev, @@ -5319,7 +5470,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_ap_settings params; + struct cfg80211_ap_settings *params; int err; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && @@ -5332,27 +5483,29 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (wdev->beacon_interval) return -EALREADY; - memset(¶ms, 0, sizeof(params)); - /* these are required for START_AP */ if (!info->attrs[NL80211_ATTR_BEACON_INTERVAL] || !info->attrs[NL80211_ATTR_DTIM_PERIOD] || !info->attrs[NL80211_ATTR_BEACON_HEAD]) return -EINVAL; - err = nl80211_parse_beacon(rdev, info->attrs, ¶ms.beacon); + params = kzalloc(sizeof(*params), GFP_KERNEL); + if (!params) + return -ENOMEM; + + err = nl80211_parse_beacon(rdev, info->attrs, ¶ms->beacon); if (err) - return err; + goto out; - params.beacon_interval = + params->beacon_interval = nla_get_u32(info->attrs[NL80211_ATTR_BEACON_INTERVAL]); - params.dtim_period = + params->dtim_period = nla_get_u32(info->attrs[NL80211_ATTR_DTIM_PERIOD]); err = cfg80211_validate_beacon_int(rdev, dev->ieee80211_ptr->iftype, - params.beacon_interval); + params->beacon_interval); if (err) - return err; + goto out; /* * In theory, some of these attributes should be required here @@ -5362,129 +5515,157 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) * additional information -- drivers must check! */ if (info->attrs[NL80211_ATTR_SSID]) { - params.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); - params.ssid_len = + params->ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); + params->ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); - if (params.ssid_len == 0) - return -EINVAL; + if (params->ssid_len == 0) { + err = -EINVAL; + goto out; + } } if (info->attrs[NL80211_ATTR_HIDDEN_SSID]) - params.hidden_ssid = nla_get_u32( + params->hidden_ssid = nla_get_u32( info->attrs[NL80211_ATTR_HIDDEN_SSID]); - params.privacy = !!info->attrs[NL80211_ATTR_PRIVACY]; + params->privacy = !!info->attrs[NL80211_ATTR_PRIVACY]; if (info->attrs[NL80211_ATTR_AUTH_TYPE]) { - params.auth_type = nla_get_u32( + params->auth_type = nla_get_u32( info->attrs[NL80211_ATTR_AUTH_TYPE]); - if (!nl80211_valid_auth_type(rdev, params.auth_type, - NL80211_CMD_START_AP)) - return -EINVAL; + if (!nl80211_valid_auth_type(rdev, params->auth_type, + NL80211_CMD_START_AP)) { + err = -EINVAL; + goto out; + } } else - params.auth_type = NL80211_AUTHTYPE_AUTOMATIC; + params->auth_type = NL80211_AUTHTYPE_AUTOMATIC; - err = nl80211_crypto_settings(rdev, info, ¶ms.crypto, + err = nl80211_crypto_settings(rdev, info, ¶ms->crypto, NL80211_MAX_NR_CIPHER_SUITES); if (err) - return err; + goto out; if (info->attrs[NL80211_ATTR_INACTIVITY_TIMEOUT]) { - if (!(rdev->wiphy.features & NL80211_FEATURE_INACTIVITY_TIMER)) - return -EOPNOTSUPP; - params.inactivity_timeout = nla_get_u16( + if (!(rdev->wiphy.features & NL80211_FEATURE_INACTIVITY_TIMER)) { + err = -EOPNOTSUPP; + goto out; + } + params->inactivity_timeout = nla_get_u16( info->attrs[NL80211_ATTR_INACTIVITY_TIMEOUT]); } if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) { - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) - return -EINVAL; - params.p2p_ctwindow = + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { + err = -EINVAL; + goto out; + } + params->p2p_ctwindow = nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); - if (params.p2p_ctwindow != 0 && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) - return -EINVAL; + if (params->p2p_ctwindow != 0 && + !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) { + err = -EINVAL; + goto out; + } } if (info->attrs[NL80211_ATTR_P2P_OPPPS]) { u8 tmp; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) - return -EINVAL; + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { + err = -EINVAL; + goto out; + } tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); - params.p2p_opp_ps = tmp; - if (params.p2p_opp_ps != 0 && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) - return -EINVAL; + params->p2p_opp_ps = tmp; + if (params->p2p_opp_ps != 0 && + !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) { + err = -EINVAL; + goto out; + } } if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { - err = nl80211_parse_chandef(rdev, info, ¶ms.chandef); + err = nl80211_parse_chandef(rdev, info, ¶ms->chandef); if (err) - return err; + goto out; } else if (wdev->preset_chandef.chan) { - params.chandef = wdev->preset_chandef; - } else if (!nl80211_get_ap_channel(rdev, ¶ms)) - return -EINVAL; + params->chandef = wdev->preset_chandef; + } else if (!nl80211_get_ap_channel(rdev, params)) { + err = -EINVAL; + goto out; + } - if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, ¶ms.chandef, - wdev->iftype)) - return -EINVAL; + if (!cfg80211_reg_can_beacon_relax(&rdev->wiphy, ¶ms->chandef, + wdev->iftype)) { + err = -EINVAL; + goto out; + } if (info->attrs[NL80211_ATTR_TX_RATES]) { err = nl80211_parse_tx_bitrate_mask(info, info->attrs, NL80211_ATTR_TX_RATES, - ¶ms.beacon_rate, + ¶ms->beacon_rate, dev, false); if (err) - return err; + goto out; - err = validate_beacon_tx_rate(rdev, params.chandef.chan->band, - ¶ms.beacon_rate); + err = validate_beacon_tx_rate(rdev, params->chandef.chan->band, + ¶ms->beacon_rate); if (err) - return err; + goto out; } if (info->attrs[NL80211_ATTR_SMPS_MODE]) { - params.smps_mode = + params->smps_mode = nla_get_u8(info->attrs[NL80211_ATTR_SMPS_MODE]); - switch (params.smps_mode) { + switch (params->smps_mode) { case NL80211_SMPS_OFF: break; case NL80211_SMPS_STATIC: if (!(rdev->wiphy.features & - NL80211_FEATURE_STATIC_SMPS)) - return -EINVAL; + NL80211_FEATURE_STATIC_SMPS)) { + err = -EINVAL; + goto out; + } break; case NL80211_SMPS_DYNAMIC: if (!(rdev->wiphy.features & - NL80211_FEATURE_DYNAMIC_SMPS)) - return -EINVAL; + NL80211_FEATURE_DYNAMIC_SMPS)) { + err = -EINVAL; + goto out; + } break; default: - return -EINVAL; + err = -EINVAL; + goto out; } } else { - params.smps_mode = NL80211_SMPS_OFF; + params->smps_mode = NL80211_SMPS_OFF; } - params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); - if (params.pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) - return -EOPNOTSUPP; + params->pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]); + if (params->pbss && !rdev->wiphy.bands[NL80211_BAND_60GHZ]) { + err = -EOPNOTSUPP; + goto out; + } if (info->attrs[NL80211_ATTR_ACL_POLICY]) { - params.acl = parse_acl_data(&rdev->wiphy, info); - if (IS_ERR(params.acl)) - return PTR_ERR(params.acl); + params->acl = parse_acl_data(&rdev->wiphy, info); + if (IS_ERR(params->acl)) { + err = PTR_ERR(params->acl); + params->acl = NULL; + goto out; + } } - params.twt_responder = + params->twt_responder = nla_get_flag(info->attrs[NL80211_ATTR_TWT_RESPONDER]); if (info->attrs[NL80211_ATTR_HE_OBSS_PD]) { err = nl80211_parse_he_obss_pd( info->attrs[NL80211_ATTR_HE_OBSS_PD], - ¶ms.he_obss_pd); + ¶ms->he_obss_pd); if (err) goto out; } @@ -5492,7 +5673,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_HE_BSS_COLOR]) { err = nl80211_parse_he_bss_color( info->attrs[NL80211_ATTR_HE_BSS_COLOR], - ¶ms.he_bss_color); + ¶ms->he_bss_color); if (err) goto out; } @@ -5500,7 +5681,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_FILS_DISCOVERY]) { err = nl80211_parse_fils_discovery(rdev, info->attrs[NL80211_ATTR_FILS_DISCOVERY], - ¶ms); + params); if (err) goto out; } @@ -5508,24 +5689,35 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) { err = nl80211_parse_unsol_bcast_probe_resp( rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP], - ¶ms); + params); if (err) goto out; } - nl80211_calculate_ap_params(¶ms); + if (info->attrs[NL80211_ATTR_MBSSID_CONFIG]) { + err = nl80211_parse_mbssid_config(&rdev->wiphy, dev, + info->attrs[NL80211_ATTR_MBSSID_CONFIG], + ¶ms->mbssid_config, + params->beacon.mbssid_ies ? + params->beacon.mbssid_ies->cnt : + 0); + if (err) + goto out; + } + + nl80211_calculate_ap_params(params); if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) - params.flags |= AP_SETTINGS_EXTERNAL_AUTH_SUPPORT; + params->flags |= AP_SETTINGS_EXTERNAL_AUTH_SUPPORT; wdev_lock(wdev); - err = rdev_start_ap(rdev, dev, ¶ms); + err = rdev_start_ap(rdev, dev, params); if (!err) { - wdev->preset_chandef = params.chandef; - wdev->beacon_interval = params.beacon_interval; - wdev->chandef = params.chandef; - wdev->ssid_len = params.ssid_len; - memcpy(wdev->ssid, params.ssid, wdev->ssid_len); + wdev->preset_chandef = params->chandef; + wdev->beacon_interval = params->beacon_interval; + wdev->chandef = params->chandef; + wdev->ssid_len = params->ssid_len; + memcpy(wdev->ssid, params->ssid, wdev->ssid_len); if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) wdev->conn_owner_nlportid = info->snd_portid; @@ -5533,7 +5725,13 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) wdev_unlock(wdev); out: - kfree(params.acl); + kfree(params->acl); + kfree(params->beacon.mbssid_ies); + if (params->mbssid_config.tx_wdev && + params->mbssid_config.tx_wdev->netdev && + params->mbssid_config.tx_wdev->netdev != dev) + dev_put(params->mbssid_config.tx_wdev->netdev); + kfree(params); return err; } @@ -5558,12 +5756,14 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info) err = nl80211_parse_beacon(rdev, info->attrs, ¶ms); if (err) - return err; + goto out; wdev_lock(wdev); err = rdev_change_beacon(rdev, dev, ¶ms); wdev_unlock(wdev); +out: + kfree(params.mbssid_ies); return err; } @@ -6527,8 +6727,7 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) err = rdev_change_station(rdev, dev, mac_addr, ¶ms); out_put_vlan: - if (params.vlan) - dev_put(params.vlan); + dev_put(params.vlan); return err; } @@ -6763,8 +6962,7 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) err = rdev_add_station(rdev, dev, mac_addr, ¶ms); - if (params.vlan) - dev_put(params.vlan); + dev_put(params.vlan); return err; } @@ -8489,8 +8687,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto out_free; nl80211_send_scan_start(rdev, wdev); - if (wdev->netdev) - dev_hold(wdev->netdev); + dev_hold(wdev->netdev); return 0; @@ -9243,12 +9440,14 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) err = nl80211_parse_beacon(rdev, info->attrs, ¶ms.beacon_after); if (err) - return err; + goto free; csa_attrs = kcalloc(NL80211_ATTR_MAX + 1, sizeof(*csa_attrs), GFP_KERNEL); - if (!csa_attrs) - return -ENOMEM; + if (!csa_attrs) { + err = -ENOMEM; + goto free; + } err = nla_parse_nested_deprecated(csa_attrs, NL80211_ATTR_MAX, info->attrs[NL80211_ATTR_CSA_IES], @@ -9366,6 +9565,8 @@ skip_beacons: wdev_unlock(wdev); free: + kfree(params.beacon_after.mbssid_ies); + kfree(params.beacon_csa.mbssid_ies); kfree(csa_attrs); return err; } @@ -11766,8 +11967,9 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, if (n_thresholds) { struct cfg80211_cqm_config *cqm_config; - cqm_config = kzalloc(sizeof(struct cfg80211_cqm_config) + - n_thresholds * sizeof(s32), GFP_KERNEL); + cqm_config = kzalloc(struct_size(cqm_config, rssi_thresholds, + n_thresholds), + GFP_KERNEL); if (!cqm_config) { err = -ENOMEM; goto unlock; @@ -11776,7 +11978,8 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, cqm_config->rssi_hyst = hysteresis; cqm_config->n_rssi_thresholds = n_thresholds; memcpy(cqm_config->rssi_thresholds, thresholds, - n_thresholds * sizeof(s32)); + flex_array_size(cqm_config, rssi_thresholds, + n_thresholds)); wdev->cqm_config = cqm_config; } @@ -14803,6 +15006,131 @@ bad_tid_conf: return ret; } +static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct cfg80211_color_change_settings params = {}; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct nlattr **tb; + u16 offset; + int err; + + if (!rdev->ops->color_change) + return -EOPNOTSUPP; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BSS_COLOR)) + return -EOPNOTSUPP; + + if (wdev->iftype != NL80211_IFTYPE_AP) + return -EOPNOTSUPP; + + if (!info->attrs[NL80211_ATTR_COLOR_CHANGE_COUNT] || + !info->attrs[NL80211_ATTR_COLOR_CHANGE_COLOR] || + !info->attrs[NL80211_ATTR_COLOR_CHANGE_ELEMS]) + return -EINVAL; + + params.count = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COUNT]); + params.color = nla_get_u8(info->attrs[NL80211_ATTR_COLOR_CHANGE_COLOR]); + + err = nl80211_parse_beacon(rdev, info->attrs, ¶ms.beacon_next); + if (err) + return err; + + tb = kcalloc(NL80211_ATTR_MAX + 1, sizeof(*tb), GFP_KERNEL); + if (!tb) + return -ENOMEM; + + err = nla_parse_nested(tb, NL80211_ATTR_MAX, + info->attrs[NL80211_ATTR_COLOR_CHANGE_ELEMS], + nl80211_policy, info->extack); + if (err) + goto out; + + err = nl80211_parse_beacon(rdev, tb, ¶ms.beacon_color_change); + if (err) + goto out; + + if (!tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]) { + err = -EINVAL; + goto out; + } + + if (nla_len(tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]) != sizeof(u16)) { + err = -EINVAL; + goto out; + } + + offset = nla_get_u16(tb[NL80211_ATTR_CNTDWN_OFFS_BEACON]); + if (offset >= params.beacon_color_change.tail_len) { + err = -EINVAL; + goto out; + } + + if (params.beacon_color_change.tail[offset] != params.count) { + err = -EINVAL; + goto out; + } + + params.counter_offset_beacon = offset; + + if (tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]) { + if (nla_len(tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]) != + sizeof(u16)) { + err = -EINVAL; + goto out; + } + + offset = nla_get_u16(tb[NL80211_ATTR_CNTDWN_OFFS_PRESP]); + if (offset >= params.beacon_color_change.probe_resp_len) { + err = -EINVAL; + goto out; + } + + if (params.beacon_color_change.probe_resp[offset] != + params.count) { + err = -EINVAL; + goto out; + } + + params.counter_offset_presp = offset; + } + + wdev_lock(wdev); + err = rdev_color_change(rdev, dev, ¶ms); + wdev_unlock(wdev); + +out: + kfree(params.beacon_next.mbssid_ies); + kfree(params.beacon_color_change.mbssid_ies); + kfree(tb); + return err; +} + +static int nl80211_set_fils_aad(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct cfg80211_fils_aad fils_aad = {}; + u8 *nonces; + + if (!info->attrs[NL80211_ATTR_MAC] || + !info->attrs[NL80211_ATTR_FILS_KEK] || + !info->attrs[NL80211_ATTR_FILS_NONCES]) + return -EINVAL; + + fils_aad.macaddr = nla_data(info->attrs[NL80211_ATTR_MAC]); + fils_aad.kek_len = nla_len(info->attrs[NL80211_ATTR_FILS_KEK]); + fils_aad.kek = nla_data(info->attrs[NL80211_ATTR_FILS_KEK]); + nonces = nla_data(info->attrs[NL80211_ATTR_FILS_NONCES]); + fils_aad.snonce = nonces; + fils_aad.anonce = nonces + FILS_NONCE_LEN; + + return rdev_set_fils_aad(rdev, dev, &fils_aad); +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -14860,9 +15188,7 @@ static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, return -ENETDOWN; } - if (dev) - dev_hold(dev); - + dev_hold(dev); info->user_ptr[0] = rdev; } @@ -14884,8 +15210,7 @@ static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb, if (ops->internal_flags & NL80211_FLAG_NEED_WDEV) { struct wireless_dev *wdev = info->user_ptr[1]; - if (wdev->netdev) - dev_put(wdev->netdev); + dev_put(wdev->netdev); } else { dev_put(info->user_ptr[1]); } @@ -14983,9 +15308,7 @@ static int nl80211_set_sar_specs(struct sk_buff *skb, struct genl_info *info) if (specs > rdev->wiphy.sar_capa->num_freq_ranges) return -EINVAL; - sar_spec = kzalloc(sizeof(*sar_spec) + - specs * sizeof(struct cfg80211_sar_sub_specs), - GFP_KERNEL); + sar_spec = kzalloc(struct_size(sar_spec, sub_specs, specs), GFP_KERNEL); if (!sar_spec) return -ENOMEM; @@ -15801,6 +16124,21 @@ static const struct genl_small_ops nl80211_small_ops[] = { .internal_flags = NL80211_FLAG_NEED_WIPHY | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_COLOR_CHANGE_REQUEST, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = nl80211_color_change, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_NEED_RTNL, + }, + { + .cmd = NL80211_CMD_SET_FILS_AAD, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = nl80211_set_fils_aad, + .flags = GENL_UNS_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP, + }, }; static struct genl_family nl80211_fam __ro_after_init = { @@ -17430,6 +17768,51 @@ void cfg80211_ch_switch_started_notify(struct net_device *dev, } EXPORT_SYMBOL(cfg80211_ch_switch_started_notify); +int cfg80211_bss_color_notify(struct net_device *dev, gfp_t gfp, + enum nl80211_commands cmd, u8 count, + u64 color_bitmap) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + ASSERT_WDEV_LOCK(wdev); + + trace_cfg80211_bss_color_notify(dev, cmd, count, color_bitmap); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, 0, 0, 0, cmd); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + if (cmd == NL80211_CMD_COLOR_CHANGE_STARTED && + nla_put_u32(msg, NL80211_ATTR_COLOR_CHANGE_COUNT, count)) + goto nla_put_failure; + + if (cmd == NL80211_CMD_OBSS_COLOR_COLLISION && + nla_put_u64_64bit(msg, NL80211_ATTR_OBSS_COLOR_BITMAP, + color_bitmap, NL80211_ATTR_PAD)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + return genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), + msg, 0, NL80211_MCGRP_MLME, gfp); + +nla_put_failure: + nlmsg_free(msg); + return -EINVAL; +} +EXPORT_SYMBOL(cfg80211_bss_color_notify); + void nl80211_radar_notify(struct cfg80211_registered_device *rdev, const struct cfg80211_chan_def *chandef, diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index 36f1b59a78bf..ae2e1a896461 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -115,23 +115,22 @@ int ieee80211_radiotap_iterator_init( iterator->_max_length = get_unaligned_le16(&radiotap_header->it_len); iterator->_arg_index = 0; iterator->_bitmap_shifter = get_unaligned_le32(&radiotap_header->it_present); - iterator->_arg = (uint8_t *)radiotap_header + sizeof(*radiotap_header); + iterator->_arg = (uint8_t *)radiotap_header->it_optional; iterator->_reset_on_ext = 0; - iterator->_next_bitmap = &radiotap_header->it_present; - iterator->_next_bitmap++; + iterator->_next_bitmap = radiotap_header->it_optional; iterator->_vns = vns; iterator->current_namespace = &radiotap_ns; iterator->is_radiotap_ns = 1; /* find payload start allowing for extended bitmap(s) */ - if (iterator->_bitmap_shifter & (1<<IEEE80211_RADIOTAP_EXT)) { + if (iterator->_bitmap_shifter & (BIT(IEEE80211_RADIOTAP_EXT))) { if ((unsigned long)iterator->_arg - (unsigned long)iterator->_rtheader + sizeof(uint32_t) > (unsigned long)iterator->_max_length) return -EINVAL; while (get_unaligned_le32(iterator->_arg) & - (1 << IEEE80211_RADIOTAP_EXT)) { + (BIT(IEEE80211_RADIOTAP_EXT))) { iterator->_arg += sizeof(uint32_t); /* diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index b1d37f582dc6..cc1efec4b27b 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1368,4 +1368,31 @@ static inline int rdev_set_sar_specs(struct cfg80211_registered_device *rdev, return ret; } +static inline int rdev_color_change(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_color_change_settings *params) +{ + int ret; + + trace_rdev_color_change(&rdev->wiphy, dev, params); + ret = rdev->ops->color_change(&rdev->wiphy, dev, params); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + +static inline int +rdev_set_fils_aad(struct cfg80211_registered_device *rdev, + struct net_device *dev, struct cfg80211_fils_aad *fils_aad) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_set_fils_aad(&rdev->wiphy, dev, fils_aad); + if (rdev->ops->set_fils_aad) + ret = rdev->ops->set_fils_aad(&rdev->wiphy, dev, fils_aad); + trace_rdev_return_int(&rdev->wiphy, ret); + + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/reg.c b/net/wireless/reg.c index c2d0ff7f089f..df87c7f3a049 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -171,9 +171,11 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) { const struct ieee80211_regdomain *regd = NULL; const struct ieee80211_regdomain *wiphy_regd = NULL; + enum nl80211_dfs_regions dfs_region; rcu_read_lock(); regd = get_cfg80211_regdom(); + dfs_region = regd->dfs_region; if (!wiphy) goto out; @@ -182,6 +184,11 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) if (!wiphy_regd) goto out; + if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { + dfs_region = wiphy_regd->dfs_region; + goto out; + } + if (wiphy_regd->dfs_region == regd->dfs_region) goto out; @@ -193,7 +200,7 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy) out: rcu_read_unlock(); - return regd->dfs_region; + return dfs_region; } static void rcu_free_regdom(const struct ieee80211_regdomain *r) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 7897b1478c3c..22e92be61938 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -383,7 +383,7 @@ static bool is_bss(struct cfg80211_bss *a, const u8 *bssid, const u8 *ssid, size_t ssid_len) { const struct cfg80211_bss_ies *ies; - const u8 *ssidie; + const struct element *ssid_elem; if (bssid && !ether_addr_equal(a->bssid, bssid)) return false; @@ -394,12 +394,12 @@ static bool is_bss(struct cfg80211_bss *a, const u8 *bssid, ies = rcu_access_pointer(a->ies); if (!ies) return false; - ssidie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len); - if (!ssidie) + ssid_elem = cfg80211_find_elem(WLAN_EID_SSID, ies->data, ies->len); + if (!ssid_elem) return false; - if (ssidie[1] != ssid_len) + if (ssid_elem->datalen != ssid_len) return false; - return memcmp(ssidie + 2, ssid, ssid_len) == 0; + return memcmp(ssid_elem->data, ssid, ssid_len) == 0; } static int @@ -418,14 +418,17 @@ cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss, } ssid_len = ssid[1]; ssid = ssid + 2; - rcu_read_unlock(); /* check if nontrans_bss is in the list */ list_for_each_entry(bss, &trans_bss->nontrans_list, nontrans_list) { - if (is_bss(bss, nontrans_bss->bssid, ssid, ssid_len)) + if (is_bss(bss, nontrans_bss->bssid, ssid, ssid_len)) { + rcu_read_unlock(); return 0; + } } + rcu_read_unlock(); + /* add to the list */ list_add_tail(&nontrans_bss->nontrans_list, &trans_bss->nontrans_list); return 0; @@ -975,8 +978,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, } #endif - if (wdev->netdev) - dev_put(wdev->netdev); + dev_put(wdev->netdev); kfree(rdev->int_scan_req); rdev->int_scan_req = NULL; @@ -1792,25 +1794,13 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, return NULL; } -/* - * Update RX channel information based on the available frame payload - * information. This is mainly for the 2.4 GHz band where frames can be received - * from neighboring channels and the Beacon frames use the DSSS Parameter Set - * element to indicate the current (transmitting) channel, but this might also - * be needed on other bands if RX frequency does not match with the actual - * operating channel of a BSS. - */ -static struct ieee80211_channel * -cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, - struct ieee80211_channel *channel, - enum nl80211_bss_scan_width scan_width) +int cfg80211_get_ies_channel_number(const u8 *ie, size_t ielen, + enum nl80211_band band) { const u8 *tmp; - u32 freq; int channel_number = -1; - struct ieee80211_channel *alt_channel; - if (channel->band == NL80211_BAND_S1GHZ) { + if (band == NL80211_BAND_S1GHZ) { tmp = cfg80211_find_ie(WLAN_EID_S1G_OPERATION, ie, ielen); if (tmp && tmp[1] >= sizeof(struct ieee80211_s1g_oper_ie)) { struct ieee80211_s1g_oper_ie *s1gop = (void *)(tmp + 2); @@ -1831,6 +1821,29 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, } } + return channel_number; +} +EXPORT_SYMBOL(cfg80211_get_ies_channel_number); + +/* + * Update RX channel information based on the available frame payload + * information. This is mainly for the 2.4 GHz band where frames can be received + * from neighboring channels and the Beacon frames use the DSSS Parameter Set + * element to indicate the current (transmitting) channel, but this might also + * be needed on other bands if RX frequency does not match with the actual + * operating channel of a BSS. + */ +static struct ieee80211_channel * +cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, + struct ieee80211_channel *channel, + enum nl80211_bss_scan_width scan_width) +{ + u32 freq; + int channel_number; + struct ieee80211_channel *alt_channel; + + channel_number = cfg80211_get_ies_channel_number(ie, ielen, channel->band); + if (channel_number < 0) { /* No channel information in frame payload */ return channel; @@ -2073,12 +2086,12 @@ static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, if (!non_tx_data) return; - if (!cfg80211_find_ie(WLAN_EID_MULTIPLE_BSSID, ie, ielen)) + if (!cfg80211_find_elem(WLAN_EID_MULTIPLE_BSSID, ie, ielen)) return; if (!wiphy->support_mbssid) return; if (wiphy->support_only_he_mbssid && - !cfg80211_find_ext_ie(WLAN_EID_EXT_HE_CAPABILITY, ie, ielen)) + !cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ie, ielen)) return; new_ie = kmalloc(IEEE80211_MAX_DATA_LEN, gfp); @@ -2445,10 +2458,10 @@ cfg80211_inform_bss_frame_data(struct wiphy *wiphy, res = cfg80211_inform_single_bss_frame_data(wiphy, data, mgmt, len, gfp); if (!res || !wiphy->support_mbssid || - !cfg80211_find_ie(WLAN_EID_MULTIPLE_BSSID, ie, ielen)) + !cfg80211_find_elem(WLAN_EID_MULTIPLE_BSSID, ie, ielen)) return res; if (wiphy->support_only_he_mbssid && - !cfg80211_find_ext_ie(WLAN_EID_EXT_HE_CAPABILITY, ie, ielen)) + !cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ie, ielen)) return res; non_tx_data.tx_bss = res; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 440bce5f0274..ad6c16a06bcb 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -167,6 +167,19 @@ __entry->center_freq1, __entry->freq1_offset, \ __entry->center_freq2 +#define FILS_AAD_ASSIGN(fa) \ + do { \ + if (fa) { \ + ether_addr_copy(__entry->macaddr, fa->macaddr); \ + __entry->kek_len = fa->kek_len; \ + } else { \ + eth_zero_addr(__entry->macaddr); \ + __entry->kek_len = 0; \ + } \ + } while (0) +#define FILS_AAD_PR_FMT \ + "macaddr: %pM, kek_len: %d" + #define SINFO_ENTRY __field(int, generation) \ __field(u32, connected_time) \ __field(u32, inactive_time) \ @@ -2614,6 +2627,24 @@ DEFINE_EVENT(wiphy_wdev_cookie_evt, rdev_abort_pmsr, TP_ARGS(wiphy, wdev, cookie) ); +TRACE_EVENT(rdev_set_fils_aad, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_fils_aad *fils_aad), + TP_ARGS(wiphy, netdev, fils_aad), + TP_STRUCT__entry(WIPHY_ENTRY + NETDEV_ENTRY + __array(u8, macaddr, ETH_ALEN) + __field(u8, kek_len) + ), + TP_fast_assign(WIPHY_ASSIGN; + NETDEV_ASSIGN; + FILS_AAD_ASSIGN(fils_aad); + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " FILS_AAD_PR_FMT, + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->macaddr, + __entry->kek_len) +); + /************************************************************* * cfg80211 exported functions traces * *************************************************************/ @@ -3597,6 +3628,52 @@ TRACE_EVENT(rdev_set_sar_specs, WIPHY_PR_ARG, __entry->type, __entry->num) ); +TRACE_EVENT(rdev_color_change, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_color_change_settings *params), + TP_ARGS(wiphy, netdev, params), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(u8, count) + __field(u16, bcn_ofs) + __field(u16, pres_ofs) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->count = params->count; + __entry->bcn_ofs = params->counter_offset_beacon; + __entry->pres_ofs = params->counter_offset_presp; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT + ", count: %u", + WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->count) +); + +TRACE_EVENT(cfg80211_bss_color_notify, + TP_PROTO(struct net_device *netdev, + enum nl80211_commands cmd, + u8 count, u64 color_bitmap), + TP_ARGS(netdev, cmd, count, color_bitmap), + TP_STRUCT__entry( + NETDEV_ENTRY + __field(u32, cmd) + __field(u8, count) + __field(u64, color_bitmap) + ), + TP_fast_assign( + NETDEV_ASSIGN; + __entry->cmd = cmd; + __entry->count = count; + __entry->color_bitmap = color_bitmap; + ), + TP_printk(NETDEV_PR_FMT ", cmd: %x, count: %u, bitmap: %llx", + NETDEV_PR_ARG, __entry->cmd, __entry->count, + __entry->color_bitmap) +); + #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index 18dba3d7c638..5ff1f8726faf 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -80,6 +80,7 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) return 0; /* not supported */ switch (band) { case NL80211_BAND_2GHZ: + case NL80211_BAND_LC: if (chan == 14) return MHZ_TO_KHZ(2484); else if (chan < 14) @@ -209,6 +210,7 @@ static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) WARN_ON(want); break; case NL80211_BAND_2GHZ: + case NL80211_BAND_LC: want = 7; for (i = 0; i < sband->n_bitrates; i++) { switch (sband->bitrates[i].bitrate) { @@ -1028,14 +1030,14 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, !(rdev->wiphy.interface_modes & (1 << ntype))) return -EOPNOTSUPP; - /* if it's part of a bridge, reject changing type to station/ibss */ - if (netif_is_bridge_port(dev) && - (ntype == NL80211_IFTYPE_ADHOC || - ntype == NL80211_IFTYPE_STATION || - ntype == NL80211_IFTYPE_P2P_CLIENT)) - return -EBUSY; - if (ntype != otype) { + /* if it's part of a bridge, reject changing type to station/ibss */ + if (netif_is_bridge_port(dev) && + (ntype == NL80211_IFTYPE_ADHOC || + ntype == NL80211_IFTYPE_STATION || + ntype == NL80211_IFTYPE_P2P_CLIENT)) + return -EBUSY; + dev->ieee80211_ptr->use_4addr = false; dev->ieee80211_ptr->mesh_id_up_len = 0; wdev_lock(dev->ieee80211_ptr); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index d6b500dc4208..f16074eb53c7 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -134,21 +134,6 @@ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, return 0; } -void xp_release(struct xdp_buff_xsk *xskb) -{ - xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; -} - -static u64 xp_get_handle(struct xdp_buff_xsk *xskb) -{ - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; - - offset += xskb->pool->headroom; - if (!xskb->pool->unaligned) - return xskb->orig_addr + offset; - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); -} - static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 8de01aaac4a0..90c4e1e819d3 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -44,12 +44,13 @@ void xp_destroy(struct xsk_buff_pool *pool) struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, struct xdp_umem *umem) { + bool unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; struct xsk_buff_pool *pool; struct xdp_buff_xsk *xskb; - u32 i; + u32 i, entries; - pool = kvzalloc(struct_size(pool, free_heads, umem->chunks), - GFP_KERNEL); + entries = unaligned ? umem->chunks : 0; + pool = kvzalloc(struct_size(pool, free_heads, entries), GFP_KERNEL); if (!pool) goto out; @@ -63,7 +64,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->free_heads_cnt = umem->chunks; pool->headroom = umem->headroom; pool->chunk_size = umem->chunk_size; - pool->unaligned = umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; + pool->chunk_shift = ffs(umem->chunk_size) - 1; + pool->unaligned = unaligned; pool->frame_len = umem->chunk_size - umem->headroom - XDP_PACKET_HEADROOM; pool->umem = umem; @@ -81,7 +83,10 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb = &pool->heads[i]; xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; - pool->free_heads[i] = xskb; + if (pool->unaligned) + pool->free_heads[i] = xskb; + else + xp_init_xskb_addr(xskb, pool, i * pool->chunk_size); } return pool; @@ -406,6 +411,12 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, if (pool->unaligned) xp_check_dma_contiguity(dma_map); + else + for (i = 0; i < pool->heads_cnt; i++) { + struct xdp_buff_xsk *xskb = &pool->heads[i]; + + xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); + } err = xp_init_dma_info(pool, dma_map); if (err) { @@ -448,12 +459,9 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) if (pool->free_heads_cnt == 0) return NULL; - xskb = pool->free_heads[--pool->free_heads_cnt]; - for (;;) { if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { pool->fq->queue_empty_descs++; - xp_release(xskb); return NULL; } @@ -466,17 +474,17 @@ static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) } break; } - xskq_cons_release(pool->fq); - xskb->orig_addr = addr; - xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; - if (pool->dma_pages_cnt) { - xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] & - ~XSK_NEXT_PG_CONTIG_MASK) + - (addr & ~PAGE_MASK); - xskb->dma = xskb->frame_dma + pool->headroom + - XDP_PACKET_HEADROOM; + if (pool->unaligned) { + xskb = pool->free_heads[--pool->free_heads_cnt]; + xp_init_xskb_addr(xskb, pool, addr); + if (pool->dma_pages_cnt) + xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); + } else { + xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; } + + xskq_cons_release(pool->fq); return xskb; } @@ -507,6 +515,96 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) } EXPORT_SYMBOL(xp_alloc); +static u32 xp_alloc_new_from_fq(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) +{ + u32 i, cached_cons, nb_entries; + + if (max > pool->free_heads_cnt) + max = pool->free_heads_cnt; + max = xskq_cons_nb_entries(pool->fq, max); + + cached_cons = pool->fq->cached_cons; + nb_entries = max; + i = max; + while (i--) { + struct xdp_buff_xsk *xskb; + u64 addr; + bool ok; + + __xskq_cons_read_addr_unchecked(pool->fq, cached_cons++, &addr); + + ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : + xp_check_aligned(pool, &addr); + if (unlikely(!ok)) { + pool->fq->invalid_descs++; + nb_entries--; + continue; + } + + if (pool->unaligned) { + xskb = pool->free_heads[--pool->free_heads_cnt]; + xp_init_xskb_addr(xskb, pool, addr); + if (pool->dma_pages_cnt) + xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); + } else { + xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; + } + + *xdp = &xskb->xdp; + xdp++; + } + + xskq_cons_release_n(pool->fq, max); + return nb_entries; +} + +static u32 xp_alloc_reused(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 nb_entries) +{ + struct xdp_buff_xsk *xskb; + u32 i; + + nb_entries = min_t(u32, nb_entries, pool->free_list_cnt); + + i = nb_entries; + while (i--) { + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); + list_del(&xskb->free_list_node); + + *xdp = &xskb->xdp; + xdp++; + } + pool->free_list_cnt -= nb_entries; + + return nb_entries; +} + +u32 xp_alloc_batch(struct xsk_buff_pool *pool, struct xdp_buff **xdp, u32 max) +{ + u32 nb_entries1 = 0, nb_entries2; + + if (unlikely(pool->dma_need_sync)) { + /* Slow path */ + *xdp = xp_alloc(pool); + return !!*xdp; + } + + if (unlikely(pool->free_list_cnt)) { + nb_entries1 = xp_alloc_reused(pool, xdp, max); + if (nb_entries1 == max) + return nb_entries1; + + max -= nb_entries1; + xdp += nb_entries1; + } + + nb_entries2 = xp_alloc_new_from_fq(pool, xdp, max); + if (!nb_entries2) + pool->fq->queue_empty_descs++; + + return nb_entries1 + nb_entries2; +} +EXPORT_SYMBOL(xp_alloc_batch); + bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) { if (pool->free_list_cnt >= count) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9ae13cccfb28..e9aa2c236356 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -111,14 +111,18 @@ struct xsk_queue { /* Functions that read and validate content from consumer rings. */ -static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) +static inline void __xskq_cons_read_addr_unchecked(struct xsk_queue *q, u32 cached_cons, u64 *addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 idx = cached_cons & q->ring_mask; - if (q->cached_cons != q->cached_prod) { - u32 idx = q->cached_cons & q->ring_mask; + *addr = ring->desc[idx]; +} - *addr = ring->desc[idx]; +static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) +{ + if (q->cached_cons != q->cached_prod) { + __xskq_cons_read_addr_unchecked(q, q->cached_cons, addr); return true; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 3df0861d4390..70a8c36f0ba6 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -530,7 +530,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } - if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) { + if (xfrm_parse_spi(skb, nexthdr, &spi, &seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } @@ -560,7 +560,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) } seq = 0; - if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) { + if (!spi && xfrm_parse_spi(skb, nexthdr, &spi, &seq)) { secpath_reset(skb); XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7f881f5a5897..1a06585022ab 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2486,9 +2486,7 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0); if (likely(xdst)) { - struct dst_entry *dst = &xdst->u.dst; - - memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); + memset_after(xdst, 0, u.dst); } else xdst = ERR_PTR(-ENOBUFS); @@ -3157,6 +3155,11 @@ ok: return dst; nopol: + if (!(dst_orig->dev->flags & IFF_LOOPBACK) && + !xfrm_default_allow(net, dir)) { + err = -EPERM; + goto error; + } if (!(flags & XFRM_LOOKUP_ICMP)) { dst = dst_orig; goto ok; @@ -3545,6 +3548,11 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, } if (!pol) { + if (!xfrm_default_allow(net, dir)) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); + return 0; + } + if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) { xfrm_secpath_reject(xerr_idx, skb, &fl); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); @@ -3599,6 +3607,12 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, tpp[ti++] = &pols[pi]->xfrm_vec[i]; } xfrm_nr = ti; + + if (!xfrm_default_allow(net, dir) && !xfrm_nr) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); + goto reject; + } + if (npols > 1) { xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); tpp = stp; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 7aff641c717d..7c36cc1f3d79 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1961,6 +1961,102 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, return skb; } +static int xfrm_notify_userpolicy(struct net *net) +{ + struct xfrm_userpolicy_default *up; + int len = NLMSG_ALIGN(sizeof(*up)); + struct nlmsghdr *nlh; + struct sk_buff *skb; + int err; + + skb = nlmsg_new(len, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + nlh = nlmsg_put(skb, 0, 0, XFRM_MSG_GETDEFAULT, sizeof(*up), 0); + if (nlh == NULL) { + kfree_skb(skb); + return -EMSGSIZE; + } + + up = nlmsg_data(nlh); + up->in = net->xfrm.policy_default & XFRM_POL_DEFAULT_IN ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + up->fwd = net->xfrm.policy_default & XFRM_POL_DEFAULT_FWD ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + up->out = net->xfrm.policy_default & XFRM_POL_DEFAULT_OUT ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + + nlmsg_end(skb, nlh); + + rcu_read_lock(); + err = xfrm_nlmsg_multicast(net, skb, 0, XFRMNLGRP_POLICY); + rcu_read_unlock(); + + return err; +} + +static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct net *net = sock_net(skb->sk); + struct xfrm_userpolicy_default *up = nlmsg_data(nlh); + + if (up->in == XFRM_USERPOLICY_BLOCK) + net->xfrm.policy_default |= XFRM_POL_DEFAULT_IN; + else if (up->in == XFRM_USERPOLICY_ACCEPT) + net->xfrm.policy_default &= ~XFRM_POL_DEFAULT_IN; + + if (up->fwd == XFRM_USERPOLICY_BLOCK) + net->xfrm.policy_default |= XFRM_POL_DEFAULT_FWD; + else if (up->fwd == XFRM_USERPOLICY_ACCEPT) + net->xfrm.policy_default &= ~XFRM_POL_DEFAULT_FWD; + + if (up->out == XFRM_USERPOLICY_BLOCK) + net->xfrm.policy_default |= XFRM_POL_DEFAULT_OUT; + else if (up->out == XFRM_USERPOLICY_ACCEPT) + net->xfrm.policy_default &= ~XFRM_POL_DEFAULT_OUT; + + rt_genid_bump_all(net); + + xfrm_notify_userpolicy(net); + return 0; +} + +static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh, + struct nlattr **attrs) +{ + struct sk_buff *r_skb; + struct nlmsghdr *r_nlh; + struct net *net = sock_net(skb->sk); + struct xfrm_userpolicy_default *r_up; + int len = NLMSG_ALIGN(sizeof(struct xfrm_userpolicy_default)); + u32 portid = NETLINK_CB(skb).portid; + u32 seq = nlh->nlmsg_seq; + + r_skb = nlmsg_new(len, GFP_ATOMIC); + if (!r_skb) + return -ENOMEM; + + r_nlh = nlmsg_put(r_skb, portid, seq, XFRM_MSG_GETDEFAULT, sizeof(*r_up), 0); + if (!r_nlh) { + kfree_skb(r_skb); + return -EMSGSIZE; + } + + r_up = nlmsg_data(r_nlh); + + r_up->in = net->xfrm.policy_default & XFRM_POL_DEFAULT_IN ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + r_up->fwd = net->xfrm.policy_default & XFRM_POL_DEFAULT_FWD ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + r_up->out = net->xfrm.policy_default & XFRM_POL_DEFAULT_OUT ? + XFRM_USERPOLICY_BLOCK : XFRM_USERPOLICY_ACCEPT; + nlmsg_end(r_skb, r_nlh); + + return nlmsg_unicast(net->xfrm.nlsk, r_skb, portid); +} + static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attrs) { @@ -2664,6 +2760,8 @@ const int xfrm_msg_min[XFRM_NR_MSGTYPES] = { [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_NEWSPDINFO - XFRM_MSG_BASE] = sizeof(u32), [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = sizeof(u32), + [XFRM_MSG_SETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default), + [XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_default), }; EXPORT_SYMBOL_GPL(xfrm_msg_min); @@ -2743,6 +2841,8 @@ static const struct xfrm_link { .nla_pol = xfrma_spd_policy, .nla_max = XFRMA_SPD_MAX }, [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, + [XFRM_MSG_SETDEFAULT - XFRM_MSG_BASE] = { .doit = xfrm_set_default }, + [XFRM_MSG_GETDEFAULT - XFRM_MSG_BASE] = { .doit = xfrm_get_default }, }; static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2855,7 +2955,7 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct copy_to_user_state(x, &ue->state); ue->hard = (c->data.hard != 0) ? 1 : 0; /* clear the padding bytes */ - memset(&ue->hard + 1, 0, sizeof(*ue) - offsetofend(typeof(*ue), hard)); + memset_after(ue, 0, hard); err = xfrm_mark_put(skb, &x->mark); if (err) |
