diff options
Diffstat (limited to 'net')
242 files changed, 4897 insertions, 3688 deletions
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 134419667d59..a000b1ef0520 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -135,11 +135,14 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], return 0; } -static int vlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vlan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); struct vlan_dev_priv *vlan = vlan_dev_priv(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct net_device *real_dev; unsigned int max_mtu; __be16 proto; @@ -155,7 +158,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev, return -EINVAL; } - real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + real_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) { NL_SET_ERR_MSG_MOD(extack, "link does not exist"); return -ENODEV; diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 9fa0b246902b..05cbb3c227c5 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -432,49 +432,18 @@ static struct atalk_addr *__aarp_proxy_find(struct net_device *dev, return a ? sa : NULL; } -/* - * Probe a Phase 1 device or a device that requires its Net:Node to - * be set via an ioctl. - */ -static void aarp_send_probe_phase1(struct atalk_iface *iface) -{ - struct ifreq atreq; - struct sockaddr_at *sa = (struct sockaddr_at *)&atreq.ifr_addr; - const struct net_device_ops *ops = iface->dev->netdev_ops; - - sa->sat_addr.s_node = iface->address.s_node; - sa->sat_addr.s_net = ntohs(iface->address.s_net); - - /* We pass the Net:Node to the drivers/cards by a Device ioctl. */ - if (!(ops->ndo_do_ioctl(iface->dev, &atreq, SIOCSIFADDR))) { - ops->ndo_do_ioctl(iface->dev, &atreq, SIOCGIFADDR); - if (iface->address.s_net != htons(sa->sat_addr.s_net) || - iface->address.s_node != sa->sat_addr.s_node) - iface->status |= ATIF_PROBE_FAIL; - - iface->address.s_net = htons(sa->sat_addr.s_net); - iface->address.s_node = sa->sat_addr.s_node; - } -} - - void aarp_probe_network(struct atalk_iface *atif) { - if (atif->dev->type == ARPHRD_LOCALTLK || - atif->dev->type == ARPHRD_PPP) - aarp_send_probe_phase1(atif); - else { - unsigned int count; + unsigned int count; - for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) { - aarp_send_probe(atif->dev, &atif->address); + for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) { + aarp_send_probe(atif->dev, &atif->address); - /* Defer 1/10th */ - msleep(100); + /* Defer 1/10th */ + msleep(100); - if (atif->status & ATIF_PROBE_FAIL) - break; - } + if (atif->status & ATIF_PROBE_FAIL) + break; } } diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index aa6c714892ec..9f3b8b682adb 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -685,6 +685,15 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; } + if (ax25->ax25_dev) { + if (dev == ax25->ax25_dev->dev) { + rcu_read_unlock(); + break; + } + netdev_put(ax25->ax25_dev->dev, &ax25->dev_tracker); + ax25_dev_put(ax25->ax25_dev); + } + ax25->ax25_dev = ax25_dev_ax25dev(dev); if (!ax25->ax25_dev) { rcu_read_unlock(); @@ -692,6 +701,8 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname, break; } ax25_fillin_cb(ax25, ax25->ax25_dev); + netdev_hold(dev, &ax25->dev_tracker, GFP_ATOMIC); + ax25_dev_hold(ax25->ax25_dev); rcu_read_unlock(); break; diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index ac11f1f08db0..d35479c465e2 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -113,8 +113,6 @@ static void batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh) { ewma_throughput_init(&hardif_neigh->bat_v.throughput); - INIT_WORK(&hardif_neigh->bat_v.metric_work, - batadv_v_elp_throughput_metric_update); } /** diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1d704574e6bf..b065578b4436 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -18,6 +18,7 @@ #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kref.h> +#include <linux/list.h> #include <linux/minmax.h> #include <linux/netdevice.h> #include <linux/nl80211.h> @@ -26,6 +27,7 @@ #include <linux/rcupdate.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> @@ -42,6 +44,18 @@ #include "send.h" /** + * struct batadv_v_metric_queue_entry - list of hardif neighbors which require + * and metric update + */ +struct batadv_v_metric_queue_entry { + /** @hardif_neigh: hardif neighbor scheduled for metric update */ + struct batadv_hardif_neigh_node *hardif_neigh; + + /** @list: list node for metric_queue */ + struct list_head list; +}; + +/** * batadv_v_elp_start_timer() - restart timer for ELP periodic work * @hard_iface: the interface for which the timer has to be reset */ @@ -59,25 +73,36 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) /** * batadv_v_elp_get_throughput() - get the throughput towards a neighbour * @neigh: the neighbour for which the throughput has to be obtained + * @pthroughput: calculated throughput towards the given neighbour in multiples + * of 100kpbs (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). * - * Return: The throughput towards the given neighbour in multiples of 100kpbs - * (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc). + * Return: true when value behind @pthroughput was set */ -static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) +static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh, + u32 *pthroughput) { struct batadv_hard_iface *hard_iface = neigh->if_incoming; + struct net_device *soft_iface = hard_iface->soft_iface; struct ethtool_link_ksettings link_settings; struct net_device *real_netdev; struct station_info sinfo; u32 throughput; int ret; + /* don't query throughput when no longer associated with any + * batman-adv interface + */ + if (!soft_iface) + return false; + /* if the user specified a customised value for this interface, then * return it directly */ throughput = atomic_read(&hard_iface->bat_v.throughput_override); - if (throughput != 0) - return throughput; + if (throughput != 0) { + *pthroughput = throughput; + return true; + } /* if this is a wireless device, then ask its throughput through * cfg80211 API @@ -104,27 +129,39 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) * possible to delete this neighbor. For now set * the throughput metric to 0. */ - return 0; + *pthroughput = 0; + return true; } if (ret) goto default_throughput; - if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) - return sinfo.expected_throughput / 100; + if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) { + *pthroughput = sinfo.expected_throughput / 100; + return true; + } /* try to estimate the expected throughput based on reported tx * rates */ - if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) - return cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) { + *pthroughput = cfg80211_calculate_bitrate(&sinfo.txrate) / 3; + return true; + } goto default_throughput; } + /* only use rtnl_trylock because the elp worker will be cancelled while + * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise + * wait forever when the elp work_item was started and it is then also + * trying to rtnl_lock + */ + if (!rtnl_trylock()) + return false; + /* if not a wifi interface, check if this device provides data via * ethtool (e.g. an Ethernet adapter) */ - rtnl_lock(); ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); rtnl_unlock(); if (ret == 0) { @@ -135,13 +172,15 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; throughput = link_settings.base.speed; - if (throughput && throughput != SPEED_UNKNOWN) - return throughput * 10; + if (throughput && throughput != SPEED_UNKNOWN) { + *pthroughput = throughput * 10; + return true; + } } default_throughput: if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) { - batadv_info(hard_iface->soft_iface, + batadv_info(soft_iface, "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n", hard_iface->net_dev->name, BATADV_THROUGHPUT_DEFAULT_VALUE / 10, @@ -150,31 +189,26 @@ default_throughput: } /* if none of the above cases apply, return the base_throughput */ - return BATADV_THROUGHPUT_DEFAULT_VALUE; + *pthroughput = BATADV_THROUGHPUT_DEFAULT_VALUE; + return true; } /** * batadv_v_elp_throughput_metric_update() - worker updating the throughput * metric of a single hop neighbour - * @work: the work queue item + * @neigh: the neighbour to probe */ -void batadv_v_elp_throughput_metric_update(struct work_struct *work) +static void +batadv_v_elp_throughput_metric_update(struct batadv_hardif_neigh_node *neigh) { - struct batadv_hardif_neigh_node_bat_v *neigh_bat_v; - struct batadv_hardif_neigh_node *neigh; - - neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v, - metric_work); - neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node, - bat_v); + u32 throughput; + bool valid; - ewma_throughput_add(&neigh->bat_v.throughput, - batadv_v_elp_get_throughput(neigh)); + valid = batadv_v_elp_get_throughput(neigh, &throughput); + if (!valid) + return; - /* decrement refcounter to balance increment performed before scheduling - * this task - */ - batadv_hardif_neigh_put(neigh); + ewma_throughput_add(&neigh->bat_v.throughput, throughput); } /** @@ -248,14 +282,16 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh) */ static void batadv_v_elp_periodic_work(struct work_struct *work) { + struct batadv_v_metric_queue_entry *metric_entry; + struct batadv_v_metric_queue_entry *metric_safe; struct batadv_hardif_neigh_node *hardif_neigh; struct batadv_hard_iface *hard_iface; struct batadv_hard_iface_bat_v *bat_v; struct batadv_elp_packet *elp_packet; + struct list_head metric_queue; struct batadv_priv *bat_priv; struct sk_buff *skb; u32 elp_interval; - bool ret; bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work); hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v); @@ -291,6 +327,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) atomic_inc(&hard_iface->bat_v.elp_seqno); + INIT_LIST_HEAD(&metric_queue); + /* The throughput metric is updated on each sent packet. This way, if a * node is dead and no longer sends packets, batman-adv is still able to * react timely to its death. @@ -315,16 +353,28 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) /* Reading the estimated throughput from cfg80211 is a task that * may sleep and that is not allowed in an rcu protected - * context. Therefore schedule a task for that. + * context. Therefore add it to metric_queue and process it + * outside rcu protected context. */ - ret = queue_work(batadv_event_workqueue, - &hardif_neigh->bat_v.metric_work); - - if (!ret) + metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC); + if (!metric_entry) { batadv_hardif_neigh_put(hardif_neigh); + continue; + } + + metric_entry->hardif_neigh = hardif_neigh; + list_add(&metric_entry->list, &metric_queue); } rcu_read_unlock(); + list_for_each_entry_safe(metric_entry, metric_safe, &metric_queue, list) { + batadv_v_elp_throughput_metric_update(metric_entry->hardif_neigh); + + batadv_hardif_neigh_put(metric_entry->hardif_neigh); + list_del(&metric_entry->list); + kfree(metric_entry); + } + restart_timer: batadv_v_elp_start_timer(hard_iface); out: diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index 9e2740195fa2..c9cb0a307100 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -10,7 +10,6 @@ #include "main.h" #include <linux/skbuff.h> -#include <linux/workqueue.h> int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface); void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface); @@ -19,6 +18,5 @@ void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface, void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface); int batadv_v_elp_packet_recv(struct sk_buff *skb, struct batadv_hard_iface *if_incoming); -void batadv_v_elp_throughput_metric_update(struct work_struct *work); #endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */ diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 822d788a5f86..e190fa954f22 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1037,7 +1037,7 @@ static void batadv_softif_init_early(struct net_device *dev) dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; - dev->netns_local = true; + dev->netns_immutable = true; /* can't call min_mtu, because the needed variables * have not been initialized yet @@ -1077,19 +1077,18 @@ static int batadv_softif_validate(struct nlattr *tb[], struct nlattr *data[], /** * batadv_softif_newlink() - pre-initialize and register new batadv link - * @src_net: the applicable net namespace * @dev: network device to register - * @tb: IFLA_INFO_DATA netlink attributes - * @data: enum batadv_ifla_attrs attributes + * @params: rtnl newlink parameters * @extack: extended ACK report struct * * Return: 0 if successful or error otherwise. */ -static int batadv_softif_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int batadv_softif_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct batadv_priv *bat_priv = netdev_priv(dev); + struct nlattr **data = params->data; const char *algo_name; int err; diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 3c0a14a582e4..d4b71d34310f 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -3937,23 +3937,21 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tt_data; u16 num_entries, num_vlan; - size_t flex_size; + size_t tt_data_sz; if (tvlv_value_len < sizeof(*tt_data)) return; tt_data = tvlv_value; - tvlv_value_len -= sizeof(*tt_data); - num_vlan = ntohs(tt_data->num_vlan); - flex_size = flex_array_size(tt_data, vlan_data, num_vlan); - if (tvlv_value_len < flex_size) + tt_data_sz = struct_size(tt_data, vlan_data, num_vlan); + if (tvlv_value_len < tt_data_sz) return; tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data - + flex_size); - tvlv_value_len -= flex_size; + + tt_data_sz); + tvlv_value_len -= tt_data_sz; num_entries = batadv_tt_entries(tvlv_value_len); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index f491bff8c51b..fe89f08533fe 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -596,9 +596,6 @@ struct batadv_hardif_neigh_node_bat_v { * neighbor */ unsigned long last_unicast_tx; - - /** @metric_work: work queue callback item for metric update */ - struct work_struct metric_work; }; /** diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig index 6746be07e222..e08aae35351a 100644 --- a/net/bluetooth/hidp/Kconfig +++ b/net/bluetooth/hidp/Kconfig @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config BT_HIDP tristate "HIDP protocol support" - depends on BT_BREDR && INPUT && HID_SUPPORT - select HID + depends on BT_BREDR && HID help HIDP (Human Interface Device Protocol) is a transport layer for HID reports. HIDP is required for the Bluetooth Human diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 27b4c4a2ba1f..b22078b67972 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -119,7 +119,6 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, { struct l2cap_chan *c; - mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_scid(conn, cid); if (c) { /* Only lock if chan reference is not 0 */ @@ -127,7 +126,6 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn, if (c) l2cap_chan_lock(c); } - mutex_unlock(&conn->chan_lock); return c; } @@ -140,7 +138,6 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, { struct l2cap_chan *c; - mutex_lock(&conn->chan_lock); c = __l2cap_get_chan_by_dcid(conn, cid); if (c) { /* Only lock if chan reference is not 0 */ @@ -148,7 +145,6 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn, if (c) l2cap_chan_lock(c); } - mutex_unlock(&conn->chan_lock); return c; } @@ -418,7 +414,7 @@ static void l2cap_chan_timeout(struct work_struct *work) if (!conn) return; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling * this work. No need to call l2cap_chan_hold(chan) here again. */ @@ -439,7 +435,7 @@ static void l2cap_chan_timeout(struct work_struct *work) l2cap_chan_unlock(chan); l2cap_chan_put(chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } struct l2cap_chan *l2cap_chan_create(void) @@ -636,14 +632,15 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) test_bit(FLAG_HOLD_HCI_CONN, &chan->flags)) hci_conn_hold(conn->hcon); - list_add(&chan->list, &conn->chan_l); + /* Append to the list since the order matters for ECRED */ + list_add_tail(&chan->list, &conn->chan_l); } void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan) { - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); __l2cap_chan_add(conn, chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } void l2cap_chan_del(struct l2cap_chan *chan, int err) @@ -731,9 +728,9 @@ void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func, if (!conn) return; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); __l2cap_chan_list(conn, func, data); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } EXPORT_SYMBOL_GPL(l2cap_chan_list); @@ -745,7 +742,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work) struct hci_conn *hcon = conn->hcon; struct l2cap_chan *chan; - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -754,7 +751,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan) @@ -948,6 +945,16 @@ static u8 l2cap_get_ident(struct l2cap_conn *conn) return id; } +static void l2cap_send_acl(struct l2cap_conn *conn, struct sk_buff *skb, + u8 flags) +{ + /* Check if the hcon still valid before attempting to send */ + if (hci_conn_valid(conn->hcon->hdev, conn->hcon)) + hci_send_acl(conn->hchan, skb, flags); + else + kfree_skb(skb); +} + static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data) { @@ -970,7 +977,7 @@ static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, bt_cb(skb)->force_active = BT_POWER_FORCE_ACTIVE_ON; skb->priority = HCI_PRIO_MAX; - hci_send_acl(conn->hchan, skb, flags); + l2cap_send_acl(conn, skb, flags); } static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb) @@ -1497,8 +1504,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -1567,8 +1572,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn) l2cap_chan_unlock(chan); } - - mutex_unlock(&conn->chan_lock); } static void l2cap_le_conn_ready(struct l2cap_conn *conn) @@ -1614,7 +1617,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) if (hcon->type == ACL_LINK) l2cap_request_info(conn); - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { @@ -1632,7 +1635,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); if (hcon->type == LE_LINK) l2cap_le_conn_ready(conn); @@ -1647,14 +1650,10 @@ static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry(chan, &conn->chan_l, list) { if (test_bit(FLAG_FORCE_RELIABLE, &chan->flags)) l2cap_chan_set_err(chan, err); } - - mutex_unlock(&conn->chan_lock); } static void l2cap_info_timeout(struct work_struct *work) @@ -1665,7 +1664,9 @@ static void l2cap_info_timeout(struct work_struct *work) conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; conn->info_ident = 0; + mutex_lock(&conn->lock); l2cap_conn_start(conn); + mutex_unlock(&conn->lock); } /* @@ -1757,6 +1758,8 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); + mutex_lock(&conn->lock); + kfree_skb(conn->rx_skb); skb_queue_purge(&conn->pending_rx); @@ -1775,8 +1778,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) /* Force the connection to be immediately dropped */ hcon->disc_timeout = 0; - mutex_lock(&conn->chan_lock); - /* Kill channels */ list_for_each_entry_safe(chan, l, &conn->chan_l, list) { l2cap_chan_hold(chan); @@ -1790,15 +1791,14 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) l2cap_chan_put(chan); } - mutex_unlock(&conn->chan_lock); - - hci_chan_del(conn->hchan); - if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) cancel_delayed_work_sync(&conn->info_timer); - hcon->l2cap_data = NULL; + hci_chan_del(conn->hchan); conn->hchan = NULL; + + hcon->l2cap_data = NULL; + mutex_unlock(&conn->lock); l2cap_conn_put(conn); } @@ -2916,8 +2916,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) BT_DBG("conn %p", conn); - mutex_lock(&conn->chan_lock); - list_for_each_entry(chan, &conn->chan_l, list) { if (chan->chan_type != L2CAP_CHAN_RAW) continue; @@ -2932,8 +2930,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) if (chan->ops->recv(chan, nskb)) kfree_skb(nskb); } - - mutex_unlock(&conn->chan_lock); } /* ---- L2CAP signalling commands ---- */ @@ -3776,7 +3772,11 @@ static void l2cap_ecred_rsp_defer(struct l2cap_chan *chan, void *data) struct l2cap_ecred_conn_rsp *rsp_flex = container_of(&rsp->pdu.rsp, struct l2cap_ecred_conn_rsp, hdr); - if (test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags)) + /* Check if channel for outgoing connection or if it wasn't deferred + * since in those cases it must be skipped. + */ + if (test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags) || + !test_and_clear_bit(FLAG_DEFER_SETUP, &chan->flags)) return; /* Reset ident so only one response is sent */ @@ -3952,7 +3952,6 @@ static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); /* Check if the ACL is secure enough (if not SDP) */ @@ -4059,7 +4058,6 @@ response: } l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); } @@ -4098,27 +4096,19 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x", dcid, scid, result, status); - mutex_lock(&conn->chan_lock); - if (scid) { chan = __l2cap_get_chan_by_scid(conn, scid); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; } else { chan = __l2cap_get_chan_by_ident(conn, cmd->ident); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; } chan = l2cap_chan_hold_unless_zero(chan); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; err = 0; @@ -4156,9 +4146,6 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); l2cap_chan_put(chan); -unlock: - mutex_unlock(&conn->chan_lock); - return err; } @@ -4446,11 +4433,7 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn, chan->ops->set_shutdown(chan); - l2cap_chan_unlock(chan); - mutex_lock(&conn->chan_lock); - l2cap_chan_lock(chan); l2cap_chan_del(chan, ECONNRESET); - mutex_unlock(&conn->chan_lock); chan->ops->close(chan); @@ -4487,11 +4470,7 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, return 0; } - l2cap_chan_unlock(chan); - mutex_lock(&conn->chan_lock); - l2cap_chan_lock(chan); l2cap_chan_del(chan, 0); - mutex_unlock(&conn->chan_lock); chan->ops->close(chan); @@ -4689,13 +4668,9 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x", dcid, mtu, mps, credits, result); - mutex_lock(&conn->chan_lock); - chan = __l2cap_get_chan_by_ident(conn, cmd->ident); - if (!chan) { - err = -EBADSLT; - goto unlock; - } + if (!chan) + return -EBADSLT; err = 0; @@ -4743,9 +4718,6 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); -unlock: - mutex_unlock(&conn->chan_lock); - return err; } @@ -4857,7 +4829,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); if (!smp_sufficient_security(conn->hcon, pchan->sec_level, @@ -4923,7 +4894,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, response_unlock: l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); if (result == L2CAP_CR_PEND) @@ -5057,7 +5027,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, goto response; } - mutex_lock(&conn->chan_lock); l2cap_chan_lock(pchan); if (!smp_sufficient_security(conn->hcon, pchan->sec_level, @@ -5132,7 +5101,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, unlock: l2cap_chan_unlock(pchan); - mutex_unlock(&conn->chan_lock); l2cap_chan_put(pchan); response: @@ -5169,8 +5137,6 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, BT_DBG("mtu %u mps %u credits %u result 0x%4.4x", mtu, mps, credits, result); - mutex_lock(&conn->chan_lock); - cmd_len -= sizeof(*rsp); list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) { @@ -5256,8 +5222,6 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn, l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); - return err; } @@ -5370,8 +5334,6 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn, if (cmd_len < sizeof(*rej)) return -EPROTO; - mutex_lock(&conn->chan_lock); - chan = __l2cap_get_chan_by_ident(conn, cmd->ident); if (!chan) goto done; @@ -5386,7 +5348,6 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn, l2cap_chan_put(chan); done: - mutex_unlock(&conn->chan_lock); return 0; } @@ -6841,8 +6802,12 @@ static void process_pending_rx(struct work_struct *work) BT_DBG(""); + mutex_lock(&conn->lock); + while ((skb = skb_dequeue(&conn->pending_rx))) l2cap_recv_frame(conn, skb); + + mutex_unlock(&conn->lock); } static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) @@ -6881,7 +6846,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR; mutex_init(&conn->ident_lock); - mutex_init(&conn->chan_lock); + mutex_init(&conn->lock); INIT_LIST_HEAD(&conn->chan_l); INIT_LIST_HEAD(&conn->users); @@ -7072,7 +7037,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, } } - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); l2cap_chan_lock(chan); if (cid && __l2cap_get_chan_by_dcid(conn, cid)) { @@ -7113,7 +7078,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, chan_unlock: l2cap_chan_unlock(chan); - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); done: hci_dev_unlock(hdev); hci_dev_put(hdev); @@ -7325,7 +7290,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) BT_DBG("conn %p status 0x%2.2x encrypt %u", conn, status, encrypt); - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); list_for_each_entry(chan, &conn->chan_l, list) { l2cap_chan_lock(chan); @@ -7399,7 +7364,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) l2cap_chan_unlock(chan); } - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); } /* Append fragment into frame respecting the maximum len of rx_skb */ @@ -7466,19 +7431,45 @@ static void l2cap_recv_reset(struct l2cap_conn *conn) conn->rx_len = 0; } +struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c) +{ + if (!c) + return NULL; + + BT_DBG("conn %p orig refcnt %u", c, kref_read(&c->ref)); + + if (!kref_get_unless_zero(&c->ref)) + return NULL; + + return c; +} + void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { - struct l2cap_conn *conn = hcon->l2cap_data; + struct l2cap_conn *conn; int len; + /* Lock hdev to access l2cap_data to avoid race with l2cap_conn_del */ + hci_dev_lock(hcon->hdev); + + conn = hcon->l2cap_data; + if (!conn) conn = l2cap_conn_add(hcon); - if (!conn) - goto drop; + conn = l2cap_conn_hold_unless_zero(conn); + + hci_dev_unlock(hcon->hdev); + + if (!conn) { + kfree_skb(skb); + return; + } BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags); + mutex_lock(&conn->lock); + switch (flags) { case ACL_START: case ACL_START_NO_FLUSH: @@ -7503,7 +7494,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) if (len == skb->len) { /* Complete frame received */ l2cap_recv_frame(conn, skb); - return; + goto unlock; } BT_DBG("Start: total len %d, frag len %u", len, skb->len); @@ -7567,6 +7558,9 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) drop: kfree_skb(skb); +unlock: + mutex_unlock(&conn->lock); + l2cap_conn_put(conn); } static struct hci_cb l2cap_cb = { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 49f97d4138ea..acd11b268b98 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -710,12 +710,12 @@ static bool l2cap_valid_mtu(struct l2cap_chan *chan, u16 mtu) { switch (chan->scid) { case L2CAP_CID_ATT: - if (mtu < L2CAP_LE_MIN_MTU) + if (mtu && mtu < L2CAP_LE_MIN_MTU) return false; break; default: - if (mtu < L2CAP_DEFAULT_MIN_MTU) + if (mtu && mtu < L2CAP_DEFAULT_MIN_MTU) return false; } @@ -1326,9 +1326,10 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) /* prevent sk structure from being freed whilst unlocked */ sock_hold(sk); - chan = l2cap_pi(sk)->chan; /* prevent chan structure from being freed whilst unlocked */ - l2cap_chan_hold(chan); + chan = l2cap_chan_hold_unless_zero(l2cap_pi(sk)->chan); + if (!chan) + goto shutdown_already; BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); @@ -1358,22 +1359,20 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) release_sock(sk); l2cap_chan_lock(chan); - conn = chan->conn; - if (conn) - /* prevent conn structure from being freed */ - l2cap_conn_get(conn); + /* prevent conn structure from being freed */ + conn = l2cap_conn_hold_unless_zero(chan->conn); l2cap_chan_unlock(chan); if (conn) /* mutex lock must be taken before l2cap_chan_lock() */ - mutex_lock(&conn->chan_lock); + mutex_lock(&conn->lock); l2cap_chan_lock(chan); l2cap_chan_close(chan, 0); l2cap_chan_unlock(chan); if (conn) { - mutex_unlock(&conn->chan_lock); + mutex_unlock(&conn->lock); l2cap_conn_put(conn); } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 09bac3c9c2d5..f53304cb09db 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -210,7 +210,7 @@ static const u16 mgmt_untrusted_events[] = { MGMT_EV_EXP_FEATURE_CHANGED, }; -#define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) +#define CACHE_TIMEOUT secs_to_jiffies(2) #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 9ae2a7f1738b..7cb192cbd65f 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -660,12 +660,9 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, void __user *data_in = u64_to_user_ptr(kattr->test.data_in); void *data; - if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) + if (user_size < ETH_HLEN || user_size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); - if (user_size > size) - return ERR_PTR(-EMSGSIZE); - size = SKB_DATA_ALIGN(size); data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) @@ -1018,6 +1015,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_CGROUP_SKB: is_direct_pkt_access = true; break; default: diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0ab4613aa07a..9d8c72ed01ab 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -488,7 +488,7 @@ void br_dev_setup(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &br_type); dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE; dev->lltx = true; - dev->netns_local = true; + dev->netns_immutable = true; dev->features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 1a52a0bca086..7e1ad229e133 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -1040,7 +1040,7 @@ static int br_mdb_add_group(const struct br_mdb_config *cfg, /* host join */ if (!port) { - if (mp->host_joined) { + if (mp->host_joined && !(cfg->nlflags & NLM_F_REPLACE)) { NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host"); return -EEXIST; } diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 451e45b9a6a5..94cbe967d1c1 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -393,38 +393,10 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ reason = ip_route_input(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), dev); if (reason) { - struct in_device *in_dev = __in_dev_get_rcu(dev); - - /* If err equals -EHOSTUNREACH the error is due to a - * martian destination or due to the fact that - * forwarding is disabled. For most martian packets, - * ip_route_output_key() will fail. It won't fail for 2 types of - * martian destinations: loopback destinations and destination - * 0.0.0.0. In both cases the packet will be dropped because the - * destination is the loopback device and not the bridge. */ - if (reason != SKB_DROP_REASON_IP_INADDRERRORS || !in_dev || - IN_DEV_FORWARD(in_dev)) - goto free_skb; - - rt = ip_route_output(net, iph->daddr, 0, - ip4h_dscp(iph), 0, - RT_SCOPE_UNIVERSE); - if (!IS_ERR(rt)) { - /* - Bridged-and-DNAT'ed traffic doesn't - * require ip_forwarding. */ - if (rt->dst.dev == dev) { - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - goto bridged_dnat; - } - ip_rt_put(rt); - } -free_skb: - kfree_skb(skb); + kfree_skb_reason(skb, reason); return 0; } else { if (skb_dst(skb)->dev == dev) { -bridged_dnat: skb->dev = br_indev; nf_bridge_update_protocol(skb); nf_bridge_push_encap_header(skb); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 3e0f47203f2a..6e337937d0d7 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1553,11 +1553,13 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return 0; } -static int br_dev_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int br_dev_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net_bridge *br = netdev_priv(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; int err; err = register_netdevice(dev); diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 94ad09e36df2..fa6a3c2634a8 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -438,10 +438,11 @@ static void caif_netlink_parms(struct nlattr *data[], } } -static int ipcaif_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipcaif_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; int ret; struct chnl_net *caifdev; ASSERT_RTNL(); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 305dd72c844c..17226b2341d0 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -1132,7 +1132,7 @@ static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, todo_size = size; - while (todo_size) { + do { struct j1939_sk_buff_cb *skcb; segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, @@ -1177,7 +1177,7 @@ static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, todo_size -= segment_size; session->total_queued_size += segment_size; - } + } while (todo_size); switch (ret) { case 0: /* OK */ diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 95f7a7e65a73..9b72d118d756 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -382,8 +382,9 @@ sk_buff *j1939_session_skb_get_by_offset(struct j1939_session *session, skb_queue_walk(&session->skb_queue, do_skb) { do_skcb = j1939_skb_to_cb(do_skb); - if (offset_start >= do_skcb->offset && - offset_start < (do_skcb->offset + do_skb->len)) { + if ((offset_start >= do_skcb->offset && + offset_start < (do_skcb->offset + do_skb->len)) || + (offset_start == 0 && do_skcb->offset == 0 && do_skb->len == 0)) { skb = do_skb; } } diff --git a/net/can/raw.c b/net/can/raw.c index 46e8ed9d64da..9b1d5f036f57 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -963,7 +963,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) skb->dev = dev; skb->priority = sockc.priority; - skb->mark = READ_ONCE(sk->sk_mark); + skb->mark = sockc.mark; skb->tstamp = sockc.transmit_time; skb_setup_tx_timestamp(skb, &sockc); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 2f4ed83a75ae..2e538399757f 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -50,15 +50,16 @@ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; + migrate_disable(); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); - if (!sk_storage) { - rcu_read_unlock(); - return; - } + if (!sk_storage) + goto out; bpf_local_storage_destroy(sk_storage); +out: rcu_read_unlock(); + migrate_enable(); } static void bpf_sk_storage_map_free(struct bpf_map *map) @@ -160,6 +161,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); + migrate_disable(); rcu_read_lock(); sk_storage = rcu_dereference(sk->sk_bpf_storage); @@ -212,6 +214,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) out: rcu_read_unlock(); + migrate_enable(); /* In case of an error, don't free anything explicitly here, the * caller is responsible to call bpf_sk_storage_free. @@ -352,11 +355,6 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) { - const struct btf *btf_vmlinux; - const struct btf_type *t; - const char *tname; - u32 btf_id; - if (prog->aux->dst_prog) return false; @@ -371,13 +369,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) return true; case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: - btf_vmlinux = bpf_get_btf_vmlinux(); - if (IS_ERR_OR_NULL(btf_vmlinux)) - return false; - btf_id = prog->aux->attach_btf_id; - t = btf_type_by_id(btf_vmlinux, btf_id); - tname = btf_name_by_offset(btf_vmlinux, t->name_off); - return !!strncmp(tname, "bpf_sk_storage", + return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage", strlen("bpf_sk_storage")); default: return false; diff --git a/net/core/dev.c b/net/core/dev.c index d7cbe6ff5249..5c9d2bd29e15 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -106,6 +106,7 @@ #include <net/dst.h> #include <net/dst_metadata.h> #include <net/gro.h> +#include <net/netdev_queues.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/checksum.h> @@ -158,6 +159,7 @@ #include <net/netdev_rx_queue.h> #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <net/rps.h> #include <linux/phy_link_topology.h> @@ -1006,7 +1008,7 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id) WARN_ON_ONCE(!rcu_read_lock_held()); - if (napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi_id)) return NULL; napi = napi_by_id(napi_id); @@ -1120,6 +1122,12 @@ out: return ret; } +static bool dev_addr_cmp(struct net_device *dev, unsigned short type, + const char *ha) +{ + return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len); +} + /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace @@ -1128,7 +1136,7 @@ out: * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. - * The caller must hold RCU or RTNL. + * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * @@ -1140,14 +1148,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, struct net_device *dev; for_each_netdev_rcu(net, dev) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); +/** + * dev_getbyhwaddr() - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold + * rtnl_lock. + * + * Context: rtnl_lock() must be held. + * Return: pointer to the net_device, or NULL if not found + */ +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *ha) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev_addr_cmp(dev, type, ha)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr); + struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; @@ -2069,20 +2102,55 @@ static void __move_netdevice_notifier_net(struct net *src_net, __register_netdevice_notifier_net(dst_net, nb, true); } +static void rtnl_net_dev_lock(struct net_device *dev) +{ + bool again; + + do { + struct net *net; + + again = false; + + /* netns might be being dismantled. */ + rcu_read_lock(); + net = dev_net_rcu(dev); + net_passive_inc(net); + rcu_read_unlock(); + + rtnl_net_lock(net); + +#ifdef CONFIG_NET_NS + /* dev might have been moved to another netns. */ + if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) { + rtnl_net_unlock(net); + net_passive_dec(net); + again = true; + } +#endif + } while (again); +} + +static void rtnl_net_dev_unlock(struct net_device *dev) +{ + struct net *net = dev_net(dev); + + rtnl_net_unlock(net); + net_passive_dec(net); +} + int register_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { - struct net *net = dev_net(dev); int err; - rtnl_net_lock(net); - err = __register_netdevice_notifier_net(net, nb, false); + rtnl_net_dev_lock(dev); + err = __register_netdevice_notifier_net(dev_net(dev), nb, false); if (!err) { nn->nb = nb; list_add(&nn->list, &dev->net_notifier_list); } - rtnl_net_unlock(net); + rtnl_net_dev_unlock(dev); return err; } @@ -2092,13 +2160,12 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev, struct notifier_block *nb, struct netdev_net_notifier *nn) { - struct net *net = dev_net(dev); int err; - rtnl_net_lock(net); + rtnl_net_dev_lock(dev); list_del(&nn->list); - err = __unregister_netdevice_notifier_net(net, nb); - rtnl_net_unlock(net); + err = __unregister_netdevice_notifier_net(dev_net(dev), nb); + rtnl_net_dev_unlock(dev); return err; } @@ -2248,8 +2315,8 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif #ifdef CONFIG_NET_CLS_ACT -DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); -EXPORT_SYMBOL(tcf_bypass_check_needed_key); +DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key); +EXPORT_SYMBOL(tcf_sw_enabled_key); #endif DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); @@ -4144,10 +4211,13 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, if (!miniq) return ret; - if (static_branch_unlikely(&tcf_bypass_check_needed_key)) { - if (tcf_block_bypass_sw(miniq->block)) - return ret; - } + /* Global bypass */ + if (!static_branch_likely(&tcf_sw_enabled_key)) + return ret; + + /* Block-wise bypass */ + if (tcf_block_bypass_sw(miniq->block)) + return ret; tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; @@ -4496,7 +4566,8 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_reset_mac_header(skb); skb_assert_len(skb); - if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) + if (unlikely(skb_shinfo(skb)->tx_flags & + (SKBTX_SCHED_TSTAMP | SKBTX_BPF))) __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED); /* Disable soft irqs for various locks below. Also @@ -4688,7 +4759,7 @@ use_local_napi: * we have to raise NET_RX_SOFTIRQ. */ if (!sd->in_net_rx_action) - __raise_softirq_irqoff(NET_RX_SOFTIRQ); + raise_softirq_irqoff(NET_RX_SOFTIRQ); } #ifdef CONFIG_RPS @@ -6115,16 +6186,18 @@ EXPORT_SYMBOL(netif_receive_skb_list); static void flush_backlog(struct work_struct *work) { struct sk_buff *skb, *tmp; + struct sk_buff_head list; struct softnet_data *sd; + __skb_queue_head_init(&list); local_bh_disable(); sd = this_cpu_ptr(&softnet_data); backlog_lock_irq_disable(sd); skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->input_pkt_queue); - dev_kfree_skb_irq(skb); + __skb_queue_tail(&list, skb); rps_input_queue_head_incr(sd); } } @@ -6132,14 +6205,16 @@ static void flush_backlog(struct work_struct *work) local_lock_nested_bh(&softnet_data.process_queue_bh_lock); skb_queue_walk_safe(&sd->process_queue, skb, tmp) { - if (skb->dev->reg_state == NETREG_UNREGISTERING) { + if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) { __skb_unlink(skb, &sd->process_queue); - kfree_skb(skb); + __skb_queue_tail(&list, skb); rps_input_queue_head_incr(sd); } } local_unlock_nested_bh(&softnet_data.process_queue_bh_lock); local_bh_enable(); + + __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY); } static bool flush_required(int cpu) @@ -6403,7 +6478,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; if (work_done) { - if (n->gro_bitmask) + if (n->gro.bitmask) timeout = napi_get_gro_flush_timeout(n); n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n); } @@ -6413,15 +6488,14 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (timeout) ret = false; } - if (n->gro_bitmask) { - /* When the NAPI instance uses a timeout and keeps postponing - * it, we need to bound somehow the time packets are kept in - * the GRO layer - */ - napi_gro_flush(n, !!timeout); - } - gro_normal_list(n); + /* + * When the NAPI instance uses a timeout and keeps postponing + * it, we need to bound somehow the time packets are kept in + * the GRO layer. + */ + gro_flush(&n->gro, !!timeout); + gro_normal_list(&n->gro); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ @@ -6485,19 +6559,15 @@ static void skb_defer_free_flush(struct softnet_data *sd) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) { if (!skip_schedule) { - gro_normal_list(napi); + gro_normal_list(&napi->gro); __napi_schedule(napi); return; } - if (napi->gro_bitmask) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(napi, HZ >= 1000); - } + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush(&napi->gro, HZ >= 1000); + gro_normal_list(&napi->gro); - gro_normal_list(napi); clear_bit(NAPI_STATE_SCHED, &napi->state); } @@ -6604,7 +6674,7 @@ restart: } work = napi_poll(napi, budget); trace_napi_poll(napi, work, budget); - gro_normal_list(napi); + gro_normal_list(&napi->gro); count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), @@ -6704,7 +6774,9 @@ void napi_resume_irqs(unsigned int napi_id) static void __napi_hash_add_with_id(struct napi_struct *napi, unsigned int napi_id) { - napi->napi_id = napi_id; + napi->gro.cached_napi_id = napi_id; + + WRITE_ONCE(napi->napi_id, napi_id); hlist_add_head_rcu(&napi->napi_hash_node, &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); } @@ -6731,7 +6803,7 @@ static void napi_hash_add(struct napi_struct *napi) /* 0..NR_CPUS range is reserved for sender_cpu use */ do { - if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + if (unlikely(!napi_id_valid(++napi_gen_id))) napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); @@ -6772,17 +6844,6 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void init_gro_hash(struct napi_struct *napi) -{ - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - INIT_LIST_HEAD(&napi->gro_hash[i].list); - napi->gro_hash[i].count = 0; - } - napi->gro_bitmask = 0; -} - int dev_set_threaded(struct net_device *dev, bool threaded) { struct napi_struct *napi; @@ -6862,11 +6923,175 @@ void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index, } EXPORT_SYMBOL(netif_queue_set_napi); +static void +netif_napi_irq_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) +{ + struct napi_struct *napi = + container_of(notify, struct napi_struct, notify); +#ifdef CONFIG_RFS_ACCEL + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + int err; +#endif + + if (napi->config && napi->dev->irq_affinity_auto) + cpumask_copy(&napi->config->affinity_mask, mask); + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); + if (err) + netdev_warn(napi->dev, "RMAP update failed (%d)\n", + err); + } +#endif +} + +#ifdef CONFIG_RFS_ACCEL +static void netif_napi_affinity_release(struct kref *ref) +{ + struct napi_struct *napi = + container_of(ref, struct napi_struct, notify.kref); + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; + + netdev_assert_locked(napi->dev); + WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, + &napi->state)); + + if (!napi->dev->rx_cpu_rmap_auto) + return; + rmap->obj[napi->napi_rmap_idx] = NULL; + napi->napi_rmap_idx = -1; + cpu_rmap_put(rmap); +} + +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + if (dev->rx_cpu_rmap_auto) + return 0; + + dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs); + if (!dev->rx_cpu_rmap) + return -ENOMEM; + + dev->rx_cpu_rmap_auto = true; + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); + +static void netif_del_cpu_rmap(struct net_device *dev) +{ + struct cpu_rmap *rmap = dev->rx_cpu_rmap; + + if (!dev->rx_cpu_rmap_auto) + return; + + /* Free the rmap */ + cpu_rmap_put(rmap); + dev->rx_cpu_rmap = NULL; + dev->rx_cpu_rmap_auto = false; +} + +#else +static void netif_napi_affinity_release(struct kref *ref) +{ +} + +int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) +{ + return 0; +} +EXPORT_SYMBOL(netif_enable_cpu_rmap); + +static void netif_del_cpu_rmap(struct net_device *dev) +{ +} +#endif + +void netif_set_affinity_auto(struct net_device *dev) +{ + unsigned int i, maxqs, numa; + + maxqs = max(dev->num_tx_queues, dev->num_rx_queues); + numa = dev_to_node(&dev->dev); + + for (i = 0; i < maxqs; i++) + cpumask_set_cpu(cpumask_local_spread(i, numa), + &dev->napi_config[i].affinity_mask); + + dev->irq_affinity_auto = true; +} +EXPORT_SYMBOL(netif_set_affinity_auto); + +void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) +{ + int rc; + + netdev_assert_locked_or_invisible(napi->dev); + + if (napi->irq == irq) + return; + + /* Remove existing resources */ + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + + napi->irq = irq; + if (irq < 0 || + (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto)) + return; + + /* Abort for buggy drivers */ + if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config)) + return; + +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi); + if (rc < 0) + return; + + cpu_rmap_get(napi->dev->rx_cpu_rmap); + napi->napi_rmap_idx = rc; + } +#endif + + /* Use core IRQ notifier */ + napi->notify.notify = netif_napi_irq_notify; + napi->notify.release = netif_napi_affinity_release; + rc = irq_set_affinity_notifier(irq, &napi->notify); + if (rc) { + netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n", + rc); + goto put_rmap; + } + + set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); + return; + +put_rmap: +#ifdef CONFIG_RFS_ACCEL + if (napi->dev->rx_cpu_rmap_auto) { + cpu_rmap_put(napi->dev->rx_cpu_rmap); + napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL; + napi->napi_rmap_idx = -1; + } +#endif + napi->notify.notify = NULL; + napi->notify.release = NULL; +} +EXPORT_SYMBOL(netif_napi_set_irq_locked); + static void napi_restore_config(struct napi_struct *n) { n->defer_hard_irqs = n->config->defer_hard_irqs; n->gro_flush_timeout = n->config->gro_flush_timeout; n->irq_suspend_timeout = n->config->irq_suspend_timeout; + + if (n->dev->irq_affinity_auto && + test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state)) + irq_set_affinity(n->irq, &n->config->affinity_mask); + /* a NAPI ID might be stored in the config, if so use it. if not, use * napi_hash_add to generate one for us. */ @@ -6902,7 +7127,7 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) higher = &dev->napi_list; list_for_each_entry(pos, &dev->napi_list, dev_list) { - if (pos->napi_id >= MIN_NAPI_ID) + if (napi_id_valid(pos->napi_id)) pos_id = pos->napi_id; else if (pos->config) pos_id = pos->config->napi_id; @@ -6916,6 +7141,23 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +static void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), @@ -6929,10 +7171,8 @@ void netif_napi_add_weight_locked(struct net_device *dev, INIT_HLIST_NODE(&napi->napi_hash_node); hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); napi->timer.function = napi_watchdog; - init_gro_hash(napi); + gro_init(&napi->gro); napi->skb = NULL; - INIT_LIST_HEAD(&napi->rx_list); - napi->rx_count = 0; napi->poll = poll; if (weight > NAPI_POLL_WEIGHT) netdev_err_once(dev, "%s() called with weight %d\n", __func__, @@ -7046,19 +7286,6 @@ void napi_enable(struct napi_struct *n) } EXPORT_SYMBOL(napi_enable); -static void flush_gro_hash(struct napi_struct *napi) -{ - int i; - - for (i = 0; i < GRO_HASH_BUCKETS; i++) { - struct sk_buff *skb, *n; - - list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list) - kfree_skb(skb); - napi->gro_hash[i].count = 0; - } -} - /* Must be called in process context */ void __netif_napi_del_locked(struct napi_struct *napi) { @@ -7067,6 +7294,12 @@ void __netif_napi_del_locked(struct napi_struct *napi) if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state)) return; + /* Make sure NAPI is disabled (or was never enabled). */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) + irq_set_affinity_notifier(napi->irq, NULL); + if (napi->config) { napi->index = -1; napi->config = NULL; @@ -7075,8 +7308,7 @@ void __netif_napi_del_locked(struct napi_struct *napi) list_del_rcu(&napi->dev_list); napi_free_frags(napi); - flush_gro_hash(napi); - napi->gro_bitmask = 0; + gro_cleanup(&napi->gro); if (napi->thread) { kthread_stop(napi->thread); @@ -7135,14 +7367,9 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) return work; } - if (n->gro_bitmask) { - /* flush too old packets - * If HZ < 1000, flush all packets. - */ - napi_gro_flush(n, HZ >= 1000); - } - - gro_normal_list(n); + /* Flush too old packets. If HZ < 1000, flush all packets */ + gro_flush(&n->gro, HZ >= 1000); + gro_normal_list(&n->gro); /* Some drivers may have called napi_schedule * prior to exhausting their budget. @@ -9170,7 +9397,7 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags, if ((flags ^ dev->gflags) & IFF_PROMISC) { int inc = (flags & IFF_PROMISC) ? 1 : -1; - unsigned int old_flags = dev->flags; + old_flags = dev->flags; dev->gflags ^= IFF_PROMISC; @@ -9716,7 +9943,7 @@ int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) if (!dev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; - if (dev->ethtool->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && bpf->command == XDP_SETUP_PROG && bpf->prog && !bpf->prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(bpf->extack, @@ -9761,7 +9988,7 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, struct netdev_bpf xdp; int err; - if (dev->ethtool->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && prog && !prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); return -EBUSY; @@ -9920,6 +10147,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack NL_SET_ERR_MSG(extack, "Program bound to different device"); return -EINVAL; } + if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) { + NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode"); + return -EINVAL; + } if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) { NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); return -EINVAL; @@ -10256,37 +10487,14 @@ static bool from_cleanup_net(void) #endif } -static void rtnl_drop_if_cleanup_net(void) -{ - if (from_cleanup_net()) - __rtnl_unlock(); -} - -static void rtnl_acquire_if_cleanup_net(void) -{ - if (from_cleanup_net()) - rtnl_lock(); -} - /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); -static LIST_HEAD(net_todo_list_for_cleanup_net); - -/* TODO: net_todo_list/net_todo_list_for_cleanup_net should probably - * be provided by callers, instead of being static, rtnl protected. - */ -static struct list_head *todo_list(void) -{ - return from_cleanup_net() ? &net_todo_list_for_cleanup_net : - &net_todo_list; -} - DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { - list_add_tail(&dev->todo_list, todo_list()); + list_add_tail(&dev->todo_list, &net_todo_list); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -11127,16 +11335,15 @@ void netdev_run_todo(void) list_replace_init(&net_unlink_list, &unlink_list); while (!list_empty(&unlink_list)) { - struct net_device *dev = list_first_entry(&unlink_list, - struct net_device, - unlink_list); + dev = list_first_entry(&unlink_list, struct net_device, + unlink_list); list_del_init(&dev->unlink_list); dev->nested_level = dev->lower_level - 1; } #endif /* Snapshot list, allow later requests */ - list_replace_init(todo_list(), &list); + list_replace_init(&net_todo_list, &list); __rtnl_unlock(); @@ -11301,6 +11508,20 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, const struct net_device_ops *ops = dev->netdev_ops; const struct net_device_core_stats __percpu *p; + /* + * IPv{4,6} and udp tunnels share common stat helpers and use + * different stat type (NETDEV_PCPU_STAT_TSTATS vs + * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent. + */ + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) != + offsetof(struct pcpu_dstats, rx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) != + offsetof(struct pcpu_dstats, rx_packets)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) != + offsetof(struct pcpu_dstats, tx_bytes)); + BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) != + offsetof(struct pcpu_dstats, tx_packets)); + if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); ops->ndo_get_stats64(dev, storage); @@ -11539,6 +11760,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (!dev->ethtool) goto free_all; + dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); + if (!dev->cfg) + goto free_all; + dev->cfg_pending = dev->cfg; + napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); if (!dev->napi_config) @@ -11607,6 +11833,8 @@ void free_netdev(struct net_device *dev) return; } + WARN_ON(dev->cfg != dev->cfg_pending); + kfree(dev->cfg); kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); @@ -11618,6 +11846,8 @@ void free_netdev(struct net_device *dev) netdev_napi_exit(dev); + netif_del_cpu_rmap(dev); + ref_tracker_dir_exit(&dev->refcnt_tracker); #ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); @@ -11732,6 +11962,19 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_queue); +static void dev_memory_provider_uninstall(struct net_device *dev) +{ + unsigned int i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct netdev_rx_queue *rxq = &dev->_rx[i]; + struct pp_memory_provider_params *p = &rxq->mp_params; + + if (p->mp_ops && p->mp_ops->uninstall) + p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); + } +} + void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { @@ -11774,11 +12017,9 @@ void unregister_netdevice_many_notify(struct list_head *head, WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); netdev_unlock(dev); } - - rtnl_drop_if_cleanup_net(); flush_all_backlogs(); + synchronize_net(); - rtnl_acquire_if_cleanup_net(); list_for_each_entry(dev, head, unreg_list) { struct sk_buff *skb = NULL; @@ -11788,7 +12029,7 @@ void unregister_netdevice_many_notify(struct list_head *head, dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); - dev_dmabuf_uninstall(dev); + dev_memory_provider_uninstall(dev); netdev_offload_xstats_disable_all(dev); @@ -11838,9 +12079,7 @@ void unregister_netdevice_many_notify(struct list_head *head, #endif } - rtnl_drop_if_cleanup_net(); synchronize_net(); - rtnl_acquire_if_cleanup_net(); list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); @@ -11878,11 +12117,9 @@ EXPORT_SYMBOL(unregister_netdevice_many); */ void unregister_netdev(struct net_device *dev) { - struct net *net = dev_net(dev); - - rtnl_net_lock(net); + rtnl_net_dev_lock(dev); unregister_netdevice(dev); - rtnl_net_unlock(net); + rtnl_net_dev_unlock(dev); } EXPORT_SYMBOL(unregister_netdev); @@ -11894,6 +12131,7 @@ EXPORT_SYMBOL(unregister_netdev); * is already taken in the destination network namespace. * @new_ifindex: If not zero, specifies device index in the target * namespace. + * @extack: netlink extended ack * * This function shuts down a device interface and moves it * to a new network namespace. On success 0 is returned, on @@ -11903,7 +12141,8 @@ EXPORT_SYMBOL(unregister_netdev); */ int __dev_change_net_namespace(struct net_device *dev, struct net *net, - const char *pat, int new_ifindex) + const char *pat, int new_ifindex, + struct netlink_ext_ack *extack) { struct netdev_name_node *name_node; struct net *net_old = dev_net(dev); @@ -11914,12 +12153,16 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, /* Don't allow namespace local devices to be moved. */ err = -EINVAL; - if (dev->netns_local) + if (dev->netns_immutable) { + NL_SET_ERR_MSG(extack, "The interface netns is immutable"); goto out; + } /* Ensure the device has been registered */ - if (dev->reg_state != NETREG_REGISTERED) + if (dev->reg_state != NETREG_REGISTERED) { + NL_SET_ERR_MSG(extack, "The interface isn't registered"); goto out; + } /* Get out if there is nothing todo */ err = 0; @@ -11932,30 +12175,49 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, err = -EEXIST; if (netdev_name_in_use(net, dev->name)) { /* We get here if we can't use the current device name */ - if (!pat) + if (!pat) { + NL_SET_ERR_MSG(extack, + "An interface with the same name exists in the target netns"); goto out; + } err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "Unable to use '%s' for the new interface name in the target netns", + pat); goto out; + } } /* Check that none of the altnames conflicts. */ err = -EEXIST; - netdev_for_each_altname(dev, name_node) - if (netdev_name_in_use(net, name_node->name)) + netdev_for_each_altname(dev, name_node) { + if (netdev_name_in_use(net, name_node->name)) { + NL_SET_ERR_MSG_FMT(extack, + "An interface with the altname %s exists in the target netns", + name_node->name); goto out; + } + } /* Check that new_ifindex isn't used yet. */ if (new_ifindex) { err = dev_index_reserve(net, new_ifindex); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG_FMT(extack, + "The ifindex %d is not available in the target netns", + new_ifindex); goto out; + } } else { /* If there is an ifindex conflict assign a new one */ err = dev_index_reserve(net, dev->ifindex); if (err == -EBUSY) err = dev_index_reserve(net, 0); - if (err < 0) + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Unable to allocate a new ifindex in the target netns"); goto out; + } new_ifindex = err; } @@ -12161,7 +12423,7 @@ static struct hlist_head * __net_init netdev_create_hash(void) static int __net_init netdev_init(struct net *net) { BUILD_BUG_ON(GRO_HASH_BUCKETS > - 8 * sizeof_field(struct napi_struct, gro_bitmask)); + BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask)); INIT_LIST_HEAD(&net->dev_base_head); @@ -12296,7 +12558,7 @@ static void __net_exit default_device_exit_net(struct net *net) char fb_name[IFNAMSIZ]; /* Ignore unmoveable devices (i.e. loopback) */ - if (dev->netns_local) + if (dev->netns_immutable) continue; /* Leave virtual devices for the generic cleanup */ @@ -12526,7 +12788,7 @@ static int __init net_dev_init(void) INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); - init_gro_hash(&sd->backlog); + gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; INIT_LIST_HEAD(&sd->backlog.poll_list); diff --git a/net/core/dev.h b/net/core/dev.h index a5b166bbd169..caa13e431a6b 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -299,6 +299,18 @@ void xdp_do_check_flushed(struct napi_struct *napi); static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif +/* Best effort check that NAPI is not idle (can't be scheduled to run) */ +static inline void napi_assert_will_not_race(const struct napi_struct *napi) +{ + /* uninitialized instance, can't race */ + if (!napi->poll_list.next) + return; + + /* SCHED bit is set on disabled instances */ + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); + WARN_ON(READ_ONCE(napi->list_owner) != -1); +} + void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/devmem.c b/net/core/devmem.c index c971b8aceac8..7c6e0b5b6acb 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -16,6 +16,7 @@ #include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <trace/events/page_pool.h> #include "devmem.h" @@ -27,20 +28,28 @@ /* Protected by rtnl_lock() */ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); +static const struct memory_provider_ops dmabuf_devmem_ops; + +bool net_is_devmem_iov(struct net_iov *niov) +{ + return niov->pp->mp_ops == &dmabuf_devmem_ops; +} + static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) { struct dmabuf_genpool_chunk_owner *owner = chunk->owner; - kvfree(owner->niovs); + kvfree(owner->area.niovs); kfree(owner); } static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) { - struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + struct dmabuf_genpool_chunk_owner *owner; + owner = net_devmem_iov_to_chunk_owner(niov); return owner->base_dma_addr + ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } @@ -83,7 +92,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) offset = dma_addr - owner->base_dma_addr; index = offset / PAGE_SIZE; - niov = &owner->niovs[index]; + niov = &owner->area.niovs[index]; niov->pp_magic = 0; niov->pp = NULL; @@ -94,7 +103,7 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) void net_devmem_free_dmabuf(struct net_iov *niov) { - struct net_devmem_dmabuf_binding *binding = net_iov_binding(niov); + struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov); unsigned long dma_addr = net_devmem_get_dma_addr(niov); if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr, @@ -117,6 +126,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) WARN_ON(rxq->mp_params.mp_priv != binding); rxq->mp_params.mp_priv = NULL; + rxq->mp_params.mp_ops = NULL; rxq_idx = get_netdev_rx_queue_index(rxq); @@ -141,18 +151,18 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return -ERANGE; } - if (dev->ethtool->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); return -EINVAL; } - if (dev->ethtool->hds_thresh) { + if (dev->cfg->hds_thresh) { NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); return -EINVAL; } rxq = __netif_get_rx_queue(dev, rxq_idx); - if (rxq->mp_params.mp_priv) { + if (rxq->mp_params.mp_ops) { NL_SET_ERR_MSG(extack, "designated queue already memory provider bound"); return -EEXIST; } @@ -170,6 +180,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return err; rxq->mp_params.mp_priv = binding; + rxq->mp_params.mp_ops = &dmabuf_devmem_ops; err = netdev_rx_queue_restart(dev, rxq_idx); if (err) @@ -179,6 +190,7 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, err_xa_erase: rxq->mp_params.mp_priv = NULL; + rxq->mp_params.mp_ops = NULL; xa_erase(&binding->bound_rxqs, xa_idx); return err; @@ -261,9 +273,9 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, goto err_free_chunks; } - owner->base_virtual = virtual; + owner->area.base_virtual = virtual; owner->base_dma_addr = dma_addr; - owner->num_niovs = len / PAGE_SIZE; + owner->area.num_niovs = len / PAGE_SIZE; owner->binding = binding; err = gen_pool_add_owner(binding->chunk_pool, dma_addr, @@ -275,17 +287,17 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned int dmabuf_fd, goto err_free_chunks; } - owner->niovs = kvmalloc_array(owner->num_niovs, - sizeof(*owner->niovs), - GFP_KERNEL); - if (!owner->niovs) { + owner->area.niovs = kvmalloc_array(owner->area.num_niovs, + sizeof(*owner->area.niovs), + GFP_KERNEL); + if (!owner->area.niovs) { err = -ENOMEM; goto err_free_chunks; } - for (i = 0; i < owner->num_niovs; i++) { - niov = &owner->niovs[i]; - niov->owner = owner; + for (i = 0; i < owner->area.num_niovs; i++) { + niov = &owner->area.niovs[i]; + niov->owner = &owner->area; page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), net_devmem_get_dma_addr(niov)); } @@ -313,26 +325,6 @@ err_put_dmabuf: return ERR_PTR(err); } -void dev_dmabuf_uninstall(struct net_device *dev) -{ - struct net_devmem_dmabuf_binding *binding; - struct netdev_rx_queue *rxq; - unsigned long xa_idx; - unsigned int i; - - for (i = 0; i < dev->real_num_rx_queues; i++) { - binding = dev->_rx[i].mp_params.mp_priv; - if (!binding) - continue; - - xa_for_each(&binding->bound_rxqs, xa_idx, rxq) - if (rxq == &dev->_rx[i]) { - xa_erase(&binding->bound_rxqs, xa_idx); - break; - } - } -} - /*** "Dmabuf devmem memory provider" ***/ int mp_dmabuf_devmem_init(struct page_pool *pool) @@ -398,3 +390,36 @@ bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem) /* We don't want the page pool put_page()ing our net_iovs. */ return false; } + +static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + const struct net_devmem_dmabuf_binding *binding = mp_priv; + int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF; + + return nla_put_u32(rsp, type, binding->id); +} + +static void mp_dmabuf_devmem_uninstall(void *mp_priv, + struct netdev_rx_queue *rxq) +{ + struct net_devmem_dmabuf_binding *binding = mp_priv; + struct netdev_rx_queue *bound_rxq; + unsigned long xa_idx; + + xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) { + if (bound_rxq == rxq) { + xa_erase(&binding->bound_rxqs, xa_idx); + break; + } + } +} + +static const struct memory_provider_ops dmabuf_devmem_ops = { + .init = mp_dmabuf_devmem_init, + .destroy = mp_dmabuf_devmem_destroy, + .alloc_netmems = mp_dmabuf_devmem_alloc_netmems, + .release_netmem = mp_dmabuf_devmem_release_page, + .nl_fill = mp_dmabuf_devmem_nl_fill, + .uninstall = mp_dmabuf_devmem_uninstall, +}; diff --git a/net/core/devmem.h b/net/core/devmem.h index 76099ef9c482..7fc158d52729 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -10,6 +10,8 @@ #ifndef _NET_DEVMEM_H #define _NET_DEVMEM_H +#include <net/netmem.h> + struct netlink_ext_ack; struct net_devmem_dmabuf_binding { @@ -51,17 +53,11 @@ struct net_devmem_dmabuf_binding { * allocations from this chunk. */ struct dmabuf_genpool_chunk_owner { - /* Offset into the dma-buf where this chunk starts. */ - unsigned long base_virtual; + struct net_iov_area area; + struct net_devmem_dmabuf_binding *binding; /* dma_addr of the start of the chunk. */ dma_addr_t base_dma_addr; - - /* Array of net_iovs for this chunk. */ - struct net_iov *niovs; - size_t num_niovs; - - struct net_devmem_dmabuf_binding *binding; }; void __net_devmem_dmabuf_binding_free(struct net_devmem_dmabuf_binding *binding); @@ -72,38 +68,34 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); -void dev_dmabuf_uninstall(struct net_device *dev); static inline struct dmabuf_genpool_chunk_owner * -net_iov_owner(const struct net_iov *niov) +net_devmem_iov_to_chunk_owner(const struct net_iov *niov) { - return niov->owner; + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct dmabuf_genpool_chunk_owner, area); } -static inline unsigned int net_iov_idx(const struct net_iov *niov) +static inline struct net_devmem_dmabuf_binding * +net_devmem_iov_binding(const struct net_iov *niov) { - return niov - net_iov_owner(niov)->niovs; + return net_devmem_iov_to_chunk_owner(niov)->binding; } -static inline struct net_devmem_dmabuf_binding * -net_iov_binding(const struct net_iov *niov) +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) { - return net_iov_owner(niov)->binding; + return net_devmem_iov_binding(niov)->id; } static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) { - struct dmabuf_genpool_chunk_owner *owner = net_iov_owner(niov); + struct net_iov_area *owner = net_iov_owner(niov); return owner->base_virtual + ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT); } -static inline u32 net_iov_binding_id(const struct net_iov *niov) -{ - return net_iov_owner(niov)->binding->id; -} - static inline void net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { @@ -123,6 +115,8 @@ struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); +bool net_is_devmem_iov(struct net_iov *niov); + #else struct net_devmem_dmabuf_binding; @@ -152,10 +146,6 @@ net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return -EOPNOTSUPP; } -static inline void dev_dmabuf_uninstall(struct net_device *dev) -{ -} - static inline struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) { @@ -171,10 +161,15 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) return 0; } -static inline u32 net_iov_binding_id(const struct net_iov *niov) +static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) { return 0; } + +static inline bool net_is_devmem_iov(struct net_iov *niov) +{ + return false; +} #endif #endif /* _NET_DEVMEM_H */ diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 6efd4cccc9dd..212f0a048cab 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1734,30 +1734,30 @@ static int __init init_net_drop_monitor(void) return -ENOSPC; } - rc = genl_register_family(&net_drop_monitor_family); - if (rc) { - pr_err("Could not create drop monitor netlink family\n"); - return rc; + for_each_possible_cpu(cpu) { + net_dm_cpu_data_init(cpu); + net_dm_hw_cpu_data_init(cpu); } - WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = register_netdevice_notifier(&dropmon_net_notifier); if (rc < 0) { pr_crit("Failed to register netdevice notifier\n"); + return rc; + } + + rc = genl_register_family(&net_drop_monitor_family); + if (rc) { + pr_err("Could not create drop monitor netlink family\n"); goto out_unreg; } + WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT); rc = 0; - for_each_possible_cpu(cpu) { - net_dm_cpu_data_init(cpu); - net_dm_hw_cpu_data_init(cpu); - } - goto out; out_unreg: - genl_unregister_family(&net_drop_monitor_family); + WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); out: return rc; } @@ -1766,19 +1766,18 @@ static void exit_net_drop_monitor(void) { int cpu; - BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); - /* * Because of the module_get/put we do in the trace state change path * we are guaranteed not to have any current users when we get here */ + BUG_ON(genl_unregister_family(&net_drop_monitor_family)); + + BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier)); for_each_possible_cpu(cpu) { net_dm_hw_cpu_data_fini(cpu); net_dm_cpu_data_fini(cpu); } - - BUG_ON(genl_unregister_family(&net_drop_monitor_family)); } module_init(init_net_drop_monitor); diff --git a/net/core/dst.c b/net/core/dst.c index 9552a90d4772..c99b95cf9cbb 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -286,7 +286,8 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, { struct metadata_dst *md_dst; - md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + md_dst = kmalloc(struct_size(md_dst, u.tun_info.options, optslen), + flags); if (!md_dst) return NULL; @@ -314,7 +315,8 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) int cpu; struct metadata_dst __percpu *md_dst; - md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, + md_dst = __alloc_percpu_gfp(struct_size(md_dst, u.tun_info.options, + optslen), __alignof__(struct metadata_dst), flags); if (!md_dst) return NULL; diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index e684ba3ebb38..4bc64d912a1c 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -37,8 +37,8 @@ static const struct fib_kuid_range fib_kuid_range_unset = { bool fib_rule_matchall(const struct fib_rule *rule) { - if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || - rule->flags) + if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) || + rule->mark || rule->tun_id || rule->flags) return false; if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) return false; @@ -261,12 +261,14 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, struct flowi *fl, int flags, struct fib_lookup_arg *arg) { - int ret = 0; + int iifindex, oifindex, ret = 0; - if (rule->iifindex && (rule->iifindex != fl->flowi_iif)) + iifindex = READ_ONCE(rule->iifindex); + if (iifindex && (iifindex != fl->flowi_iif)) goto out; - if (rule->oifindex && (rule->oifindex != fl->flowi_oif)) + oifindex = READ_ONCE(rule->oifindex); + if (oifindex && (oifindex != fl->flowi_oif)) goto out; if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) @@ -371,7 +373,8 @@ static int call_fib_rule_notifiers(struct net *net, .rule = rule, }; - ASSERT_RTNL(); + ASSERT_RTNL_NET(net); + /* Paired with READ_ONCE() in fib_rules_seq() */ WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1); return call_fib_notifiers(net, event_type, &info.info); @@ -459,9 +462,6 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, if (rule->tun_id && r->tun_id != rule->tun_id) continue; - if (r->fr_net != rule->fr_net) - continue; - if (rule->l3mdev && r->l3mdev != rule->l3mdev) continue; @@ -481,11 +481,17 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, &rule->sport_range)) continue; + if (rule->sport_mask && r->sport_mask != rule->sport_mask) + continue; + if (fib_rule_port_range_set(&rule->dport_range) && !fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; + if (rule->dport_mask && r->dport_mask != rule->dport_mask) + continue; + if (!ops->compare(r, frh, tb)) continue; return r; @@ -515,14 +521,40 @@ static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule, } #endif -static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, +static int fib_nl2rule_port_mask(const struct nlattr *mask_attr, + const struct fib_rule_port_range *range, + u16 *port_mask, + struct netlink_ext_ack *extack) +{ + if (!fib_rule_port_range_valid(range)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, + "Cannot specify port mask without port value"); + return -EINVAL; + } + + if (fib_rule_port_is_range(range)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, + "Cannot specify port mask for port range"); + return -EINVAL; + } + + if (range->start & ~nla_get_u16(mask_attr)) { + NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask"); + return -EINVAL; + } + + *port_mask = nla_get_u16(mask_attr); + + return 0; +} + +static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct fib_rules_ops *ops, struct nlattr *tb[], struct fib_rule **rule, bool *user_priority) { - struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rule *nlrule = NULL; int err = -EINVAL; @@ -554,30 +586,18 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[FRA_PRIORITY]) { nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]); *user_priority = true; - } else { - nlrule->pref = fib_default_rule_pref(ops); } nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC); if (tb[FRA_IIFNAME]) { - struct net_device *dev; - nlrule->iifindex = -1; nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, nlrule->iifname); - if (dev) - nlrule->iifindex = dev->ifindex; } if (tb[FRA_OIFNAME]) { - struct net_device *dev; - nlrule->oifindex = -1; nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); - dev = __dev_get_by_name(net, nlrule->oifname); - if (dev) - nlrule->oifindex = dev->ifindex; } if (tb[FRA_FWMARK]) { @@ -619,11 +639,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, } nlrule->target = nla_get_u32(tb[FRA_GOTO]); - /* Backward jumps are prohibited to avoid endless loops */ - if (nlrule->target <= nlrule->pref) { - NL_SET_ERR_MSG(extack, "Backward goto not supported"); - goto errout_free; - } } else if (nlrule->action == FR_ACT_GOTO) { NL_SET_ERR_MSG(extack, "Missing goto target for action goto"); goto errout_free; @@ -662,6 +677,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid sport range"); goto errout_free; } + if (!fib_rule_port_is_range(&nlrule->sport_range)) + nlrule->sport_mask = U16_MAX; + } + + if (tb[FRA_SPORT_MASK]) { + err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK], + &nlrule->sport_range, + &nlrule->sport_mask, extack); + if (err) + goto errout_free; } if (tb[FRA_DPORT_RANGE]) { @@ -671,6 +696,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, NL_SET_ERR_MSG(extack, "Invalid dport range"); goto errout_free; } + if (!fib_rule_port_is_range(&nlrule->dport_range)) + nlrule->dport_mask = U16_MAX; + } + + if (tb[FRA_DPORT_MASK]) { + err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK], + &nlrule->dport_range, + &nlrule->dport_mask, extack); + if (err) + goto errout_free; } *rule = nlrule; @@ -683,6 +718,39 @@ errout: return err; } +static int fib_nl2rule_rtnl(struct fib_rule *nlrule, + struct fib_rules_ops *ops, + struct nlattr *tb[], + struct netlink_ext_ack *extack) +{ + if (!tb[FRA_PRIORITY]) + nlrule->pref = fib_default_rule_pref(ops); + + /* Backward jumps are prohibited to avoid endless loops */ + if (tb[FRA_GOTO] && nlrule->target <= nlrule->pref) { + NL_SET_ERR_MSG(extack, "Backward goto not supported"); + return -EINVAL; + } + + if (tb[FRA_IIFNAME]) { + struct net_device *dev; + + dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname); + if (dev) + nlrule->iifindex = dev->ifindex; + } + + if (tb[FRA_OIFNAME]) { + struct net_device *dev; + + dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname); + if (dev) + nlrule->oifindex = dev->ifindex; + } + + return 0; +} + static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, struct nlattr **tb, struct fib_rule *rule) { @@ -719,9 +787,6 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, if (r->tun_id != rule->tun_id) continue; - if (r->fr_net != rule->fr_net) - continue; - if (r->l3mdev != rule->l3mdev) continue; @@ -739,10 +804,16 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, &rule->sport_range)) continue; + if (r->sport_mask != rule->sport_mask) + continue; + if (!fib_rule_port_range_compare(&r->dport_range, &rule->dport_range)) continue; + if (r->dport_mask != rule->dport_mask) + continue; + if (!ops->compare(r, frh, tb)) continue; return 1; @@ -772,17 +843,19 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = { [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2), [FRA_FLOWLABEL] = { .type = NLA_BE32 }, [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 }, + [FRA_SPORT_MASK] = { .type = NLA_U16 }, + [FRA_DPORT_MASK] = { .type = NLA_U16 }, + [FRA_DSCP_MASK] = NLA_POLICY_MASK(NLA_U8, INET_DSCP_MASK >> 2), }; -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held) { - struct net *net = sock_net(skb->sk); + struct fib_rule *rule = NULL, *r, *last = NULL; struct fib_rule_hdr *frh = nlmsg_data(nlh); + int err = -EINVAL, unresolved = 0; struct fib_rules_ops *ops = NULL; - struct fib_rule *rule = NULL, *r, *last = NULL; struct nlattr *tb[FRA_MAX + 1]; - int err = -EINVAL, unresolved = 0; bool user_priority = false; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { @@ -804,10 +877,17 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority); + err = fib_nl2rule(net, nlh, extack, ops, tb, &rule, &user_priority); if (err) goto errout; + if (!rtnl_held) + rtnl_net_lock(net); + + err = fib_nl2rule_rtnl(rule, ops, tb, extack); + if (err) + goto errout_free; + if ((nlh->nlmsg_flags & NLM_F_EXCL) && rule_exists(ops, frh, tb, rule)) { err = -EEXIST; @@ -869,29 +949,42 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, if (rule->tun_id) ip_tunnel_need_metadata(); + fib_rule_get(rule); + + if (!rtnl_held) + rtnl_net_unlock(net); + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); + fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); return 0; errout_free: + if (!rtnl_held) + rtnl_net_unlock(net); kfree(rule); errout: rules_ops_put(ops); return err; } -EXPORT_SYMBOL_GPL(fib_nl_newrule); +EXPORT_SYMBOL_GPL(fib_newrule); -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, - struct netlink_ext_ack *extack) +static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); + return fib_newrule(sock_net(skb->sk), skb, nlh, extack, false); +} + +int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack, bool rtnl_held) +{ + struct fib_rule *rule = NULL, *nlrule = NULL; struct fib_rule_hdr *frh = nlmsg_data(nlh); struct fib_rules_ops *ops = NULL; - struct fib_rule *rule = NULL, *r, *nlrule = NULL; struct nlattr *tb[FRA_MAX+1]; - int err = -EINVAL; bool user_priority = false; + int err = -EINVAL; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) { NL_SET_ERR_MSG(extack, "Invalid msg length"); @@ -912,25 +1005,32 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; } - err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority); + err = fib_nl2rule(net, nlh, extack, ops, tb, &nlrule, &user_priority); if (err) goto errout; + if (!rtnl_held) + rtnl_net_lock(net); + + err = fib_nl2rule_rtnl(nlrule, ops, tb, extack); + if (err) + goto errout_free; + rule = rule_find(ops, frh, tb, nlrule, user_priority); if (!rule) { err = -ENOENT; - goto errout; + goto errout_free; } if (rule->flags & FIB_RULE_PERMANENT) { err = -EPERM; - goto errout; + goto errout_free; } if (ops->delete) { err = ops->delete(rule); if (err) - goto errout; + goto errout_free; } if (rule->tun_id) @@ -952,7 +1052,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, * current if it is goto rule, have actually been added. */ if (ops->nr_goto_rules > 0) { - struct fib_rule *n; + struct fib_rule *n, *r; n = list_next_entry(rule, list); if (&n->list == &ops->rules_list || n->pref != rule->pref) @@ -966,22 +1066,33 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, } } - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, - NULL); - notify_rule_change(RTM_DELRULE, rule, ops, nlh, - NETLINK_CB(skb).portid); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL); + + if (!rtnl_held) + rtnl_net_unlock(net); + + notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid); fib_rule_put(rule); flush_route_cache(ops); rules_ops_put(ops); kfree(nlrule); return 0; -errout: +errout_free: + if (!rtnl_held) + rtnl_net_unlock(net); kfree(nlrule); +errout: rules_ops_put(ops); return err; } -EXPORT_SYMBOL_GPL(fib_nl_delrule); +EXPORT_SYMBOL_GPL(fib_delrule); + +static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + return fib_delrule(sock_net(skb->sk), skb, nlh, extack, false); +} static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, struct fib_rule *rule) @@ -1000,7 +1111,9 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(1) /* FRA_PROTOCOL */ + nla_total_size(1) /* FRA_IP_PROTO */ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */ - + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */ + + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */ + + nla_total_size(2) /* FRA_SPORT_MASK */ + + nla_total_size(2); /* FRA_DPORT_MASK */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -1041,14 +1154,14 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, if (rule->iifname[0]) { if (nla_put_string(skb, FRA_IIFNAME, rule->iifname)) goto nla_put_failure; - if (rule->iifindex == -1) + if (READ_ONCE(rule->iifindex) == -1) frh->flags |= FIB_RULE_IIF_DETACHED; } if (rule->oifname[0]) { if (nla_put_string(skb, FRA_OIFNAME, rule->oifname)) goto nla_put_failure; - if (rule->oifindex == -1) + if (READ_ONCE(rule->oifindex) == -1) frh->flags |= FIB_RULE_OIF_DETACHED; } @@ -1068,8 +1181,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, nla_put_uid_range(skb, &rule->uid_range)) || (fib_rule_port_range_set(&rule->sport_range) && nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) || + (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK, + rule->sport_mask)) || (fib_rule_port_range_set(&rule->dport_range) && nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) || + (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK, + rule->dport_mask)) || (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto))) goto nla_put_failure; @@ -1220,10 +1337,10 @@ static void attach_rules(struct list_head *rules, struct net_device *dev) list_for_each_entry(rule, rules, list) { if (rule->iifindex == -1 && strcmp(dev->name, rule->iifname) == 0) - rule->iifindex = dev->ifindex; + WRITE_ONCE(rule->iifindex, dev->ifindex); if (rule->oifindex == -1 && strcmp(dev->name, rule->oifname) == 0) - rule->oifindex = dev->ifindex; + WRITE_ONCE(rule->oifindex, dev->ifindex); } } @@ -1233,9 +1350,9 @@ static void detach_rules(struct list_head *rules, struct net_device *dev) list_for_each_entry(rule, rules, list) { if (rule->iifindex == dev->ifindex) - rule->iifindex = -1; + WRITE_ONCE(rule->iifindex, -1); if (rule->oifindex == dev->ifindex) - rule->oifindex = -1; + WRITE_ONCE(rule->oifindex, -1); } } @@ -1293,8 +1410,10 @@ static struct pernet_operations fib_rules_net_ops = { }; static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = { - {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule}, - {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule}, + {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule, + .flags = RTNL_FLAG_DOIT_PERNET}, + {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule, + .flags = RTNL_FLAG_DOIT_PERNET}, {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; diff --git a/net/core/filter.c b/net/core/filter.c index 5b5996901ccc..a0867c5b32b3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5222,6 +5222,25 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt) +{ + u32 sk_bpf_cb_flags; + + if (getopt) { + *(u32 *)optval = sk->sk_bpf_cb_flags; + return 0; + } + + sk_bpf_cb_flags = *(u32 *)optval; + + if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK) + return -EINVAL; + + sk->sk_bpf_cb_flags = sk_bpf_cb_flags; + + return 0; +} + static int sol_socket_sockopt(struct sock *sk, int optname, char *optval, int *optlen, bool getopt) @@ -5238,6 +5257,7 @@ static int sol_socket_sockopt(struct sock *sk, int optname, case SO_MAX_PACING_RATE: case SO_BINDTOIFINDEX: case SO_TXREHASH: + case SK_BPF_CB_FLAGS: if (*optlen != sizeof(int)) return -EINVAL; break; @@ -5247,6 +5267,9 @@ static int sol_socket_sockopt(struct sock *sk, int optname, return -EINVAL; } + if (optname == SK_BPF_CB_FLAGS) + return sk_bpf_set_get_cb_flags(sk, optval, getopt); + if (getopt) { if (optname == SO_BINDTODEVICE) return -EINVAL; @@ -5382,6 +5405,7 @@ static int sol_tcp_sockopt(struct sock *sk, int optname, case TCP_USER_TIMEOUT: case TCP_NOTSENT_LOWAT: case TCP_SAVE_SYN: + case TCP_RTO_MAX_MS: if (*optlen != sizeof(int)) return -EINVAL; break; @@ -5500,6 +5524,11 @@ static int __bpf_setsockopt(struct sock *sk, int level, int optname, return -EINVAL; } +static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock) +{ + return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB; +} + static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen) { @@ -5650,6 +5679,9 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen); } @@ -5735,6 +5767,9 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP && optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) { int ret, copy_len = 0; @@ -5777,6 +5812,9 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, struct sock *sk = bpf_sock->sk; int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk)) return -EINVAL; @@ -7586,6 +7624,9 @@ BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock, u8 search_kind, search_len, copy_len, magic_len; int ret; + if (!is_locked_tcp_sock_ops(bpf_sock)) + return -EOPNOTSUPP; + /* 2 byte is the minimal option len except TCPOPT_NOP and * TCPOPT_EOL which are useless for the bpf prog to learn * and this helper disallow loading them also. @@ -7651,7 +7692,7 @@ static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, .arg3_type = ARG_CONST_SIZE, .arg4_type = ARG_ANYTHING, }; @@ -10358,10 +10399,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, } \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ - is_fullsock), \ + is_locked_tcp_sock), \ fullsock_reg, si->src_reg, \ offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ + is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \ if (si->dst_reg == si->src_reg) \ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \ @@ -10446,10 +10487,10 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, temp)); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, \ - is_fullsock), \ + is_locked_tcp_sock), \ reg, si->dst_reg, \ offsetof(struct bpf_sock_ops_kern, \ - is_fullsock)); \ + is_locked_tcp_sock)); \ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ struct bpf_sock_ops_kern, sk),\ @@ -12062,6 +12103,25 @@ __bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk, #endif } +__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, + u64 flags) +{ + struct sk_buff *skb; + + if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB) + return -EOPNOTSUPP; + + if (flags) + return -EINVAL; + + skb = skops->skb; + skb_shinfo(skb)->tx_flags |= SKBTX_BPF; + TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF; + skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; + + return 0; +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, @@ -12095,6 +12155,10 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk) BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk) +BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops) +BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops) + static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, @@ -12115,6 +12179,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { .set = &bpf_kfunc_check_set_tcp_reqsk, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_sock_ops, +}; + static int __init bpf_kfunc_init(void) { int ret; @@ -12133,7 +12202,8 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); - return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops); } late_initcall(bpf_kfunc_init); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0e638a37aa09..1b61bb25ba0e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -106,7 +106,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, #endif /* CONFIG_BPF_SYSCALL */ /** - * __skb_flow_get_ports - extract the upper layer ports and return them + * skb_flow_get_ports - extract the upper layer ports and return them * @skb: sk_buff to extract the ports from * @thoff: transport header offset * @ip_proto: protocol for which to get port offset @@ -116,8 +116,8 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, * The function will try to retrieve the ports at offset thoff + poff where poff * is the protocol port offset returned from proto_ports_offset */ -__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - const void *data, int hlen) +__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, + const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); @@ -137,7 +137,7 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, return 0; } -EXPORT_SYMBOL(__skb_flow_get_ports); +EXPORT_SYMBOL(skb_flow_get_ports); static bool icmp_has_id(u8 type) { @@ -853,23 +853,30 @@ __skb_flow_dissect_ports(const struct sk_buff *skb, void *target_container, const void *data, int nhoff, u8 ip_proto, int hlen) { - enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; - struct flow_dissector_key_ports *key_ports; + struct flow_dissector_key_ports_range *key_ports_range = NULL; + struct flow_dissector_key_ports *key_ports = NULL; + __be32 ports; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS; - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - dissector_ports = FLOW_DISSECTOR_KEY_PORTS_RANGE; + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE)) + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); - if (dissector_ports == FLOW_DISSECTOR_KEY_MAX) + if (!key_ports && !key_ports_range) return; - key_ports = skb_flow_dissector_target(flow_dissector, - dissector_ports, - target_container); - key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto, - data, hlen); + ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen); + + if (key_ports) + key_ports->ports = ports; + + if (key_ports_range) + key_ports_range->tp.ports = ports; } static void @@ -924,6 +931,7 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, struct flow_dissector *flow_dissector, void *target_container) { + struct flow_dissector_key_ports_range *key_ports_range = NULL; struct flow_dissector_key_ports *key_ports = NULL; struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -968,20 +976,21 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); - else if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE)) - key_ports = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS_RANGE, - target_container); - - if (key_ports) { key_ports->src = flow_keys->sport; key_ports->dst = flow_keys->dport; } + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE)) { + key_ports_range = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS_RANGE, + target_container); + key_ports_range->tp.src = flow_keys->sport; + key_ports_range->tp.dst = flow_keys->dport; + } if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL)) { @@ -1108,10 +1117,12 @@ bool __skb_flow_dissect(const struct net *net, FLOW_DISSECTOR_KEY_BASIC, target_container); + rcu_read_lock(); + if (skb) { if (!net) { if (skb->dev) - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); else if (skb->sk) net = sock_net(skb->sk); } @@ -1122,7 +1133,6 @@ bool __skb_flow_dissect(const struct net *net, enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog_array *run_array; - rcu_read_lock(); run_array = rcu_dereference(init_net.bpf.run_array[type]); if (!run_array) run_array = rcu_dereference(net->bpf.run_array[type]); @@ -1150,17 +1160,17 @@ bool __skb_flow_dissect(const struct net *net, prog = READ_ONCE(run_array->items[0].prog); result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, hlen, flags); - if (result == BPF_FLOW_DISSECTOR_CONTINUE) - goto dissect_continue; - __skb_flow_bpf_to_target(&flow_keys, flow_dissector, - target_container); - rcu_read_unlock(); - return result == BPF_OK; + if (result != BPF_FLOW_DISSECTOR_CONTINUE) { + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + rcu_read_unlock(); + return result == BPF_OK; + } } -dissect_continue: - rcu_read_unlock(); } + rcu_read_unlock(); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); diff --git a/net/core/gro.c b/net/core/gro.c index d1f44084e978..b350e5b69549 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -7,9 +7,6 @@ #define MAX_GRO_SKBS 8 -/* This should be increased if a protocol with a bigger head is added. */ -#define GRO_MAX_HEAD (MAX_HEADER + 128) - static DEFINE_SPINLOCK(offload_lock); /** @@ -253,8 +250,7 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) return 0; } - -static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) +static void gro_complete(struct gro_node *gro, struct sk_buff *skb) { struct list_head *head = &net_hotdata.offload_base; struct packet_offload *ptype; @@ -287,43 +283,43 @@ static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) } out: - gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count); + gro_normal_one(gro, skb, NAPI_GRO_CB(skb)->count); } -static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index, - bool flush_old) +static void __gro_flush_chain(struct gro_node *gro, u32 index, bool flush_old) { - struct list_head *head = &napi->gro_hash[index].list; + struct list_head *head = &gro->hash[index].list; struct sk_buff *skb, *p; list_for_each_entry_safe_reverse(skb, p, head, list) { if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) return; skb_list_del_init(skb); - napi_gro_complete(napi, skb); - napi->gro_hash[index].count--; + gro_complete(gro, skb); + gro->hash[index].count--; } - if (!napi->gro_hash[index].count) - __clear_bit(index, &napi->gro_bitmask); + if (!gro->hash[index].count) + __clear_bit(index, &gro->bitmask); } -/* napi->gro_hash[].list contains packets ordered by age. +/* + * gro->hash[].list contains packets ordered by age. * youngest packets at the head of it. * Complete skbs in reverse order to reduce latencies. */ -void napi_gro_flush(struct napi_struct *napi, bool flush_old) +void __gro_flush(struct gro_node *gro, bool flush_old) { - unsigned long bitmask = napi->gro_bitmask; + unsigned long bitmask = gro->bitmask; unsigned int i, base = ~0U; while ((i = ffs(bitmask)) != 0) { bitmask >>= i; base += i; - __napi_gro_flush_chain(napi, base, flush_old); + __gro_flush_chain(gro, base, flush_old); } } -EXPORT_SYMBOL(napi_gro_flush); +EXPORT_SYMBOL(__gro_flush); static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb, const struct sk_buff *p, @@ -442,7 +438,7 @@ static void gro_try_pull_from_frag0(struct sk_buff *skb) gro_pull_from_frag0(skb, grow); } -static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) +static void gro_flush_oldest(struct gro_node *gro, struct list_head *head) { struct sk_buff *oldest; @@ -458,14 +454,15 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) * SKB to the chain. */ skb_list_del_init(oldest); - napi_gro_complete(napi, oldest); + gro_complete(gro, oldest); } -static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +static enum gro_result dev_gro_receive(struct gro_node *gro, + struct sk_buff *skb) { u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &net_hotdata.offload_base; + struct gro_list *gro_list = &gro->hash[bucket]; struct packet_offload *ptype; __be16 type = skb->protocol; struct sk_buff *pp = NULL; @@ -529,7 +526,7 @@ found_ptype: if (pp) { skb_list_del_init(pp); - napi_gro_complete(napi, pp); + gro_complete(gro, pp); gro_list->count--; } @@ -540,7 +537,7 @@ found_ptype: goto normal; if (unlikely(gro_list->count >= MAX_GRO_SKBS)) - gro_flush_oldest(napi, &gro_list->list); + gro_flush_oldest(gro, &gro_list->list); else gro_list->count++; @@ -554,10 +551,10 @@ found_ptype: ret = GRO_HELD; ok: if (gro_list->count) { - if (!test_bit(bucket, &napi->gro_bitmask)) - __set_bit(bucket, &napi->gro_bitmask); - } else if (test_bit(bucket, &napi->gro_bitmask)) { - __clear_bit(bucket, &napi->gro_bitmask); + if (!test_bit(bucket, &gro->bitmask)) + __set_bit(bucket, &gro->bitmask); + } else if (test_bit(bucket, &gro->bitmask)) { + __clear_bit(bucket, &gro->bitmask); } return ret; @@ -596,13 +593,12 @@ struct packet_offload *gro_find_complete_by_type(__be16 type) } EXPORT_SYMBOL(gro_find_complete_by_type); -static gro_result_t napi_skb_finish(struct napi_struct *napi, - struct sk_buff *skb, - gro_result_t ret) +static gro_result_t gro_skb_finish(struct gro_node *gro, struct sk_buff *skb, + gro_result_t ret) { switch (ret) { case GRO_NORMAL: - gro_normal_one(napi, skb, 1); + gro_normal_one(gro, skb, 1); break; case GRO_MERGED_FREE: @@ -623,21 +619,21 @@ static gro_result_t napi_skb_finish(struct napi_struct *napi, return ret; } -gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) +gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb) { gro_result_t ret; - skb_mark_napi_id(skb, napi); + __skb_mark_napi_id(skb, gro); trace_napi_gro_receive_entry(skb); skb_gro_reset_offset(skb, 0); - ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = gro_skb_finish(gro, skb, dev_gro_receive(gro, skb)); trace_napi_gro_receive_exit(ret); return ret; } -EXPORT_SYMBOL(napi_gro_receive); +EXPORT_SYMBOL(gro_receive_skb); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { @@ -656,6 +652,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->pkt_type = PACKET_HOST; skb->encapsulation = 0; + skb->ip_summed = CHECKSUM_NONE; skb_shinfo(skb)->gso_type = 0; skb_shinfo(skb)->gso_size = 0; if (unlikely(skb->slow_gro)) { @@ -693,7 +690,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi, __skb_push(skb, ETH_HLEN); skb->protocol = eth_type_trans(skb, skb->dev); if (ret == GRO_NORMAL) - gro_normal_one(napi, skb, 1); + gro_normal_one(&napi->gro, skb, 1); break; case GRO_MERGED_FREE: @@ -762,7 +759,7 @@ gro_result_t napi_gro_frags(struct napi_struct *napi) trace_napi_gro_frags_entry(skb); - ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); + ret = napi_frags_finish(napi, skb, dev_gro_receive(&napi->gro, skb)); trace_napi_gro_frags_exit(ret); return ret; @@ -794,3 +791,37 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) return sum; } EXPORT_SYMBOL(__skb_gro_checksum_complete); + +void gro_init(struct gro_node *gro) +{ + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { + INIT_LIST_HEAD(&gro->hash[i].list); + gro->hash[i].count = 0; + } + + gro->bitmask = 0; + gro->cached_napi_id = 0; + + INIT_LIST_HEAD(&gro->rx_list); + gro->rx_count = 0; +} + +void gro_cleanup(struct gro_node *gro) +{ + struct sk_buff *skb, *n; + + for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) { + list_for_each_entry_safe(skb, n, &gro->hash[i].list, list) + kfree_skb(skb); + + gro->hash[i].count = 0; + } + + gro->bitmask = 0; + gro->cached_napi_id = 0; + + list_for_each_entry_safe(skb, n, &gro->rx_list, list) + kfree_skb(skb); + + gro->rx_count = 0; +} diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 89656d180bc6..344c9cd168ec 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -518,7 +518,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift) if (!ret) return NULL; - hash_heads = kvzalloc(size, GFP_ATOMIC); + hash_heads = kzalloc(size, GFP_ATOMIC); if (!hash_heads) { kfree(ret); return NULL; @@ -536,7 +536,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct neigh_hash_table, rcu); - kvfree(nht->hash_heads); + kfree(nht->hash_heads); kfree(nht); } @@ -832,12 +832,10 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, return -ENOENT; } -static void neigh_parms_destroy(struct neigh_parms *parms); - static inline void neigh_parms_put(struct neigh_parms *parms) { if (refcount_dec_and_test(&parms->refcnt)) - neigh_parms_destroy(parms); + kfree(parms); } /* @@ -1713,11 +1711,6 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) } EXPORT_SYMBOL(neigh_parms_release); -static void neigh_parms_destroy(struct neigh_parms *parms) -{ - kfree(parms); -} - static struct lock_class_key neigh_table_proxy_queue_class; static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly; @@ -3447,10 +3440,12 @@ static const struct seq_operations neigh_stat_seq_ops = { static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid) { - struct net *net = dev_net(n->dev); struct sk_buff *skb; int err = -ENOBUFS; + struct net *net; + rcu_read_lock(); + net = dev_net_rcu(n->dev); skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC); if (skb == NULL) goto errout; @@ -3463,9 +3458,11 @@ static void __neigh_notify(struct neighbour *n, int type, int flags, goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); - return; + goto out; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); +out: + rcu_read_unlock(); } void neigh_app_ns(struct neighbour *n) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 07cb99b114bd..8d9dc048a548 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -42,6 +42,87 @@ static inline int dev_isalive(const struct net_device *dev) return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; } +/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active, + * when unregistering a net device and accessing associated sysfs files. The + * potential deadlock is as follow: + * + * CPU 0 CPU 1 + * + * rtnl_lock vfs_read + * unregister_netdevice_many kernfs_seq_start + * device_del / kobject_put kernfs_get_active (kn->active++) + * kernfs_drain sysfs_kf_seq_show + * wait_event( rtnl_lock + * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release + * -> waits on CPU 1 to decrease kn->active the rtnl lock. + * + * The historical fix was to use rtnl_trylock with restart_syscall to bail out + * of sysfs operations when the lock couldn't be taken. This fixed the above + * issue as it allowed CPU 1 to bail out of the ABBA situation. + * + * But it came with performances issues, as syscalls are being restarted in + * loops when there was contention on the rtnl lock, with huge slow downs in + * specific scenarios (e.g. lots of virtual interfaces created and userspace + * daemons querying their attributes). + * + * The idea below is to bail out of the active kernfs_node protection + * (kn->active) while trying to take the rtnl lock. + * + * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The + * net device is guaranteed to be alive if this returns successfully. + */ +static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr, + struct net_device *ndev) +{ + struct kernfs_node *kn; + int ret = 0; + + /* First, we hold a reference to the net device as the unregistration + * path might run in parallel. This will ensure the net device and the + * associated sysfs objects won't be freed while we try to take the rtnl + * lock. + */ + dev_hold(ndev); + /* sysfs_break_active_protection was introduced to allow self-removal of + * devices and their associated sysfs files by bailing out of the + * sysfs/kernfs protection. We do this here to allow the unregistration + * path to complete in parallel. The following takes a reference on the + * kobject and the kernfs_node being accessed. + * + * This works because we hold a reference onto the net device and the + * unregistration path will wait for us eventually in netdev_run_todo + * (outside an rtnl lock section). + */ + kn = sysfs_break_active_protection(kobj, attr); + /* We can now try to take the rtnl lock. This can't deadlock us as the + * unregistration path is able to drain sysfs files (kernfs_node) thanks + * to the above dance. + */ + if (rtnl_lock_interruptible()) { + ret = -ERESTARTSYS; + goto unbreak; + } + /* Check dismantle on the device hasn't started, otherwise deny the + * operation. + */ + if (!dev_isalive(ndev)) { + rtnl_unlock(); + ret = -ENODEV; + goto unbreak; + } + /* We are now sure the device dismantle hasn't started nor that it can + * start before we exit the locking section as we hold the rtnl lock. + * There's no need to keep unbreaking the sysfs protection nor to hold + * a net device reference from that point; that was only needed to take + * the rtnl lock. + */ +unbreak: + sysfs_unbreak_active_protection(kn); + dev_put(ndev); + + return ret; +} + /* use same locking rules as GIF* ioctl's */ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, @@ -95,14 +176,14 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, if (ret) goto err; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + goto err; + + ret = (*set)(netdev, new); + if (ret == 0) + ret = len; - if (dev_isalive(netdev)) { - ret = (*set)(netdev, new); - if (ret == 0) - ret = len; - } rtnl_unlock(); err: return ret; @@ -220,7 +301,7 @@ static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); /* The check is also done in change_carrier; this helps returning early - * without hitting the trylock/restart in netdev_store. + * without hitting the locking section in netdev_store. */ if (!netdev->netdev_ops->ndo_change_carrier) return -EOPNOTSUPP; @@ -232,11 +313,13 @@ static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - int ret = -EINVAL; + int ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + ret = -EINVAL; if (netif_running(netdev)) { /* Synchronize carrier state with link watch, * see also rtnl_getlink(). @@ -245,8 +328,8 @@ static ssize_t carrier_show(struct device *dev, ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); } - rtnl_unlock(); + rtnl_unlock(); return ret; } static DEVICE_ATTR_RW(carrier); @@ -258,14 +341,16 @@ static ssize_t speed_show(struct device *dev, int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + ret = -EINVAL; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; @@ -284,14 +369,16 @@ static ssize_t duplex_show(struct device *dev, int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; + ret = -EINVAL; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; @@ -481,7 +568,7 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); size_t count = len; - ssize_t ret = 0; + ssize_t ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -490,16 +577,15 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, if (len > 0 && buf[len - 1] == '\n') --count; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - ret = dev_set_alias(netdev, buf, count); - if (ret < 0) - goto err; - ret = len; - netdev_state_change(netdev); - } + ret = dev_set_alias(netdev, buf, count); + if (ret < 0) + goto err; + ret = len; + netdev_state_change(netdev); err: rtnl_unlock(); @@ -511,7 +597,7 @@ static ssize_t ifalias_show(struct device *dev, { const struct net_device *netdev = to_net_dev(dev); char tmp[IFALIASZ]; - ssize_t ret = 0; + ssize_t ret; ret = dev_get_alias(netdev, tmp, sizeof(tmp)); if (ret > 0) @@ -551,24 +637,23 @@ static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; + struct netdev_phys_item_id ppid; + ssize_t ret; /* The check is also done in dev_get_phys_port_id; this helps returning - * early without hitting the trylock/restart below. + * early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_id) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid; + ret = dev_get_phys_port_id(netdev, &ppid); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - ret = dev_get_phys_port_id(netdev, &ppid); - if (!ret) - ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - } rtnl_unlock(); return ret; @@ -579,25 +664,24 @@ static ssize_t phys_port_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; + char name[IFNAMSIZ]; + ssize_t ret; /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the trylock/restart below. + * returning early without hitting the locking section below. */ if (!netdev->netdev_ops->ndo_get_phys_port_name && !netdev->devlink_port) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - char name[IFNAMSIZ]; + ret = dev_get_phys_port_name(netdev, name, sizeof(name)); + if (!ret) + ret = sysfs_emit(buf, "%s\n", name); - ret = dev_get_phys_port_name(netdev, name, sizeof(name)); - if (!ret) - ret = sysfs_emit(buf, "%s\n", name); - } rtnl_unlock(); return ret; @@ -608,26 +692,25 @@ static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); - ssize_t ret = -EINVAL; + struct netdev_phys_item_id ppid = { }; + ssize_t ret; /* The checks are also done in dev_get_phys_port_name; this helps - * returning early without hitting the trylock/restart below. This works + * returning early without hitting the locking section below. This works * because recurse is false when calling dev_get_port_parent_id. */ if (!netdev->netdev_ops->ndo_get_port_parent_id && !netdev->devlink_port) return -EOPNOTSUPP; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); + if (ret) + return ret; - if (dev_isalive(netdev)) { - struct netdev_phys_item_id ppid = { }; + ret = dev_get_port_parent_id(netdev, &ppid, false); + if (!ret) + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - ret = dev_get_port_parent_id(netdev, &ppid, false); - if (!ret) - ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); - } rtnl_unlock(); return ret; @@ -1108,7 +1191,6 @@ static void rx_queue_get_ownership(const struct kobject *kobj, static const struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, - .default_groups = rx_queue_default_groups, .namespace = rx_queue_namespace, .get_ownership = rx_queue_get_ownership, }; @@ -1131,6 +1213,22 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) struct kobject *kobj = &queue->kobj; int error = 0; + /* Rx queues are cleared in rx_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed"); + return -EAGAIN; + } + /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ @@ -1142,20 +1240,27 @@ static int rx_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = rx_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) - goto err; + goto err_default_groups; } error = rx_queue_default_mask(dev, queue); if (error) - goto err; + goto err_default_groups; kobject_uevent(kobj, KOBJ_ADD); return error; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1200,12 +1305,14 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) } while (--i >= new_num) { - struct kobject *kobj = &dev->_rx[i].kobj; + struct netdev_rx_queue *queue = &dev->_rx[i]; + struct kobject *kobj = &queue->kobj; if (!refcount_read(&dev_net(dev)->ns.count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); + sysfs_remove_groups(kobj, queue->groups); kobject_put(kobj); } @@ -1244,9 +1351,11 @@ static int net_rx_queue_change_owner(struct net_device *dev, int num, */ struct netdev_queue_attribute { struct attribute attr; - ssize_t (*show)(struct netdev_queue *queue, char *buf); - ssize_t (*store)(struct netdev_queue *queue, - const char *buf, size_t len); + ssize_t (*show)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf); + ssize_t (*store)(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len); }; #define to_netdev_queue_attr(_attr) \ container_of(_attr, struct netdev_queue_attribute, attr) @@ -1263,7 +1372,7 @@ static ssize_t netdev_queue_attr_show(struct kobject *kobj, if (!attribute->show) return -EIO; - return attribute->show(queue, buf); + return attribute->show(kobj, attr, queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, @@ -1277,7 +1386,7 @@ static ssize_t netdev_queue_attr_store(struct kobject *kobj, if (!attribute->store) return -EIO; - return attribute->store(queue, buf, count); + return attribute->store(kobj, attr, queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { @@ -1285,7 +1394,8 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = { .store = netdev_queue_attr_store, }; -static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) +static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); @@ -1303,18 +1413,18 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue) return i; } -static ssize_t traffic_class_show(struct netdev_queue *queue, - char *buf) +static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; - int num_tc, tc; - int index; + int num_tc, tc, index, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; index = get_netdev_queue_index(queue); @@ -1341,24 +1451,25 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, } #ifdef CONFIG_XPS -static ssize_t tx_maxrate_show(struct netdev_queue *queue, - char *buf) +static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); } -static ssize_t tx_maxrate_store(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { - struct net_device *dev = queue->dev; int err, index = get_netdev_queue_index(queue); + struct net_device *dev = queue->dev; u32 rate = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; /* The check is also done later; this helps returning early without - * hitting the trylock/restart below. + * hitting the locking section below. */ if (!dev->netdev_ops->ndo_set_tx_maxrate) return -EOPNOTSUPP; @@ -1367,18 +1478,21 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue, if (err < 0) return err; - if (!rtnl_trylock()) - return restart_syscall(); + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) + return err; err = -EOPNOTSUPP; if (dev->netdev_ops->ndo_set_tx_maxrate) err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); - rtnl_unlock(); if (!err) { queue->tx_maxrate = rate; + rtnl_unlock(); return len; } + + rtnl_unlock(); return err; } @@ -1422,16 +1536,17 @@ static ssize_t bql_set(const char *buf, const size_t count, return count; } -static ssize_t bql_show_hold_time(struct netdev_queue *queue, - char *buf) +static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } -static ssize_t bql_set_hold_time(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct dql *dql = &queue->dql; unsigned int value; @@ -1450,15 +1565,17 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); -static ssize_t bql_show_stall_thrs(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); } -static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct dql *dql = &queue->dql; unsigned int value; @@ -1484,13 +1601,15 @@ static ssize_t bql_set_stall_thrs(struct netdev_queue *queue, static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs); -static ssize_t bql_show_stall_max(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); } -static ssize_t bql_set_stall_max(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { WRITE_ONCE(queue->dql.stall_max, 0); return len; @@ -1499,7 +1618,8 @@ static ssize_t bql_set_stall_max(struct netdev_queue *queue, static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init = __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max); -static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf) +static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; @@ -1509,8 +1629,8 @@ static ssize_t bql_show_stall_cnt(struct netdev_queue *queue, char *buf) static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL); -static ssize_t bql_show_inflight(struct netdev_queue *queue, - char *buf) +static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; @@ -1521,13 +1641,16 @@ static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ -static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ - char *buf) \ +static ssize_t bql_show_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ -static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ +static ssize_t bql_set_ ## NAME(struct kobject *kobj, \ + struct attribute *attr, \ + struct netdev_queue *queue, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ @@ -1613,19 +1736,21 @@ out_no_maps: return len < PAGE_SIZE ? len : -EINVAL; } -static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) +static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; - int len, tc; + int len, tc, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; index = get_netdev_queue_index(queue); - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, queue->dev); + if (ret) + return ret; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; @@ -1636,18 +1761,21 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) return -EINVAL; } - /* Make sure the subordinate device can't be freed */ - get_device(&dev->dev); + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); rtnl_unlock(); len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); - put_device(&dev->dev); + dev_put(dev); return len; } -static ssize_t xps_cpus_store(struct netdev_queue *queue, - const char *buf, size_t len) +static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, + size_t len) { struct net_device *dev = queue->dev; unsigned int index; @@ -1671,9 +1799,10 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, return err; } - if (!rtnl_trylock()) { + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { free_cpumask_var(mask); - return restart_syscall(); + return err; } err = netif_set_xps_queue(dev, mask, index); @@ -1687,26 +1816,34 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); -static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) +static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; - int tc; + int tc, ret; index = get_netdev_queue_index(queue); - if (!rtnl_trylock()) - return restart_syscall(); + ret = sysfs_rtnl_lock(kobj, attr, dev); + if (ret) + return ret; tc = netdev_txq_to_tc(dev, index); + + /* Increase the net device refcnt to make sure it won't be freed while + * xps_queue_show is running. + */ + dev_hold(dev); rtnl_unlock(); - if (tc < 0) - return -EINVAL; - return xps_queue_show(dev, index, tc, buf, XPS_RXQS); + ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL; + dev_put(dev); + return ret; } -static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, +static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr, + struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; @@ -1730,9 +1867,10 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, return err; } - if (!rtnl_trylock()) { + err = sysfs_rtnl_lock(kobj, attr, dev); + if (err) { bitmap_free(mask); - return restart_syscall(); + return err; } cpus_read_lock(); @@ -1792,7 +1930,6 @@ static void netdev_queue_get_ownership(const struct kobject *kobj, static const struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, - .default_groups = netdev_queue_default_groups, .namespace = netdev_queue_namespace, .get_ownership = netdev_queue_get_ownership, }; @@ -1811,6 +1948,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) struct kobject *kobj = &queue->kobj; int error = 0; + /* Tx queues are cleared in netdev_queue_release to allow later + * re-registration. This is triggered when their kobj refcount is + * dropped. + * + * If a queue is removed while both a read (or write) operation and a + * the re-addition of the same queue are pending (waiting on rntl_lock) + * it might happen that the re-addition will execute before the read, + * making the initial removal to never happen (queue's kobj refcount + * won't drop enough because of the pending read). In such rare case, + * return to allow the removal operation to complete. + */ + if (unlikely(kobj->state_initialized)) { + netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed"); + return -EAGAIN; + } + /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ @@ -1822,15 +1975,22 @@ static int netdev_queue_add_kobject(struct net_device *dev, int index) if (error) goto err; + queue->groups = netdev_queue_default_groups; + error = sysfs_create_groups(kobj, queue->groups); + if (error) + goto err; + if (netdev_uses_bql(dev)) { error = sysfs_create_group(kobj, &dql_group); if (error) - goto err; + goto err_default_groups; } kobject_uevent(kobj, KOBJ_ADD); return 0; +err_default_groups: + sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; @@ -1885,6 +2045,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) if (netdev_uses_bql(dev)) sysfs_remove_group(&queue->kobj, &dql_group); + sysfs_remove_groups(&queue->kobj, queue->groups); kobject_put(&queue->kobj); } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index cb39a12b2f82..4303f2a49262 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -464,7 +464,7 @@ static void net_complete_free(void) } -static void net_free(struct net *net) +void net_passive_dec(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); @@ -482,7 +482,7 @@ void net_drop_ns(void *p) struct net *net = (struct net *)p; if (net) - net_free(net); + net_passive_dec(net); } struct net *copy_net_ns(unsigned long flags, @@ -523,7 +523,7 @@ put_userns: key_remove_domain(net->key_domain); #endif put_user_ns(user_ns); - net_free(net); + net_passive_dec(net); dec_ucounts: dec_net_namespaces(ucounts); return ERR_PTR(rv); @@ -672,7 +672,7 @@ static void cleanup_net(struct work_struct *work) key_remove_domain(net->key_domain); #endif put_user_ns(net->user_ns); - net_free(net); + net_passive_dec(net); } cleanup_net_task = NULL; } diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 715f85c6b62e..2b774183d31c 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -10,6 +10,7 @@ #include <net/sock.h> #include <net/xdp.h> #include <net/xdp_sock.h> +#include <net/page_pool/memory_provider.h> #include "dev.h" #include "devmem.h" @@ -52,6 +53,8 @@ XDP_METADATA_KFUNC_xxx xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP; if (netdev->xsk_tx_metadata_ops->tmo_request_checksum) xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM; + if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time) + xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO; } if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) || @@ -266,7 +269,7 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, prev_id = UINT_MAX; list_for_each_entry(napi, &netdev->napi_list, dev_list) { - if (napi->napi_id < MIN_NAPI_ID) + if (!napi_id_valid(napi->napi_id)) continue; /* Dump continuation below depends on the list being sorted */ @@ -364,11 +367,18 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) return err; } +static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi) +{ + if (napi && napi_id_valid(napi->napi_id)) + return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id); + return 0; +} + static int netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding; + struct pp_memory_provider_params *params; struct netdev_rx_queue *rxq; struct netdev_queue *txq; void *hdr; @@ -385,21 +395,30 @@ netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, switch (q_type) { case NETDEV_QUEUE_TYPE_RX: rxq = __netif_get_rx_queue(netdev, q_idx); - if (rxq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, - rxq->napi->napi_id)) + if (nla_put_napi_id(rsp, rxq->napi)) goto nla_put_failure; - binding = rxq->mp_params.mp_priv; - if (binding && - nla_put_u32(rsp, NETDEV_A_QUEUE_DMABUF, binding->id)) + params = &rxq->mp_params; + if (params->mp_ops && + params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) goto nla_put_failure; +#ifdef CONFIG_XDP_SOCKETS + if (rxq->pool) + if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) + goto nla_put_failure; +#endif break; case NETDEV_QUEUE_TYPE_TX: txq = netdev_get_tx_queue(netdev, q_idx); - if (txq->napi && nla_put_u32(rsp, NETDEV_A_QUEUE_NAPI_ID, - txq->napi->napi_id)) + if (nla_put_napi_id(rsp, txq->napi)) goto nla_put_failure; +#ifdef CONFIG_XDP_SOCKETS + if (txq->pool) + if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) + goto nla_put_failure; +#endif + break; } genlmsg_end(rsp, hdr); @@ -576,6 +595,7 @@ netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx) netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) || + netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) || netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) || diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index db82786fa0c4..ddd54e1e7289 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -3,34 +3,34 @@ #include <linux/netdevice.h> #include <net/netdev_queues.h> #include <net/netdev_rx_queue.h> +#include <net/page_pool/memory_provider.h> #include "page_pool_priv.h" int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) { struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); + const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; void *new_mem, *old_mem; int err; - if (!dev->queue_mgmt_ops || !dev->queue_mgmt_ops->ndo_queue_stop || - !dev->queue_mgmt_ops->ndo_queue_mem_free || - !dev->queue_mgmt_ops->ndo_queue_mem_alloc || - !dev->queue_mgmt_ops->ndo_queue_start) + if (!qops || !qops->ndo_queue_stop || !qops->ndo_queue_mem_free || + !qops->ndo_queue_mem_alloc || !qops->ndo_queue_start) return -EOPNOTSUPP; ASSERT_RTNL(); - new_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL); + new_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); if (!new_mem) return -ENOMEM; - old_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL); + old_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); if (!old_mem) { err = -ENOMEM; goto err_free_new_mem; } - err = dev->queue_mgmt_ops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); + err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); if (err) goto err_free_old_mem; @@ -38,15 +38,19 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) if (err) goto err_free_new_queue_mem; - err = dev->queue_mgmt_ops->ndo_queue_stop(dev, old_mem, rxq_idx); - if (err) - goto err_free_new_queue_mem; + if (netif_running(dev)) { + err = qops->ndo_queue_stop(dev, old_mem, rxq_idx); + if (err) + goto err_free_new_queue_mem; - err = dev->queue_mgmt_ops->ndo_queue_start(dev, new_mem, rxq_idx); - if (err) - goto err_start_queue; + err = qops->ndo_queue_start(dev, new_mem, rxq_idx); + if (err) + goto err_start_queue; + } else { + swap(new_mem, old_mem); + } - dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem); + qops->ndo_queue_mem_free(dev, old_mem); kvfree(old_mem); kvfree(new_mem); @@ -61,15 +65,15 @@ err_start_queue: * WARN if we fail to recover the old rx queue, and at least free * old_mem so we don't also leak that. */ - if (dev->queue_mgmt_ops->ndo_queue_start(dev, old_mem, rxq_idx)) { + if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) { WARN(1, "Failed to restart old queue in error path. RX queue %d may be unhealthy.", rxq_idx); - dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem); + qops->ndo_queue_mem_free(dev, old_mem); } err_free_new_queue_mem: - dev->queue_mgmt_ops->ndo_queue_mem_free(dev, new_mem); + qops->ndo_queue_mem_free(dev, new_mem); err_free_old_mem: kvfree(old_mem); @@ -80,3 +84,71 @@ err_free_new_mem: return err; } EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); + +static int __net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p) +{ + struct netdev_rx_queue *rxq; + int ret; + + if (ifq_idx >= dev->real_num_rx_queues) + return -EINVAL; + ifq_idx = array_index_nospec(ifq_idx, dev->real_num_rx_queues); + + rxq = __netif_get_rx_queue(dev, ifq_idx); + if (rxq->mp_params.mp_ops) + return -EEXIST; + + rxq->mp_params = *p; + ret = netdev_rx_queue_restart(dev, ifq_idx); + if (ret) { + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + } + return ret; +} + +int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *p) +{ + int ret; + + rtnl_lock(); + ret = __net_mp_open_rxq(dev, ifq_idx, p); + rtnl_unlock(); + return ret; +} + +static void __net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + struct netdev_rx_queue *rxq; + + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) + return; + + rxq = __netif_get_rx_queue(dev, ifq_idx); + + /* Callers holding a netdev ref may get here after we already + * went thru shutdown via dev_memory_provider_uninstall(). + */ + if (dev->reg_state > NETREG_REGISTERED && + !rxq->mp_params.mp_ops) + return; + + if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops || + rxq->mp_params.mp_priv != old_p->mp_priv)) + return; + + rxq->mp_params.mp_ops = NULL; + rxq->mp_params.mp_priv = NULL; + WARN_ON(netdev_rx_queue_restart(dev, ifq_idx)); +} + +void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, + struct pp_memory_provider_params *old_p) +{ + rtnl_lock(); + __net_mp_close_rxq(dev, ifq_idx, old_p); + rtnl_unlock(); +} diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a3de752c5178..acef1fcd8ddc 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -13,6 +13,7 @@ #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <net/xdp.h> #include <linux/dma-direction.h> @@ -25,6 +26,7 @@ #include <trace/events/page_pool.h> +#include "dev.h" #include "mp_dmabuf_devmem.h" #include "netmem_priv.h" #include "page_pool_priv.h" @@ -285,13 +287,19 @@ static int page_pool_init(struct page_pool *pool, rxq = __netif_get_rx_queue(pool->slow.netdev, pool->slow.queue_idx); pool->mp_priv = rxq->mp_params.mp_priv; + pool->mp_ops = rxq->mp_params.mp_ops; } - if (pool->mp_priv) { + if (pool->mp_ops) { if (!pool->dma_map || !pool->dma_sync) return -EOPNOTSUPP; - err = mp_dmabuf_devmem_init(pool); + if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { + err = -EFAULT; + goto free_ptr_ring; + } + + err = pool->mp_ops->init(pool); if (err) { pr_warn("%s() mem-provider init failed %d\n", __func__, err); @@ -536,12 +544,11 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, if (unlikely(pool->alloc.count > 0)) return pool->alloc.cache[--pool->alloc.count]; - /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ + /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); - nr_pages = alloc_pages_bulk_array_node(gfp, - pool->p.nid, bulk, - (struct page **)pool->alloc.cache); + nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, + (struct page **)pool->alloc.cache); if (unlikely(!nr_pages)) return 0; @@ -588,8 +595,8 @@ netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) return netmem; /* Slow-path: cache empty, do real allocation */ - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - netmem = mp_dmabuf_devmem_alloc_netmems(pool, gfp); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + netmem = pool->mp_ops->alloc_netmems(pool, gfp); else netmem = __page_pool_alloc_pages_slow(pool, gfp); return netmem; @@ -680,8 +687,8 @@ void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) bool put; put = true; - if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_priv) - put = mp_dmabuf_devmem_release_page(pool, netmem); + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + put = pool->mp_ops->release_netmem(pool, netmem); else __page_pool_release_page_dma(pool, netmem); @@ -1049,8 +1056,8 @@ static void __page_pool_destroy(struct page_pool *pool) page_pool_unlist(pool); page_pool_uninit(pool); - if (pool->mp_priv) { - mp_dmabuf_devmem_destroy(pool); + if (pool->mp_ops) { + pool->mp_ops->destroy(pool); static_branch_dec(&page_pool_mem_providers); } @@ -1105,7 +1112,13 @@ static void page_pool_release_retry(struct work_struct *wq) int inflight; inflight = page_pool_release(pool); - if (!inflight) + /* In rare cases, a driver bug may cause inflight to go negative. + * Don't reschedule release if inflight is 0 or negative. + * - If 0, the page_pool has been destroyed + * - if negative, we will never recover + * in both cases no reschedule is necessary. + */ + if (inflight <= 0) return; /* Periodic warning for page pools the user can't see */ @@ -1141,13 +1154,11 @@ void page_pool_disable_direct_recycling(struct page_pool *pool) if (!pool->p.napi) return; - /* To avoid races with recycling and additional barriers make sure - * pool and NAPI are unlinked when NAPI is disabled. - */ - WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state)); - WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1); + napi_assert_will_not_race(pool->p.napi); + mutex_lock(&page_pools_lock); WRITE_ONCE(pool->p.napi, NULL); + mutex_unlock(&page_pools_lock); } EXPORT_SYMBOL(page_pool_disable_direct_recycling); @@ -1189,3 +1200,31 @@ void page_pool_update_nid(struct page_pool *pool, int new_nid) } } EXPORT_SYMBOL(page_pool_update_nid); + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); +} + +/* Associate a niov with a page pool. Should follow with a matching + * net_mp_niov_clear_page_pool() + */ +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); +} + +/* Disassociate a niov from a page pool. Should only be used in the + * ->release_netmem() path. + */ +void net_mp_niov_clear_page_pool(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_clear_pp_info(netmem); +} diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h index 57439787b9c2..2fb06d5f6d55 100644 --- a/net/core/page_pool_priv.h +++ b/net/core/page_pool_priv.h @@ -7,6 +7,8 @@ #include "netmem_priv.h" +extern struct mutex page_pools_lock; + s32 page_pool_inflight(const struct page_pool *pool, bool strict); int page_pool_list(struct page_pool *pool); diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c index 48335766c1bf..c82a95beceff 100644 --- a/net/core/page_pool_user.c +++ b/net/core/page_pool_user.c @@ -3,21 +3,23 @@ #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/xarray.h> +#include <net/busy_poll.h> #include <net/net_debug.h> #include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> #include <net/page_pool/types.h> +#include <net/page_pool/memory_provider.h> #include <net/sock.h> -#include "devmem.h" #include "page_pool_priv.h" #include "netdev-genl-gen.h" static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); -/* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user. +/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev, + * pool->user. * Ordering: inside rtnl_lock */ -static DEFINE_MUTEX(page_pools_lock); +DEFINE_MUTEX(page_pools_lock); /* Page pools are only reachable from user space (via netlink) if they are * linked to a netdev at creation time. Following page pool "visibility" @@ -214,8 +216,8 @@ static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { - struct net_devmem_dmabuf_binding *binding = pool->mp_priv; size_t inflight, refsz; + unsigned int napi_id; void *hdr; hdr = genlmsg_iput(rsp, info); @@ -229,8 +231,10 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, pool->slow.netdev->ifindex)) goto err_cancel; - if (pool->user.napi_id && - nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id)) + + napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0; + if (napi_id_valid(napi_id) && + nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id)) goto err_cancel; inflight = page_pool_inflight(pool, false); @@ -244,7 +248,7 @@ page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, pool->user.detach_time)) goto err_cancel; - if (binding && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_DMABUF, binding->id)) + if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL)) goto err_cancel; genlmsg_end(rsp, hdr); @@ -319,8 +323,6 @@ int page_pool_list(struct page_pool *pool) if (pool->slow.netdev) { hlist_add_head(&pool->user.list, &pool->slow.netdev->page_pools); - pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0; - netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); } @@ -353,7 +355,7 @@ void page_pool_unlist(struct page_pool *pool) int page_pool_check_memory_provider(struct net_device *dev, struct netdev_rx_queue *rxq) { - struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv; + void *binding = rxq->mp_params.mp_priv; struct page_pool *pool; struct hlist_node *n; diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 82b6a2c3c141..e850598db3e7 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -158,9 +158,7 @@ #include <net/udp.h> #include <net/ip6_checksum.h> #include <net/addrconf.h> -#ifdef CONFIG_XFRM #include <net/xfrm.h> -#endif #include <net/netns/generic.h> #include <asm/byteorder.h> #include <linux/rcupdate.h> @@ -517,21 +515,23 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { char data[128]; + size_t max; struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id); if (!capable(CAP_NET_ADMIN)) return -EPERM; - if (count == 0) + if (count < 1) return -EINVAL; - if (count > sizeof(data)) - count = sizeof(data); - - if (copy_from_user(data, buf, count)) + max = min(count, sizeof(data) - 1); + if (copy_from_user(data, buf, max)) return -EFAULT; - data[count - 1] = 0; /* Strip trailing '\n' and terminate string */ + if (data[max - 1] == '\n') + data[max - 1] = 0; /* strip trailing '\n', terminate string */ + else + data[max] = 0; /* terminate string */ if (!strcmp(data, "stop")) pktgen_stop_all_threads(pn); @@ -744,31 +744,32 @@ static int pktgen_if_show(struct seq_file *seq, void *v) } -static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, - __u32 *num) +static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen, + __u32 *num) { - int i = 0; + size_t i = 0; + *num = 0; for (; i < maxlen; i++) { int value; char c; - *num <<= 4; if (get_user(c, &user_buffer[i])) return -EFAULT; value = hex_to_bin(c); - if (value >= 0) + if (value >= 0) { + *num <<= 4; *num |= value; - else + } else { break; + } } return i; } -static int count_trail_chars(const char __user * user_buffer, - unsigned int maxlen) +static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen) { - int i; + size_t i; for (i = 0; i < maxlen; i++) { char c; @@ -790,10 +791,10 @@ done: return i; } -static long num_arg(const char __user *user_buffer, unsigned long maxlen, - unsigned long *num) +static ssize_t num_arg(const char __user *user_buffer, size_t maxlen, + unsigned long *num) { - int i; + size_t i; *num = 0; for (i = 0; i < maxlen; i++) { @@ -809,9 +810,9 @@ static long num_arg(const char __user *user_buffer, unsigned long maxlen, return i; } -static int strn_len(const char __user * user_buffer, unsigned int maxlen) +static ssize_t strn_len(const char __user *user_buffer, size_t maxlen) { - int i; + size_t i; for (i = 0; i < maxlen; i++) { char c; @@ -823,6 +824,7 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen) case '\r': case '\t': case ' ': + case '=': goto done_str; default: break; @@ -838,11 +840,11 @@ done_str: * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example. */ static ssize_t get_imix_entries(const char __user *buffer, + size_t maxlen, struct pktgen_dev *pkt_dev) { - const int max_digits = 10; - int i = 0; - long len; + size_t i = 0, max; + ssize_t len; char c; pkt_dev->n_imix_entries = 0; @@ -854,10 +856,13 @@ static ssize_t get_imix_entries(const char __user *buffer, if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES) return -E2BIG; - len = num_arg(&buffer[i], max_digits, &size); + max = min(10, maxlen - i); + len = num_arg(&buffer[i], max, &size); if (len < 0) return len; i += len; + if (i >= maxlen) + return -EINVAL; if (get_user(c, &buffer[i])) return -EFAULT; /* Check for comma between size_i and weight_i */ @@ -868,7 +873,8 @@ static ssize_t get_imix_entries(const char __user *buffer, if (size < 14 + 20 + 8) size = 14 + 20 + 8; - len = num_arg(&buffer[i], max_digits, &weight); + max = min(10, maxlen - i); + len = num_arg(&buffer[i], max, &weight); if (len < 0) return len; if (weight <= 0) @@ -878,39 +884,52 @@ static ssize_t get_imix_entries(const char __user *buffer, pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight; i += len; + pkt_dev->n_imix_entries++; + + if (i >= maxlen) + break; if (get_user(c, &buffer[i])) return -EFAULT; - i++; - pkt_dev->n_imix_entries++; } while (c == ' '); return i; } -static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) +static ssize_t get_labels(const char __user *buffer, + size_t maxlen, struct pktgen_dev *pkt_dev) { unsigned int n = 0; + size_t i = 0, max; + ssize_t len; char c; - ssize_t i = 0; - int len; pkt_dev->nr_labels = 0; do { __u32 tmp; - len = hex32_arg(&buffer[i], 8, &tmp); - if (len <= 0) + + if (n >= MAX_MPLS_LABELS) + return -E2BIG; + + max = min(8, maxlen - i); + len = hex32_arg(&buffer[i], max, &tmp); + if (len < 0) return len; + + /* return empty list in case of invalid input or zero value */ + if (len == 0 || tmp == 0) + return maxlen; + pkt_dev->labels[n] = htonl(tmp); if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM) pkt_dev->flags |= F_MPLS_RND; i += len; + n++; + if (i >= maxlen) + break; if (get_user(c, &buffer[i])) return -EFAULT; i++; - n++; - if (n >= MAX_MPLS_LABELS) - return -E2BIG; } while (c == ','); pkt_dev->nr_labels = n; @@ -952,11 +971,11 @@ static ssize_t pktgen_if_write(struct file *file, { struct seq_file *seq = file->private_data; struct pktgen_dev *pkt_dev = seq->private; - int i, max, len; + size_t i, max; + ssize_t len; char name[16], valstr[32]; unsigned long value = 0; char *pg_result = NULL; - int tmp = 0; char buf[128]; pg_result = &(pkt_dev->result[0]); @@ -967,16 +986,16 @@ static ssize_t pktgen_if_write(struct file *file, } max = count; - tmp = count_trail_chars(user_buffer, max); - if (tmp < 0) { + len = count_trail_chars(user_buffer, max); + if (len < 0) { pr_warn("illegal format\n"); - return tmp; + return len; } - i = tmp; + i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1004,11 +1023,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "min_pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->min_pkt_size) { @@ -1021,11 +1040,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "max_pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->max_pkt_size) { @@ -1040,11 +1059,11 @@ static ssize_t pktgen_if_write(struct file *file, /* Shortcut for min = max */ if (!strcmp(name, "pkt_size")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value < 14 + 20 + 8) value = 14 + 20 + 8; if (value != pkt_dev->min_pkt_size) { @@ -1060,43 +1079,43 @@ static ssize_t pktgen_if_write(struct file *file, if (pkt_dev->clone_skb > 0) return -EINVAL; - len = get_imix_entries(&user_buffer[i], pkt_dev); + max = count - i; + len = get_imix_entries(&user_buffer[i], max, pkt_dev); if (len < 0) return len; fill_imix_distribution(pkt_dev); - i += len; return count; } if (!strcmp(name, "debug")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; debug = value; sprintf(pg_result, "OK: debug=%u", debug); return count; } if (!strcmp(name, "frags")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->nfrags = value; sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags); return count; } if (!strcmp(name, "delay")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value == 0x7FFFFFFF) pkt_dev->delay = ULLONG_MAX; else @@ -1107,13 +1126,13 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "rate")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (!value) - return len; + return -EINVAL; pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value; if (debug) pr_info("Delay set at: %llu ns\n", pkt_dev->delay); @@ -1122,13 +1141,13 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "ratep")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (!value) - return len; + return -EINVAL; pkt_dev->delay = NSEC_PER_SEC/value; if (debug) pr_info("Delay set at: %llu ns\n", pkt_dev->delay); @@ -1137,11 +1156,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_src_min")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_src_min) { pkt_dev->udp_src_min = value; pkt_dev->cur_udp_src = value; @@ -1150,11 +1169,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_dst_min")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_dst_min) { pkt_dev->udp_dst_min = value; pkt_dev->cur_udp_dst = value; @@ -1163,11 +1182,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_src_max")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_src_max) { pkt_dev->udp_src_max = value; pkt_dev->cur_udp_src = value; @@ -1176,11 +1195,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "udp_dst_max")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value != pkt_dev->udp_dst_max) { pkt_dev->udp_dst_max = value; pkt_dev->cur_udp_dst = value; @@ -1189,7 +1208,8 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "clone_skb")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; /* clone_skb is not supported for netif_receive xmit_mode and @@ -1198,34 +1218,33 @@ static ssize_t pktgen_if_write(struct file *file, if ((value > 0) && ((pkt_dev->xmit_mode == M_NETIF_RECEIVE) || !(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))) - return -ENOTSUPP; + return -EOPNOTSUPP; if (value > 0 && (pkt_dev->n_imix_entries > 0 || !(pkt_dev->flags & F_SHARED))) return -EINVAL; - i += len; pkt_dev->clone_skb = value; sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); return count; } if (!strcmp(name, "count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->count = value; sprintf(pg_result, "OK: count=%llu", (unsigned long long)pkt_dev->count); return count; } if (!strcmp(name, "src_mac_count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (pkt_dev->src_mac_count != value) { pkt_dev->src_mac_count = value; pkt_dev->cur_src_mac_offset = 0; @@ -1235,11 +1254,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "dst_mac_count")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (pkt_dev->dst_mac_count != value) { pkt_dev->dst_mac_count = value; pkt_dev->cur_dst_mac_offset = 0; @@ -1249,16 +1268,16 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "burst")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value > 1) && ((pkt_dev->xmit_mode == M_QUEUE_XMIT) || ((pkt_dev->xmit_mode == M_START_XMIT) && (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING))))) - return -ENOTSUPP; + return -EOPNOTSUPP; if (value > 1 && !(pkt_dev->flags & F_SHARED)) return -EINVAL; @@ -1268,12 +1287,11 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "node")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; - if (node_possible(value)) { pkt_dev->node = value; sprintf(pg_result, "OK: node=%d", pkt_dev->node); @@ -1289,21 +1307,21 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "xmit_mode")) { char f[32]; - memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; + memset(f, 0, sizeof(f)); if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; if (strcmp(f, "start_xmit") == 0) { pkt_dev->xmit_mode = M_START_XMIT; } else if (strcmp(f, "netif_receive") == 0) { /* clone_skb set earlier, not supported in this mode */ if (pkt_dev->clone_skb > 0) - return -ENOTSUPP; + return -EOPNOTSUPP; pkt_dev->xmit_mode = M_NETIF_RECEIVE; @@ -1329,14 +1347,14 @@ static ssize_t pktgen_if_write(struct file *file, char f[32]; char *end; - memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; + memset(f, 0, 32); if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; flag = pktgen_read_flag(f, &disable); if (flag) { @@ -1378,7 +1396,8 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1); + max = min(sizeof(pkt_dev->dst_min) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1393,12 +1412,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (debug) pr_debug("dst_min set to: %s\n", pkt_dev->dst_min); - i += len; + sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min); return count; } if (!strcmp(name, "dst_max")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1); + max = min(sizeof(pkt_dev->dst_max) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1413,12 +1433,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (debug) pr_debug("dst_max set to: %s\n", pkt_dev->dst_max); - i += len; + sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max); return count; } if (!strcmp(name, "dst6")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1436,12 +1457,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6 set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6=%s", buf); return count; } if (!strcmp(name, "dst6_min")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1458,12 +1479,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6_min set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6_min=%s", buf); return count; } if (!strcmp(name, "dst6_max")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1479,12 +1500,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("dst6_max set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: dst6_max=%s", buf); return count; } if (!strcmp(name, "src6")) { - len = strn_len(&user_buffer[i], sizeof(buf) - 1); + max = min(sizeof(buf) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1502,12 +1523,12 @@ static ssize_t pktgen_if_write(struct file *file, if (debug) pr_debug("src6 set to: %s\n", buf); - i += len; sprintf(pg_result, "OK: src6=%s", buf); return count; } if (!strcmp(name, "src_min")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1); + max = min(sizeof(pkt_dev->src_min) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1522,12 +1543,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (debug) pr_debug("src_min set to: %s\n", pkt_dev->src_min); - i += len; + sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min); return count; } if (!strcmp(name, "src_max")) { - len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1); + max = min(sizeof(pkt_dev->src_max) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1542,12 +1564,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (debug) pr_debug("src_max set to: %s\n", pkt_dev->src_max); - i += len; + sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max); return count; } if (!strcmp(name, "dst_mac")) { - len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + max = min(sizeof(valstr) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1564,7 +1587,8 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "src_mac")) { - len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + max = min(sizeof(valstr) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1588,11 +1612,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "flows")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value > MAX_CFLOWS) value = MAX_CFLOWS; @@ -1602,44 +1626,44 @@ static ssize_t pktgen_if_write(struct file *file, } #ifdef CONFIG_XFRM if (!strcmp(name, "spi")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->spi = value; sprintf(pg_result, "OK: spi=%u", pkt_dev->spi); return count; } #endif if (!strcmp(name, "flowlen")) { - len = num_arg(&user_buffer[i], 10, &value); + max = min(10, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->lflow = value; sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow); return count; } if (!strcmp(name, "queue_map_min")) { - len = num_arg(&user_buffer[i], 5, &value); + max = min(5, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->queue_map_min = value; sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min); return count; } if (!strcmp(name, "queue_map_max")) { - len = num_arg(&user_buffer[i], 5, &value); + max = min(5, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->queue_map_max = value; sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max); return count; @@ -1648,10 +1672,11 @@ static ssize_t pktgen_if_write(struct file *file, if (!strcmp(name, "mpls")) { unsigned int n, cnt; - len = get_labels(&user_buffer[i], pkt_dev); + max = count - i; + len = get_labels(&user_buffer[i], max, pkt_dev); if (len < 0) return len; - i += len; + cnt = sprintf(pg_result, "OK: mpls="); for (n = 0; n < pkt_dev->nr_labels; n++) cnt += sprintf(pg_result + cnt, @@ -1669,11 +1694,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_id")) { - len = num_arg(&user_buffer[i], 4, &value); + max = min(4, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if (value <= 4095) { pkt_dev->vlan_id = value; /* turn on VLAN */ @@ -1696,11 +1721,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_p")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) { pkt_dev->vlan_p = value; sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p); @@ -1711,11 +1736,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "vlan_cfi")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) { pkt_dev->vlan_cfi = value; sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi); @@ -1726,11 +1751,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_id")) { - len = num_arg(&user_buffer[i], 4, &value); + max = min(4, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) { pkt_dev->svlan_id = value; /* turn on SVLAN */ @@ -1753,11 +1778,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_p")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) { pkt_dev->svlan_p = value; sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p); @@ -1768,11 +1793,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "svlan_cfi")) { - len = num_arg(&user_buffer[i], 1, &value); + max = min(1, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) { pkt_dev->svlan_cfi = value; sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi); @@ -1783,12 +1808,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "tos")) { - __u32 tmp_value = 0; - len = hex32_arg(&user_buffer[i], 2, &tmp_value); + __u32 tmp_value; + + max = min(2, count - i); + len = hex32_arg(&user_buffer[i], max, &tmp_value); if (len < 0) return len; - i += len; if (len == 2) { pkt_dev->tos = tmp_value; sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos); @@ -1799,12 +1825,13 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "traffic_class")) { - __u32 tmp_value = 0; - len = hex32_arg(&user_buffer[i], 2, &tmp_value); + __u32 tmp_value; + + max = min(2, count - i); + len = hex32_arg(&user_buffer[i], max, &tmp_value); if (len < 0) return len; - i += len; if (len == 2) { pkt_dev->traffic_class = tmp_value; sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class); @@ -1815,11 +1842,11 @@ static ssize_t pktgen_if_write(struct file *file, } if (!strcmp(name, "skb_priority")) { - len = num_arg(&user_buffer[i], 9, &value); + max = min(9, count - i); + len = num_arg(&user_buffer[i], max, &value); if (len < 0) return len; - i += len; pkt_dev->skb_priority = value; sprintf(pg_result, "OK: skb_priority=%i", pkt_dev->skb_priority); @@ -1879,7 +1906,8 @@ static ssize_t pktgen_thread_write(struct file *file, { struct seq_file *seq = file->private_data; struct pktgen_thread *t = seq->private; - int i, max, len, ret; + size_t i, max; + ssize_t len, ret; char name[40]; char *pg_result; @@ -1896,8 +1924,8 @@ static ssize_t pktgen_thread_write(struct file *file, i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1927,14 +1955,15 @@ static ssize_t pktgen_thread_write(struct file *file, if (!strcmp(name, "add_device")) { char f[32]; memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) { ret = len; goto out; } if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; - i += len; + mutex_lock(&pktgen_thread_lock); ret = pktgen_add_device(t, f); mutex_unlock(&pktgen_thread_lock); @@ -2358,13 +2387,13 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) } -#ifdef CONFIG_XFRM /* If there was already an IPSEC SA, we keep it as is, else * we go look for it ... */ #define DUMMY_MARK 0 static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) { +#ifdef CONFIG_XFRM struct xfrm_state *x = pkt_dev->flows[flow].x; struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id); if (!x) { @@ -2390,11 +2419,10 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow) } } -} #endif +} static void set_cur_queue_map(struct pktgen_dev *pkt_dev) { - if (pkt_dev->flags & F_QUEUE_MAP_CPU) pkt_dev->cur_queue_map = smp_processor_id(); @@ -2569,10 +2597,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->flows[flow].flags |= F_INIT; pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr; -#ifdef CONFIG_XFRM if (pkt_dev->flags & F_IPSEC) get_ipsec_sa(pkt_dev, flow); -#endif pkt_dev->nflows++; } } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1f4d4b5570ab..b4612d305970 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -80,6 +80,11 @@ void rtnl_lock(void) } EXPORT_SYMBOL(rtnl_lock); +int rtnl_lock_interruptible(void) +{ + return mutex_lock_interruptible(&rtnl_mutex); +} + int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); @@ -1287,6 +1292,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + + nla_total_size(1) /* IFLA_NETNS_IMMUTABLE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(4) /* IFLA_LINK_NETNSID */ + nla_total_size(4) /* IFLA_GROUP */ @@ -2041,6 +2047,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN) || nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) || + nla_put_u8(skb, IFLA_NETNS_IMMUTABLE, dev->netns_immutable) || nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) || nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) || nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) || @@ -2229,6 +2236,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_ALLMULTI] = { .type = NLA_REJECT }, [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, + [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -3020,7 +3028,7 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0); - err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex); + err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex, extack); if (err) goto errout; @@ -3432,6 +3440,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = -ENODEV; rtnl_nets_unlock(&rtnl_nets); + rtnl_nets_destroy(&rtnl_nets); errout: return err; } @@ -3761,7 +3770,13 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, struct netlink_ext_ack *extack) { unsigned char name_assign_type = NET_NAME_USER; - struct net *net = sock_net(skb->sk); + struct rtnl_newlink_params params = { + .src_net = sock_net(skb->sk), + .link_net = link_net, + .peer_net = peer_net, + .tb = tb, + .data = data, + }; u32 portid = NETLINK_CB(skb).portid; struct net_device *dev; char ifname[IFNAMSIZ]; @@ -3777,8 +3792,8 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, name_assign_type = NET_NAME_ENUM; } - dev = rtnl_create_link(link_net ? : tgt_net, ifname, - name_assign_type, ops, tb, extack); + dev = rtnl_create_link(tgt_net, ifname, name_assign_type, ops, tb, + extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; @@ -3786,13 +3801,8 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, dev->ifindex = ifm->ifi_index; - if (link_net) - net = link_net; - if (peer_net) - net = peer_net; - if (ops->newlink) - err = ops->newlink(net, dev, tb, data, extack); + err = ops->newlink(dev, ¶ms, extack); else err = register_netdevice(dev); if (err < 0) { @@ -3803,11 +3813,6 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, err = rtnl_configure_link(dev, ifm, portid, nlh); if (err < 0) goto out_unregister; - if (link_net) { - err = dev_change_net_namespace(dev, tgt_net, ifname); - if (err < 0) - goto out_unregister; - } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) @@ -3861,20 +3866,26 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, { struct nlattr ** const tb = tbs->tb; struct net *net = sock_net(skb->sk); + struct net *device_net; struct net_device *dev; struct ifinfomsg *ifm; bool link_specified; + /* When creating, lookup for existing device in target net namespace */ + device_net = (nlh->nlmsg_flags & NLM_F_CREATE) && + (nlh->nlmsg_flags & NLM_F_EXCL) ? + tgt_net : net; + ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { link_specified = true; - dev = __dev_get_by_index(net, ifm->ifi_index); + dev = __dev_get_by_index(device_net, ifm->ifi_index); } else if (ifm->ifi_index < 0) { NL_SET_ERR_MSG(extack, "ifindex can't be negative"); return -EINVAL; } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { link_specified = true; - dev = rtnl_dev_get(net, tb); + dev = rtnl_dev_get(device_net, tb); } else { link_specified = false; dev = NULL; diff --git a/net/core/scm.c b/net/core/scm.c index 4f6a14babe5a..733c0cbd393d 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -282,6 +282,16 @@ efault: } EXPORT_SYMBOL(put_cmsg); +int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data) +{ + /* Don't produce truncated CMSGs */ + if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len)) + return -ETOOSMALL; + + return put_cmsg(msg, level, type, len, data); +} + void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) { struct scm_timestamping64 tss; diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index b0ff6153be62..568779d5a0ef 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -71,7 +71,7 @@ u32 secure_tcpv6_ts_off(const struct net *net, return siphash(&combined, offsetofend(typeof(combined), daddr), &ts_secret); } -EXPORT_SYMBOL(secure_tcpv6_ts_off); +EXPORT_IPV6_MOD(secure_tcpv6_ts_off); u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a441613a1e6c..ab8acb737b93 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -69,6 +69,7 @@ #include <net/dst.h> #include <net/sock.h> #include <net/checksum.h> +#include <net/gro.h> #include <net/gso.h> #include <net/hotdata.h> #include <net/ip6_checksum.h> @@ -95,7 +96,9 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; #endif -#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER) +#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN) +#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \ + GRO_MAX_HEAD_PAD)) /* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two. * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique @@ -220,67 +223,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) -#if PAGE_SIZE == SZ_4K - -#define NAPI_HAS_SMALL_PAGE_FRAG 1 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) - -/* specialized page frag allocator using a single order 0 page - * and slicing it into 1K sized fragment. Constrained to systems - * with a very limited amount of 1K fragments fitting a single - * page - to avoid excessive truesize underestimation - */ - -struct page_frag_1k { - void *va; - u16 offset; - bool pfmemalloc; -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) -{ - struct page *page; - int offset; - - offset = nc->offset - SZ_1K; - if (likely(offset >= 0)) - goto use_frag; - - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - if (!page) - return NULL; - - nc->va = page_address(page); - nc->pfmemalloc = page_is_pfmemalloc(page); - offset = PAGE_SIZE - SZ_1K; - page_ref_add(page, offset / SZ_1K); - -use_frag: - nc->offset = offset; - return nc->va + offset; -} -#else - -/* the small page is actually unused in this build; add dummy helpers - * to please the compiler and avoid later preprocessor's conditionals - */ -#define NAPI_HAS_SMALL_PAGE_FRAG 0 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false - -struct page_frag_1k { -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) -{ - return NULL; -} - -#endif - struct napi_alloc_cache { local_lock_t bh_lock; struct page_frag_cache page; - struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -290,23 +235,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -367,6 +295,68 @@ static struct sk_buff *napi_skb_cache_get(void) return skb; } +/** + * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache + * @skbs: pointer to an at least @n-sized array to fill with skb pointers + * @n: number of entries to provide + * + * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes + * the pointers into the provided array @skbs. If there are less entries + * available, tries to replenish the cache and bulk-allocates the diff from + * the MM layer if needed. + * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are + * ready for {,__}build_skb_around() and don't have any data buffers attached. + * Must be called *only* from the BH context. + * + * Return: number of successfully allocated skbs (@n if no actual allocation + * needed or kmem_cache_alloc_bulk() didn't fail). + */ +u32 napi_skb_cache_get_bulk(void **skbs, u32 n) +{ + struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); + u32 bulk, total = n; + + local_lock_nested_bh(&napi_alloc_cache.bh_lock); + + if (nc->skb_count >= n) + goto get; + + /* No enough cached skbs. Try refilling the cache first */ + bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK); + nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN, bulk, + &nc->skb_cache[nc->skb_count]); + if (likely(nc->skb_count >= n)) + goto get; + + /* Still not enough. Bulk-allocate the missing part directly, zeroed */ + n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN, + n - nc->skb_count, &skbs[nc->skb_count]); + if (likely(nc->skb_count >= n)) + goto get; + + /* kmem_cache didn't allocate the number we need, limit the output */ + total -= n - nc->skb_count; + n = nc->skb_count; + +get: + for (u32 base = nc->skb_count - n, i = 0; i < n; i++) { + u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache); + + skbs[i] = nc->skb_cache[base + i]; + + kasan_mempool_unpoison_object(skbs[i], cache_size); + memset(skbs[i], 0, offsetof(struct sk_buff, tail)); + } + + nc->skb_count -= n; + local_unlock_nested_bh(&napi_alloc_cache.bh_lock); + + return total; +} +EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk); + static inline void __finalize_skb_around(struct sk_buff *skb, void *data, unsigned int size) { @@ -736,7 +726,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); @@ -813,10 +803,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. - * When the small frag allocator is available, prefer it over kmalloc - * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -826,32 +814,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) goto skb_success; } + len = SKB_HEAD_ALIGN(len); + if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache); - if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { - /* we are artificially inflating the allocation size, but - * that is not as bad as it may look like, as: - * - 'len' less than GRO_MAX_HEAD makes little sense - * - On most systems, larger 'len' values lead to fragment - * size above 512 bytes - * - kmalloc would use the kmalloc-1k slab for such values - * - Builds with smaller GRO_MAX_HEAD will very likely do - * little networking, as that implies no WiFi and no - * tunnels support, and 32 bits arches. - */ - len = SZ_1K; - - data = page_frag_alloc_1k(&nc->page_small, gfp_mask); - pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); - } else { - len = SKB_HEAD_ALIGN(len); - data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); - } + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) @@ -5539,6 +5511,52 @@ err: } EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); +static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps, + int tstype) +{ + switch (tstype) { + case SCM_TSTAMP_SCHED: + return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP; + case SCM_TSTAMP_SND: + return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF : + SKBTX_SW_TSTAMP); + case SCM_TSTAMP_ACK: + return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK; + } + + return false; +} + +static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb, + struct skb_shared_hwtstamps *hwtstamps, + struct sock *sk, + int tstype) +{ + int op; + + switch (tstype) { + case SCM_TSTAMP_SCHED: + op = BPF_SOCK_OPS_TSTAMP_SCHED_CB; + break; + case SCM_TSTAMP_SND: + if (hwtstamps) { + op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB; + *skb_hwtstamps(skb) = *hwtstamps; + } else { + op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB; + } + break; + case SCM_TSTAMP_ACK: + op = BPF_SOCK_OPS_TSTAMP_ACK_CB; + break; + default: + return; + } + + bpf_skops_tx_timestamping(sk, skb, op); +} + void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb, struct skb_shared_hwtstamps *hwtstamps, @@ -5551,6 +5569,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, if (!sk) return; + if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF) + skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps, + sk, tstype); + + if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype)) + return; + tsflags = READ_ONCE(sk->sk_tsflags); if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) @@ -6123,11 +6148,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->offload_fwd_mark = 0; skb->offload_l3_fwd_mark = 0; #endif + ipvs_reset(skb); if (!xnet) return; - ipvs_reset(skb); skb->mark = 0; skb_clear_tstamp(skb); } diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 61f3f3d4e528..0ddc4c718833 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -549,6 +549,9 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, return num_sge; } +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + psock->ingress_bytes += len; +#endif copied = len; msg->sg.start = 0; msg->sg.size = copied; @@ -1144,6 +1147,10 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) if (!ret) sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + if (sk_is_tcp(sk)) { + psock->strp.cb.read_sock = tcp_bpf_strp_read_sock; + psock->copied_seq = tcp_sk(sk)->copied_seq; + } return ret; } diff --git a/net/core/sock.c b/net/core/sock.c index eae2ae70a2e0..a0598518ce89 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -938,6 +938,7 @@ int sock_set_timestamping(struct sock *sk, int optname, WRITE_ONCE(sk->sk_tsflags, val); sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY)); if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, @@ -948,6 +949,20 @@ int sock_set_timestamping(struct sock *sk, int optname, return 0; } +#if defined(CONFIG_CGROUP_BPF) +void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) +{ + struct bpf_sock_ops_kern sock_ops; + + memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); + sock_ops.op = op; + sock_ops.is_fullsock = 1; + sock_ops.sk = sk; + bpf_skops_init_skb(&sock_ops, skb, 0); + __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); +} +#endif + void sock_set_keepalive(struct sock *sk) { lock_sock(sk); @@ -2041,7 +2056,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, v.val = READ_ONCE(sk->sk_napi_id); /* aggregate non-NAPI IDs down to 0 */ - if (v.val < MIN_NAPI_ID) + if (!napi_id_valid(v.val)) v.val = 0; break; @@ -2246,6 +2261,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); } else { + net_passive_inc(net); __netns_tracker_alloc(net, &sk->ns_tracker, false, priority); } @@ -2270,6 +2286,7 @@ EXPORT_SYMBOL(sk_alloc); static void __sk_destruct(struct rcu_head *head) { struct sock *sk = container_of(head, struct sock, sk_rcu); + struct net *net = sock_net(sk); struct sk_filter *filter; if (sk->sk_destruct) @@ -2301,14 +2318,28 @@ static void __sk_destruct(struct rcu_head *head) put_cred(sk->sk_peer_cred); put_pid(sk->sk_peer_pid); - if (likely(sk->sk_net_refcnt)) - put_net_track(sock_net(sk), &sk->ns_tracker); - else - __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); - + if (likely(sk->sk_net_refcnt)) { + put_net_track(net, &sk->ns_tracker); + } else { + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + } sk_prot_free(sk->sk_prot_creator, sk); } +void sk_net_refcnt_upgrade(struct sock *sk) +{ + struct net *net = sock_net(sk); + + WARN_ON_ONCE(sk->sk_net_refcnt); + __netns_tracker_free(net, &sk->ns_tracker, false); + net_passive_dec(net); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); +} +EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); + void sk_destruct(struct sock *sk) { bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); @@ -2405,6 +2436,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) * is not properly dismantling its kernel sockets at netns * destroy time. */ + net_passive_inc(sock_net(newsk)); __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } @@ -2804,6 +2836,22 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) } EXPORT_SYMBOL(sock_kmalloc); +/* + * Duplicate the input "src" memory block using the socket's + * option memory buffer. + */ +void *sock_kmemdup(struct sock *sk, const void *src, + int size, gfp_t priority) +{ + void *mem; + + mem = sock_kmalloc(sk, size, priority); + if (mem) + memcpy(mem, src, size); + return mem; +} +EXPORT_SYMBOL(sock_kmemdup); + /* Free an option memory block. Note, we actually want the inline * here as this allows gcc to detect the nullify and fold away the * condition entirely. @@ -3881,7 +3929,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); - mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); + mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index f1b9b3958792..82a14f131d00 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -303,7 +303,10 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) write_lock_bh(&sk->sk_callback_lock); if (stream_parser && stream_verdict && !psock->saved_data_ready) { - ret = sk_psock_init_strp(sk, psock); + if (sk_is_tcp(sk)) + ret = sk_psock_init_strp(sk, psock); + else + ret = -EOPNOTSUPP; if (ret) { write_unlock_bh(&sk->sk_callback_lock); sk_psock_put(sk, psock); @@ -541,6 +544,9 @@ static bool sock_map_sk_state_allowed(const struct sock *sk) return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN); if (sk_is_stream_unix(sk)) return (1 << sk->sk_state) & TCPF_ESTABLISHED; + if (sk_is_vsock(sk) && + (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) + return (1 << sk->sk_state) & TCPF_ESTABLISHED; return true; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index cb8d32e5c14e..c7769ee0d9c5 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -34,6 +34,7 @@ static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE; +static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -319,7 +320,7 @@ static int proc_do_dev_weight(const struct ctl_table *table, int write, int ret, weight; mutex_lock(&dev_weight_mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias); @@ -412,6 +413,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_rx_bias", @@ -419,6 +421,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "dev_weight_tx_bias", @@ -426,6 +429,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, + .extra1 = SYSCTL_ONE, }, { .procname = "netdev_max_backlog", @@ -584,7 +588,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = &netdev_budget_usecs_min, }, { .procname = "fb_tunnels_only_for_init_net", diff --git a/net/core/xdp.c b/net/core/xdp.c index 2c6ab6fb452f..f86eedad586a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -618,16 +618,6 @@ void xdp_warn(const char *msg, const char *func, const int line) }; EXPORT_SYMBOL_GPL(xdp_warn); -int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp) -{ - n_skb = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, n_skb, skbs); - if (unlikely(!n_skb)) - return -ENOMEM; - - return 0; -} -EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk); - /** * xdp_build_skb_from_buff - create an skb from &xdp_buff * @xdp: &xdp_buff to convert to an skb diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index be515ba821e2..bfa529a54aca 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -426,9 +426,6 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, newinet = inet_sk(newsk); ireq = inet_rsk(req); - sk_daddr_set(newsk, ireq->ir_rmt_addr); - sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); - newinet->inet_saddr = ireq->ir_loc_addr; RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d6649246188d..39ae9d89d7d4 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -365,6 +365,9 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + ireq->ir_rmt_addr = LOOPBACK4_IPV6; + ireq->ir_loc_addr = LOOPBACK4_IPV6; + ireq->ireq_family = AF_INET6; ireq->ir_mark = inet_request_mark(sk, skb); @@ -504,10 +507,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; newnp->saddr = ireq->ir_v6_loc_addr; - newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; - newsk->sk_bound_dev_if = ireq->ir_iif; /* Now IPv6 options... @@ -546,9 +546,6 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, dccp_sync_mss(newsk, dst_mtu(dst)); - newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; - newinet->inet_rcv_saddr = LOOPBACK4_IPV6; - if (__inet_inherit_port(sk, newsk) < 0) { inet_csk_prepare_forced_close(newsk); dccp_done(newsk); diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 3fc474d6e57d..b15845fd6300 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -11,10 +11,6 @@ #include "dccp.h" #include "feat.h" -#ifndef CONFIG_SYSCTL -#error This file should not be compiled without CONFIG_SYSCTL defined -#endif - /* Boundary values */ static int u8_max = 0xFF; static unsigned long seqw_min = DCCPF_SEQ_WMIN, diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 5a7c0e565a89..e827775baf2e 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -1367,7 +1367,7 @@ static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn) return dsa_switch_parse_ports_of(ds, dn); } -static int dev_is_class(struct device *dev, void *class) +static int dev_is_class(struct device *dev, const void *class) { if (dev->class != NULL && !strcmp(dev->class->name, class)) return 1; diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 281bbac5539d..c33d4bf17929 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -66,7 +66,7 @@ static int ksz_connect(struct dsa_switch *ds) if (!priv) return -ENOMEM; - xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit", + xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit", ds->dst->index, ds->index); if (IS_ERR(xmit_worker)) { ret = PTR_ERR(xmit_worker); diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c index 11ea8cfd6266..3929584791e4 100644 --- a/net/dsa/tag_ocelot_8021q.c +++ b/net/dsa/tag_ocelot_8021q.c @@ -110,7 +110,7 @@ static int ocelot_connect(struct dsa_switch *ds) if (!priv) return -ENOMEM; - priv->xmit_worker = kthread_create_worker(0, "felix_xmit"); + priv->xmit_worker = kthread_run_worker(0, "felix_xmit"); if (IS_ERR(priv->xmit_worker)) { err = PTR_ERR(priv->xmit_worker); kfree(priv); diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 3e902af7eea6..02adec693811 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -707,7 +707,7 @@ static int sja1105_connect(struct dsa_switch *ds) spin_lock_init(&priv->meta_lock); - xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit", + xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit", ds->dst->index, ds->index); if (IS_ERR(xmit_worker)) { err = PTR_ERR(xmit_worker); diff --git a/net/dsa/user.c b/net/dsa/user.c index 291ab1b4acc4..804dc7dac4f2 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -897,7 +897,7 @@ static void dsa_skb_tx_timestamp(struct dsa_user_priv *p, { struct dsa_switch *ds = p->dp->ds; - if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) + if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NOBPF)) return; if (!ds->ops->port_txtstamp) @@ -1243,16 +1243,25 @@ static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index)) return -EOPNOTSUPP; - /* Port's PHY and MAC both need to be EEE capable */ - if (!dev->phydev) - return -ENODEV; + /* If the port is using phylink managed EEE, then an unimplemented + * set_mac_eee() is permissible. + */ + if (!phylink_mac_implements_lpi(ds->phylink_mac_ops)) { + /* Port's PHY and MAC both need to be EEE capable */ + if (!dev->phydev) + return -ENODEV; - if (!ds->ops->set_mac_eee) - return -EOPNOTSUPP; + if (!ds->ops->set_mac_eee) + return -EOPNOTSUPP; - ret = ds->ops->set_mac_eee(ds, dp->index, e); - if (ret) - return ret; + ret = ds->ops->set_mac_eee(ds, dp->index, e); + if (ret) + return ret; + } else if (ds->ops->set_mac_eee) { + ret = ds->ops->set_mac_eee(ds, dp->index, e); + if (ret) + return ret; + } return phylink_ethtool_set_eee(dp->pl, e); } diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 2bd77c94f9f1..ac8b6107863e 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -6,6 +6,7 @@ #include <linux/rtnetlink.h> #include <linux/ptp_clock_kernel.h> #include <linux/phy_link_topology.h> +#include <net/netdev_queues.h> #include "netlink.h" #include "common.h" @@ -213,6 +214,24 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(10, T1S, Half), __DEFINE_LINK_MODE_NAME(10, T1S_P2MP, Half), __DEFINE_LINK_MODE_NAME(10, T1BRR, Full), + __DEFINE_LINK_MODE_NAME(200000, CR, Full), + __DEFINE_LINK_MODE_NAME(200000, KR, Full), + __DEFINE_LINK_MODE_NAME(200000, DR, Full), + __DEFINE_LINK_MODE_NAME(200000, DR_2, Full), + __DEFINE_LINK_MODE_NAME(200000, SR, Full), + __DEFINE_LINK_MODE_NAME(200000, VR, Full), + __DEFINE_LINK_MODE_NAME(400000, CR2, Full), + __DEFINE_LINK_MODE_NAME(400000, KR2, Full), + __DEFINE_LINK_MODE_NAME(400000, DR2, Full), + __DEFINE_LINK_MODE_NAME(400000, DR2_2, Full), + __DEFINE_LINK_MODE_NAME(400000, SR2, Full), + __DEFINE_LINK_MODE_NAME(400000, VR2, Full), + __DEFINE_LINK_MODE_NAME(800000, CR4, Full), + __DEFINE_LINK_MODE_NAME(800000, KR4, Full), + __DEFINE_LINK_MODE_NAME(800000, DR4, Full), + __DEFINE_LINK_MODE_NAME(800000, DR4_2, Full), + __DEFINE_LINK_MODE_NAME(800000, SR4, Full), + __DEFINE_LINK_MODE_NAME(800000, VR4, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -221,8 +240,11 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_CR4 4 #define __LINK_MODE_LANES_CR8 8 #define __LINK_MODE_LANES_DR 1 +#define __LINK_MODE_LANES_DR_2 1 #define __LINK_MODE_LANES_DR2 2 +#define __LINK_MODE_LANES_DR2_2 2 #define __LINK_MODE_LANES_DR4 4 +#define __LINK_MODE_LANES_DR4_2 4 #define __LINK_MODE_LANES_DR8 8 #define __LINK_MODE_LANES_KR 1 #define __LINK_MODE_LANES_KR2 2 @@ -251,6 +273,9 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_T1L 1 #define __LINK_MODE_LANES_T1S 1 #define __LINK_MODE_LANES_T1S_P2MP 1 +#define __LINK_MODE_LANES_VR 1 +#define __LINK_MODE_LANES_VR2 2 +#define __LINK_MODE_LANES_VR4 4 #define __LINK_MODE_LANES_VR8 8 #define __LINK_MODE_LANES_DR8_2 8 #define __LINK_MODE_LANES_T1BRR 1 @@ -378,6 +403,24 @@ const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(10, T1S, Half), __DEFINE_LINK_MODE_PARAMS(10, T1S_P2MP, Half), __DEFINE_LINK_MODE_PARAMS(10, T1BRR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, CR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, DR_2, Full), + __DEFINE_LINK_MODE_PARAMS(200000, SR, Full), + __DEFINE_LINK_MODE_PARAMS(200000, VR, Full), + __DEFINE_LINK_MODE_PARAMS(400000, CR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, KR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, DR2_2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, SR2, Full), + __DEFINE_LINK_MODE_PARAMS(400000, VR2, Full), + __DEFINE_LINK_MODE_PARAMS(800000, CR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, KR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full), + __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full), + __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -462,6 +505,11 @@ const char ts_rx_filter_names[][ETH_GSTRING_LEN] = { }; static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT); +const char ts_flags_names[][ETH_GSTRING_LEN] = { + [const_ilog2(HWTSTAMP_FLAG_BONDED_PHC_INDEX)] = "bonded-phc-index", +}; +static_assert(ARRAY_SIZE(ts_flags_names) == __HWTSTAMP_FLAG_CNT); + const char udp_tunnel_type_names[][ETH_GSTRING_LEN] = { [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN] = "vxlan", [ETHTOOL_UDP_TUNNEL_TYPE_GENEVE] = "geneve", @@ -766,6 +814,21 @@ int ethtool_check_ops(const struct ethtool_ops *ops) return 0; } +void ethtool_ringparam_get_cfg(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kparam, + struct netlink_ext_ack *extack) +{ + memset(param, 0, sizeof(*param)); + memset(kparam, 0, sizeof(*kparam)); + + param->cmd = ETHTOOL_GRINGPARAM; + dev->ethtool_ops->get_ringparam(dev, param, kparam, extack); + + /* Driver gives us current state, we want to return current config */ + kparam->tcp_data_split = dev->cfg->hds_config; +} + static void ethtool_init_tsinfo(struct kernel_ethtool_ts_info *info) { memset(info, 0, sizeof(*info)); diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 850eadde4bfc..a1088c2441d0 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -13,6 +13,7 @@ ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT #define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1) +#define __HWTSTAMP_FLAG_CNT (const_ilog2(HWTSTAMP_FLAG_LAST) + 1) struct link_mode_info { int speed; @@ -38,6 +39,7 @@ extern const char wol_mode_names[][ETH_GSTRING_LEN]; extern const char sof_timestamping_names[][ETH_GSTRING_LEN]; extern const char ts_tx_type_names[][ETH_GSTRING_LEN]; extern const char ts_rx_filter_names[][ETH_GSTRING_LEN]; +extern const char ts_flags_names[][ETH_GSTRING_LEN]; extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN]; int __ethtool_get_link(struct net_device *dev); @@ -49,6 +51,12 @@ int ethtool_check_max_channel(struct net_device *dev, struct ethtool_channels channels, struct genl_info *info); int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context); + +void ethtool_ringparam_get_cfg(struct net_device *dev, + struct ethtool_ringparam *param, + struct kernel_ethtool_ringparam *kparam, + struct netlink_ext_ack *extack); + int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); int ethtool_get_ts_info_by_phc(struct net_device *dev, struct kernel_ethtool_ts_info *info, diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 7bb94875a7ec..e95b41f40cac 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -992,24 +992,30 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, if (rc) return rc; - /* Nonzero ring with RSS only makes sense if NIC adds them together */ - if (cmd == ETHTOOL_SRXCLSRLINS && info.flow_type & FLOW_RSS && - !ops->cap_rss_rxnfc_adds && - ethtool_get_flow_spec_ring(info.fs.ring_cookie)) - return -EINVAL; + if (cmd == ETHTOOL_SRXCLSRLINS && info.fs.flow_type & FLOW_RSS) { + /* Nonzero ring with RSS only makes sense + * if NIC adds them together + */ + if (!ops->cap_rss_rxnfc_adds && + ethtool_get_flow_spec_ring(info.fs.ring_cookie)) + return -EINVAL; + + if (!xa_load(&dev->ethtool->rss_ctx, info.rss_context)) + return -EINVAL; + } - if (ops->get_rxfh) { + if (cmd == ETHTOOL_SRXFH && ops->get_rxfh) { struct ethtool_rxfh_param rxfh = {}; rc = ops->get_rxfh(dev, &rxfh); if (rc) return rc; - /* Sanity check: if symmetric-xor is set, then: + /* Sanity check: if symmetric-xor/symmetric-or-xor is set, then: * 1 - no other fields besides IP src/dst and/or L4 src/dst * 2 - If src is set, dst must also be set */ - if ((rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && + if ((rxfh.input_xfrm & (RXH_XFRM_SYM_XOR | RXH_XFRM_SYM_OR_XOR)) && ((info.data & ~(RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3)) || (!!(info.data & RXH_IP_SRC) ^ !!(info.data & RXH_IP_DST)) || @@ -1382,11 +1388,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, return -EOPNOTSUPP; /* Check input data transformation capabilities */ if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR && + rxfh.input_xfrm != RXH_XFRM_SYM_OR_XOR && rxfh.input_xfrm != RXH_XFRM_NO_CHANGE) return -EINVAL; if (rxfh.input_xfrm != RXH_XFRM_NO_CHANGE && - (rxfh.input_xfrm & RXH_XFRM_SYM_XOR) && - !ops->cap_rss_sym_xor_supported) + rxfh.input_xfrm & ~ops->supported_input_xfrm) return -EOPNOTSUPP; create = rxfh.rss_context == ETH_RXFH_CONTEXT_ALLOC; @@ -2059,8 +2065,8 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { - struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; struct kernel_ethtool_ringparam kernel_ringparam; + struct ethtool_ringparam ringparam, max; int ret; if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) @@ -2069,7 +2075,7 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; - dev->ethtool_ops->get_ringparam(dev, &max, &kernel_ringparam, NULL); + ethtool_ringparam_get_cfg(dev, &max, &kernel_ringparam, NULL); /* ensure new ring parameters are within the maximums */ if (ringparam.rx_pending > max.rx_max_pending || diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 849c98e637c6..b4c45207fa32 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <net/netdev_queues.h> #include <net/sock.h> #include <linux/ethtool_netlink.h> #include <linux/phy_link_topology.h> @@ -90,7 +91,7 @@ int ethnl_ops_begin(struct net_device *dev) pm_runtime_get_sync(dev->dev.parent); if (!netif_device_present(dev) || - dev->reg_state == NETREG_UNREGISTERING) { + dev->reg_state >= NETREG_UNREGISTERING) { ret = -ENODEV; goto err; } @@ -667,6 +668,7 @@ static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) const struct ethnl_request_ops *ops; struct ethnl_req_info req_info = {}; const u8 cmd = info->genlhdr->cmd; + struct net_device *dev; int ret; ops = ethnl_default_requests[cmd]; @@ -688,20 +690,36 @@ static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) goto out_dev; } + dev = req_info.dev; + rtnl_lock(); - ret = ethnl_ops_begin(req_info.dev); + dev->cfg_pending = kmemdup(dev->cfg, sizeof(*dev->cfg), + GFP_KERNEL_ACCOUNT); + if (!dev->cfg_pending) { + ret = -ENOMEM; + goto out_tie_cfg; + } + + ret = ethnl_ops_begin(dev); if (ret < 0) - goto out_rtnl; + goto out_free_cfg; ret = ops->set(&req_info, info); - if (ret <= 0) + if (ret < 0) + goto out_ops; + + swap(dev->cfg, dev->cfg_pending); + if (!ret) goto out_ops; - ethtool_notify(req_info.dev, ops->set_ntf_cmd, NULL); + ethtool_notify(dev, ops->set_ntf_cmd, NULL); ret = 0; out_ops: - ethnl_ops_complete(req_info.dev); -out_rtnl: + ethnl_ops_complete(dev); +out_free_cfg: + kfree(dev->cfg_pending); +out_tie_cfg: + dev->cfg_pending = dev->cfg; rtnl_unlock(); out_dev: ethnl_parse_header_dev_put(&req_info); diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index d8cd4e4d7762..aeedd5ec6b8c 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <net/netdev_queues.h> + #include "netlink.h" #include "common.h" @@ -37,6 +39,10 @@ static int rings_prepare_data(const struct ethnl_req_info *req_base, ret = ethnl_ops_begin(dev); if (ret < 0) return ret; + + data->kernel_ringparam.tcp_data_split = dev->cfg->hds_config; + data->kernel_ringparam.hds_thresh = dev->cfg->hds_thresh; + dev->ethtool_ops->get_ringparam(dev, &data->ringparam, &data->kernel_ringparam, info->extack); ethnl_ops_complete(dev); @@ -209,17 +215,16 @@ ethnl_set_rings_validate(struct ethnl_req_info *req_info, static int ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) { - struct kernel_ethtool_ringparam kernel_ringparam = {}; - struct ethtool_ringparam ringparam = {}; + struct kernel_ethtool_ringparam kernel_ringparam; struct net_device *dev = req_info->dev; + struct ethtool_ringparam ringparam; struct nlattr **tb = info->attrs; const struct nlattr *err_attr; bool mod = false; int ret; - dev->ethtool_ops->get_ringparam(dev, &ringparam, - &kernel_ringparam, info->extack); - kernel_ringparam.tcp_data_split = dev->ethtool->hds_config; + ethtool_ringparam_get_cfg(dev, &ringparam, &kernel_ringparam, + info->extack); ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, @@ -292,13 +297,11 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) return -EINVAL; } + dev->cfg_pending->hds_config = kernel_ringparam.tcp_data_split; + dev->cfg_pending->hds_thresh = kernel_ringparam.hds_thresh; + ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); - if (!ret) { - dev->ethtool->hds_config = kernel_ringparam.tcp_data_split; - dev->ethtool->hds_thresh = kernel_ringparam.hds_thresh; - } - return ret < 0 ? ret : 1; } diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 7cb106b590ab..58df9ad02ce8 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -107,6 +107,8 @@ rss_prepare_ctx(const struct rss_req_info *request, struct net_device *dev, u32 total_size, indir_bytes; u8 *rss_config; + data->no_key_fields = !dev->ethtool_ops->rxfh_per_ctx_key; + ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context); if (!ctx) return -ENOENT; @@ -153,7 +155,6 @@ rss_prepare_data(const struct ethnl_req_info *req_base, if (!ops->cap_rss_ctx_supported && !ops->create_rxfh_context) return -EOPNOTSUPP; - data->no_key_fields = !ops->rxfh_per_ctx_key; return rss_prepare_ctx(request, dev, data, info); } diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c index 818cf01f0911..6b76c05caba4 100644 --- a/net/ethtool/strset.c +++ b/net/ethtool/strset.c @@ -75,6 +75,11 @@ static const struct strset_info info_template[] = { .count = __HWTSTAMP_FILTER_CNT, .strings = ts_rx_filter_names, }, + [ETH_SS_TS_FLAGS] = { + .per_dev = false, + .count = __HWTSTAMP_FLAG_CNT, + .strings = ts_flags_names, + }, [ETH_SS_UDP_TUNNEL_TYPES] = { .per_dev = false, .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT, diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c index 9188e088fb2f..2be356bdfe87 100644 --- a/net/ethtool/tsconfig.c +++ b/net/ethtool/tsconfig.c @@ -54,7 +54,7 @@ static int tsconfig_prepare_data(const struct ethnl_req_info *req_base, data->hwtst_config.tx_type = BIT(cfg.tx_type); data->hwtst_config.rx_filter = BIT(cfg.rx_filter); - data->hwtst_config.flags = BIT(cfg.flags); + data->hwtst_config.flags = cfg.flags; data->hwprov_desc.index = -1; hwprov = rtnl_dereference(dev->hwprov); @@ -91,10 +91,16 @@ static int tsconfig_reply_size(const struct ethnl_req_info *req_base, BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); + BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32); - if (data->hwtst_config.flags) - /* _TSCONFIG_HWTSTAMP_FLAGS */ - len += nla_total_size(sizeof(u32)); + if (data->hwtst_config.flags) { + ret = ethnl_bitset32_size(&data->hwtst_config.flags, + NULL, __HWTSTAMP_FLAG_CNT, + ts_flags_names, compact); + if (ret < 0) + return ret; + len += ret; /* _TSCONFIG_HWTSTAMP_FLAGS */ + } if (data->hwtst_config.tx_type) { ret = ethnl_bitset32_size(&data->hwtst_config.tx_type, @@ -130,8 +136,10 @@ static int tsconfig_fill_reply(struct sk_buff *skb, int ret; if (data->hwtst_config.flags) { - ret = nla_put_u32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, - data->hwtst_config.flags); + ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS, + &data->hwtst_config.flags, NULL, + __HWTSTAMP_FLAG_CNT, + ts_flags_names, compact); if (ret < 0) return ret; } @@ -180,7 +188,7 @@ const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1] = [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER] = NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), - [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_U32 }, + [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_NESTED }, [ETHTOOL_A_TSCONFIG_RX_FILTERS] = { .type = NLA_NESTED }, [ETHTOOL_A_TSCONFIG_TX_TYPES] = { .type = NLA_NESTED }, }; @@ -296,6 +304,7 @@ static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, BUILD_BUG_ON(__HWTSTAMP_TX_CNT >= 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT >= 32); + BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32); if (!netif_device_present(dev)) return -ENODEV; @@ -377,9 +386,13 @@ static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, } if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS]) { - ethnl_update_u32(&hwtst_config.flags, - tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS], - &config_mod); + ret = ethnl_update_bitset32(&hwtst_config.flags, + __HWTSTAMP_FLAG_CNT, + tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS], + ts_flags_names, info->extack, + &config_mod); + if (ret < 0) + goto err_free_hwprov; } ret = net_hwtstamp_validate(&hwtst_config); diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c index 1a195efc79cd..5b2cfac3b2ba 100644 --- a/net/hsr/hsr_debugfs.c +++ b/net/hsr/hsr_debugfs.c @@ -57,14 +57,11 @@ DEFINE_SHOW_ATTRIBUTE(hsr_node_table); void hsr_debugfs_rename(struct net_device *dev) { struct hsr_priv *priv = netdev_priv(dev); - struct dentry *d; + int err; - d = debugfs_rename(hsr_debugfs_root_dir, priv->node_tbl_root, - hsr_debugfs_root_dir, dev->name); - if (IS_ERR(d)) + err = debugfs_change_name(priv->node_tbl_root, "%s", dev->name); + if (err) netdev_warn(dev, "failed to rename\n"); - else - priv->node_tbl_root = d; } /* hsr_debugfs_init - create hsr node_table file for dumping diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index b6fb18469439..c6f8614e9ed1 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -643,7 +643,7 @@ void hsr_dev_setup(struct net_device *dev) /* Not sure about this. Taken from bridge code. netdevice.h says * it means "Does not change network namespaces". */ - dev->netns_local = true; + dev->netns_immutable = true; dev->needs_free_netdev = true; diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 87bb3a91598e..a4bacf198555 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -700,9 +700,12 @@ static int fill_frame_info(struct hsr_frame_info *frame, frame->is_vlan = true; if (frame->is_vlan) { - if (skb->mac_len < offsetofend(struct hsr_vlan_ethhdr, vlanhdr)) + /* Note: skb->mac_len might be wrong here. */ + if (!pskb_may_pull(skb, + skb_mac_offset(skb) + + offsetofend(struct hsr_vlan_ethhdr, vlanhdr))) return -EINVAL; - vlan_hdr = (struct hsr_vlan_ethhdr *)ethhdr; + vlan_hdr = (struct hsr_vlan_ethhdr *)skb_mac_header(skb); proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; } diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index b68f2f71d0e1..b120470246cc 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -29,10 +29,12 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = { /* Here, it seems a netdevice has already been allocated for us, and the * hsr_dev_setup routine has been executed. Nice! */ -static int hsr_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int hsr_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *link_net = rtnl_newlink_link_net(params); + struct nlattr **data = params->data; enum hsr_version proto_version; unsigned char multicast_spec; u8 proto = HSR_PROTOCOL_HSR; @@ -46,7 +48,7 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, NL_SET_ERR_MSG_MOD(extack, "Slave1 device not specified"); return -EINVAL; } - link[0] = __dev_get_by_index(src_net, + link[0] = __dev_get_by_index(link_net, nla_get_u32(data[IFLA_HSR_SLAVE1])); if (!link[0]) { NL_SET_ERR_MSG_MOD(extack, "Slave1 does not exist"); @@ -56,7 +58,7 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, NL_SET_ERR_MSG_MOD(extack, "Slave2 device not specified"); return -EINVAL; } - link[1] = __dev_get_by_index(src_net, + link[1] = __dev_get_by_index(link_net, nla_get_u32(data[IFLA_HSR_SLAVE2])); if (!link[1]) { NL_SET_ERR_MSG_MOD(extack, "Slave2 does not exist"); @@ -69,7 +71,7 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, } if (data[IFLA_HSR_INTERLINK]) - interlink = __dev_get_by_index(src_net, + interlink = __dev_get_by_index(link_net, nla_get_u32(data[IFLA_HSR_INTERLINK])); if (interlink && interlink == link[0]) { diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 175efd860f7b..9a9da74b0a4f 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -116,7 +116,7 @@ static void lowpan_setup(struct net_device *ldev) ldev->netdev_ops = &lowpan_netdev_ops; ldev->header_ops = &lowpan_header_ops; ldev->needs_free_netdev = true; - ldev->netns_local = true; + ldev->netns_immutable = true; } static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -129,10 +129,11 @@ static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[], return 0; } -static int lowpan_newlink(struct net *src_net, struct net_device *ldev, - struct nlattr *tb[], struct nlattr *data[], +static int lowpan_newlink(struct net_device *ldev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **tb = params->tb; struct net_device *wdev; int ret; @@ -142,6 +143,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev, if (!tb[IFLA_LINK]) return -EINVAL; + if (params->link_net && !net_eq(params->link_net, dev_net(ldev))) + return -EINVAL; /* find and hold wpan device */ wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK])); if (!wdev) diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index 88adb04e4072..89b671b12600 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -226,11 +226,11 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { if (!wpan_dev->netdev) continue; - wpan_dev->netdev->netns_local = false; + wpan_dev->netdev->netns_immutable = false; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); if (err) break; - wpan_dev->netdev->netns_local = true; + wpan_dev->netdev->netns_immutable = true; } if (err) { @@ -242,11 +242,11 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev, list) { if (!wpan_dev->netdev) continue; - wpan_dev->netdev->netns_local = false; + wpan_dev->netdev->netns_immutable = false; err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d"); WARN_ON(err); - wpan_dev->netdev->netns_local = true; + wpan_dev->netdev->netns_immutable = true; } return err; @@ -291,7 +291,7 @@ static int cfg802154_netdev_notifier_call(struct notifier_block *nb, switch (state) { /* TODO NETDEV_DEVTYPE */ case NETDEV_REGISTER: - dev->netns_local = true; + dev->netns_immutable = true; wpan_dev->identifier = ++rdev->wpan_dev_id; list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list); rdev->devlist_generation++; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 21f46ee7b6e9..5df1f1325259 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -153,7 +153,7 @@ void inet_sock_destruct(struct sock *sk) WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc)); WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); WARN_ON_ONCE(sk->sk_wmem_queued); - WARN_ON_ONCE(sk_forward_alloc_get(sk)); + WARN_ON_ONCE(sk->sk_forward_alloc); kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index cb9a7ed8abd3..a648fff71ea7 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -659,10 +659,12 @@ static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb */ void arp_xmit(struct sk_buff *skb) { + rcu_read_lock(); /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, - dev_net(skb->dev), NULL, skb, NULL, skb->dev, + dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev, arp_xmit_finish); + rcu_read_unlock(); } EXPORT_SYMBOL(arp_xmit); @@ -1062,8 +1064,8 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } - if (__in_dev_get_rtnl(dev)) { - IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); + if (__in_dev_get_rtnl_net(dev)) { + IN_DEV_CONF_SET(__in_dev_get_rtnl_net(dev), PROXY_ARP, on); return 0; } return -ENXIO; @@ -1075,7 +1077,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family, + dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; @@ -1293,14 +1295,14 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) switch (cmd) { case SIOCDARP: - rtnl_lock(); + rtnl_net_lock(net); err = arp_req_delete(net, &r); - rtnl_unlock(); + rtnl_net_unlock(net); break; case SIOCSARP: - rtnl_lock(); + rtnl_net_lock(net); err = arp_req_set(net, &r); - rtnl_unlock(); + rtnl_net_unlock(net); break; case SIOCGARP: rcu_read_lock(); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c8b3cf5fba4c..754f60fb6e25 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -46,6 +46,7 @@ #include <linux/notifier.h> #include <linux/inetdevice.h> #include <linux/igmp.h> +#include "igmp_internal.h" #include <linux/slab.h> #include <linux/hash.h> #ifdef CONFIG_SYSCTL @@ -107,15 +108,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_PROTO] = { .type = NLA_U8 }, }; -struct inet_fill_args { - u32 portid; - u32 seq; - int event; - unsigned int flags; - int netnsid; - int ifindex; -}; - #define IN4_ADDR_HSIZE_SHIFT 8 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) @@ -1371,10 +1363,11 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) __be32 addr = 0; unsigned char localnet_scope = RT_SCOPE_HOST; struct in_device *in_dev; - struct net *net = dev_net(dev); + struct net *net; int master_idx; rcu_read_lock(); + net = dev_net_rcu(dev); in_dev = __in_dev_get_rcu(dev); if (!in_dev) goto no_in_dev; @@ -1846,9 +1839,38 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, return 0; } -static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, - struct netlink_callback *cb, int *s_ip_idx, - struct inet_fill_args *fillargs) +static int in_dev_dump_ifmcaddr(struct in_device *in_dev, struct sk_buff *skb, + struct netlink_callback *cb, int *s_ip_idx, + struct inet_fill_args *fillargs) +{ + struct ip_mc_list *im; + int ip_idx = 0; + int err; + + for (im = rcu_dereference(in_dev->mc_list); + im; + im = rcu_dereference(im->next_rcu)) { + if (ip_idx < *s_ip_idx) { + ip_idx++; + continue; + } + err = inet_fill_ifmcaddr(skb, in_dev->dev, im, fillargs); + if (err < 0) + goto done; + + nl_dump_check_consistent(cb, nlmsg_hdr(skb)); + ip_idx++; + } + err = 0; + ip_idx = 0; +done: + *s_ip_idx = ip_idx; + return err; +} + +static int in_dev_dump_ifaddr(struct in_device *in_dev, struct sk_buff *skb, + struct netlink_callback *cb, int *s_ip_idx, + struct inet_fill_args *fillargs) { struct in_ifaddr *ifa; int ip_idx = 0; @@ -1874,6 +1896,21 @@ done: return err; } +static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb, + struct netlink_callback *cb, int *s_ip_idx, + struct inet_fill_args *fillargs) +{ + switch (fillargs->event) { + case RTM_NEWADDR: + return in_dev_dump_ifaddr(in_dev, skb, cb, s_ip_idx, fillargs); + case RTM_GETMULTICAST: + return in_dev_dump_ifmcaddr(in_dev, skb, cb, s_ip_idx, + fillargs); + default: + return -EINVAL; + } +} + /* Combine dev_addr_genid and dev_base_seq to detect changes. */ static u32 inet_base_seq(const struct net *net) @@ -1889,13 +1926,14 @@ static u32 inet_base_seq(const struct net *net) return res; } -static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +static int inet_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, + int event) { const struct nlmsghdr *nlh = cb->nlh; struct inet_fill_args fillargs = { .portid = NETLINK_CB(cb->skb).portid, .seq = nlh->nlmsg_seq, - .event = RTM_NEWADDR, + .event = event, .flags = NLM_F_MULTI, .netnsid = -1, }; @@ -1949,6 +1987,16 @@ done: return err; } +static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + return inet_dump_addr(skb, cb, RTM_NEWADDR); +} + +static int inet_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + return inet_dump_addr(skb, cb, RTM_GETMULTICAST); +} + static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, u32 portid) { @@ -2845,6 +2893,8 @@ static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_GETNETCONF, .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, + {.owner = THIS_MODULE, .protocol = PF_INET, .msgtype = RTM_GETMULTICAST, + .dumpit = inet_dump_ifmcaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED}, }; void __init devinet_init(void) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b0fbf804bbba..0e4076866c0a 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -279,7 +279,7 @@ static void esp_output_done(void *data, int err) x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) esp_output_tail_tcp(x, skb); else - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 272e42d81323..6de77415b5b3 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -553,18 +553,16 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, const struct in_ifaddr *ifa; struct in_device *in_dev; - in_dev = __in_dev_get_rtnl(dev); + in_dev = __in_dev_get_rtnl_net(dev); if (!in_dev) return -ENODEV; *colon = ':'; - rcu_read_lock(); - in_dev_for_each_ifa_rcu(ifa, in_dev) { + in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) { if (strcmp(ifa->ifa_label, devname) == 0) break; } - rcu_read_unlock(); if (!ifa) return -ENODEV; @@ -635,7 +633,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - rtnl_lock(); + rtnl_net_lock(net); err = rtentry_to_fib_config(net, cmd, rt, &cfg); if (err == 0) { struct fib_table *tb; @@ -659,7 +657,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt) /* allocated by rtentry_to_fib_config() */ kfree(cfg.fc_mx); } - rtnl_unlock(); + rtnl_net_unlock(net); return err; } return -EINVAL; @@ -837,19 +835,33 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, } } + if (cfg->fc_dst_len > 32) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + err = -EINVAL; + goto errout; + } + + if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { + NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); + err = -EINVAL; + goto errout; + } + if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); - return -EINVAL; + err = -EINVAL; + goto errout; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); - return -EINVAL; + err = -EINVAL; + goto errout; } if (!cfg->fc_table) @@ -872,20 +884,24 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + rtnl_net_lock(net); + if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); err = -EINVAL; - goto errout; + goto unlock; } tb = fib_get_table(net, cfg.fc_table); if (!tb) { NL_SET_ERR_MSG(extack, "FIB table does not exist"); err = -ESRCH; - goto errout; + goto unlock; } err = fib_table_delete(net, tb, &cfg, extack); +unlock: + rtnl_net_unlock(net); errout: return err; } @@ -902,15 +918,20 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + rtnl_net_lock(net); + tb = fib_new_table(net, cfg.fc_table); if (!tb) { err = -ENOBUFS; - goto errout; + goto unlock; } err = fib_table_insert(net, tb, &cfg, extack); if (!err && cfg.fc_type == RTN_LOCAL) net->ipv4.fib_has_custom_local_routes = true; + +unlock: + rtnl_net_unlock(net); errout: return err; } @@ -1450,7 +1471,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, fib_sync_up(dev, RTNH_F_DEAD); #endif atomic_inc(&net->ipv4.dev_addr_genid); - rt_cache_flush(dev_net(dev)); + rt_cache_flush(net); break; case NETDEV_DOWN: fib_del_ifaddr(ifa, NULL); @@ -1461,7 +1482,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, */ fib_disable_ip(dev, event, true); } else { - rt_cache_flush(dev_net(dev)); + rt_cache_flush(net); } break; } @@ -1575,7 +1596,7 @@ static void ip_fib_net_exit(struct net *net) { int i; - ASSERT_RTNL(); + ASSERT_RTNL_NET(net); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); @@ -1615,9 +1636,15 @@ static int __net_init fib_net_init(struct net *net) error = ip_fib_net_init(net); if (error < 0) goto out; + + error = fib4_semantics_init(net); + if (error) + goto out_semantics; + error = nl_fib_lookup_init(net); if (error < 0) goto out_nlfl; + error = fib_proc_init(net); if (error < 0) goto out_proc; @@ -1627,9 +1654,11 @@ out: out_proc: nl_fib_lookup_exit(net); out_nlfl: - rtnl_lock(); + fib4_semantics_exit(net); +out_semantics: + rtnl_net_lock(net); ip_fib_net_exit(net); - rtnl_unlock(); + rtnl_net_unlock(net); goto out; } @@ -1644,10 +1673,15 @@ static void __net_exit fib_net_exit_batch(struct list_head *net_list) struct net *net; rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) + list_for_each_entry(net, net_list, exit_list) { + __rtnl_net_lock(net); ip_fib_net_exit(net); - + __rtnl_net_unlock(net); + } rtnl_unlock(); + + list_for_each_entry(net, net_list, exit_list) + fib4_semantics_exit(net); } static struct pernet_operations fib_net_ops = { @@ -1658,9 +1692,9 @@ static struct pernet_operations fib_net_ops = { static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_NEWROUTE, - .doit = inet_rtm_newroute}, + .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_DELROUTE, - .doit = inet_rtm_delroute}, + .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET}, {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib, .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, }; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 9517b8667e00..fa58d6620ed6 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -37,6 +37,7 @@ struct fib4_rule { u8 dst_len; u8 src_len; dscp_t dscp; + dscp_t dscp_mask; u8 dscp_full:1; /* DSCP or TOS selector */ __be32 src; __be32 srcmask; @@ -192,7 +193,8 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, * to mask the upper three DSCP bits prior to matching to maintain * legacy behavior. */ - if (r->dscp_full && r->dscp != inet_dsfield_to_dscp(fl4->flowi4_tos)) + if (r->dscp_full && + (r->dscp ^ inet_dsfield_to_dscp(fl4->flowi4_tos)) & r->dscp_mask) return 0; else if (!r->dscp_full && r->dscp && !fib_dscp_masked_match(r->dscp, fl4)) @@ -201,12 +203,12 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto)) return 0; - if (fib_rule_port_range_set(&rule->sport_range) && - !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport)) + if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask, + fl4->fl4_sport)) return 0; - if (fib_rule_port_range_set(&rule->dport_range) && - !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport)) + if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask, + fl4->fl4_dport)) return 0; return 1; @@ -235,19 +237,43 @@ static int fib4_nl2rule_dscp(const struct nlattr *nla, struct fib4_rule *rule4, } rule4->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + rule4->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK); rule4->dscp_full = true; return 0; } +static int fib4_nl2rule_dscp_mask(const struct nlattr *nla, + struct fib4_rule *rule4, + struct netlink_ext_ack *extack) +{ + dscp_t dscp_mask; + + if (!rule4->dscp_full) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Cannot specify DSCP mask without DSCP value"); + return -EINVAL; + } + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + if (rule4->dscp & ~dscp_mask) { + NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask"); + return -EINVAL; + } + + rule4->dscp_mask = dscp_mask; + + return 0; +} + static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); + struct fib4_rule *rule4 = (struct fib4_rule *)rule; + struct net *net = rule->fr_net; int err = -EINVAL; - struct fib4_rule *rule4 = (struct fib4_rule *) rule; if (tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) { NL_SET_ERR_MSG(extack, @@ -271,6 +297,10 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, fib4_nl2rule_dscp(tb[FRA_DSCP], rule4, extack) < 0) goto errout; + if (tb[FRA_DSCP_MASK] && + fib4_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule4, extack) < 0) + goto errout; + /* split local/main if they are not already split */ err = fib_unmerge(net); if (err) @@ -366,6 +396,14 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, return 0; } + if (tb[FRA_DSCP_MASK]) { + dscp_t dscp_mask; + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2); + if (!rule4->dscp_full || rule4->dscp_mask != dscp_mask) + return 0; + } + #ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; @@ -391,7 +429,9 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, if (rule4->dscp_full) { frh->tos = 0; if (nla_put_u8(skb, FRA_DSCP, - inet_dscp_to_dsfield(rule4->dscp) >> 2)) + inet_dscp_to_dsfield(rule4->dscp) >> 2) || + nla_put_u8(skb, FRA_DSCP_MASK, + inet_dscp_to_dsfield(rule4->dscp_mask) >> 2)) goto nla_put_failure; } else { frh->tos = inet_dscp_to_dsfield(rule4->dscp); @@ -418,7 +458,8 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) return nla_total_size(4) /* dst */ + nla_total_size(4) /* src */ + nla_total_size(4) /* flow */ - + nla_total_size(1); /* dscp */ + + nla_total_size(1) /* dscp */ + + nla_total_size(1); /* dscp mask */ } static void fib4_rule_flush_cache(struct fib_rules_ops *ops) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d2cee5c314f5..f68bb9e34c34 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -50,12 +50,6 @@ #include "fib_lookup.h" -static struct hlist_head *fib_info_hash; -static struct hlist_head *fib_info_laddrhash; -static unsigned int fib_info_hash_size; -static unsigned int fib_info_hash_bits; -static unsigned int fib_info_cnt; - /* for_nexthops and change_nexthops only used when nexthop object * is not set in a fib_info. The logic within can reference fib_nh. */ @@ -258,8 +252,7 @@ void fib_release_info(struct fib_info *fi) ASSERT_RTNL(); if (fi && refcount_dec_and_test(&fi->fib_treeref)) { hlist_del(&fi->fib_hash); - - fib_info_cnt--; + fi->fib_net->ipv4.fib_info_cnt--; if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); @@ -335,11 +328,12 @@ static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope, static unsigned int fib_info_hashfn_result(const struct net *net, unsigned int val) { - return hash_32(val ^ net_hash_mix(net), fib_info_hash_bits); + return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits); } -static inline unsigned int fib_info_hashfn(struct fib_info *fi) +static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) { + struct net *net = fi->fib_net; unsigned int val; val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, @@ -354,7 +348,70 @@ static inline unsigned int fib_info_hashfn(struct fib_info *fi) } endfor_nexthops(fi) } - return fib_info_hashfn_result(fi->fib_net, val); + return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)]; +} + +static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, + __be32 val) +{ + unsigned int hash_bits = net->ipv4.fib_info_hash_bits; + u32 slot; + + slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits); + + return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot]; +} + +static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) +{ + /* The second half is used for prefsrc */ + return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head *), + GFP_KERNEL); +} + +static void fib_info_hash_free(struct hlist_head *head) +{ + kvfree(head); +} + +static void fib_info_hash_grow(struct net *net) +{ + unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits; + struct hlist_head *new_info_hash, *old_info_hash; + unsigned int i; + + if (net->ipv4.fib_info_cnt < old_size) + return; + + new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1); + if (!new_info_hash) + return; + + old_info_hash = net->ipv4.fib_info_hash; + net->ipv4.fib_info_hash = new_info_hash; + net->ipv4.fib_info_hash_bits += 1; + + for (i = 0; i < old_size; i++) { + struct hlist_head *head = &old_info_hash[i]; + struct hlist_node *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, n, head, fib_hash) + hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); + } + + for (i = 0; i < old_size; i++) { + struct hlist_head *lhead = &old_info_hash[old_size + i]; + struct hlist_node *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) + hlist_add_head(&fi->fib_lhash, + fib_info_laddrhash_bucket(fi->fib_net, + fi->fib_prefsrc)); + } + + fib_info_hash_free(old_info_hash); } /* no metrics, only nexthop id */ @@ -370,13 +427,12 @@ static struct fib_info *fib_find_info_nh(struct net *net, (__force u32)cfg->fc_prefsrc, cfg->fc_priority); hash = fib_info_hashfn_result(net, hash); - head = &fib_info_hash[hash]; + head = &net->ipv4.fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { - if (!net_eq(fi->fib_net, net)) - continue; if (!fi->nh || fi->nh->id != cfg->fc_nh_id) continue; + if (cfg->fc_protocol == fi->fib_protocol && cfg->fc_scope == fi->fib_scope && cfg->fc_prefsrc == fi->fib_prefsrc && @@ -392,18 +448,13 @@ static struct fib_info *fib_find_info_nh(struct net *net, static struct fib_info *fib_find_info(struct fib_info *nfi) { - struct hlist_head *head; + struct hlist_head *head = fib_info_hash_bucket(nfi); struct fib_info *fi; - unsigned int hash; - - hash = fib_info_hashfn(nfi); - head = &fib_info_hash[hash]; hlist_for_each_entry(fi, head, fib_hash) { - if (!net_eq(fi->fib_net, nfi->fib_net)) - continue; if (fi->fib_nhs != nfi->fib_nhs) continue; + if (nfi->fib_protocol == fi->fib_protocol && nfi->fib_scope == fi->fib_scope && nfi->fib_prefsrc == fi->fib_prefsrc && @@ -1239,64 +1290,6 @@ int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, return err; } -static struct hlist_head * -fib_info_laddrhash_bucket(const struct net *net, __be32 val) -{ - u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val, - fib_info_hash_bits); - - return &fib_info_laddrhash[slot]; -} - -static void fib_info_hash_move(struct hlist_head *new_info_hash, - struct hlist_head *new_laddrhash, - unsigned int new_size) -{ - struct hlist_head *old_info_hash, *old_laddrhash; - unsigned int old_size = fib_info_hash_size; - unsigned int i; - - ASSERT_RTNL(); - old_info_hash = fib_info_hash; - old_laddrhash = fib_info_laddrhash; - fib_info_hash_size = new_size; - fib_info_hash_bits = ilog2(new_size); - - for (i = 0; i < old_size; i++) { - struct hlist_head *head = &fib_info_hash[i]; - struct hlist_node *n; - struct fib_info *fi; - - hlist_for_each_entry_safe(fi, n, head, fib_hash) { - struct hlist_head *dest; - unsigned int new_hash; - - new_hash = fib_info_hashfn(fi); - dest = &new_info_hash[new_hash]; - hlist_add_head(&fi->fib_hash, dest); - } - } - fib_info_hash = new_info_hash; - - fib_info_laddrhash = new_laddrhash; - for (i = 0; i < old_size; i++) { - struct hlist_head *lhead = &old_laddrhash[i]; - struct hlist_node *n; - struct fib_info *fi; - - hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) { - struct hlist_head *ldest; - - ldest = fib_info_laddrhash_bucket(fi->fib_net, - fi->fib_prefsrc); - hlist_add_head(&fi->fib_lhash, ldest); - } - } - - kvfree(old_info_hash); - kvfree(old_laddrhash); -} - __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope) { @@ -1409,32 +1402,14 @@ struct fib_info *fib_create_info(struct fib_config *cfg, } #endif - err = -ENOBUFS; - - if (fib_info_cnt >= fib_info_hash_size) { - unsigned int new_size = fib_info_hash_size << 1; - struct hlist_head *new_info_hash; - struct hlist_head *new_laddrhash; - size_t bytes; - - if (!new_size) - new_size = 16; - bytes = (size_t)new_size * sizeof(struct hlist_head *); - new_info_hash = kvzalloc(bytes, GFP_KERNEL); - new_laddrhash = kvzalloc(bytes, GFP_KERNEL); - if (!new_info_hash || !new_laddrhash) { - kvfree(new_info_hash); - kvfree(new_laddrhash); - } else { - fib_info_hash_move(new_info_hash, new_laddrhash, new_size); - } - if (!fib_info_hash_size) - goto failure; - } + fib_info_hash_grow(net); fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); - if (!fi) + if (!fi) { + err = -ENOBUFS; goto failure; + } + fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); if (IS_ERR(fi->fib_metrics)) { err = PTR_ERR(fi->fib_metrics); @@ -1571,9 +1546,9 @@ link_it: refcount_set(&fi->fib_treeref, 1); refcount_set(&fi->fib_clntref, 1); - fib_info_cnt++; - hlist_add_head(&fi->fib_hash, - &fib_info_hash[fib_info_hashfn(fi)]); + net->ipv4.fib_info_cnt++; + hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); + if (fi->fib_prefsrc) { struct hlist_head *head; @@ -1855,7 +1830,7 @@ int fib_sync_down_addr(struct net_device *dev, __be32 local) struct fib_info *fi; int ret = 0; - if (!fib_info_laddrhash || local == 0) + if (!local) return 0; head = fib_info_laddrhash_bucket(net, local); @@ -2257,3 +2232,22 @@ check_saddr: fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK); } } + +int __net_init fib4_semantics_init(struct net *net) +{ + unsigned int hash_bits = 4; + + net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits); + if (!net->ipv4.fib_info_hash) + return -ENOMEM; + + net->ipv4.fib_info_hash_bits = hash_bits; + net->ipv4.fib_info_cnt = 0; + + return 0; +} + +void __net_exit fib4_semantics_exit(struct net *net) +{ + fib_info_hash_free(net->ipv4.fib_info_hash); +} diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index d6411ac81096..59a6f0a9638f 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1187,22 +1187,6 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } -static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) -{ - if (plen > KEYLENGTH) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - return false; - } - - if ((plen < KEYLENGTH) && (key << plen)) { - NL_SET_ERR_MSG(extack, - "Invalid prefix for given prefix length"); - return false; - } - - return true; -} - static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); @@ -1223,9 +1207,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); @@ -1717,9 +1698,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 094084b61bff..717cb7d3607a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -399,13 +399,12 @@ static void icmp_push_reply(struct sock *sk, static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) { - struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->dst.dev); + struct net *net = dev_net_rcu(rt->dst.dev); bool apply_ratelimit = false; + struct ipcm_cookie ipc; struct flowi4 fl4; struct sock *sk; - struct inet_sock *inet; __be32 daddr, saddr; u32 mark = IP4_REPLY_MARK(net, skb->mark); int type = icmp_param->data.icmph.type; @@ -424,12 +423,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) sk = icmp_xmit_lock(net); if (!sk) goto out_bh_enable; - inet = inet_sk(sk); icmp_param->data.icmph.checksum = 0; ipcm_init(&ipc); - inet->tos = ip_hdr(skb)->tos; + ipc.tos = ip_hdr(skb)->tos; ipc.sockc.mark = mark; daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); @@ -608,12 +606,14 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, struct sock *sk; if (!rt) - goto out; + return; + + rcu_read_lock(); if (rt->dst.dev) - net = dev_net(rt->dst.dev); + net = dev_net_rcu(rt->dst.dev); else if (skb_in->dev) - net = dev_net(skb_in->dev); + net = dev_net_rcu(skb_in->dev); else goto out; @@ -735,8 +735,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, icmp_param.data.icmph.checksum = 0; icmp_param.skb = skb_in; icmp_param.offset = skb_network_offset(skb_in); - inet_sk(sk)->tos = tos; ipcm_init(&ipc); + ipc.tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts.opt; ipc.sockc.mark = mark; @@ -785,7 +785,8 @@ out_unlock: icmp_xmit_unlock(sk); out_bh_enable: local_bh_enable(); -out:; +out: + rcu_read_unlock(); } EXPORT_SYMBOL(__icmp_send); @@ -834,7 +835,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) * avoid additional coding at protocol handlers. */ if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { - __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); return; } @@ -868,7 +869,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) struct net *net; u32 info = 0; - net = dev_net(skb_dst(skb)->dev); + net = dev_net_rcu(skb_dst(skb)->dev); /* * Incomplete header ? @@ -979,7 +980,7 @@ out_err: static enum skb_drop_reason icmp_redirect(struct sk_buff *skb) { if (skb->len < sizeof(struct iphdr)) { - __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); return SKB_DROP_REASON_PKT_TOO_SMALL; } @@ -1011,7 +1012,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) struct icmp_bxm icmp_param; struct net *net; - net = dev_net(skb_dst(skb)->dev); + net = dev_net_rcu(skb_dst(skb)->dev); /* should there be an ICMP stat for ignored echos? */ if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) return SKB_NOT_DROPPED_YET; @@ -1040,9 +1041,9 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr) { + struct net *net = dev_net_rcu(skb->dev); struct icmp_ext_hdr *ext_hdr, _ext_hdr; struct icmp_ext_echo_iio *iio, _iio; - struct net *net = dev_net(skb->dev); struct inet6_dev *in6_dev; struct in_device *in_dev; struct net_device *dev; @@ -1181,7 +1182,7 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) return SKB_NOT_DROPPED_YET; out_err: - __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS); return SKB_DROP_REASON_PKT_TOO_SMALL; } @@ -1198,7 +1199,7 @@ int icmp_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct rtable *rt = skb_rtable(skb); - struct net *net = dev_net(rt->dst.dev); + struct net *net = dev_net_rcu(rt->dst.dev); struct icmphdr *icmph; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { @@ -1247,22 +1248,6 @@ int icmp_rcv(struct sk_buff *skb) goto reason_check; } - if (icmph->type == ICMP_EXT_ECHOREPLY) { - reason = ping_rcv(skb); - goto reason_check; - } - - /* - * 18 is the highest 'known' ICMP type. Anything else is a mystery - * - * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently - * discarded. - */ - if (icmph->type > NR_ICMP_TYPES) { - reason = SKB_DROP_REASON_UNHANDLED_PROTO; - goto error; - } - /* * Parse the ICMP message */ @@ -1289,6 +1274,23 @@ int icmp_rcv(struct sk_buff *skb) } } + if (icmph->type == ICMP_EXT_ECHOREPLY || + icmph->type == ICMP_ECHOREPLY) { + reason = ping_rcv(skb); + return reason ? NET_RX_DROP : NET_RX_SUCCESS; + } + + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + reason = SKB_DROP_REASON_UNHANDLED_PROTO; + goto error; + } + reason = icmp_pointers[icmph->type].handler(skb); reason_check: if (!reason) { @@ -1371,9 +1373,9 @@ int icmp_err(struct sk_buff *skb, u32 info) struct iphdr *iph = (struct iphdr *)skb->data; int offset = iph->ihl<<2; struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset); + struct net *net = dev_net_rcu(skb->dev); int type = icmp_hdr(skb)->type; int code = icmp_hdr(skb)->code; - struct net *net = dev_net(skb->dev); /* * Use ping_err to handle all icmp errors except those diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 3da126cea884..2c394c364cb9 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -81,6 +81,7 @@ #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> +#include "igmp_internal.h" #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/times.h> @@ -1432,14 +1433,16 @@ static void ip_mc_hash_remove(struct in_device *in_dev, *mc_hash = im->next_hash; } -static int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, - const struct ip_mc_list *im, int event) +int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, + const struct ip_mc_list *im, + struct inet_fill_args *args) { struct ifa_cacheinfo ci; struct ifaddrmsg *ifm; struct nlmsghdr *nlh; - nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct ifaddrmsg), 0); + nlh = nlmsg_put(skb, args->portid, args->seq, args->event, + sizeof(struct ifaddrmsg), args->flags); if (!nlh) return -EMSGSIZE; @@ -1468,6 +1471,9 @@ static int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, static void inet_ifmcaddr_notify(struct net_device *dev, const struct ip_mc_list *im, int event) { + struct inet_fill_args fillargs = { + .event = event, + }; struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOMEM; @@ -1479,7 +1485,7 @@ static void inet_ifmcaddr_notify(struct net_device *dev, if (!skb) goto error; - err = inet_fill_ifmcaddr(skb, dev, im, event); + err = inet_fill_ifmcaddr(skb, dev, im, &fillargs); if (err < 0) { WARN_ON_ONCE(err == -EMSGSIZE); nlmsg_free(skb); diff --git a/net/ipv4/igmp_internal.h b/net/ipv4/igmp_internal.h new file mode 100644 index 000000000000..0a1bcc8ec8e1 --- /dev/null +++ b/net/ipv4/igmp_internal.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_IGMP_INTERNAL_H +#define _LINUX_IGMP_INTERNAL_H + +struct inet_fill_args { + u32 portid; + u32 seq; + int event; + unsigned int flags; + int netnsid; + int ifindex; +}; + +int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev, + const struct ip_mc_list *im, + struct inet_fill_args *args); +#endif diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e4decfb270fa..bf9ce0c19657 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -799,18 +799,6 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk) sk_stop_timer_sync(sk, &sk->sk_timer); } -void inet_csk_delete_keepalive_timer(struct sock *sk) -{ - sk_stop_timer(sk, &sk->sk_timer); -} -EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); - -void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) -{ - sk_reset_timer(sk, &sk->sk_timer, jiffies + len); -} -EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); - struct dst_entry *inet_csk_route_req(const struct sock *sk, struct flowi4 *fl4, const struct request_sock *req) @@ -1249,39 +1237,59 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, const gfp_t priority) { struct sock *newsk = sk_clone_lock(sk, priority); + struct inet_connection_sock *newicsk; + struct inet_request_sock *ireq; + struct inet_sock *newinet; - if (newsk) { - struct inet_connection_sock *newicsk = inet_csk(newsk); + if (!newsk) + return NULL; - inet_sk_set_state(newsk, TCP_SYN_RECV); - newicsk->icsk_bind_hash = NULL; - newicsk->icsk_bind2_hash = NULL; + newicsk = inet_csk(newsk); + newinet = inet_sk(newsk); + ireq = inet_rsk(req); - inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port; - inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num; - inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num); + newicsk->icsk_bind_hash = NULL; + newicsk->icsk_bind2_hash = NULL; - /* listeners have SOCK_RCU_FREE, not the children */ - sock_reset_flag(newsk, SOCK_RCU_FREE); + newinet->inet_dport = ireq->ir_rmt_port; + newinet->inet_num = ireq->ir_num; + newinet->inet_sport = htons(ireq->ir_num); - inet_sk(newsk)->mc_list = NULL; + newsk->sk_bound_dev_if = ireq->ir_iif; - newsk->sk_mark = inet_rsk(req)->ir_mark; - atomic64_set(&newsk->sk_cookie, - atomic64_read(&inet_rsk(req)->ir_cookie)); + newsk->sk_daddr = ireq->ir_rmt_addr; + newsk->sk_rcv_saddr = ireq->ir_loc_addr; + newinet->inet_saddr = ireq->ir_loc_addr; - newicsk->icsk_retransmits = 0; - newicsk->icsk_backoff = 0; - newicsk->icsk_probes_out = 0; - newicsk->icsk_probes_tstamp = 0; +#if IS_ENABLED(CONFIG_IPV6) + newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; + newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; +#endif - /* Deinitialize accept_queue to trap illegal accesses. */ - memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + /* listeners have SOCK_RCU_FREE, not the children */ + sock_reset_flag(newsk, SOCK_RCU_FREE); - inet_clone_ulp(req, newsk, priority); + inet_sk(newsk)->mc_list = NULL; + + newsk->sk_mark = inet_rsk(req)->ir_mark; + atomic64_set(&newsk->sk_cookie, + atomic64_read(&inet_rsk(req)->ir_cookie)); + + newicsk->icsk_retransmits = 0; + newicsk->icsk_backoff = 0; + newicsk->icsk_probes_out = 0; + newicsk->icsk_probes_tstamp = 0; + + /* Deinitialize accept_queue to trap illegal accesses. */ + memset(&newicsk->icsk_accept_queue, 0, + sizeof(newicsk->icsk_accept_queue)); + + inet_sk_set_state(newsk, TCP_SYN_RECV); + + inet_clone_ulp(req, newsk, priority); + + security_inet_csk_clone(newsk, req); - security_inet_csk_clone(newsk, req); - } return newsk; } EXPORT_SYMBOL_GPL(inet_csk_clone_lock); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 321acc8abf17..efe2a085cf68 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -282,7 +282,7 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct inet_diag_meminfo minfo = { .idiag_rmem = sk_rmem_alloc_get(sk), .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), - .idiag_fmem = sk_forward_alloc_get(sk), + .idiag_fmem = READ_ONCE(sk->sk_forward_alloc), .idiag_tmem = sk_wmem_alloc_get(sk), }; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index b8b23a77ceb4..7b1e0a2d6906 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -60,7 +60,7 @@ void inet_peer_base_init(struct inet_peer_base *bp) seqlock_init(&bp->lock); bp->total = 0; } -EXPORT_SYMBOL_GPL(inet_peer_base_init); +EXPORT_IPV6_MOD_GPL(inet_peer_base_init); #define PEER_MAX_GC 32 @@ -218,7 +218,7 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, return p; } -EXPORT_SYMBOL_GPL(inet_getpeer); +EXPORT_IPV6_MOD_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { @@ -269,7 +269,7 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) WRITE_ONCE(peer->rate_tokens, token); return rc; } -EXPORT_SYMBOL(inet_peer_xrlim_allow); +EXPORT_IPV6_MOD(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { @@ -286,4 +286,4 @@ void inetpeer_invalidate_tree(struct inet_peer_base *base) base->total = 0; } -EXPORT_SYMBOL(inetpeer_invalidate_tree); +EXPORT_IPV6_MOD(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ed1b6b44faf8..26d15f907551 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -141,7 +141,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, const struct iphdr *iph; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - unsigned int data_len = 0; struct ip_tunnel *t; if (tpi->proto == htons(ETH_P_TEB)) @@ -182,7 +181,6 @@ static int ipgre_err(struct sk_buff *skb, u32 info, case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; - data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: @@ -190,10 +188,16 @@ static int ipgre_err(struct sk_buff *skb, u32 info, } #if IS_ENABLED(CONFIG_IPV6) - if (tpi->proto == htons(ETH_P_IPV6) && - !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, - type, data_len)) - return 0; + if (tpi->proto == htons(ETH_P_IPV6)) { + unsigned int data_len = 0; + + if (type == ICMP_TIME_EXCEEDED) + data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ + + if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len, + type, data_len)) + return 0; + } #endif if (t->parms.iph.daddr == 0 || @@ -1392,10 +1396,12 @@ ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[]) return 0; } -static int ipgre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipgre_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1407,13 +1413,16 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; - return ip_tunnel_newlink(dev, tb, &p, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } -static int erspan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int erspan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; int err; @@ -1425,7 +1434,8 @@ static int erspan_newlink(struct net *src_net, struct net_device *dev, err = erspan_netlink_parms(dev, data, tb, &p, &fwmark); if (err) return err; - return ip_tunnel_newlink(dev, tb, &p, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], @@ -1693,6 +1703,7 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = { struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type) { + struct rtnl_newlink_params params = { .src_net = net }; struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; LIST_HEAD(list_kill); @@ -1700,6 +1711,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, int err; memset(&tb, 0, sizeof(tb)); + params.tb = tb; dev = rtnl_create_link(net, name, name_assign_type, &ipgre_tap_ops, tb, NULL); @@ -1710,7 +1722,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name, t = netdev_priv(dev); t->collect_md = true; - err = ipgre_newlink(net, dev, tb, NULL, NULL); + err = ipgre_newlink(dev, ¶ms, NULL); if (err < 0) { free_netdev(dev); return ERR_PTR(err); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index f6a03b418dde..6d9c5c20b1c4 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -128,20 +128,20 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb, static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb) { - char *secdata; - u32 seclen, secid; + struct lsm_context ctx; + u32 secid; int err; err = security_socket_getpeersec_dgram(NULL, skb, &secid); if (err) return; - err = security_secid_to_secctx(secid, &secdata, &seclen); - if (err) + err = security_secid_to_secctx(secid, &ctx); + if (err < 0) return; - put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata); - security_release_secctx(secdata, seclen); + put_cmsg(msg, SOL_IP, SCM_SECURITY, ctx.len, ctx.context); + security_release_secctx(&ctx); } static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 09b73acf037a..4b06dc7e04f2 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1162,7 +1162,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id, * Allowing to move it to another netns is clearly unsafe. */ if (!IS_ERR(itn->fb_tunnel_dev)) { - itn->fb_tunnel_dev->netns_local = true; + itn->fb_tunnel_dev->netns_immutable = true; itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); itn->type = itn->fb_tunnel_dev->type; @@ -1213,11 +1213,11 @@ void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, } EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); -int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm_kern *p, __u32 fwmark) +int ip_tunnel_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct ip_tunnel_parm_kern *p, + __u32 fwmark) { struct ip_tunnel *nt; - struct net *net = dev_net(dev); struct ip_tunnel_net *itn; int mtu; int err; @@ -1326,7 +1326,6 @@ int ip_tunnel_init(struct net_device *dev) } tunnel->dev = dev; - tunnel->net = dev_net(dev); strscpy(tunnel->parms.name, dev->name); iph->version = 4; iph->ihl = 5; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index f0b4419cef34..159b4473290e 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -575,15 +575,18 @@ static void vti_netlink_parms(struct nlattr *data[], *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } -static int vti_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vti_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct nlattr **data = params->data; struct ip_tunnel_parm_kern parms; + struct nlattr **tb = params->tb; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); - return ip_tunnel_newlink(dev, tb, &parms, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, + &parms, fwmark); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index dc0db5895e0e..bab0bf90c908 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -436,11 +436,13 @@ static void ipip_netlink_parms(struct nlattr *data[], *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -static int ipip_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipip_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; struct ip_tunnel_parm_kern p; __u32 fwmark = 0; @@ -453,7 +455,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, } ipip_netlink_parms(data, &p, &t->collect_md, &fwmark); - return ip_tunnel_newlink(dev, tb, &p, fwmark); + return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p, + fwmark); } static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 21ae7594a852..b81c8131e23f 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -563,7 +563,7 @@ static void reg_vif_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; - dev->netns_local = true; + dev->netns_immutable = true; } static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 03b6eee407a2..28d77d454d44 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -330,9 +330,6 @@ next_entry: list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { if (e < s_e) goto next_entry2; - if (filter->dev && - !mr_mfc_uses_dev(mrt, mfc, filter->dev)) - goto next_entry2; err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 619ddc087957..c14baa6589c7 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -705,7 +705,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ip_options_data opt_copy; int free = 0; __be32 saddr, daddr, faddr; - u8 tos, scope; + u8 scope; int err; pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num); @@ -768,7 +768,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } faddr = ipc.opt->opt.faddr; } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (ipv4_is_multicast(daddr)) { @@ -779,7 +778,8 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } else if (!ipc.oif) ipc.oif = READ_ONCE(inet->uc_index); - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, saddr, 0, 0, sk->sk_uid); @@ -966,10 +966,9 @@ EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); enum skb_drop_reason ping_rcv(struct sk_buff *skb) { - enum skb_drop_reason reason = SKB_DROP_REASON_NO_SOCKET; - struct sock *sk; struct net *net = dev_net(skb->dev); struct icmphdr *icmph = icmp_hdr(skb); + struct sock *sk; /* We assume the packet has already been checked by icmp_rcv */ @@ -980,20 +979,11 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb) skb_push(skb, skb->data - (u8 *)icmph); sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id)); - if (sk) { - struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); - - pr_debug("rcv on socket %p\n", sk); - if (skb2) - reason = __ping_queue_rcv_skb(sk, skb2); - else - reason = SKB_DROP_REASON_NOMEM; - } - - if (reason) - pr_debug("no socket, dropping\n"); + if (sk) + return __ping_queue_rcv_skb(sk, skb); - return reason; + kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET); + return SKB_DROP_REASON_NO_SOCKET; } EXPORT_SYMBOL_GPL(ping_rcv); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index affd21a0f572..10cbeb76c274 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -189,6 +189,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), + SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED), SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4304a68d1db0..6aace4d55733 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -486,7 +486,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct ipcm_cookie ipc; struct rtable *rt = NULL; struct flowi4 fl4; - u8 tos, scope; + u8 scope; int free = 0; __be32 daddr; __be32 saddr; @@ -581,7 +581,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) daddr = ipc.opt->opt.faddr; } } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); uc_index = READ_ONCE(inet->uc_index); @@ -606,7 +605,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) } } - flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 577b88a43293..753704f75b2c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -390,7 +390,13 @@ static inline int ip_rt_proc_init(void) static inline bool rt_is_expired(const struct rtable *rth) { - return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); + bool res; + + rcu_read_lock(); + res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); + rcu_read_unlock(); + + return res; } void rt_cache_flush(struct net *net) @@ -1002,9 +1008,9 @@ out: kfree_skb_reason(skb, reason); static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; - struct net *net = dev_net(dst->dev); struct fib_result res; bool lock = false; + struct net *net; u32 old_mtu; if (ip_mtu_locked(dst)) @@ -1014,6 +1020,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (old_mtu < mtu) return; + rcu_read_lock(); + net = dev_net_rcu(dst->dev); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1021,9 +1029,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) - return; + goto out; - rcu_read_lock(); if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; @@ -1037,14 +1044,14 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } - rcu_read_unlock(); - return; + goto out; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } +out: rcu_read_unlock(); } @@ -1307,10 +1314,15 @@ static void set_class_tag(struct rtable *rt, u32 tag) static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { - struct net *net = dev_net(dst->dev); unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); - unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, - net->ipv4.ip_rt_min_advmss); + unsigned int advmss; + struct net *net; + + rcu_read_lock(); + net = dev_net_rcu(dst->dev); + advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, + net->ipv4.ip_rt_min_advmss); + rcu_read_unlock(); return min(advmss, IPV4_MAX_PMTU - header_size); } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 1948d15f1f28..5459a78b9809 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -222,7 +222,7 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, return NULL; } -EXPORT_SYMBOL(tcp_get_cookie_sock); +EXPORT_IPV6_MOD(tcp_get_cookie_sock); /* * when syncookies are in effect and tcp timestamps are enabled we stored @@ -259,7 +259,7 @@ bool cookie_timestamp_decode(const struct net *net, return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0; } -EXPORT_SYMBOL(cookie_timestamp_decode); +EXPORT_IPV6_MOD(cookie_timestamp_decode); static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, struct request_sock *req) @@ -279,6 +279,7 @@ static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, ireq->smc_ok = 0; treq->snt_synack = 0; + treq->snt_tsval_first = 0; treq->tfo_listener = false; treq->txhash = net_tx_rndhash(); treq->rcv_isn = ntohl(th->seq) - 1; @@ -310,7 +311,7 @@ struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb) return req; } -EXPORT_SYMBOL_GPL(cookie_bpf_check); +EXPORT_IPV6_MOD_GPL(cookie_bpf_check); #endif struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, @@ -351,7 +352,7 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, return req; } -EXPORT_SYMBOL_GPL(cookie_tcp_reqsk_alloc); +EXPORT_IPV6_MOD_GPL(cookie_tcp_reqsk_alloc); static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 42cb5dc9cb24..3a43010d726f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -28,6 +28,7 @@ static int tcp_adv_win_scale_max = 31; static int tcp_app_win_max = 31; static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS; static int tcp_min_snd_mss_max = 65535; +static int tcp_rto_max_max = TCP_RTO_MAX_SEC * MSEC_PER_SEC; static int ip_privileged_port_min; static int ip_privileged_port_max = 65535; static int ip_ttl_min = 1; @@ -1583,6 +1584,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "tcp_rto_max_ms", + .data = &init_net.ipv4.sysctl_tcp_rto_max_ms, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_THOUSAND, + .extra2 = &tcp_rto_max_max, + }, }; static __net_init int ipv4_sysctl_init_net(struct net *net) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0d704bda6c41..eb5a60c7a9cc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -300,10 +300,10 @@ DEFINE_PER_CPU(u32, tcp_tw_isn); EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); long sysctl_tcp_mem[3] __read_mostly; -EXPORT_SYMBOL(sysctl_tcp_mem); +EXPORT_IPV6_MOD(sysctl_tcp_mem); atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ -EXPORT_SYMBOL(tcp_memory_allocated); +EXPORT_IPV6_MOD(tcp_memory_allocated); DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); @@ -316,7 +316,7 @@ EXPORT_SYMBOL(tcp_have_smc); * Current number of TCP sockets. */ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; -EXPORT_SYMBOL(tcp_sockets_allocated); +EXPORT_IPV6_MOD(tcp_sockets_allocated); /* * TCP splice context @@ -349,7 +349,7 @@ void tcp_enter_memory_pressure(struct sock *sk) if (!cmpxchg(&tcp_memory_pressure, 0, val)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); } -EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure); +EXPORT_IPV6_MOD_GPL(tcp_enter_memory_pressure); void tcp_leave_memory_pressure(struct sock *sk) { @@ -362,7 +362,7 @@ void tcp_leave_memory_pressure(struct sock *sk) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO, jiffies_to_msecs(jiffies - val)); } -EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure); +EXPORT_IPV6_MOD_GPL(tcp_leave_memory_pressure); /* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max) @@ -423,7 +423,7 @@ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int rto_min_us; + int rto_min_us, rto_max_ms; tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; @@ -432,6 +432,10 @@ void tcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; + + rto_max_ms = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_max_ms); + icsk->icsk_rto_max = msecs_to_jiffies(rto_max_ms); + rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us); icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us); icsk->icsk_delack_max = TCP_DELACK_MAX; @@ -475,7 +479,7 @@ void tcp_init_sock(struct sock *sk) sk_sockets_allocated_inc(sk); xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1); } -EXPORT_SYMBOL(tcp_init_sock); +EXPORT_IPV6_MOD(tcp_init_sock); static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) { @@ -488,10 +492,14 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) sock_tx_timestamp(sk, sockc, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) - tcb->txstamp_ack = 1; + tcb->txstamp_ack |= TSTAMP_ACK_SK; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; } + + if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) && + SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb) + bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB); } static bool tcp_stream_is_readable(struct sock *sk, int target) @@ -660,7 +668,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg) *karg = answ; return 0; } -EXPORT_SYMBOL(tcp_ioctl); +EXPORT_IPV6_MOD(tcp_ioctl); void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { @@ -876,7 +884,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, return ret; } -EXPORT_SYMBOL(tcp_splice_read); +EXPORT_IPV6_MOD(tcp_splice_read); struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule) @@ -1123,7 +1131,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) /* 'common' sending to sendq */ } - sockcm_init(&sockc, sk); + sockc = (struct sockcm_cookie) { .tsflags = READ_ONCE(sk->sk_tsflags)}; if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { @@ -1376,7 +1384,7 @@ void tcp_splice_eof(struct socket *sock) tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); release_sock(sk); } -EXPORT_SYMBOL_GPL(tcp_splice_eof); +EXPORT_IPV6_MOD_GPL(tcp_splice_eof); /* * Handle reading urgent data. BSD has very simple semantics for @@ -1565,12 +1573,13 @@ EXPORT_SYMBOL(tcp_recv_skb); * or for 'peeking' the socket using this routine * (although both would be easy to implement). */ -int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq) { struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - u32 seq = tp->copied_seq; + u32 seq = *copied_seq; u32 offset; int copied = 0; @@ -1624,9 +1633,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_eat_recv_skb(sk, skb); if (!desc->count) break; - WRITE_ONCE(tp->copied_seq, seq); + WRITE_ONCE(*copied_seq, seq); } - WRITE_ONCE(tp->copied_seq, seq); + WRITE_ONCE(*copied_seq, seq); + + if (noack) + goto out; tcp_rcv_space_adjust(sk); @@ -1635,10 +1647,25 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); } +out: return copied; } + +int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + return __tcp_read_sock(sk, desc, recv_actor, false, + &tcp_sk(sk)->copied_seq); +} EXPORT_SYMBOL(tcp_read_sock); +int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq) +{ + return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq); +} + int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; @@ -1667,7 +1694,7 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) } return copied; } -EXPORT_SYMBOL(tcp_read_skb); +EXPORT_IPV6_MOD(tcp_read_skb); void tcp_read_done(struct sock *sk, size_t len) { @@ -1712,7 +1739,7 @@ int tcp_peek_len(struct socket *sock) { return tcp_inq(sock->sk); } -EXPORT_SYMBOL(tcp_peek_len); +EXPORT_IPV6_MOD(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) @@ -1739,7 +1766,7 @@ int tcp_set_rcvlowat(struct sock *sk, int val) } return 0; } -EXPORT_SYMBOL(tcp_set_rcvlowat); +EXPORT_IPV6_MOD(tcp_set_rcvlowat); void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping_internal *tss) @@ -1772,7 +1799,7 @@ int tcp_mmap(struct file *file, struct socket *sock, vma->vm_ops = &tcp_vm_ops; return 0; } -EXPORT_SYMBOL(tcp_mmap); +EXPORT_IPV6_MOD(tcp_mmap); static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, u32 *offset_frag) @@ -2438,14 +2465,12 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, */ memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg)); dmabuf_cmsg.frag_size = copy; - err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR, - sizeof(dmabuf_cmsg), &dmabuf_cmsg); - if (err || msg->msg_flags & MSG_CTRUNC) { - msg->msg_flags &= ~MSG_CTRUNC; - if (!err) - err = -ETOOSMALL; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_LINEAR, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) goto out; - } sent += copy; @@ -2476,6 +2501,11 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, } niov = skb_frag_net_iov(frag); + if (!net_is_devmem_iov(niov)) { + err = -ENODEV; + goto out; + } + end = start + skb_frag_size(frag); copy = end - offset; @@ -2494,21 +2524,17 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, /* Will perform the exchange later */ dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx]; - dmabuf_cmsg.dmabuf_id = net_iov_binding_id(niov); + dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov); offset += copy; remaining_len -= copy; - err = put_cmsg(msg, SOL_SOCKET, - SO_DEVMEM_DMABUF, - sizeof(dmabuf_cmsg), - &dmabuf_cmsg); - if (err || msg->msg_flags & MSG_CTRUNC) { - msg->msg_flags &= ~MSG_CTRUNC; - if (!err) - err = -ETOOSMALL; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_DMABUF, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) goto out; - } atomic_long_inc(&niov->pp_ref_count); tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); @@ -2864,7 +2890,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, } return ret; } -EXPORT_SYMBOL(tcp_recvmsg); +EXPORT_IPV6_MOD(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) { @@ -2994,7 +3020,7 @@ void tcp_shutdown(struct sock *sk, int how) tcp_send_fin(sk); } } -EXPORT_SYMBOL(tcp_shutdown); +EXPORT_IPV6_MOD(tcp_shutdown); int tcp_orphan_count_sum(void) { @@ -3174,7 +3200,7 @@ adjudge_to_death: const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); @@ -3493,7 +3519,7 @@ static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, } DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); -EXPORT_SYMBOL(tcp_tx_delay_enabled); +EXPORT_IPV6_MOD(tcp_tx_delay_enabled); static void tcp_enable_tx_delay(void) { @@ -3627,7 +3653,7 @@ int tcp_sock_set_keepidle_locked(struct sock *sk, int val) elapsed = tp->keepalive_time - elapsed; else elapsed = 0; - inet_csk_reset_keepalive_timer(sk, elapsed); + tcp_reset_keepalive_timer(sk, elapsed); } return 0; @@ -3667,33 +3693,33 @@ EXPORT_SYMBOL(tcp_sock_set_keepcnt); int tcp_set_window_clamp(struct sock *sk, int val) { + u32 old_window_clamp, new_window_clamp; struct tcp_sock *tp = tcp_sk(sk); if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; WRITE_ONCE(tp->window_clamp, 0); - } else { - u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp; - u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ? - SOCK_MIN_RCVBUF / 2 : val; + return 0; + } - if (new_window_clamp == old_window_clamp) - return 0; + old_window_clamp = tp->window_clamp; + new_window_clamp = max_t(int, SOCK_MIN_RCVBUF / 2, val); - WRITE_ONCE(tp->window_clamp, new_window_clamp); - if (new_window_clamp < old_window_clamp) { - /* need to apply the reserved mem provisioning only - * when shrinking the window clamp - */ - __tcp_adjust_rcv_ssthresh(sk, tp->window_clamp); + if (new_window_clamp == old_window_clamp) + return 0; - } else { - new_rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp); - tp->rcv_ssthresh = max(new_rcv_ssthresh, - tp->rcv_ssthresh); - } - } + WRITE_ONCE(tp->window_clamp, new_window_clamp); + + /* Need to apply the reserved mem provisioning only + * when shrinking the window clamp. + */ + if (new_window_clamp < old_window_clamp) + __tcp_adjust_rcv_ssthresh(sk, new_window_clamp); + else + tp->rcv_ssthresh = clamp(new_window_clamp, + tp->rcv_ssthresh, + tp->rcv_wnd); return 0; } @@ -3802,6 +3828,11 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ)); return 0; + case TCP_RTO_MAX_MS: + if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC) + return -EINVAL; + WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val)); + return 0; } sockopt_lock_sock(sk); @@ -4031,7 +4062,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } -EXPORT_SYMBOL(tcp_setsockopt); +EXPORT_IPV6_MOD(tcp_setsockopt); static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) @@ -4638,6 +4669,9 @@ zerocopy_rcv_out: case TCP_IS_MPTCP: val = 0; break; + case TCP_RTO_MAX_MS: + val = jiffies_to_msecs(tcp_rto_max(sk)); + break; default: return -ENOPROTOOPT; } @@ -4659,7 +4693,7 @@ bool tcp_bpf_bypass_getsockopt(int level, int optname) return false; } -EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); +EXPORT_IPV6_MOD(tcp_bpf_bypass_getsockopt); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) @@ -4673,11 +4707,11 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); } -EXPORT_SYMBOL(tcp_getsockopt); +EXPORT_IPV6_MOD(tcp_getsockopt); #ifdef CONFIG_TCP_MD5SIG int tcp_md5_sigpool_id = -1; -EXPORT_SYMBOL_GPL(tcp_md5_sigpool_id); +EXPORT_IPV6_MOD_GPL(tcp_md5_sigpool_id); int tcp_md5_alloc_sigpool(void) { @@ -4723,7 +4757,7 @@ int tcp_md5_hash_key(struct tcp_sigpool *hp, */ return data_race(crypto_ahash_update(hp->req)); } -EXPORT_SYMBOL(tcp_md5_hash_key); +EXPORT_IPV6_MOD(tcp_md5_hash_key); /* Called with rcu_read_lock() */ static enum skb_drop_reason @@ -4843,7 +4877,7 @@ tcp_inbound_hash(struct sock *sk, const struct request_sock *req, return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family, l3index, md5_location); } -EXPORT_SYMBOL_GPL(tcp_inbound_hash); +EXPORT_IPV6_MOD_GPL(tcp_inbound_hash); void tcp_done(struct sock *sk) { diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 47f65b1b70ca..ba581785adb4 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -646,6 +646,42 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; } +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) +int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + struct sock *sk = strp->sk; + struct sk_psock *psock; + struct tcp_sock *tp; + int copied = 0; + + tp = tcp_sk(sk); + rcu_read_lock(); + psock = sk_psock(sk); + if (WARN_ON_ONCE(!psock)) { + desc->error = -EINVAL; + goto out; + } + + psock->ingress_bytes = 0; + copied = tcp_read_sock_noack(sk, desc, recv_actor, true, + &psock->copied_seq); + if (copied < 0) + goto out; + /* recv_actor may redirect skb to another socket (SK_REDIRECT) or + * just put skb into ingress queue of current socket (SK_PASS). + * For SK_REDIRECT, we need to ack the frame immediately but for + * SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser(). + */ + tp->copied_seq = psock->copied_seq - psock->ingress_bytes; + tcp_rcv_space_adjust(sk); + __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes); +out: + rcu_read_unlock(); + return copied; +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 5dbed91c6178..76c23675ae50 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -392,6 +392,10 @@ static void hystart_update(struct sock *sk, u32 delay) if (after(tp->snd_una, ca->end_seq)) bictcp_hystart_reset(sk); + /* hystart triggers when cwnd is larger than some threshold */ + if (tcp_snd_cwnd(tp) < hystart_low_window) + return; + if (hystart_detect & HYSTART_ACK_TRAIN) { u32 now = bictcp_clock_us(sk); @@ -467,9 +471,7 @@ __bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample if (ca->delay_min == 0 || ca->delay_min > delay) ca->delay_min = delay; - /* hystart triggers when cwnd is larger than some threshold */ - if (!ca->found && tcp_in_slow_start(tp) && hystart && - tcp_snd_cwnd(tp) >= hystart_low_window) + if (!ca->found && tcp_in_slow_start(tp) && hystart) hystart_update(sk, delay); } diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 0f523cbfe329..1a6b1bc54245 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -178,7 +178,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) if (!skb) return; - skb_dst_drop(skb); + tcp_cleanup_skb(skb); /* segs_in has been initialized to 1 in tcp_create_openreq_child(). * Hence, reset segs_in to 0 before calling tcp_segs_in() * to avoid double counting. Also, tcp_segs_in() expects @@ -195,7 +195,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN; tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); tp->syn_data_acked = 1; /* u64_stats_update_begin(&tp->syncp) not needed here, @@ -274,8 +274,8 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, * because it's been added to the accept queue directly. */ req->timeout = tcp_timeout_init(child); - inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, - req->timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(child, ICSK_TIME_RETRANS, + req->timeout, false); refcount_set(&req->rsk_refcnt, 2); @@ -468,7 +468,7 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) } return false; } -EXPORT_SYMBOL(tcp_fastopen_defer_connect); +EXPORT_IPV6_MOD(tcp_fastopen_defer_connect); /* * The following code block is to deal with middle box issues with TFO: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eb82e01da911..4e2212348088 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -169,6 +169,7 @@ static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb) memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB; sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb)); @@ -185,6 +186,7 @@ static void bpf_skops_established(struct sock *sk, int bpf_op, memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); sock_ops.op = bpf_op; sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ if (skb) @@ -243,9 +245,15 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) do_div(val, skb->truesize); tcp_sk(sk)->scaling_ratio = val ? val : 1; - if (old_ratio != tcp_sk(sk)->scaling_ratio) - WRITE_ONCE(tcp_sk(sk)->window_clamp, - tcp_win_from_space(sk, sk->sk_rcvbuf)); + if (old_ratio != tcp_sk(sk)->scaling_ratio) { + struct tcp_sock *tp = tcp_sk(sk); + + val = tcp_win_from_space(sk, sk->sk_rcvbuf); + tcp_set_window_clamp(sk, val); + + if (tp->window_clamp < tp->rcvq_space.space) + tp->rcvq_space.space = tp->window_clamp; + } } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); @@ -630,7 +638,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) inet_csk(sk)->icsk_ack.rcv_mss = hint; } -EXPORT_SYMBOL(tcp_initialize_rcv_mss); +EXPORT_IPV6_MOD(tcp_initialize_rcv_mss); /* Receiver "autotuning" code. * @@ -2252,8 +2260,7 @@ static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag) unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - delay, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, false); *ack_flag &= ~FLAG_SET_XMIT_TIMER; return true; } @@ -2710,6 +2717,8 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd)) return; + trace_tcp_cwnd_reduction_tp(sk, newly_acked_sacked, newly_lost, flag); + tp->prr_delivered += newly_acked_sacked; if (delta < 0) { u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + @@ -2892,7 +2901,7 @@ void tcp_simple_retransmit(struct sock *sk) */ tcp_non_congestion_loss_retransmit(sk); } -EXPORT_SYMBOL(tcp_simple_retransmit); +EXPORT_IPV6_MOD(tcp_simple_retransmit); void tcp_enter_recovery(struct sock *sk, bool ece_ack) { @@ -3282,8 +3291,7 @@ void tcp_rearm_rto(struct sock *sk) */ rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } - tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, true); } } @@ -3560,10 +3568,10 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { - unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); + unsigned long when = tcp_probe0_when(sk, tcp_rto_max(sk)); when = tcp_clamp_probe0_to_user_timeout(sk, when); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, true); } } @@ -4174,7 +4182,6 @@ u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss) } return mss; } -EXPORT_SYMBOL_GPL(tcp_parse_mss_option); /* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when @@ -4524,7 +4531,7 @@ void tcp_done_with_error(struct sock *sk, int err) if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } -EXPORT_SYMBOL(tcp_done_with_error); +EXPORT_IPV6_MOD(tcp_done_with_error); /* When we get a reset we do this. */ void tcp_reset(struct sock *sk, struct sk_buff *skb) @@ -4970,7 +4977,7 @@ static void tcp_ofo_queue(struct sock *sk) tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; if (!eaten) - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); else kfree_skb_partial(skb, fragstolen); @@ -5162,7 +5169,7 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, skb, fragstolen)) ? 1 : 0; tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq); if (!eaten) { - __skb_queue_tail(&sk->sk_receive_queue, skb); + tcp_add_receive_queue(sk, skb); skb_set_owner_r(skb, sk); } return eaten; @@ -5245,7 +5252,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); return; } - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_hdr(skb)->doff * 4); reason = SKB_DROP_REASON_NOT_SPECIFIED; @@ -6226,7 +6233,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ - skb_dst_drop(skb); + tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); @@ -6294,7 +6301,7 @@ csum_error: discard: tcp_drop_reason(sk, skb, reason); } -EXPORT_SYMBOL(tcp_rcv_established); +EXPORT_IPV6_MOD(tcp_rcv_established); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) { @@ -6347,7 +6354,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; if (sock_flag(sk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); + tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); if (!tp->rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp->snd_wnd); @@ -6470,9 +6477,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { /* Previous FIN/ACK or RST/ACK might be ignored. */ if (icsk->icsk_retransmits == 0) - inet_csk_reset_xmit_timer(sk, - ICSK_TIME_RETRANS, - TCP_TIMEOUT_MIN, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + TCP_TIMEOUT_MIN, false); SKB_DR_SET(reason, TCP_INVALID_ACK_SEQUENCE); goto reset_and_undo; } @@ -6587,8 +6593,8 @@ consume: */ inet_csk_schedule_ack(sk); tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, false); goto consume; } tcp_send_ack(sk); @@ -6806,10 +6812,9 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV && sk->sk_state != TCP_FIN_WAIT1); - if (!tcp_check_req(sk, skb, req, true, &req_stolen)) { - SKB_DR_SET(reason, TCP_FASTOPEN); + SKB_DR_SET(reason, TCP_FASTOPEN); + if (!tcp_check_req(sk, skb, req, true, &req_stolen, &reason)) goto discard; - } } if (!th->ack && !th->rst && !th->syn) { @@ -6922,7 +6927,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing @@ -6930,7 +6935,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) * if it spins in bh_lock_sock(), but it is really * marginal case. */ - inet_csk_reset_keepalive_timer(sk, tmo); + tcp_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto consume; @@ -7008,7 +7013,7 @@ consume: __kfree_skb(skb); return 0; } -EXPORT_SYMBOL(tcp_rcv_state_process); +EXPORT_IPV6_MOD(tcp_rcv_state_process); static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) { @@ -7075,6 +7080,7 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq; tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; + tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; @@ -7190,7 +7196,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, return mss; } -EXPORT_SYMBOL_GPL(tcp_get_syncookie_mss); +EXPORT_IPV6_MOD_GPL(tcp_get_syncookie_mss); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, @@ -7371,4 +7377,4 @@ drop: tcp_listendrop(sk); return 0; } -EXPORT_SYMBOL(tcp_conn_request); +EXPORT_IPV6_MOD(tcp_conn_request); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cc2b5194a18d..d9405b012dff 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -92,7 +92,6 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, #endif struct inet_hashinfo tcp_hashinfo; -EXPORT_SYMBOL(tcp_hashinfo); static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), @@ -199,7 +198,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) return 0; } -EXPORT_SYMBOL_GPL(tcp_twsk_unique); +EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -359,7 +358,7 @@ failure: inet->inet_dport = 0; return err; } -EXPORT_SYMBOL(tcp_v4_connect); +EXPORT_IPV6_MOD(tcp_v4_connect); /* * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191. @@ -400,7 +399,7 @@ void tcp_v4_mtu_reduced(struct sock *sk) tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ } -EXPORT_SYMBOL(tcp_v4_mtu_reduced); +EXPORT_IPV6_MOD(tcp_v4_mtu_reduced); static void do_redirect(struct sk_buff *skb, struct sock *sk) { @@ -434,7 +433,7 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) } reqsk_put(req); } -EXPORT_SYMBOL(tcp_req_err); +EXPORT_IPV6_MOD(tcp_req_err); /* TCP-LD (RFC 6069) logic */ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) @@ -458,15 +457,14 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) icsk->icsk_backoff--; icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; - icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk)); tcp_mstamp_refresh(tp); delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); if (remaining > 0) { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - remaining, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false); } else { /* RTO revert clocked out retransmission. * Will retransmit now. @@ -474,7 +472,7 @@ void tcp_ld_RTO_revert(struct sock *sk, u32 seq) tcp_retransmit_timer(sk); } } -EXPORT_SYMBOL(tcp_ld_RTO_revert); +EXPORT_IPV6_MOD(tcp_ld_RTO_revert); /* * This routine is called by the ICMP module when it gets some @@ -496,14 +494,14 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); - struct tcp_sock *tp; + struct net *net = dev_net_rcu(skb->dev); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; - struct sock *sk; struct request_sock *fastopen; + struct tcp_sock *tp; u32 seq, snd_una; + struct sock *sk; int err; - struct net *net = dev_net(skb->dev); sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->daddr, th->dest, iph->saddr, @@ -676,7 +674,7 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb) __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr); } -EXPORT_SYMBOL(tcp_v4_send_check); +EXPORT_IPV6_MOD(tcp_v4_send_check); #define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32)) @@ -788,7 +786,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); - net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) @@ -1157,7 +1155,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), - READ_ONCE(req->ts_recent), + req->ts_recent, 0, &key, inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos, @@ -1231,7 +1229,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) */ DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ); -EXPORT_SYMBOL(tcp_md5_needed); +EXPORT_IPV6_MOD(tcp_md5_needed); static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new) { @@ -1290,7 +1288,7 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, } return best_match; } -EXPORT_SYMBOL(__tcp_md5_do_lookup); +EXPORT_IPV6_MOD(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, @@ -1337,7 +1335,7 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } -EXPORT_SYMBOL(tcp_v4_md5_lookup); +EXPORT_IPV6_MOD(tcp_v4_md5_lookup); static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp) { @@ -1433,7 +1431,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags, newkey, newkeylen, GFP_KERNEL); } -EXPORT_SYMBOL(tcp_md5_do_add); +EXPORT_IPV6_MOD(tcp_md5_do_add); int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, @@ -1465,7 +1463,7 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, key->flags, key->key, key->keylen, sk_gfp_mask(sk, GFP_ATOMIC)); } -EXPORT_SYMBOL(tcp_md5_key_copy); +EXPORT_IPV6_MOD(tcp_md5_key_copy); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, u8 flags) @@ -1480,7 +1478,7 @@ int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, kfree_rcu(key, rcu); return 0; } -EXPORT_SYMBOL(tcp_md5_do_del); +EXPORT_IPV6_MOD(tcp_md5_do_del); void tcp_clear_md5_list(struct sock *sk) { @@ -1659,7 +1657,7 @@ clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } -EXPORT_SYMBOL(tcp_v4_md5_hash_skb); +EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); #endif @@ -1732,7 +1730,7 @@ drop: tcp_listendrop(sk); return 0; } -EXPORT_SYMBOL(tcp_v4_conn_request); +EXPORT_IPV6_MOD(tcp_v4_conn_request); /* @@ -1770,10 +1768,6 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - sk_daddr_set(newsk, ireq->ir_rmt_addr); - sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); - newsk->sk_bound_dev_if = ireq->ir_iif; - newinet->inet_saddr = ireq->ir_loc_addr; inet_opt = rcu_dereference(ireq->ireq_opt); RCU_INIT_POINTER(newinet->inet_opt, inet_opt); newinet->mc_index = inet_iif(skb); @@ -1856,7 +1850,7 @@ put_and_exit: tcp_done(newsk); goto exit; } -EXPORT_SYMBOL(tcp_v4_syn_recv_sock); +EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock); static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb) { @@ -1967,7 +1961,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv); int tcp_v4_early_demux(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; @@ -2027,7 +2021,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, */ skb_condense(skb); - skb_dst_drop(skb); + tcp_cleanup_skb(skb); if (unlikely(tcp_checksum_complete(skb))) { bh_unlock_sock(sk); @@ -2135,7 +2129,7 @@ no_coalesce: } return false; } -EXPORT_SYMBOL(tcp_add_backlog); +EXPORT_IPV6_MOD(tcp_add_backlog); int tcp_filter(struct sock *sk, struct sk_buff *skb) { @@ -2143,7 +2137,7 @@ int tcp_filter(struct sock *sk, struct sk_buff *skb) return sk_filter_trim_cap(sk, skb, th->doff * 4); } -EXPORT_SYMBOL(tcp_filter); +EXPORT_IPV6_MOD(tcp_filter); static void tcp_v4_restore_cb(struct sk_buff *skb) { @@ -2178,7 +2172,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph, int tcp_v4_rcv(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; int sdif = inet_sdif(skb); int dif = inet_iif(skb); @@ -2271,7 +2265,8 @@ lookup: th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); - nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen, + &drop_reason); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } @@ -2452,7 +2447,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) sk->sk_rx_dst_ifindex = skb->skb_iif; } } -EXPORT_SYMBOL(inet_sk_rx_dst_set); +EXPORT_IPV6_MOD(inet_sk_rx_dst_set); const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, @@ -2468,7 +2463,7 @@ const struct inet_connection_sock_af_ops ipv4_specific = { .sockaddr_len = sizeof(struct sockaddr_in), .mtu_reduced = tcp_v4_mtu_reduced, }; -EXPORT_SYMBOL(ipv4_specific); +EXPORT_IPV6_MOD(ipv4_specific); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { @@ -2578,7 +2573,7 @@ void tcp_v4_destroy_sock(struct sock *sk) sk_sockets_allocated_dec(sk); } -EXPORT_SYMBOL(tcp_v4_destroy_sock); +EXPORT_IPV6_MOD(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ @@ -2814,7 +2809,7 @@ out: st->last_pos = *pos; return rc; } -EXPORT_SYMBOL(tcp_seq_start); +EXPORT_IPV6_MOD(tcp_seq_start); void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { @@ -2845,7 +2840,7 @@ out: st->last_pos = *pos; return rc; } -EXPORT_SYMBOL(tcp_seq_next); +EXPORT_IPV6_MOD(tcp_seq_next); void tcp_seq_stop(struct seq_file *seq, void *v) { @@ -2863,7 +2858,7 @@ void tcp_seq_stop(struct seq_file *seq, void *v) break; } } -EXPORT_SYMBOL(tcp_seq_stop); +EXPORT_IPV6_MOD(tcp_seq_stop); static void get_openreq4(const struct request_sock *req, struct seq_file *f, int i) @@ -3533,6 +3528,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_pingpong_thresh = 1; net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN); + net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC; return 0; } diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 95669935494e..4251670e328c 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, bool reclaim = false; spin_lock_bh(&tcp_metrics_lock); - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. @@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return NULL; } - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); @@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, else return NULL; - net = dev_net(dst->dev); + net = dev_net_rcu(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b089b08e9617..3cb8f281186b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -264,7 +264,7 @@ kill: inet_twsk_put(tw); return TCP_TW_SUCCESS; } -EXPORT_SYMBOL(tcp_timewait_state_process); +EXPORT_IPV6_MOD(tcp_timewait_state_process); static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) { @@ -398,7 +398,7 @@ void tcp_twsk_destructor(struct sock *sk) #endif tcp_ao_destroy_sock(sk, true); } -EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor); void tcp_twsk_purge(struct list_head *net_exit_list) { @@ -457,7 +457,6 @@ void tcp_openreq_init_rwin(struct request_sock *req, rcv_wnd); ireq->rcv_wscale = rcv_wscale; } -EXPORT_SYMBOL(tcp_openreq_init_rwin); static void tcp_ecn_openreq_child(struct tcp_sock *tp, const struct request_sock *req) @@ -492,7 +491,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) tcp_set_ca_state(sk, TCP_CA_Open); } -EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); +EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child); static void smc_check_reset_syn_req(const struct tcp_sock *oldtp, struct request_sock *req, @@ -566,8 +565,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); + tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; @@ -587,7 +585,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (newtp->rx_opt.tstamp_ok) { newtp->tcp_usec_ts = treq->req_usec_ts; - newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent); + newtp->rx_opt.ts_recent = req->ts_recent; newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { @@ -659,12 +657,14 @@ EXPORT_SYMBOL(tcp_create_openreq_child); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - bool fastopen, bool *req_stolen) + bool fastopen, bool *req_stolen, + enum skb_drop_reason *drop_reason) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); + bool tsecr_reject = false; bool paws_reject = false; bool own_req; @@ -673,9 +673,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { - tmp_opt.ts_recent = READ_ONCE(req->ts_recent); - if (tmp_opt.rcv_tsecr) + tmp_opt.ts_recent = req->ts_recent; + if (tmp_opt.rcv_tsecr) { + if (inet_rsk(req)->tstamp_ok && !fastopen) + tsecr_reject = !between(tmp_opt.rcv_tsecr, + tcp_rsk(req)->snt_tsval_first, + READ_ONCE(tcp_rsk(req)->snt_tsval_last)); tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; + } /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. @@ -790,37 +795,34 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->snt_isn + 1)) return sk; - /* Also, it would be not so bad idea to check rcv_tsecr, which - * is essentially ACK extension and too early or too late values - * should cause reset in unsynchronized states. - */ - /* RFC793: "first check sequence number". */ - if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_nxt, - tcp_rsk(req)->rcv_nxt + - tcp_synack_window(req))) { + if (paws_reject || tsecr_reject || + !tcp_in_window(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + tcp_rsk(req)->rcv_nxt, + tcp_rsk(req)->rcv_nxt + + tcp_synack_window(req))) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); - if (paws_reject) + if (paws_reject) { + SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS); NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); + } else if (tsecr_reject) { + SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED); + } else { + SKB_DR_SET(*drop_reason, TCP_OVERWINDOW); + } return NULL; } /* In sequence, PAWS is OK. */ - /* TODO: We probably should defer ts_recent change once - * we take ownership of @req. - */ - if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) - WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval); - if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ @@ -869,6 +871,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; + if (own_req && tmp_opt.saw_tstamp && + !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) + tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; + if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); @@ -881,6 +887,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: + SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW); if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); @@ -910,7 +917,7 @@ embryonic_reset: } return NULL; } -EXPORT_SYMBOL(tcp_check_req); +EXPORT_IPV6_MOD(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, @@ -952,4 +959,4 @@ enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child, sock_put(child); return reason; } -EXPORT_SYMBOL(tcp_child_process); +EXPORT_IPV6_MOD(tcp_child_process); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 2308665b51c5..ecef16c58c07 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -425,7 +425,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet_get_iif_sdif(skb, &iif, &sdif); iph = skb_gro_network_header(skb); - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 0e5b9a654254..24e56bf96747 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -250,7 +250,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, WRITE_ONCE(*__window_clamp, min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp)); } -EXPORT_SYMBOL(tcp_select_initial_window); +EXPORT_IPV6_MOD(tcp_select_initial_window); /* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return @@ -265,11 +265,14 @@ static u16 tcp_select_window(struct sock *sk) u32 cur_win, new_win; /* Make the window 0 if we failed to queue the data because we - * are out of memory. The window is temporary, so we don't store - * it on the socket. + * are out of memory. */ - if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) + if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) { + tp->pred_flags = 0; + tp->rcv_wnd = 0; + tp->rcv_wup = tp->rcv_nxt; return 0; + } cur_win = tcp_receive_window(tp); new_win = __tcp_select_window(sk); @@ -522,6 +525,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb, sock_owned_by_me(sk); sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } @@ -567,6 +571,7 @@ static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb, sock_owned_by_me(sk); sock_ops.is_fullsock = 1; + sock_ops.is_locked_tcp_sock = 1; sock_ops.sk = sk; } @@ -938,7 +943,13 @@ static unsigned int tcp_synack_options(const struct sock *sk, opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) + tcp_rsk(req)->ts_off; - opts->tsecr = READ_ONCE(req->ts_recent); + if (!tcp_rsk(req)->snt_tsval_first) { + if (!opts->tsval) + opts->tsval = ~0U; + tcp_rsk(req)->snt_tsval_first = opts->tsval; + } + WRITE_ONCE(tcp_rsk(req)->snt_tsval_last, opts->tsval); + opts->tsecr = req->ts_recent; remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { @@ -1168,7 +1179,7 @@ void tcp_release_cb(struct sock *sk) if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk)) tcp_send_ack(sk); } -EXPORT_SYMBOL(tcp_release_cb); +EXPORT_IPV6_MOD(tcp_release_cb); void __init tcp_tasklet_init(void) { @@ -1780,7 +1791,7 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu) return __tcp_mtu_to_mss(sk, pmtu) - (tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr)); } -EXPORT_SYMBOL(tcp_mtu_to_mss); +EXPORT_IPV6_MOD(tcp_mtu_to_mss); /* Inverse of above */ int tcp_mss_to_mtu(struct sock *sk, int mss) @@ -1810,7 +1821,6 @@ void tcp_mtup_init(struct sock *sk) if (icsk->icsk_mtup.enabled) icsk->icsk_mtup.probe_timestamp = tcp_jiffies32; } -EXPORT_SYMBOL(tcp_mtup_init); /* This function synchronize snd mss to current pmtu/exthdr set. @@ -1854,7 +1864,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) return mss_now; } -EXPORT_SYMBOL(tcp_sync_mss); +EXPORT_IPV6_MOD(tcp_sync_mss); /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. @@ -2908,7 +2918,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); - tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, true); return true; } @@ -3542,8 +3552,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) } if (rearm_timer) tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); + inet_csk(sk)->icsk_rto, true); } /* We allow to exceed memory limits for FIN packets to expedite @@ -3850,7 +3859,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, return skb; } -EXPORT_SYMBOL(tcp_make_synack); +EXPORT_IPV6_MOD(tcp_make_synack); static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) { @@ -4160,8 +4169,8 @@ int tcp_connect(struct sock *sk) TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, false); return 0; } EXPORT_SYMBOL(tcp_connect); @@ -4250,11 +4259,11 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) unsigned long delay; delay = TCP_DELACK_MAX << icsk->icsk_ack.retry; - if (delay < TCP_RTO_MAX) + if (delay < tcp_rto_max(sk)) icsk->icsk_ack.retry++; inet_csk_schedule_ack(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, false); return; } @@ -4390,7 +4399,7 @@ void tcp_send_probe0(struct sock *sk) if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; - timeout = tcp_probe0_when(sk, TCP_RTO_MAX); + timeout = tcp_probe0_when(sk, tcp_rto_max(sk)); } else { /* If packet was not sent due to local congestion, * Let senders fight for local resources conservatively. @@ -4399,7 +4408,7 @@ void tcp_send_probe0(struct sock *sk) } timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout); - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, true); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) @@ -4427,4 +4436,4 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) } return res; } -EXPORT_SYMBOL(tcp_rtx_synack); +EXPORT_IPV6_MOD(tcp_rtx_synack); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b412ed88ccd9..728bce01ccd3 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -109,7 +109,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset) /* If peer does not open window for long time, or did not transmit * anything for long time, penalize it. */ - if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*tcp_rto_max(sk) || !do_reset) shift++; /* If some dubious ICMP arrived, penalize even more. */ @@ -189,12 +189,12 @@ static unsigned int tcp_model_timeout(struct sock *sk, { unsigned int linear_backoff_thresh, timeout; - linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base); + linear_backoff_thresh = ilog2(tcp_rto_max(sk) / rto_base); if (boundary <= linear_backoff_thresh) timeout = ((2 << boundary) - 1) * rto_base; else timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + (boundary - linear_backoff_thresh) * tcp_rto_max(sk); return jiffies_to_msecs(timeout); } /** @@ -268,7 +268,7 @@ static int tcp_write_timeout(struct sock *sk) retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { - const bool alive = icsk->icsk_rto < TCP_RTO_MAX; + const bool alive = icsk->icsk_rto < tcp_rto_max(sk); retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || @@ -416,7 +416,8 @@ static void tcp_probe_timer(struct sock *sk) } max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2); if (sock_flag(sk, SOCK_DEAD)) { - const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX; + unsigned int rto_max = tcp_rto_max(sk); + const bool alive = inet_csk_rto_backoff(icsk, rto_max) < rto_max; max_probes = tcp_orphan_retries(sk, alive); if (!alive && icsk->icsk_backoff >= max_probes) @@ -481,8 +482,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) tcp_update_rto_stats(sk); if (!tp->retrans_stamp) tp->retrans_stamp = tcp_time_stamp_ts(tp); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - req->timeout << req->num_timeout, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + req->timeout << req->num_timeout, false); } static bool tcp_rtx_probe0_timed_out(const struct sock *sk, @@ -492,7 +493,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, const struct inet_connection_sock *icsk = inet_csk(sk); u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); const struct tcp_sock *tp = tcp_sk(sk); - int timeout = TCP_RTO_MAX * 2; + int timeout = tcp_rto_max(sk) * 2; s32 rcv_delta; if (user_timeout) { @@ -626,9 +627,9 @@ void tcp_retransmit_timer(struct sock *sk) /* Retransmission failed because of local congestion, * Let senders fight for local resources conservatively. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - TCP_RESOURCE_PROBE_INTERVAL, - TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + TCP_RESOURCE_PROBE_INTERVAL, + false); goto out; } @@ -665,7 +666,7 @@ out_reset_timer: icsk->icsk_backoff = 0; icsk->icsk_rto = clamp(__tcp_set_rto(tp), tcp_rto_min(sk), - TCP_RTO_MAX); + tcp_rto_max(sk)); } else if (sk->sk_state != TCP_SYN_SENT || tp->total_rto > READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) { @@ -673,10 +674,10 @@ out_reset_timer: * activated. */ icsk->icsk_backoff++; - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + icsk->icsk_rto = min(icsk->icsk_rto << 1, tcp_rto_max(sk)); } - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + tcp_clamp_rto_to_user_timeout(sk), false); if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0)) __sk_dst_reset(sk); @@ -749,7 +750,17 @@ void tcp_syn_ack_timeout(const struct request_sock *req) __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS); } -EXPORT_SYMBOL(tcp_syn_ack_timeout); +EXPORT_IPV6_MOD(tcp_syn_ack_timeout); + +void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len) +{ + sk_reset_timer(sk, &sk->sk_timer, jiffies + len); +} + +static void tcp_delete_keepalive_timer(struct sock *sk) +{ + sk_stop_timer(sk, &sk->sk_timer); +} void tcp_set_keepalive(struct sock *sk, int val) { @@ -757,14 +768,13 @@ void tcp_set_keepalive(struct sock *sk, int val) return; if (val && !sock_flag(sk, SOCK_KEEPOPEN)) - inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); + tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) - inet_csk_delete_keepalive_timer(sk); + tcp_delete_keepalive_timer(sk); } -EXPORT_SYMBOL_GPL(tcp_set_keepalive); - +EXPORT_IPV6_MOD_GPL(tcp_set_keepalive); -static void tcp_keepalive_timer (struct timer_list *t) +static void tcp_keepalive_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); struct inet_connection_sock *icsk = inet_csk(sk); @@ -775,7 +785,7 @@ static void tcp_keepalive_timer (struct timer_list *t) bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - inet_csk_reset_keepalive_timer (sk, HZ/20); + tcp_reset_keepalive_timer(sk, HZ/20); goto out; } @@ -841,7 +851,7 @@ static void tcp_keepalive_timer (struct timer_list *t) } resched: - inet_csk_reset_keepalive_timer (sk, elapsed); + tcp_reset_keepalive_timer(sk, elapsed); goto out; death: diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index c472c9a57cf6..17c7736d8349 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -121,13 +121,12 @@ #endif struct udp_table udp_table __read_mostly; -EXPORT_SYMBOL(udp_table); long sysctl_udp_mem[3] __read_mostly; -EXPORT_SYMBOL(sysctl_udp_mem); +EXPORT_IPV6_MOD(sysctl_udp_mem); atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; -EXPORT_SYMBOL(udp_memory_allocated); +EXPORT_IPV6_MOD(udp_memory_allocated); DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); @@ -352,7 +351,7 @@ fail_unlock: fail: return error; } -EXPORT_SYMBOL(udp_lib_get_port); +EXPORT_IPV6_MOD(udp_lib_get_port); int udp_v4_get_port(struct sock *sk, unsigned short snum) { @@ -418,7 +417,7 @@ u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, return __inet_ehashfn(laddr, lport, faddr, fport, udp_ehash_secret + net_hash_mix(net)); } -EXPORT_SYMBOL(udp_ehashfn); +EXPORT_IPV6_MOD(udp_ehashfn); /** * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port) @@ -653,7 +652,7 @@ void udp_lib_hash4(struct sock *sk, u16 hash) spin_unlock_bh(&hslot->lock); } -EXPORT_SYMBOL(udp_lib_hash4); +EXPORT_IPV6_MOD(udp_lib_hash4); /* call with sock lock */ void udp4_hash4(struct sock *sk) @@ -669,7 +668,7 @@ void udp4_hash4(struct sock *sk) udp_lib_hash4(sk, hash); } -EXPORT_SYMBOL(udp4_hash4); +EXPORT_IPV6_MOD(udp4_hash4); #endif /* CONFIG_BASE_SMALL */ /* UDP is nearly always wildcards out the wazoo, it makes no sense to try @@ -809,11 +808,11 @@ static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk, } DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); -EXPORT_SYMBOL(udp_encap_needed_key); +EXPORT_IPV6_MOD(udp_encap_needed_key); #if IS_ENABLED(CONFIG_IPV6) DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); -EXPORT_SYMBOL(udpv6_encap_needed_key); +EXPORT_IPV6_MOD(udpv6_encap_needed_key); #endif void udp_encap_enable(void) @@ -1041,7 +1040,7 @@ void udp_flush_pending_frames(struct sock *sk) ip_flush_pending_frames(sk); } } -EXPORT_SYMBOL(udp_flush_pending_frames); +EXPORT_IPV6_MOD(udp_flush_pending_frames); /** * udp4_hwcsum - handle outgoing HW checksumming @@ -1141,9 +1140,9 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) { + if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); - return -EINVAL; + return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); @@ -1229,7 +1228,7 @@ out: WRITE_ONCE(up->pending, 0); return err; } -EXPORT_SYMBOL(udp_push_pending_frames); +EXPORT_IPV6_MOD(udp_push_pending_frames); static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size) { @@ -1266,7 +1265,7 @@ int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size) return need_ip; } -EXPORT_SYMBOL_GPL(udp_cmsg_send); +EXPORT_IPV6_MOD_GPL(udp_cmsg_send); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { @@ -1281,7 +1280,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int free = 0; int connected = 0; __be32 daddr, faddr, saddr; - u8 tos, scope; + u8 scope; __be16 dport; int err, is_udplite = IS_UDPLITE(sk); int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; @@ -1405,7 +1404,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) faddr = ipc.opt->opt.faddr; connected = 0; } - tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (scope == RT_SCOPE_LINK) connected = 0; @@ -1442,7 +1440,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl4 = &fl4_stack; - flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope, + flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, + ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, sk->sk_uid); @@ -1561,7 +1560,7 @@ void udp_splice_eof(struct socket *sock) udp_push_pending_frames(sk); release_sock(sk); } -EXPORT_SYMBOL_GPL(udp_splice_eof); +EXPORT_IPV6_MOD_GPL(udp_splice_eof); #define UDP_SKB_IS_STATELESS 0x80000000 @@ -1678,7 +1677,7 @@ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) prefetch(&skb->data); udp_rmem_release(sk, udp_skb_truesize(skb), 1, false); } -EXPORT_SYMBOL(udp_skb_destructor); +EXPORT_IPV6_MOD(udp_skb_destructor); /* as above, but the caller held the rx queue lock, too */ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) @@ -1785,7 +1784,7 @@ drop: busylock_release(busy); return err; } -EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); +EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb); void udp_destruct_common(struct sock *sk) { @@ -1801,7 +1800,7 @@ void udp_destruct_common(struct sock *sk) } udp_rmem_release(sk, total, 0, true); } -EXPORT_SYMBOL_GPL(udp_destruct_common); +EXPORT_IPV6_MOD_GPL(udp_destruct_common); static void udp_destruct_sock(struct sock *sk) { @@ -1832,7 +1831,7 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) skb_release_head_state(skb); __consume_stateless_skb(skb); } -EXPORT_SYMBOL_GPL(skb_consume_udp); +EXPORT_IPV6_MOD_GPL(skb_consume_udp); static struct sk_buff *__first_packet_length(struct sock *sk, struct sk_buff_head *rcvq, @@ -1914,7 +1913,7 @@ int udp_ioctl(struct sock *sk, int cmd, int *karg) return 0; } -EXPORT_SYMBOL(udp_ioctl); +EXPORT_IPV6_MOD(udp_ioctl); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off, int *err) @@ -2010,7 +2009,7 @@ try_again: WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); return recv_actor(sk, skb); } -EXPORT_SYMBOL(udp_read_skb); +EXPORT_IPV6_MOD(udp_read_skb); /* * This should be easy, if there is something there we @@ -2137,7 +2136,7 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len); } -EXPORT_SYMBOL(udp_pre_connect); +EXPORT_IPV6_MOD(udp_pre_connect); static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -2186,7 +2185,7 @@ int udp_disconnect(struct sock *sk, int flags) release_sock(sk); return 0; } -EXPORT_SYMBOL(udp_disconnect); +EXPORT_IPV6_MOD(udp_disconnect); void udp_lib_unhash(struct sock *sk) { @@ -2216,7 +2215,7 @@ void udp_lib_unhash(struct sock *sk) spin_unlock_bh(&hslot->lock); } } -EXPORT_SYMBOL(udp_lib_unhash); +EXPORT_IPV6_MOD(udp_lib_unhash); /* * inet_rcv_saddr was changed, we must rehash secondary hash @@ -2280,7 +2279,7 @@ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) } } } -EXPORT_SYMBOL(udp_lib_rehash); +EXPORT_IPV6_MOD(udp_lib_rehash); void udp_v4_rehash(struct sock *sk) { @@ -2485,7 +2484,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) } return false; } -EXPORT_SYMBOL(udp_sk_rx_dst_set); +EXPORT_IPV6_MOD(udp_sk_rx_dst_set); /* * Multicasts and broadcasts go to each listener. @@ -3041,7 +3040,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, return err; } -EXPORT_SYMBOL(udp_lib_setsockopt); +EXPORT_IPV6_MOD(udp_lib_setsockopt); int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) @@ -3112,7 +3111,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; return 0; } -EXPORT_SYMBOL(udp_lib_getsockopt); +EXPORT_IPV6_MOD(udp_lib_getsockopt); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) @@ -3154,7 +3153,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) return mask; } -EXPORT_SYMBOL(udp_poll); +EXPORT_IPV6_MOD(udp_poll); int udp_abort(struct sock *sk, int err) { @@ -3177,7 +3176,7 @@ out: return 0; } -EXPORT_SYMBOL_GPL(udp_abort); +EXPORT_IPV6_MOD_GPL(udp_abort); struct proto udp_prot = { .name = "UDP", @@ -3311,7 +3310,7 @@ void *udp_seq_start(struct seq_file *seq, loff_t *pos) return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } -EXPORT_SYMBOL(udp_seq_start); +EXPORT_IPV6_MOD(udp_seq_start); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { @@ -3325,7 +3324,7 @@ void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; return sk; } -EXPORT_SYMBOL(udp_seq_next); +EXPORT_IPV6_MOD(udp_seq_next); void udp_seq_stop(struct seq_file *seq, void *v) { @@ -3337,7 +3336,7 @@ void udp_seq_stop(struct seq_file *seq, void *v) if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); } -EXPORT_SYMBOL(udp_seq_stop); +EXPORT_IPV6_MOD(udp_seq_stop); /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, @@ -3616,7 +3615,7 @@ const struct seq_operations udp_seq_ops = { .stop = udp_seq_stop, .show = udp4_seq_show, }; -EXPORT_SYMBOL(udp_seq_ops); +EXPORT_IPV6_MOD(udp_seq_ops); static struct udp_seq_afinfo udp4_seq_afinfo = { .family = AF_INET, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a5be6e4ed326..c1a85b300ee8 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -630,7 +630,7 @@ static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct iphdr *iph = skb_gro_network_header(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); int iif, sdif; inet_get_iif_sdif(skb, &iif, &sdif); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c3729382be3b..ac8cc1076536 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -852,7 +852,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf) struct inet6_dev *idev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.forwarding) ^ (!newf); @@ -865,13 +865,12 @@ static void addrconf_forward_change(struct net *net, __s32 newf) static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int newf) { - struct net *net; + struct net *net = (struct net *)table->extra2; int old; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); - net = (struct net *)table->extra2; old = *p; WRITE_ONCE(*p, newf); @@ -881,7 +880,7 @@ static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } @@ -903,7 +902,7 @@ static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int net->ipv6.devconf_all); } else if ((!newf) ^ (!old)) dev_forward_change((struct inet6_dev *)table->extra1); - rtnl_unlock(); + rtnl_net_unlock(net); if (newf) rt6_purge_dflt_routers(net); @@ -916,7 +915,7 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf) struct inet6_dev *idev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf); @@ -933,13 +932,12 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf) static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int newf) { - struct net *net; + struct net *net = (struct net *)table->extra2; int old; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); - net = (struct net *)table->extra2; old = *p; WRITE_ONCE(*p, newf); @@ -950,7 +948,7 @@ static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int ne NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } @@ -964,7 +962,8 @@ static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int ne NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); } - rtnl_unlock(); + + rtnl_net_unlock(net); return 1; } @@ -2980,11 +2979,11 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg) if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; - rtnl_lock(); + rtnl_net_lock(net); dev = __dev_get_by_index(net, ireq.ifr6_ifindex); if (dev && dev->type == ARPHRD_SIT) err = addrconf_set_sit_dstaddr(net, dev, &ireq); - rtnl_unlock(); + rtnl_net_unlock(net); return err; } @@ -3008,39 +3007,25 @@ static int ipv6_mc_config(struct sock *sk, bool join, /* * Manual configuration of address on an interface */ -static int inet6_addr_add(struct net *net, int ifindex, - struct ifa6_config *cfg, +static int inet6_addr_add(struct net *net, struct net_device *dev, + struct ifa6_config *cfg, clock_t expires, u32 flags, struct netlink_ext_ack *extack) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; - struct net_device *dev; - unsigned long timeout; - clock_t expires; - u32 flags; - ASSERT_RTNL(); + ASSERT_RTNL_NET(net); if (cfg->plen > 128) { NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length"); return -EINVAL; } - /* check the lifetime */ - if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) { - NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid"); - return -EINVAL; - } - if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) { NL_SET_ERR_MSG_MOD(extack, "address with \"mngtmpaddr\" flag must have a prefix length of 64"); return -EINVAL; } - dev = __dev_get_by_index(net, ifindex); - if (!dev) - return -ENODEV; - idev = addrconf_add_dev(dev); if (IS_ERR(idev)) { NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); @@ -3049,7 +3034,7 @@ static int inet6_addr_add(struct net *net, int ifindex, if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk, - true, cfg->pfx, ifindex); + true, cfg->pfx, dev->ifindex); if (ret < 0) { NL_SET_ERR_MSG_MOD(extack, "Multicast auto join failed"); @@ -3059,24 +3044,6 @@ static int inet6_addr_add(struct net *net, int ifindex, cfg->scope = ipv6_addr_scope(cfg->pfx); - timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - expires = jiffies_to_clock_t(timeout * HZ); - cfg->valid_lft = timeout; - flags = RTF_EXPIRES; - } else { - expires = 0; - flags = 0; - cfg->ifa_flags |= IFA_F_PERMANENT; - } - - timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - if (timeout == 0) - cfg->ifa_flags |= IFA_F_DEPRECATED; - cfg->preferred_lft = timeout; - } - ifp = ipv6_add_addr(idev, cfg, true, extack); if (!IS_ERR(ifp)) { if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) { @@ -3104,7 +3071,7 @@ static int inet6_addr_add(struct net *net, int ifindex, return 0; } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { ipv6_mc_config(net->ipv6.mc_autojoin_sk, false, - cfg->pfx, ifindex); + cfg->pfx, dev->ifindex); } return PTR_ERR(ifp); @@ -3129,7 +3096,7 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, return -ENODEV; } - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (!idev) { NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device"); return -ENXIO; @@ -3170,6 +3137,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) .preferred_lft = INFINITY_LIFE_TIME, .valid_lft = INFINITY_LIFE_TIME, }; + struct net_device *dev; struct in6_ifreq ireq; int err; @@ -3182,9 +3150,13 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg) cfg.pfx = &ireq.ifr6_addr; cfg.plen = ireq.ifr6_prefixlen; - rtnl_lock(); - err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL); - rtnl_unlock(); + rtnl_net_lock(net); + dev = __dev_get_by_index(net, ireq.ifr6_ifindex); + if (dev) + err = inet6_addr_add(net, dev, &cfg, 0, 0, NULL); + else + err = -ENODEV; + rtnl_net_unlock(net); return err; } @@ -3199,10 +3171,10 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg) if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) return -EFAULT; - rtnl_lock(); + rtnl_net_lock(net); err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr, ireq.ifr6_prefixlen, NULL); - rtnl_unlock(); + rtnl_net_unlock(net); return err; } @@ -4205,6 +4177,7 @@ static void addrconf_dad_work(struct work_struct *w) struct inet6_dev *idev = ifp->idev; bool bump_id, disable_ipv6 = false; struct in6_addr mcaddr; + struct net *net; enum { DAD_PROCESS, @@ -4212,7 +4185,9 @@ static void addrconf_dad_work(struct work_struct *w) DAD_ABORT, } action = DAD_PROCESS; - rtnl_lock(); + net = dev_net(idev->dev); + + rtnl_net_lock(net); spin_lock_bh(&ifp->lock); if (ifp->state == INET6_IFADDR_STATE_PREDAD) { @@ -4222,7 +4197,7 @@ static void addrconf_dad_work(struct work_struct *w) action = DAD_ABORT; ifp->state = INET6_IFADDR_STATE_POSTDAD; - if ((READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->accept_dad) > 1 || + if ((READ_ONCE(net->ipv6.devconf_all->accept_dad) > 1 || READ_ONCE(idev->cnf.accept_dad) > 1) && !idev->cnf.disable_ipv6 && !(ifp->flags & IFA_F_STABLE_PRIVACY)) { @@ -4304,7 +4279,7 @@ static void addrconf_dad_work(struct work_struct *w) ifp->dad_nonce); out: in6_ifa_put(ifp); - rtnl_unlock(); + rtnl_net_unlock(net); } /* ifp->idev must be at least read locked */ @@ -4752,9 +4727,9 @@ static void addrconf_verify_work(struct work_struct *w) struct net *net = container_of(to_delayed_work(w), struct net, ipv6.addr_chk_work); - rtnl_lock(); + rtnl_net_lock(net); addrconf_verify_rtnl(net); - rtnl_unlock(); + rtnl_net_unlock(net); } static void addrconf_verify(struct net *net) @@ -4817,8 +4792,12 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, /* We ignore other flags so far. */ ifa_flags &= IFA_F_MANAGETEMPADDR; - return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, - ifm->ifa_prefixlen, extack); + rtnl_net_lock(net); + err = inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx, + ifm->ifa_prefixlen, extack); + rtnl_net_unlock(net); + + return err; } static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp, @@ -4867,19 +4846,14 @@ static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp, } static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, - struct ifa6_config *cfg) + struct ifa6_config *cfg, clock_t expires, + u32 flags) { - u32 flags; - clock_t expires; - unsigned long timeout; bool was_managetempaddr; - bool had_prefixroute; bool new_peer = false; + bool had_prefixroute; - ASSERT_RTNL(); - - if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) - return -EINVAL; + ASSERT_RTNL_NET(net); if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64)) @@ -4888,24 +4862,6 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED) cfg->ifa_flags &= ~IFA_F_OPTIMISTIC; - timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - expires = jiffies_to_clock_t(timeout * HZ); - cfg->valid_lft = timeout; - flags = RTF_EXPIRES; - } else { - expires = 0; - flags = 0; - cfg->ifa_flags |= IFA_F_PERMANENT; - } - - timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ); - if (addrconf_finite_timeout(timeout)) { - if (timeout == 0) - cfg->ifa_flags |= IFA_F_DEPRECATED; - cfg->preferred_lft = timeout; - } - if (cfg->peer_pfx && memcmp(&ifp->peer_addr, cfg->peer_pfx, sizeof(struct in6_addr))) { if (!ipv6_addr_any(&ifp->peer_addr)) @@ -4990,13 +4946,16 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct ifaddrmsg *ifm; struct nlattr *tb[IFA_MAX+1]; struct in6_addr *peer_pfx; struct inet6_ifaddr *ifa; struct net_device *dev; struct inet6_dev *idev; struct ifa6_config cfg; + struct ifaddrmsg *ifm; + unsigned long timeout; + clock_t expires; + u32 flags; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX, @@ -5019,8 +4978,18 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[IFA_PROTO]) cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]); + cfg.ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); + + /* We ignore other flags so far. */ + cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | + IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE | + IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; + + cfg.ifa_flags |= IFA_F_PERMANENT; cfg.valid_lft = INFINITY_LIFE_TIME; cfg.preferred_lft = INFINITY_LIFE_TIME; + expires = 0; + flags = 0; if (tb[IFA_CACHEINFO]) { struct ifa_cacheinfo *ci; @@ -5028,24 +4997,43 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, ci = nla_data(tb[IFA_CACHEINFO]); cfg.valid_lft = ci->ifa_valid; cfg.preferred_lft = ci->ifa_prefered; + + if (!cfg.valid_lft || cfg.preferred_lft > cfg.valid_lft) { + NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid"); + return -EINVAL; + } + + timeout = addrconf_timeout_fixup(cfg.valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + cfg.ifa_flags &= ~IFA_F_PERMANENT; + cfg.valid_lft = timeout; + expires = jiffies_to_clock_t(timeout * HZ); + flags = RTF_EXPIRES; + } + + timeout = addrconf_timeout_fixup(cfg.preferred_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + cfg.ifa_flags |= IFA_F_DEPRECATED; + + cfg.preferred_lft = timeout; + } } + rtnl_net_lock(net); + dev = __dev_get_by_index(net, ifm->ifa_index); if (!dev) { NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface"); - return -ENODEV; + err = -ENODEV; + goto unlock; } - cfg.ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags); - - /* We ignore other flags so far. */ - cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS | - IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE | - IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC; - idev = ipv6_find_idev(dev); - if (IS_ERR(idev)) - return PTR_ERR(idev); + if (IS_ERR(idev)) { + err = PTR_ERR(idev); + goto unlock; + } if (!ipv6_allow_optimistic_dad(net, idev)) cfg.ifa_flags &= ~IFA_F_OPTIMISTIC; @@ -5053,7 +5041,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, if (cfg.ifa_flags & IFA_F_NODAD && cfg.ifa_flags & IFA_F_OPTIMISTIC) { NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive"); - return -EINVAL; + err = -EINVAL; + goto unlock; } ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1); @@ -5062,7 +5051,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, * It would be best to check for !NLM_F_CREATE here but * userspace already relies on not having to provide this. */ - return inet6_addr_add(net, ifm->ifa_index, &cfg, extack); + err = inet6_addr_add(net, dev, &cfg, expires, flags, extack); + goto unlock; } if (nlh->nlmsg_flags & NLM_F_EXCL || @@ -5070,10 +5060,12 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, NL_SET_ERR_MSG_MOD(extack, "address already assigned"); err = -EEXIST; } else { - err = inet6_addr_modify(net, ifa, &cfg); + err = inet6_addr_modify(net, ifa, &cfg, expires, flags); } in6_ifa_put(ifa); +unlock: + rtnl_net_unlock(net); return err; } @@ -6370,7 +6362,7 @@ static void addrconf_disable_change(struct net *net, __s32 newf) struct inet6_dev *idev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) { int changed = (!idev->cnf.disable_ipv6) ^ (!newf); @@ -6391,7 +6383,7 @@ static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf return 0; } - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); old = *p; @@ -6400,10 +6392,11 @@ static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf if (p == &net->ipv6.devconf_all->disable_ipv6) { WRITE_ONCE(net->ipv6.devconf_dflt->disable_ipv6, newf); addrconf_disable_change(net, newf); - } else if ((!newf) ^ (!old)) + } else if ((!newf) ^ (!old)) { dev_disable_change((struct inet6_dev *)table->extra1); + } - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } @@ -6446,20 +6439,20 @@ static int addrconf_sysctl_proxy_ndp(const struct ctl_table *ctl, int write, if (write && old != new) { struct net *net = ctl->extra2; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); - if (valp == &net->ipv6.devconf_dflt->proxy_ndp) + if (valp == &net->ipv6.devconf_dflt->proxy_ndp) { inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); - else if (valp == &net->ipv6.devconf_all->proxy_ndp) + } else if (valp == &net->ipv6.devconf_all->proxy_ndp) { inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_PROXY_NEIGH, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); - else { + } else { struct inet6_dev *idev = ctl->extra1; inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, @@ -6467,7 +6460,7 @@ static int addrconf_sysctl_proxy_ndp(const struct ctl_table *ctl, int write, idev->dev->ifindex, &idev->cnf); } - rtnl_unlock(); + rtnl_net_unlock(net); } return ret; @@ -6487,7 +6480,7 @@ static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write, .mode = ctl->mode, }; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); new_val = *((u32 *)ctl->data); @@ -6517,7 +6510,7 @@ static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write, WRITE_ONCE(net->ipv6.devconf_dflt->addr_gen_mode, new_val); for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev && idev->cnf.addr_gen_mode != new_val) { WRITE_ONCE(idev->cnf.addr_gen_mode, @@ -6531,7 +6524,7 @@ static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write, } out: - rtnl_unlock(); + rtnl_net_unlock(net); return ret; } @@ -6553,7 +6546,7 @@ static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write, lctl.maxlen = IPV6_MAX_STRLEN; lctl.data = str; - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); if (!write && !secret->initialized) { @@ -6583,7 +6576,7 @@ static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write, struct net_device *dev; for_each_netdev(net, dev) { - struct inet6_dev *idev = __in6_dev_get(dev); + struct inet6_dev *idev = __in6_dev_get_rtnl_net(dev); if (idev) { WRITE_ONCE(idev->cnf.addr_gen_mode, @@ -6598,7 +6591,7 @@ static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write, } out: - rtnl_unlock(); + rtnl_net_unlock(net); return err; } @@ -6682,7 +6675,7 @@ int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val) return 0; } - if (!rtnl_trylock()) + if (!rtnl_net_trylock(net)) return restart_syscall(); WRITE_ONCE(*valp, val); @@ -6691,7 +6684,7 @@ int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val) struct net_device *dev; for_each_netdev(net, dev) { - idev = __in6_dev_get(dev); + idev = __in6_dev_get_rtnl_net(dev); if (idev) addrconf_disable_policy_idev(idev, val); } @@ -6700,7 +6693,7 @@ int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val) addrconf_disable_policy_idev(idev, val); } - rtnl_unlock(); + rtnl_net_unlock(net); return 0; } @@ -7413,9 +7406,9 @@ static const struct rtnl_msg_handler addrconf_rtnl_msg_handlers[] __initconst_or {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETLINK, .dumpit = inet6_dump_ifinfo, .flags = RTNL_FLAG_DUMP_UNLOCKED}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDR, - .doit = inet6_rtm_newaddr}, + .doit = inet6_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDR, - .doit = inet6_rtm_deladdr}, + .doit = inet6_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET}, {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDR, .doit = inet6_rtm_getaddr, .dumpit = inet6_dump_ifaddr, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, @@ -7457,9 +7450,9 @@ int __init addrconf_init(void) goto out_nowq; } - rtnl_lock(); + rtnl_net_lock(&init_net); idev = ipv6_add_dev(blackhole_netdev); - rtnl_unlock(); + rtnl_net_unlock(&init_net); if (IS_ERR(idev)) { err = PTR_ERR(idev); goto errlo; @@ -7509,17 +7502,17 @@ void addrconf_cleanup(void) rtnl_af_unregister(&inet6_ops); - rtnl_lock(); + rtnl_net_lock(&init_net); /* clean dev list */ for_each_netdev(&init_net, dev) { - if (__in6_dev_get(dev) == NULL) + if (!__in6_dev_get_rtnl_net(dev)) continue; addrconf_ifdown(dev, true); } addrconf_ifdown(init_net.loopback_dev, true); - rtnl_unlock(); + rtnl_net_unlock(&init_net); destroy_workqueue(addrconf_wq); } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 5f3d0cc1555a..9e73944e3b53 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -315,7 +315,7 @@ static void esp_output_done(void *data, int err) x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) esp_output_tail_tcp(x, skb); else - xfrm_output_resume(skb->sk, skb, err); + xfrm_output_resume(skb_to_full_sk(skb), skb, err); } } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 6789623b2b0d..457de0745a33 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -1204,10 +1204,9 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) { struct ipv6_txoptions *opt2; - opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); + opt2 = sock_kmemdup(sk, opt, opt->tot_len, GFP_ATOMIC); if (opt2) { long dif = (char *)opt2 - (char *)opt; - memcpy(opt2, opt, opt->tot_len); if (opt2->hopopt) *((char **)&opt2->hopopt) += dif; if (opt2->dst0opt) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 67d39114d9a6..fd5f7112a51f 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -29,6 +29,7 @@ struct fib6_rule { __be32 flowlabel; __be32 flowlabel_mask; dscp_t dscp; + dscp_t dscp_mask; u8 dscp_full:1; /* DSCP or TOS selector */ }; @@ -331,7 +332,7 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, return 0; } - if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel)) + if ((r->dscp ^ ip6_dscp(fl6->flowlabel)) & r->dscp_mask) return 0; if ((r->flowlabel ^ flowi6_get_flowlabel(fl6)) & r->flowlabel_mask) @@ -340,12 +341,12 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule, if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto)) return 0; - if (fib_rule_port_range_set(&rule->sport_range) && - !fib_rule_port_inrange(&rule->sport_range, fl6->fl6_sport)) + if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask, + fl6->fl6_sport)) return 0; - if (fib_rule_port_range_set(&rule->dport_range) && - !fib_rule_port_inrange(&rule->dport_range, fl6->fl6_dport)) + if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask, + fl6->fl6_dport)) return 0; return 1; @@ -360,11 +361,35 @@ static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6, } rule6->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + rule6->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK); rule6->dscp_full = true; return 0; } +static int fib6_nl2rule_dscp_mask(const struct nlattr *nla, + struct fib6_rule *rule6, + struct netlink_ext_ack *extack) +{ + dscp_t dscp_mask; + + if (!rule6->dscp_full) { + NL_SET_ERR_MSG_ATTR(extack, nla, + "Cannot specify DSCP mask without DSCP value"); + return -EINVAL; + } + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2); + if (rule6->dscp & ~dscp_mask) { + NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask"); + return -EINVAL; + } + + rule6->dscp_mask = dscp_mask; + + return 0; +} + static int fib6_nl2rule_flowlabel(struct nlattr **tb, struct fib6_rule *rule6, struct netlink_ext_ack *extack) { @@ -399,9 +424,9 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct nlattr **tb, struct netlink_ext_ack *extack) { + struct fib6_rule *rule6 = (struct fib6_rule *)rule; + struct net *net = rule->fr_net; int err = -EINVAL; - struct net *net = sock_net(skb->sk); - struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, @@ -409,10 +434,15 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, goto errout; } rule6->dscp = inet_dsfield_to_dscp(frh->tos); + rule6->dscp_mask = frh->tos ? inet_dsfield_to_dscp(INET_DSCP_MASK) : 0; if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0) goto errout; + if (tb[FRA_DSCP_MASK] && + fib6_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule6, extack) < 0) + goto errout; + if ((tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) && fib6_nl2rule_flowlabel(tb, rule6, extack) < 0) goto errout; @@ -482,6 +512,14 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, return 0; } + if (tb[FRA_DSCP_MASK]) { + dscp_t dscp_mask; + + dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2); + if (!rule6->dscp_full || rule6->dscp_mask != dscp_mask) + return 0; + } + if (tb[FRA_FLOWLABEL] && nla_get_be32(tb[FRA_FLOWLABEL]) != rule6->flowlabel) return 0; @@ -512,7 +550,9 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb, if (rule6->dscp_full) { frh->tos = 0; if (nla_put_u8(skb, FRA_DSCP, - inet_dscp_to_dsfield(rule6->dscp) >> 2)) + inet_dscp_to_dsfield(rule6->dscp) >> 2) || + nla_put_u8(skb, FRA_DSCP_MASK, + inet_dscp_to_dsfield(rule6->dscp_mask) >> 2)) goto nla_put_failure; } else { frh->tos = inet_dscp_to_dsfield(rule6->dscp); @@ -539,6 +579,7 @@ static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) return nla_total_size(16) /* dst */ + nla_total_size(16) /* src */ + nla_total_size(1) /* dscp */ + + nla_total_size(1) /* dscp mask */ + nla_total_size(4) /* flowlabel */ + nla_total_size(4); /* flowlabel mask */ } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a6984a29fdb9..3fd19a84b358 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -76,7 +76,7 @@ static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); if (type == ICMPV6_PKT_TOOBIG) ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); @@ -473,7 +473,10 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (!skb->dev) return; - net = dev_net(skb->dev); + + rcu_read_lock(); + + net = dev_net_rcu(skb->dev); mark = IP6_REPLY_MARK(net, skb->mark); /* * Make sure we respect the rules @@ -496,7 +499,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, !(type == ICMPV6_PARAMPROB && code == ICMPV6_UNK_OPTION && (opt_unrec(skb, info)))) - return; + goto out; saddr = NULL; } @@ -526,7 +529,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); - return; + goto out; } /* @@ -535,7 +538,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (is_ineligible(skb)) { net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); - return; + goto out; } /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ @@ -582,7 +585,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, np = inet6_sk(sk); if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) - goto out; + goto out_unlock; tmp_hdr.icmp6_type = type; tmp_hdr.icmp6_code = code; @@ -600,7 +603,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) - goto out; + goto out_unlock; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); @@ -616,7 +619,6 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, goto out_dst_release; } - rcu_read_lock(); idev = __in6_dev_get(skb->dev); if (ip6_append_data(sk, icmpv6_getfrag, &msg, @@ -630,13 +632,15 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, len + sizeof(struct icmp6hdr)); } - rcu_read_unlock(); + out_dst_release: dst_release(dst); -out: +out_unlock: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); +out: + rcu_read_unlock(); } EXPORT_SYMBOL(icmp6_send); @@ -679,8 +683,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, skb_pull(skb2, nhs); skb_reset_network_header(skb2); - rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, - skb, 0); + rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr, + NULL, 0, skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; @@ -717,7 +721,7 @@ EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct sock *sk; struct inet6_dev *idev; struct ipv6_pinfo *np; @@ -832,7 +836,7 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) { struct inet6_skb_parm *opt = IP6CB(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct inet6_protocol *ipprot; enum skb_drop_reason reason; int inner_offset; @@ -889,7 +893,7 @@ out: static int icmpv6_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct net_device *dev = icmp6_dev(skb); struct inet6_dev *idev = __in6_dev_get(dev); const struct in6_addr *saddr, *daddr; @@ -921,7 +925,7 @@ static int icmpv6_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; @@ -939,7 +943,7 @@ static int icmpv6_rcv(struct sk_buff *skb) type = hdr->icmp6_type; - ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type); + ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type); switch (type) { case ICMPV6_ECHO_REQUEST: @@ -953,12 +957,9 @@ static int icmpv6_rcv(struct sk_buff *skb) break; case ICMPV6_ECHO_REPLY: - reason = ping_rcv(skb); - break; - case ICMPV6_EXT_ECHO_REPLY: - reason = ping_rcv(skb); - break; + ping_rcv(skb); + return 0; case ICMPV6_PKT_TOOBIG: /* BUGGG_FUTURE: if packet contains rthdr, we cannot update @@ -1034,9 +1035,9 @@ static int icmpv6_rcv(struct sk_buff *skb) csum_error: reason = SKB_DROP_REASON_ICMP_CSUM; - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: - __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS); + __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: kfree_skb_reason(skb, reason); return 0; diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 28e5a89dc255..2c383c12a431 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -336,7 +336,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct dst_entry *dst = skb_dst(skb), *cache_dst; + struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL; struct in6_addr orig_daddr; struct ioam6_lwt *ilwt; int err = -EINVAL; @@ -407,13 +407,15 @@ do_encap: cache_dst = ip6_route_output(net, NULL, &fl6); if (cache_dst->error) { err = cache_dst->error; - dst_release(cache_dst); goto drop; } - local_bh_disable(); - dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); - local_bh_enable(); + /* cache only if we don't create a dst reference loop */ + if (dst->lwtstate != cache_dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&ilwt->cache, cache_dst, &fl6.saddr); + local_bh_enable(); + } err = skb_cow_head(skb, LL_RESERVED_SPACE(cache_dst->dev)); if (unlikely(err)) @@ -426,8 +428,10 @@ do_encap: return dst_output(net, sk, skb); } out: + dst_release(cache_dst); return dst->lwtstate->orig_output(net, sk, skb); drop: + dst_release(cache_dst); kfree_skb(skb); return err; } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 235808cfec70..c6ebb6a6d390 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1498,7 +1498,6 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); @@ -1621,7 +1620,7 @@ static int __net_init ip6gre_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - ign->fb_tunnel_dev->netns_local = true; + ign->fb_tunnel_dev->netns_immutable = true; ip6gre_fb_tunnel_init(ign->fb_tunnel_dev); ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops; @@ -1882,7 +1881,6 @@ static int ip6erspan_tap_init(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); @@ -1971,7 +1969,7 @@ static bool ip6gre_netlink_encap_parms(struct nlattr *data[], return ret; } -static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev, +static int ip6gre_newlink_common(struct net *link_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -1992,7 +1990,7 @@ static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev, eth_hw_addr_random(dev); nt->dev = dev; - nt->net = dev_net(dev); + nt->net = link_net; err = register_netdevice(dev); if (err) @@ -2005,12 +2003,14 @@ out: return err; } -static int ip6gre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6gre_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *net = params->link_net ? : dev_net(dev); struct ip6_tnl *nt = netdev_priv(dev); - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip6gre_net *ign; int err; @@ -2025,7 +2025,7 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev, return -EEXIST; } - err = ip6gre_newlink_common(src_net, dev, tb, data, extack); + err = ip6gre_newlink_common(net, dev, tb, data, extack); if (!err) { ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); ip6gre_tunnel_link_md(ign, nt); @@ -2241,12 +2241,14 @@ static void ip6erspan_tap_setup(struct net_device *dev) netif_keep_dst(dev); } -static int ip6erspan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6erspan_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { + struct net *net = params->link_net ? : dev_net(dev); struct ip6_tnl *nt = netdev_priv(dev); - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip6gre_net *ign; int err; @@ -2262,7 +2264,7 @@ static int ip6erspan_newlink(struct net *src_net, struct net_device *dev, return -EEXIST; } - err = ip6gre_newlink_common(src_net, dev, tb, data, extack); + err = ip6gre_newlink_common(net, dev, tb, data, extack); if (!err) { ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]); ip6erspan_tunnel_link_md(ign, nt); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 70c0e16c0ae6..39da6a7ce5f1 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -477,9 +477,7 @@ discard: static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { skb_clear_delivery_time(skb); - rcu_read_lock(); ip6_protocol_deliver_rcu(net, skb, 0, false); - rcu_read_unlock(); return 0; } @@ -487,9 +485,15 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk int ip6_input(struct sk_buff *skb) { - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, - dev_net(skb->dev), NULL, skb, skb->dev, NULL, - ip6_input_finish); + int res; + + rcu_read_lock(); + res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN, + dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL, + ip6_input_finish); + rcu_read_unlock(); + + return res; } EXPORT_SYMBOL_GPL(ip6_input); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 48fd53b98972..170a6ac30889 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -253,8 +253,7 @@ static void ip6_dev_free(struct net_device *dev) static int ip6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id); int err; dev->rtnl_link_ops = &ip6_link_ops; @@ -1878,7 +1877,6 @@ ip6_tnl_dev_init_gen(struct net_device *dev) int t_hlen; t->dev = dev; - t->net = dev_net(dev); ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); if (ret) @@ -1940,6 +1938,7 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); @@ -2002,17 +2001,22 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[], parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ip6_tnl_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel_encap ipencap; + struct ip6_tnl_net *ip6n; struct ip6_tnl *nt, *t; + struct net *net; int err; + net = params->link_net ? : dev_net(dev); + ip6n = net_generic(net, ip6_tnl_net_id); nt = netdev_priv(dev); + nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); @@ -2261,7 +2265,7 @@ static int __net_init ip6_tnl_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - ip6n->fb_tnl_dev->netns_local = true; + ip6n->fb_tnl_dev->netns_immutable = true; err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 590737c27537..83c055996fbb 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -177,8 +177,7 @@ vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t) static int vti6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct vti6_net *ip6n = net_generic(net, vti6_net_id); + struct vti6_net *ip6n = net_generic(t->net, vti6_net_id); int err; dev->rtnl_link_ops = &vti6_link_ops; @@ -925,7 +924,6 @@ static inline int vti6_dev_init_gen(struct net_device *dev) struct ip6_tnl *t = netdev_priv(dev); t->dev = dev; - t->net = dev_net(dev); netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; @@ -958,6 +956,7 @@ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); + t->net = net; t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); @@ -997,17 +996,20 @@ static void vti6_netlink_parms(struct nlattr *data[], parms->fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } -static int vti6_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int vti6_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); + struct nlattr **data = params->data; struct ip6_tnl *nt; + struct net *net; + net = params->link_net ? : dev_net(dev); nt = netdev_priv(dev); vti6_netlink_parms(data, &nt->parms); nt->parms.proto = IPPROTO_IPV6; + nt->net = net; if (vti6_locate(net, &nt->parms, 0)) return -EEXIST; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 535e9f72514c..e8ade93a0f0e 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -668,7 +668,7 @@ static void reg_vif_setup(struct net_device *dev) dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; - dev->netns_local = true; + dev->netns_immutable = true; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 9dfdb40988b0..65831b4fee1f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1773,21 +1773,19 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) struct net_device *dev = idev->dev; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct net *net = dev_net(dev); const struct in6_addr *saddr; struct in6_addr addr_buf; struct mld2_report *pmr; struct sk_buff *skb; unsigned int size; struct sock *sk; - int err; + struct net *net; - sk = net->ipv6.igmp_sk; /* we assume size > sizeof(ra) here * Also try to not allocate high-order pages for big MTU */ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen; - skb = sock_alloc_send_skb(sk, size, 1, &err); + skb = alloc_skb(size, GFP_KERNEL); if (!skb) return NULL; @@ -1795,6 +1793,12 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) skb_reserve(skb, hlen); skb_tailroom_reserve(skb, mtu, tlen); + rcu_read_lock(); + + net = dev_net_rcu(dev); + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) { /* <draft-ietf-magma-mld-source-05.txt>: * use unspecified address as the source address @@ -1806,6 +1810,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu) ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0); + rcu_read_unlock(); + skb_put_data(skb, ra, sizeof(ra)); skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data); @@ -2165,21 +2171,21 @@ static void mld_send_cr(struct inet6_dev *idev) static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) { - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.igmp_sk; + const struct in6_addr *snd_addr, *saddr; + int err, len, payload_len, full_len; + struct in6_addr addr_buf; struct inet6_dev *idev; struct sk_buff *skb; struct mld_msg *hdr; - const struct in6_addr *snd_addr, *saddr; - struct in6_addr addr_buf; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; - struct flowi6 fl6; struct dst_entry *dst; + struct flowi6 fl6; + struct net *net; + struct sock *sk; if (type == ICMPV6_MGM_REDUCTION) snd_addr = &in6addr_linklocal_allrouters; @@ -2190,19 +2196,21 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) payload_len = len + sizeof(ra); full_len = sizeof(struct ipv6hdr) + payload_len; - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_OUTREQUESTS); - rcu_read_unlock(); + skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL); - skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); + rcu_read_lock(); + net = dev_net_rcu(dev); + idev = __in6_dev_get(dev); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); if (!skb) { - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), - IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; } + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); @@ -2227,9 +2235,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) IPPROTO_ICMPV6, csum_partial(hdr, len, 0)); - rcu_read_lock(); - idev = __in6_dev_get(skb->dev); - icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d044c67019de..ecb5c4b8518f 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -418,15 +418,11 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, { int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - struct sock *sk = dev_net(dev)->ipv6.ndisc_sk; struct sk_buff *skb; skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC); - if (!skb) { - ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n", - __func__); + if (!skb) return NULL; - } skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; @@ -437,7 +433,9 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev, /* Manually assign socket ownership as we avoid calling * sock_alloc_send_pskb() to bypass wmem buffer limits */ - skb_set_owner_w(skb, sk); + rcu_read_lock(); + skb_set_owner_w(skb, dev_net_rcu(dev)->ipv6.ndisc_sk); + rcu_read_unlock(); return skb; } @@ -473,16 +471,20 @@ static void ip6_nd_hdr(struct sk_buff *skb, void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, const struct in6_addr *saddr) { + struct icmp6hdr *icmp6h = icmp6_hdr(skb); struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(skb->dev); - struct sock *sk = net->ipv6.ndisc_sk; struct inet6_dev *idev; + struct net *net; + struct sock *sk; int err; - struct icmp6hdr *icmp6h = icmp6_hdr(skb); u8 type; type = icmp6h->icmp6_type; + rcu_read_lock(); + + net = dev_net_rcu(skb->dev); + sk = net->ipv6.ndisc_sk; if (!dst) { struct flowi6 fl6; int oif = skb->dev->ifindex; @@ -490,6 +492,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif); dst = icmp6_dst_alloc(skb->dev, &fl6); if (IS_ERR(dst)) { + rcu_read_unlock(); kfree_skb(skb); return; } @@ -504,7 +507,6 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); - rcu_read_lock(); idev = __in6_dev_get(dst->dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); @@ -1678,7 +1680,7 @@ static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb, void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) { struct net_device *dev = skb->dev; - struct net *net = dev_net(dev); + struct net *net = dev_net_rcu(dev); struct sock *sk = net->ipv6.ndisc_sk; int optlen = 0; struct inet_peer *peer; @@ -1693,8 +1695,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL; bool ret; - if (netif_is_l3_master(skb->dev)) { - dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); + if (netif_is_l3_master(dev)) { + dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) return; } @@ -1732,10 +1734,8 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) goto release; } - rcu_read_lock(); peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr); ret = inet_peer_xrlim_allow(peer, 1*HZ); - rcu_read_unlock(); if (!ret) goto release; diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 46b8adf6e7f8..84d90dd8b3f0 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -119,9 +119,6 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) return -EINVAL; ipcm6_init_sk(&ipc6, sk); - ipc6.sockc.priority = READ_ONCE(sk->sk_priority); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = READ_ONCE(sk->sk_mark); fl6.flowi6_oif = oif; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a45aba090aa4..fda640ebd53f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -769,19 +769,16 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) hdrincl = inet_test_bit(HDRINCL, sk); + ipcm6_init_sk(&ipc6, sk); + /* * Get and verify the address. */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_mark = READ_ONCE(sk->sk_mark); + fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; - ipcm6_init(&ipc6); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = fl6.flowi6_mark; - ipc6.sockc.priority = READ_ONCE(sk->sk_priority); - if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -891,9 +888,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); @@ -904,9 +898,6 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); - if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 78362822b907..ef2d23a1e3d5 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3196,13 +3196,18 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) { struct net_device *dev = dst->dev; unsigned int mtu = dst_mtu(dst); - struct net *net = dev_net(dev); + struct net *net; mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + rcu_read_lock(); + + net = dev_net_rcu(dev); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) mtu = net->ipv6.sysctl.ip6_rt_min_advmss; + rcu_read_unlock(); + /* * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index 7ba22d2f2bfe..7c05ac846646 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -232,13 +232,15 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; - dst_release(dst); goto drop; } - local_bh_disable(); - dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); - local_bh_enable(); + /* cache only if we don't create a dst reference loop */ + if (orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr); + local_bh_enable(); + } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) @@ -251,6 +253,7 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) return dst_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } @@ -259,23 +262,35 @@ static int rpl_input(struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct rpl_lwt *rlwt; int err; - rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate); + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; + + rlwt = rpl_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&rlwt->cache); local_bh_enable(); err = rpl_do_srh(skb, rlwt, dst); - if (unlikely(err)) + if (unlikely(err)) { + dst_release(dst); goto drop; + } if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&rlwt->cache, dst, &ipv6_hdr(skb)->saddr); diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 4bf937bfc263..51583461ae29 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -472,23 +472,35 @@ static int seg6_input_core(struct net *net, struct sock *sk, { struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; + struct lwtunnel_state *lwtst; struct seg6_lwt *slwt; int err; - slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); + /* We cannot dereference "orig_dst" once ip6_route_input() or + * skb_dst_drop() is called. However, in order to detect a dst loop, we + * need the address of its lwtstate. So, save the address of lwtstate + * now and use it later as a comparison. + */ + lwtst = orig_dst->lwtstate; + + slwt = seg6_lwt_lwtunnel(lwtst); local_bh_disable(); dst = dst_cache_get(&slwt->cache); local_bh_enable(); err = seg6_do_srh(skb, dst); - if (unlikely(err)) + if (unlikely(err)) { + dst_release(dst); goto drop; + } if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); - if (!dst->error) { + + /* cache only if we don't create a dst reference loop */ + if (!dst->error && lwtst != dst->lwtstate) { local_bh_disable(); dst_cache_set_ip6(&slwt->cache, dst, &ipv6_hdr(skb)->saddr); @@ -571,13 +583,15 @@ static int seg6_output_core(struct net *net, struct sock *sk, dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = dst->error; - dst_release(dst); goto drop; } - local_bh_disable(); - dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); - local_bh_enable(); + /* cache only if we don't create a dst reference loop */ + if (orig_dst->lwtstate != dst->lwtstate) { + local_bh_disable(); + dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); + local_bh_enable(); + } err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); if (unlikely(err)) @@ -593,6 +607,7 @@ static int seg6_output_core(struct net *net, struct sock *sk, return dst_output(net, sk, skb); drop: + dst_release(dst); kfree_skb(skb); return err; } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 39bd8951bfca..6f04703fe638 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -201,8 +201,7 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) static int ipip6_tunnel_create(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct sit_net *sitn = net_generic(net, sit_net_id); + struct sit_net *sitn = net_generic(t->net, sit_net_id); int err; __dev_addr_set(dev, &t->parms.iph.saddr, 4); @@ -269,6 +268,7 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, nt = netdev_priv(dev); + nt->net = net; nt->parms = *parms; if (ipip6_tunnel_create(dev) < 0) goto failed_free; @@ -1449,7 +1449,6 @@ static int ipip6_tunnel_init(struct net_device *dev) int err; tunnel->dev = dev; - tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ipip6_tunnel_bind_dev(dev); @@ -1550,19 +1549,23 @@ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], } #endif -static int ipip6_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], +static int ipip6_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); + struct nlattr **data = params->data; + struct nlattr **tb = params->tb; struct ip_tunnel *nt; struct ip_tunnel_encap ipencap; #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif + struct net *net; int err; + net = params->link_net ? : dev_net(dev); nt = netdev_priv(dev); + nt->net = net; if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip_tunnel_encap_setup(nt, &ipencap); @@ -1856,7 +1859,10 @@ static int __net_init sit_init_net(struct net *net) /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ - sitn->fb_tunnel_dev->netns_local = true; + sitn->fb_tunnel_dev->netns_immutable = true; + + t = netdev_priv(sitn->fb_tunnel_dev); + t->net = net; err = register_netdev(sitn->fb_tunnel_dev); if (err) @@ -1865,8 +1871,6 @@ static int __net_init sit_init_net(struct net *net) ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); - t = netdev_priv(sitn->fb_tunnel_dev); - strcpy(t->parms.name, sitn->fb_tunnel_dev->name); return 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2debdf085a3b..85c4820bfe15 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -376,7 +376,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); struct request_sock *fastopen; struct ipv6_pinfo *np; struct tcp_sock *tp; @@ -798,6 +798,8 @@ static void tcp_v6_init_req(struct request_sock *req, ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + ireq->ir_rmt_addr = LOOPBACK4_IPV6; + ireq->ir_loc_addr = LOOPBACK4_IPV6; /* So that link locals have meaning */ if ((!sk_listener->sk_bound_dev_if || l3_slave) && @@ -864,16 +866,16 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 int oif, int rst, u8 tclass, __be32 label, u32 priority, u32 txhash, struct tcp_key *key) { - const struct tcphdr *th = tcp_hdr(skb); - struct tcphdr *t1; - struct sk_buff *buff; - struct flowi6 fl6; - struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); - struct sock *ctl_sk = net->ipv6.tcp_sk; + struct net *net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); unsigned int tot_len = sizeof(struct tcphdr); + struct sock *ctl_sk = net->ipv6.tcp_sk; + const struct tcphdr *th = tcp_hdr(skb); __be32 mrst = 0, *topt; struct dst_entry *dst; - __u32 mark = 0; + struct sk_buff *buff; + struct tcphdr *t1; + struct flowi6 fl6; + u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; @@ -1039,7 +1041,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, if (!sk && !ipv6_unicast_destination(skb)) return; - net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; @@ -1277,7 +1279,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_nxt, tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), - READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, + req->ts_recent, sk->sk_bound_dev_if, &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, READ_ONCE(sk->sk_priority), READ_ONCE(tcp_rsk(req)->txhash)); @@ -1451,10 +1453,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * ip6_dst_store(newsk, dst, NULL, NULL); - newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; newnp->saddr = ireq->ir_v6_loc_addr; - newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; - newsk->sk_bound_dev_if = ireq->ir_iif; /* Now IPv6 options... @@ -1507,9 +1506,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_initialize_rcv_mss(newsk); - newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; - newinet->inet_rcv_saddr = LOOPBACK4_IPV6; - #ifdef CONFIG_TCP_MD5SIG l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); @@ -1744,6 +1740,7 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { + struct net *net = dev_net_rcu(skb->dev); enum skb_drop_reason drop_reason; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); @@ -1753,7 +1750,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) bool refcounted; int ret; u32 isn; - struct net *net = dev_net(skb->dev); drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) @@ -1832,7 +1828,8 @@ lookup: th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); - nsk = tcp_check_req(sk, skb, req, false, &req_stolen); + nsk = tcp_check_req(sk, skb, req, false, &req_stolen, + &drop_reason); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } @@ -2004,7 +2001,7 @@ do_time_wait: void tcp_v6_early_demux(struct sk_buff *skb) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); const struct ipv6hdr *hdr; const struct tcphdr *th; struct sock *sk; diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index a45bf17cb2a1..91b88daa5b55 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -35,7 +35,7 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet6_get_iif_sdif(skb, &iif, &sdif); hdr = skb_gro_network_header(skb); - net = dev_net(skb->dev); + net = dev_net_rcu(skb->dev); sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6671daa67f4f..3a0d6c5a8286 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1389,9 +1389,9 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); - if (hlen + cork->gso_size > cork->fragsize) { + if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); - return -EINVAL; + return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); @@ -1494,11 +1494,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int is_udplite = IS_UDPLITE(sk); int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); ipc6.gso_size = READ_ONCE(up->gso_size); - ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); - ipc6.sockc.mark = READ_ONCE(sk->sk_mark); - ipc6.sockc.priority = READ_ONCE(sk->sk_priority); /* destination address check */ if (sin6) { @@ -1704,9 +1701,6 @@ do_udp_sendmsg: security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel); dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected); @@ -1752,8 +1746,6 @@ back_from_confirm: WRITE_ONCE(up->pending, AF_INET6); do_append_data: - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); up->len += ulen; err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr), &ipc6, fl6, dst_rt6_info(dst), diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index b41152dd4246..404212dfc99a 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -117,7 +117,7 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport) { const struct ipv6hdr *iph = skb_gro_network_header(skb); - struct net *net = dev_net(skb->dev); + struct net *net = dev_net_rcu(skb->dev); int iif, sdif; inet6_get_iif_sdif(skb, &iif, &sdif); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 5f7b1fdbffe6..b3d5d1f266ee 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -82,14 +82,14 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) toobig = skb->len > mtu && !skb_is_gso(skb); - if (toobig && xfrm6_local_dontfrag(skb->sk)) { + if (toobig && xfrm6_local_dontfrag(sk)) { xfrm6_local_rxpmtu(skb, mtu); kfree_skb(skb); return -EMSGSIZE; } else if (toobig && xfrm6_noneed_fragment(skb)) { skb->ignore_df = 1; goto skip_frag; - } else if (!skb->ignore_df && toobig && skb->sk) { + } else if (!skb->ignore_df && toobig && sk) { xfrm_local_error(skb, mtu); kfree_skb(skb); return -EMSGSIZE; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index f4c1da070826..b98d13584c81 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -547,7 +547,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.flowi6_mark = READ_ONCE(sk->sk_mark); fl6.flowi6_uid = sk->sk_uid; - ipcm6_init(&ipc6); + ipcm6_init_sk(&ipc6, sk); if (lsa) { if (addr_len < SIN6_LEN_RFC2133) @@ -634,9 +634,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - if (ipc6.tclass < 0) - ipc6.tclass = np->tclass; - fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); @@ -648,9 +645,6 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); - if (ipc6.dontfrag < 0) - ipc6.dontfrag = inet6_test_bit(DONTFRAG, sk); - if (msg->msg_flags & MSG_CONFIRM) goto do_confirm; diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c index 72e101135f8c..c8d88e2508fc 100644 --- a/net/llc/sysctl_net_llc.c +++ b/net/llc/sysctl_net_llc.c @@ -11,10 +11,6 @@ #include <net/net_namespace.h> #include <net/llc.h> -#ifndef CONFIG_SYSCTL -#error This file should not be compiled without CONFIG_SYSCTL defined -#endif - static struct ctl_table llc2_timeout_table[] = { { .procname = "ack", diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index e7687a7b1683..54c479910d05 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -1025,16 +1025,7 @@ void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata) void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata) { - struct dentry *dir; - char buf[10 + IFNAMSIZ]; - - dir = sdata->vif.debugfs_dir; - - if (IS_ERR_OR_NULL(dir)) - return; - - sprintf(buf, "netdev:%s", sdata->name); - debugfs_rename(dir->d_parent, dir, dir->d_parent, buf); + debugfs_change_name(sdata->vif.debugfs_dir, "netdev:%s", sdata->name); } void ieee80211_debugfs_recreate_netdev(struct ieee80211_sub_if_data *sdata, diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 3999e0ba2c35..be6c0237e10b 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -401,26 +401,30 @@ void mptcp_active_enable(struct sock *sk) void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) { struct mptcp_subflow_context *subflow; + u8 timeouts, to_max; + struct net *net; - if (!sk_is_mptcp(ssk)) + /* Only check MPTCP SYN ... */ + if (likely(!sk_is_mptcp(ssk) || ssk->sk_state != TCP_SYN_SENT)) return; subflow = mptcp_subflow_ctx(ssk); - if (subflow->request_mptcp && ssk->sk_state == TCP_SYN_SENT) { - struct net *net = sock_net(ssk); - u8 timeouts, to_max; + /* ... + MP_CAPABLE */ + if (!subflow->request_mptcp) { + /* Mark as blackhole iif the 1st non-MPTCP SYN is accepted */ + subflow->mpc_drop = 0; + return; + } - timeouts = inet_csk(ssk)->icsk_retransmits; - to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback; + net = sock_net(ssk); + timeouts = inet_csk(ssk)->icsk_retransmits; + to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback; - if (timeouts == to_max || (timeouts < to_max && expired)) { - MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); - subflow->mpc_drop = 1; - mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); - } else { - subflow->mpc_drop = 0; - } + if (timeouts == to_max || (timeouts < to_max && expired)) { + MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); + subflow->mpc_drop = 1; + mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); } } diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index a29ff901df75..b9e451197902 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -40,17 +40,17 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf tp->copied_seq += skb->len; subflow->ssn_offset += skb->len; - /* initialize a dummy sequence number, we will update it at MPC - * completion, if needed - */ + /* Only the sequence delta is relevant */ MPTCP_SKB_CB(skb)->map_seq = -skb->len; MPTCP_SKB_CB(skb)->end_seq = 0; MPTCP_SKB_CB(skb)->offset = 0; MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + MPTCP_SKB_CB(skb)->cant_coalesce = 1; mptcp_data_lock(sk); + DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk)); - mptcp_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); mptcp_sk(sk)->bytes_received += skb->len; @@ -58,22 +58,3 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf mptcp_data_unlock(sk); } - -void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - const struct mptcp_options_received *mp_opt) -{ - struct sock *sk = (struct sock *)msk; - struct sk_buff *skb; - - skb = skb_peek_tail(&sk->sk_receive_queue); - if (skb) { - WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq); - pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx\n", sk, - MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq, - MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq); - MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq; - MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq; - } - - pr_debug("msk=%p ack_seq=%llx\n", msk, msk->ack_seq); -} diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 123f3f297284..fd2de185bc93 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -108,7 +108,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, mp_opt->suboptions |= OPTION_MPTCP_DSS; mp_opt->use_map = 1; mp_opt->mpc_map = 1; - mp_opt->use_ack = 0; mp_opt->data_len = get_unaligned_be16(ptr); ptr += 2; } @@ -157,11 +156,6 @@ static void mptcp_parse_option(const struct sk_buff *skb, pr_debug("DSS\n"); ptr++; - /* we must clear 'mpc_map' be able to detect MP_CAPABLE - * map vs DSS map in mptcp_incoming_options(), and reconstruct - * map info accordingly - */ - mp_opt->mpc_map = 0; flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; @@ -369,8 +363,11 @@ void mptcp_get_options(const struct sk_buff *skb, const unsigned char *ptr; int length; - /* initialize option status */ - mp_opt->suboptions = 0; + /* Ensure that casting the whole status to u32 is efficient and safe */ + BUILD_BUG_ON(sizeof_field(struct mptcp_options_received, status) != sizeof(u32)); + BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct mptcp_options_received, status), + sizeof(u32))); + *(u32 *)&mp_opt->status = 0; length = (th->doff * 4) - sizeof(struct tcphdr); ptr = (const unsigned char *)(th + 1); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 16c336c51940..16cacce6c10f 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -10,6 +10,7 @@ #include "protocol.h" #include "mib.h" +#include "mptcp_pm_gen.h" /* path manager command handlers */ @@ -433,14 +434,83 @@ bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc) return mptcp_pm_nl_is_backup(msk, &skc_local); } -int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info) +static int mptcp_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, + struct genl_info *info) { if (info->attrs[MPTCP_PM_ATTR_TOKEN]) - return mptcp_userspace_pm_get_addr(skb, info); - return mptcp_pm_nl_get_addr(skb, info); + return mptcp_userspace_pm_get_addr(id, addr, info); + return mptcp_pm_nl_get_addr(id, addr, info); } -int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) +int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct mptcp_pm_addr_entry addr; + struct nlattr *attr; + struct sk_buff *msg; + void *reply; + int ret; + + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) + return -EINVAL; + + attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; + ret = mptcp_pm_parse_entry(attr, info, false, &addr); + if (ret < 0) + return ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, + info->genlhdr->cmd); + if (!reply) { + GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); + ret = -EMSGSIZE; + goto fail; + } + + ret = mptcp_pm_get_addr(addr.addr.id, &addr, info); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); + goto fail; + } + + ret = mptcp_nl_fill_addr(msg, &addr); + if (ret) + goto fail; + + genlmsg_end(msg, reply); + ret = genlmsg_reply(msg, info); + return ret; + +fail: + nlmsg_free(msg); + return ret; +} + +int mptcp_pm_genl_fill_addr(struct sk_buff *msg, + struct netlink_callback *cb, + struct mptcp_pm_addr_entry *entry) +{ + void *hdr; + + hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &mptcp_genl_family, + NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); + if (!hdr) + return -EINVAL; + + if (mptcp_nl_fill_addr(msg, entry) < 0) { + genlmsg_cancel(msg, hdr); + return -EINVAL; + } + + genlmsg_end(msg, hdr); + return 0; +} + +static int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); @@ -449,11 +519,34 @@ int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) return mptcp_pm_nl_dump_addr(msg, cb); } -int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info) +int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + return mptcp_pm_dump_addr(msg, cb); +} + +static int mptcp_pm_set_flags(struct genl_info *info) { + struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; + struct nlattr *attr_loc; + int ret = -EINVAL; + + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR)) + return ret; + + attr_loc = info->attrs[MPTCP_PM_ATTR_ADDR]; + ret = mptcp_pm_parse_entry(attr_loc, info, false, &loc); + if (ret < 0) + return ret; + if (info->attrs[MPTCP_PM_ATTR_TOKEN]) - return mptcp_userspace_pm_set_flags(skb, info); - return mptcp_pm_nl_set_flags(skb, info); + return mptcp_userspace_pm_set_flags(&loc, info); + return mptcp_pm_nl_set_flags(&loc, info); +} + +int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info) +{ + return mptcp_pm_set_flags(info); } void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 98ac73938bd8..d4328443d844 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -64,7 +64,7 @@ bool mptcp_addresses_equal(const struct mptcp_addr_info *a, addr_equals = a->addr.s_addr == b->addr.s_addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else - addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6); + addr_equals = ipv6_addr_equal(&a->addr6, &b->addr6); } else if (a->family == AF_INET) { if (ipv6_addr_v4mapped(&b->addr6)) addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; @@ -1393,28 +1393,35 @@ static bool mptcp_pm_has_addr_attr_id(const struct nlattr *attr, int mptcp_pm_nl_add_addr_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; + struct nlattr *attr; int ret; + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) + return -EINVAL; + + attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; ret = mptcp_pm_parse_entry(attr, info, true, &addr); if (ret < 0) return ret; if (addr.addr.port && !address_use_port(&addr)) { - GENL_SET_ERR_MSG(info, "flags must have signal and not subflow when using port"); + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "flags must have signal and not subflow when using port"); return -EINVAL; } if (addr.flags & MPTCP_PM_ADDR_FLAG_SIGNAL && addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { - GENL_SET_ERR_MSG(info, "flags mustn't have both signal and fullmesh"); + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "flags mustn't have both signal and fullmesh"); return -EINVAL; } if (addr.flags & MPTCP_PM_ADDR_FLAG_IMPLICIT) { - GENL_SET_ERR_MSG(info, "can't create IMPLICIT endpoint"); + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "can't create IMPLICIT endpoint"); return -EINVAL; } @@ -1514,11 +1521,6 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, if (mptcp_pm_is_userspace(msk)) goto next; - if (list_empty(&msk->conn_list)) { - mptcp_pm_remove_anno_addr(msk, addr, false); - goto next; - } - lock_sock(sk); remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr); mptcp_pm_remove_anno_addr(msk, addr, remove_subflow && @@ -1587,12 +1589,16 @@ next: int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; unsigned int addr_max; + struct nlattr *attr; int ret; + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) + return -EINVAL; + + attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; ret = mptcp_pm_parse_entry(attr, info, false, &addr); if (ret < 0) return ret; @@ -1608,7 +1614,7 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) spin_lock_bh(&pernet->lock); entry = __lookup_addr_by_id(pernet, addr.addr.id); if (!entry) { - GENL_SET_ERR_MSG(info, "address not found"); + NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); spin_unlock_bh(&pernet->lock); return -EINVAL; } @@ -1762,61 +1768,24 @@ nla_put_failure: return -EMSGSIZE; } -int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info) +int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, + struct genl_info *info) { - struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; struct pm_nl_pernet *pernet = genl_info_pm_nl(info); - struct mptcp_pm_addr_entry addr, *entry; - struct sk_buff *msg; - void *reply; - int ret; - - ret = mptcp_pm_parse_entry(attr, info, false, &addr); - if (ret < 0) - return ret; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, - info->genlhdr->cmd); - if (!reply) { - GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); - ret = -EMSGSIZE; - goto fail; - } + struct mptcp_pm_addr_entry *entry; + int ret = -EINVAL; rcu_read_lock(); - entry = __lookup_addr_by_id(pernet, addr.addr.id); - if (!entry) { - GENL_SET_ERR_MSG(info, "address not found"); - ret = -EINVAL; - goto unlock_fail; + entry = __lookup_addr_by_id(pernet, id); + if (entry) { + *addr = *entry; + ret = 0; } - - ret = mptcp_nl_fill_addr(msg, entry); - if (ret) - goto unlock_fail; - - genlmsg_end(msg, reply); - ret = genlmsg_reply(msg, info); rcu_read_unlock(); - return ret; -unlock_fail: - rcu_read_unlock(); - -fail: - nlmsg_free(msg); return ret; } -int mptcp_pm_nl_get_addr_doit(struct sk_buff *skb, struct genl_info *info) -{ - return mptcp_pm_get_addr(skb, info); -} - int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) { @@ -1824,7 +1793,6 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; int id = cb->args[0]; - void *hdr; int i; pernet = pm_nl_get_pernet(net); @@ -1839,19 +1807,10 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg, if (entry->addr.id <= id) continue; - hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, &mptcp_genl_family, - NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); - if (!hdr) - break; - - if (mptcp_nl_fill_addr(msg, entry) < 0) { - genlmsg_cancel(msg, hdr); + if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0) break; - } id = entry->addr.id; - genlmsg_end(msg, hdr); } } rcu_read_unlock(); @@ -1860,12 +1819,6 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg, return msg->len; } -int mptcp_pm_nl_get_addr_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - return mptcp_pm_dump_addr(msg, cb); -} - static int parse_limit(struct genl_info *info, int id, unsigned int *limit) { struct nlattr *attr = info->attrs[id]; @@ -1875,7 +1828,9 @@ static int parse_limit(struct genl_info *info, int id, unsigned int *limit) *limit = nla_get_u32(attr); if (*limit > MPTCP_PM_ADDR_MAX) { - GENL_SET_ERR_MSG(info, "limit greater than maximum"); + NL_SET_ERR_MSG_ATTR_FMT(info->extack, attr, + "limit greater than maximum (%u)", + MPTCP_PM_ADDR_MAX); return -EINVAL; } return 0; @@ -1952,13 +1907,16 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, spin_unlock_bh(&msk->pm.lock); } -static int mptcp_nl_set_flags(struct net *net, - struct mptcp_addr_info *addr, - u8 bkup, u8 changed) +static void mptcp_nl_set_flags(struct net *net, struct mptcp_addr_info *addr, + u8 flags, u8 changed) { + u8 is_subflow = !!(flags & MPTCP_PM_ADDR_FLAG_SUBFLOW); + u8 bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); long s_slot = 0, s_num = 0; struct mptcp_sock *msk; - int ret = -EINVAL; + + if (changed == MPTCP_PM_ADDR_FLAG_FULLMESH && !is_subflow) + return; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; @@ -1968,8 +1926,9 @@ static int mptcp_nl_set_flags(struct net *net, lock_sock(sk); if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) - ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup); - if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) + mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup); + /* Subflows will only be recreated if the SUBFLOW flag is set */ + if (is_subflow && (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)) mptcp_pm_nl_fullmesh(msk, addr); release_sock(sk); @@ -1978,68 +1937,56 @@ next: cond_resched(); } - return ret; + return; } -int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info) +int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info) { - struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }; struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; u8 changed, mask = MPTCP_PM_ADDR_FLAG_BACKUP | MPTCP_PM_ADDR_FLAG_FULLMESH; - struct net *net = sock_net(skb->sk); + struct net *net = genl_info_net(info); struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; u8 lookup_by_id = 0; - u8 bkup = 0; - int ret; pernet = pm_nl_get_pernet(net); - ret = mptcp_pm_parse_entry(attr, info, false, &addr); - if (ret < 0) - return ret; - - if (addr.addr.family == AF_UNSPEC) { + if (local->addr.family == AF_UNSPEC) { lookup_by_id = 1; - if (!addr.addr.id) { - GENL_SET_ERR_MSG(info, "missing required inputs"); + if (!local->addr.id) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "missing address ID"); return -EOPNOTSUPP; } } - if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) - bkup = 1; - spin_lock_bh(&pernet->lock); - entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) : - __lookup_addr(pernet, &addr.addr); + entry = lookup_by_id ? __lookup_addr_by_id(pernet, local->addr.id) : + __lookup_addr(pernet, &local->addr); if (!entry) { spin_unlock_bh(&pernet->lock); - GENL_SET_ERR_MSG(info, "address not found"); + NL_SET_ERR_MSG_ATTR(info->extack, attr, "address not found"); return -EINVAL; } - if ((addr.flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && - (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + if ((local->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) && + (entry->flags & (MPTCP_PM_ADDR_FLAG_SIGNAL | + MPTCP_PM_ADDR_FLAG_IMPLICIT))) { spin_unlock_bh(&pernet->lock); - GENL_SET_ERR_MSG(info, "invalid addr flags"); + NL_SET_ERR_MSG_ATTR(info->extack, attr, "invalid addr flags"); return -EINVAL; } - changed = (addr.flags ^ entry->flags) & mask; - entry->flags = (entry->flags & ~mask) | (addr.flags & mask); - addr = *entry; + changed = (local->flags ^ entry->flags) & mask; + entry->flags = (entry->flags & ~mask) | (local->flags & mask); + *local = *entry; spin_unlock_bh(&pernet->lock); - mptcp_nl_set_flags(net, &addr.addr, bkup, changed); + mptcp_nl_set_flags(net, &local->addr, entry->flags, changed); return 0; } -int mptcp_pm_nl_set_flags_doit(struct sk_buff *skb, struct genl_info *info) -{ - return mptcp_pm_set_flags(skb, info); -} - static void mptcp_nl_mcast_send(struct net *net, struct sk_buff *nlskb, gfp_t gfp) { genlmsg_multicast_netns(&mptcp_genl_family, net, @@ -2070,9 +2017,7 @@ static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { - const struct ipv6_pinfo *np = inet6_sk(ssk); - - if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) + if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr)) return -EMSGSIZE; if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr)) return -EMSGSIZE; @@ -2299,9 +2244,7 @@ void mptcp_event_pm_listener(const struct sock *ssk, break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { - const struct ipv6_pinfo *np = inet6_sk(ssk); - - if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) + if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr)) goto nla_put_failure; break; } diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index a3d477059b11..7e7d01bef5d4 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -48,7 +48,6 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, bool needs_id) { DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); - struct mptcp_pm_addr_entry *match = NULL; struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *e; bool addr_match = false; @@ -63,26 +62,21 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, if (addr_match && entry->addr.id == 0 && needs_id) entry->addr.id = e->addr.id; id_match = (e->addr.id == entry->addr.id); - if (addr_match && id_match) { - match = e; + if (addr_match || id_match) break; - } else if (addr_match || id_match) { - break; - } __set_bit(e->addr.id, id_bitmap); } - if (!match && !addr_match && !id_match) { + if (!addr_match && !id_match) { /* Memory for the entry is allocated from the * sock option buffer. */ - e = sock_kmalloc(sk, sizeof(*e), GFP_ATOMIC); + e = sock_kmemdup(sk, entry, sizeof(*entry), GFP_ATOMIC); if (!e) { ret = -ENOMEM; goto append_err; } - *e = *entry; if (!e->addr.id && needs_id) e->addr.id = find_next_zero_bit(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, @@ -90,7 +84,7 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list); msk->pm.local_addr_used++; ret = e->addr.id; - } else if (match) { + } else if (addr_match && id_match) { ret = entry->addr.id; } @@ -175,14 +169,13 @@ bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, static struct mptcp_sock *mptcp_userspace_pm_get_sock(const struct genl_info *info) { - struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct mptcp_sock *msk; + struct nlattr *token; - if (!token) { - GENL_SET_ERR_MSG(info, "missing required token"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_TOKEN)) return NULL; - } + token = info->attrs[MPTCP_PM_ATTR_TOKEN]; msk = mptcp_token_get_sock(genl_info_net(info), nla_get_u32(token)); if (!msk) { NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token"); @@ -190,7 +183,8 @@ static struct mptcp_sock *mptcp_userspace_pm_get_sock(const struct genl_info *in } if (!mptcp_pm_is_userspace(msk)) { - GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); + NL_SET_ERR_MSG_ATTR(info->extack, token, + "userspace PM not selected"); sock_put((struct sock *)msk); return NULL; } @@ -200,16 +194,14 @@ static struct mptcp_sock *mptcp_userspace_pm_get_sock(const struct genl_info *in int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry addr_val; struct mptcp_sock *msk; + struct nlattr *addr; int err = -EINVAL; struct sock *sk; - if (!addr) { - GENL_SET_ERR_MSG(info, "missing required address"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR)) return err; - } msk = mptcp_userspace_pm_get_sock(info); if (!msk) @@ -217,21 +209,27 @@ int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) sk = (struct sock *)msk; + addr = info->attrs[MPTCP_PM_ATTR_ADDR]; err = mptcp_pm_parse_entry(addr, info, true, &addr_val); - if (err < 0) { - GENL_SET_ERR_MSG(info, "error parsing local address"); + if (err < 0) + goto announce_err; + + if (addr_val.addr.id == 0) { + NL_SET_ERR_MSG_ATTR(info->extack, addr, "invalid addr id"); + err = -EINVAL; goto announce_err; } - if (addr_val.addr.id == 0 || !(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { - GENL_SET_ERR_MSG(info, "invalid addr id or flags"); + if (!(addr_val.flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) { + NL_SET_ERR_MSG_ATTR(info->extack, addr, "invalid addr flags"); err = -EINVAL; goto announce_err; } err = mptcp_userspace_pm_append_new_local_addr(msk, &addr_val, false); if (err < 0) { - GENL_SET_ERR_MSG(info, "did not match address and id"); + NL_SET_ERR_MSG_ATTR(info->extack, addr, + "did not match address and id"); goto announce_err; } @@ -253,8 +251,7 @@ int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) return err; } -static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, - struct genl_info *info) +static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk) { struct mptcp_rm_list list = { .nr = 0 }; struct mptcp_subflow_context *subflow; @@ -269,10 +266,8 @@ static int mptcp_userspace_pm_remove_id_zero_address(struct mptcp_sock *msk, break; } } - if (!has_id_0) { - GENL_SET_ERR_MSG(info, "address with id 0 not found"); + if (!has_id_0) goto remove_err; - } list.ids[list.nr++] = 0; @@ -309,18 +304,17 @@ void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk, int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; struct mptcp_pm_addr_entry *match; struct mptcp_sock *msk; + struct nlattr *id; int err = -EINVAL; struct sock *sk; u8 id_val; - if (!id) { - GENL_SET_ERR_MSG(info, "missing required ID"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_LOC_ID)) return err; - } + id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; id_val = nla_get_u8(id); msk = mptcp_userspace_pm_get_sock(info); @@ -330,7 +324,7 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) sk = (struct sock *)msk; if (id_val == 0) { - err = mptcp_userspace_pm_remove_id_zero_address(msk, info); + err = mptcp_userspace_pm_remove_id_zero_address(msk); goto out; } @@ -339,7 +333,6 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) spin_lock_bh(&msk->pm.lock); match = mptcp_userspace_pm_lookup_addr_by_id(msk, id_val); if (!match) { - GENL_SET_ERR_MSG(info, "address with specified id not found"); spin_unlock_bh(&msk->pm.lock); release_sock(sk); goto out; @@ -356,25 +349,28 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) err = 0; out: + if (err) + NL_SET_ERR_MSG_ATTR_FMT(info->extack, id, + "address with id %u not found", + id_val); + sock_put(sk); return err; } int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry entry = { 0 }; struct mptcp_addr_info addr_r; + struct nlattr *raddr, *laddr; struct mptcp_pm_local local; struct mptcp_sock *msk; int err = -EINVAL; struct sock *sk; - if (!laddr || !raddr) { - GENL_SET_ERR_MSG(info, "missing required address(es)"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR) || + GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR_REMOTE)) return err; - } msk = mptcp_userspace_pm_get_sock(info); if (!msk) @@ -382,24 +378,22 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) sk = (struct sock *)msk; + laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; err = mptcp_pm_parse_entry(laddr, info, true, &entry); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); + if (err < 0) goto create_err; - } if (entry.flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - GENL_SET_ERR_MSG(info, "invalid addr flags"); + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "invalid addr flags"); err = -EINVAL; goto create_err; } entry.flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW; + raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; err = mptcp_pm_parse_addr(raddr, info, &addr_r); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); + if (err < 0) goto create_err; - } if (!mptcp_pm_addr_families_match(sk, &entry.addr, &addr_r)) { GENL_SET_ERR_MSG(info, "families mismatch"); @@ -409,7 +403,8 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) err = mptcp_userspace_pm_append_new_local_addr(msk, &entry, false); if (err < 0) { - GENL_SET_ERR_MSG(info, "did not match address and id"); + NL_SET_ERR_MSG_ATTR(info->extack, laddr, + "did not match address and id"); goto create_err; } @@ -421,6 +416,9 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) err = __mptcp_subflow_connect(sk, &local, &addr_r); release_sock(sk); + if (err) + GENL_SET_ERR_MSG_FMT(info, "connect error: %d", err); + spin_lock_bh(&msk->pm.lock); if (err) mptcp_userspace_pm_delete_local_addr(msk, &entry); @@ -461,9 +459,7 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { - const struct ipv6_pinfo *pinfo = inet6_sk(ssk); - - if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) || + if (!ipv6_addr_equal(&local->addr6, &issk->pinet6->saddr) || !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr)) continue; break; @@ -483,18 +479,16 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info) { - struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct mptcp_pm_addr_entry addr_l; struct mptcp_addr_info addr_r; + struct nlattr *raddr, *laddr; struct mptcp_sock *msk; struct sock *sk, *ssk; int err = -EINVAL; - if (!laddr || !raddr) { - GENL_SET_ERR_MSG(info, "missing required address(es)"); + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR) || + GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR_REMOTE)) return err; - } msk = mptcp_userspace_pm_get_sock(info); if (!msk) @@ -502,17 +496,15 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info sk = (struct sock *)msk; + laddr = info->attrs[MPTCP_PM_ATTR_ADDR]; err = mptcp_pm_parse_entry(laddr, info, true, &addr_l); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr"); + if (err < 0) goto destroy_err; - } + raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; err = mptcp_pm_parse_addr(raddr, info, &addr_r); - if (err < 0) { - NL_SET_ERR_MSG_ATTR(info->extack, raddr, "error parsing remote addr"); + if (err < 0) goto destroy_err; - } #if IS_ENABLED(CONFIG_MPTCP_IPV6) if (addr_l.addr.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) { @@ -530,8 +522,14 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info goto destroy_err; } - if (!addr_l.addr.port || !addr_r.port) { - GENL_SET_ERR_MSG(info, "missing local or remote port"); + if (!addr_l.addr.port) { + NL_SET_ERR_MSG_ATTR(info->extack, laddr, "missing local port"); + err = -EINVAL; + goto destroy_err; + } + + if (!addr_r.port) { + NL_SET_ERR_MSG_ATTR(info->extack, raddr, "missing remote port"); err = -EINVAL; goto destroy_err; } @@ -539,6 +537,7 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info lock_sock(sk); ssk = mptcp_nl_find_ssk(msk, &addr_l.addr, &addr_r); if (!ssk) { + GENL_SET_ERR_MSG(info, "subflow not found"); err = -ESRCH; goto release_sock; } @@ -557,46 +556,51 @@ destroy_err: return err; } -int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) +int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info) { - struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, }; - struct mptcp_pm_addr_entry rem = { .addr = { .family = AF_UNSPEC }, }; - struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; - struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct mptcp_addr_info rem = { .family = AF_UNSPEC, }; struct mptcp_pm_addr_entry *entry; + struct nlattr *attr, *attr_rem; struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; u8 bkup = 0; + if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ATTR_ADDR_REMOTE)) + return ret; + msk = mptcp_userspace_pm_get_sock(info); if (!msk) return ret; sk = (struct sock *)msk; - ret = mptcp_pm_parse_entry(attr, info, false, &loc); - if (ret < 0) + attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + if (local->addr.family == AF_UNSPEC) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "invalid local address family"); + ret = -EINVAL; goto set_flags_err; - - if (attr_rem) { - ret = mptcp_pm_parse_entry(attr_rem, info, false, &rem); - if (ret < 0) - goto set_flags_err; } - if (loc.addr.family == AF_UNSPEC || - rem.addr.family == AF_UNSPEC) { - GENL_SET_ERR_MSG(info, "invalid address families"); + attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE]; + ret = mptcp_pm_parse_addr(attr_rem, info, &rem); + if (ret < 0) + goto set_flags_err; + + if (rem.family == AF_UNSPEC) { + NL_SET_ERR_MSG_ATTR(info->extack, attr_rem, + "invalid remote address family"); ret = -EINVAL; goto set_flags_err; } - if (loc.flags & MPTCP_PM_ADDR_FLAG_BACKUP) + if (local->flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; spin_lock_bh(&msk->pm.lock); - entry = mptcp_userspace_pm_lookup_addr(msk, &loc.addr); + entry = mptcp_userspace_pm_lookup_addr(msk, &local->addr); if (entry) { if (bkup) entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; @@ -606,9 +610,13 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) spin_unlock_bh(&msk->pm.lock); lock_sock(sk); - ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc.addr, &rem.addr, bkup); + ret = mptcp_pm_nl_mp_prio_send_ack(msk, &local->addr, &rem, bkup); release_sock(sk); + /* mptcp_pm_nl_mp_prio_send_ack() only fails in one case */ + if (ret < 0) + GENL_SET_ERR_MSG(info, "subflow not found"); + set_flags_err: sock_put(sk); return ret; @@ -625,7 +633,8 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; - void *hdr; + + BUILD_BUG_ON(sizeof(struct id_bitmap) > sizeof(cb->ctx)); bitmap = (struct id_bitmap *)cb->ctx; @@ -641,19 +650,10 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, if (test_bit(entry->addr.id, bitmap->map)) continue; - hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, &mptcp_genl_family, - NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); - if (!hdr) - break; - - if (mptcp_nl_fill_addr(msg, entry) < 0) { - genlmsg_cancel(msg, hdr); + if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0) break; - } __set_bit(entry->addr.id, bitmap->map); - genlmsg_end(msg, hdr); } spin_unlock_bh(&msk->pm.lock); release_sock(sk); @@ -663,16 +663,13 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, return ret; } -int mptcp_userspace_pm_get_addr(struct sk_buff *skb, +int mptcp_userspace_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info) { - struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR]; - struct mptcp_pm_addr_entry addr, *entry; + struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; - struct sk_buff *msg; int ret = -EINVAL; struct sock *sk; - void *reply; msk = mptcp_userspace_pm_get_sock(info); if (!msk) @@ -680,50 +677,16 @@ int mptcp_userspace_pm_get_addr(struct sk_buff *skb, sk = (struct sock *)msk; - ret = mptcp_pm_parse_entry(attr, info, false, &addr); - if (ret < 0) - goto out; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - ret = -ENOMEM; - goto out; - } - - reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, - info->genlhdr->cmd); - if (!reply) { - GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); - ret = -EMSGSIZE; - goto fail; - } - lock_sock(sk); spin_lock_bh(&msk->pm.lock); - entry = mptcp_userspace_pm_lookup_addr_by_id(msk, addr.addr.id); - if (!entry) { - GENL_SET_ERR_MSG(info, "address not found"); - ret = -EINVAL; - goto unlock_fail; + entry = mptcp_userspace_pm_lookup_addr_by_id(msk, id); + if (entry) { + *addr = *entry; + ret = 0; } - - ret = mptcp_nl_fill_addr(msg, entry); - if (ret) - goto unlock_fail; - - genlmsg_end(msg, reply); - ret = genlmsg_reply(msg, info); spin_unlock_bh(&msk->pm.lock); release_sock(sk); - sock_put(sk); - return ret; -unlock_fail: - spin_unlock_bh(&msk->pm.lock); - release_sock(sk); -fail: - nlmsg_free(msg); -out: sock_put(sk); return ret; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index c44c89ecaca6..ec23e65ef0f1 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -118,24 +118,14 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb) __kfree_skb(skb); } -static void mptcp_rmem_fwd_alloc_add(struct sock *sk, int size) -{ - WRITE_ONCE(mptcp_sk(sk)->rmem_fwd_alloc, - mptcp_sk(sk)->rmem_fwd_alloc + size); -} - -static void mptcp_rmem_charge(struct sock *sk, int size) -{ - mptcp_rmem_fwd_alloc_add(sk, -size); -} - static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, struct sk_buff *from) { bool fragstolen; int delta; - if (MPTCP_SKB_CB(from)->offset || + if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) || + MPTCP_SKB_CB(from)->offset || ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; @@ -150,7 +140,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, * negative one */ atomic_add(delta, &sk->sk_rmem_alloc); - mptcp_rmem_charge(sk, delta); + sk_mem_charge(sk, delta); kfree_skb_partial(from, fragstolen); return true; @@ -165,44 +155,6 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, return mptcp_try_coalesce((struct sock *)msk, to, from); } -static void __mptcp_rmem_reclaim(struct sock *sk, int amount) -{ - amount >>= PAGE_SHIFT; - mptcp_rmem_charge(sk, amount << PAGE_SHIFT); - __sk_mem_reduce_allocated(sk, amount); -} - -static void mptcp_rmem_uncharge(struct sock *sk, int size) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - int reclaimable; - - mptcp_rmem_fwd_alloc_add(sk, size); - reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk); - - /* see sk_mem_uncharge() for the rationale behind the following schema */ - if (unlikely(reclaimable >= PAGE_SIZE)) - __mptcp_rmem_reclaim(sk, reclaimable); -} - -static void mptcp_rfree(struct sk_buff *skb) -{ - unsigned int len = skb->truesize; - struct sock *sk = skb->sk; - - atomic_sub(len, &sk->sk_rmem_alloc); - mptcp_rmem_uncharge(sk, len); -} - -void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk) -{ - skb_orphan(skb); - skb->sk = sk; - skb->destructor = mptcp_rfree; - atomic_add(skb->truesize, &sk->sk_rmem_alloc); - mptcp_rmem_charge(sk, skb->truesize); -} - /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks @@ -315,25 +267,7 @@ merge_right: end: skb_condense(skb); - mptcp_set_owner_r(skb, sk); -} - -static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - int amt, amount; - - if (size <= msk->rmem_fwd_alloc) - return true; - - size -= msk->rmem_fwd_alloc; - amt = sk_mem_pages(size); - amount = amt << PAGE_SHIFT; - if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) - return false; - - mptcp_rmem_fwd_alloc_add(sk, amount); - return true; + skb_set_owner_r(skb, sk); } static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, @@ -351,7 +285,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, skb_orphan(skb); /* try to fetch required memory from subflow */ - if (!mptcp_rmem_schedule(sk, ssk, skb->truesize)) { + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); goto drop; } @@ -366,6 +300,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; MPTCP_SKB_CB(skb)->offset = offset; MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; + MPTCP_SKB_CB(skb)->cant_coalesce = 0; if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ @@ -375,7 +310,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, if (tail && mptcp_try_coalesce(sk, tail, skb)) return true; - mptcp_set_owner_r(skb, sk); + skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); return true; } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { @@ -561,7 +496,7 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) bool cleanup, rx_empty; cleanup = (space > 0) && (space >= (old_space << 1)) && copied; - rx_empty = !__mptcp_rmem(sk) && copied; + rx_empty = !sk_rmem_alloc_get(sk) && copied; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -634,27 +569,13 @@ static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) } static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, - struct sock *ssk, - unsigned int *bytes) + struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; - unsigned int moved = 0; bool more_data_avail; struct tcp_sock *tp; - bool done = false; - int sk_rbuf; - - sk_rbuf = READ_ONCE(sk->sk_rcvbuf); - - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); - - if (unlikely(ssk_rbuf > sk_rbuf)) { - WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); - sk_rbuf = ssk_rbuf; - } - } + bool ret = false; pr_debug("msk=%p ssk=%p\n", msk, ssk); tp = tcp_sk(ssk); @@ -664,20 +585,16 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sk_buff *skb; bool fin; + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) + break; + /* try to move as much data as available */ map_remaining = subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); skb = skb_peek(&ssk->sk_receive_queue); - if (!skb) { - /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), - * a different CPU can have already processed the pending - * data, stop here or we can enter an infinite loop - */ - if (!moved) - done = true; + if (unlikely(!skb)) break; - } if (__mptcp_check_fallback(msk)) { /* Under fallback skbs have no MPTCP extension and TCP could @@ -690,19 +607,13 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, offset = seq - TCP_SKB_CB(skb)->seq; fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; - if (fin) { - done = true; + if (fin) seq++; - } if (offset < skb->len) { size_t len = skb->len - offset; - if (tp->urg_data) - done = true; - - if (__mptcp_move_skb(msk, ssk, skb, offset, len)) - moved += len; + ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret; seq += len; if (unlikely(map_remaining < len)) { @@ -716,22 +627,16 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } sk_eat_skb(ssk, skb); - done = true; } WRITE_ONCE(tp->copied_seq, seq); more_data_avail = mptcp_subflow_data_available(ssk); - if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { - done = true; - break; - } } while (more_data_avail); - if (moved > 0) + if (ret) msk->last_data_recv = tcp_jiffies32; - *bytes += moved; - return done; + return ret; } static bool __mptcp_ofo_queue(struct mptcp_sock *msk) @@ -825,9 +730,9 @@ void __mptcp_error_report(struct sock *sk) static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; - unsigned int moved = 0; + bool moved; - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); + moved = __mptcp_move_skbs_from_subflow(msk, ssk); __mptcp_ofo_queue(msk); if (unlikely(ssk->sk_err)) { if (!sock_owned_by_user(sk)) @@ -843,14 +748,29 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) */ if (mptcp_pending_data_fin(sk, NULL)) mptcp_schedule_work(sk); - return moved > 0; + return moved; +} + +static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk) +{ + if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf)) + WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf); +} + +static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + __mptcp_rcvbuf_update(sk, ssk); + + /* Wake-up the reader only for in-sequence data */ + if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) + sk->sk_data_ready(sk); } void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - struct mptcp_sock *msk = mptcp_sk(sk); - int sk_rbuf, ssk_rbuf; /* The peer can send data while we are shutting down this * subflow at msk destruction time, but we must avoid enqueuing @@ -859,19 +779,11 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) if (unlikely(subflow->disposable)) return; - ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); - sk_rbuf = READ_ONCE(sk->sk_rcvbuf); - if (unlikely(ssk_rbuf > sk_rbuf)) - sk_rbuf = ssk_rbuf; - - /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ - if (__mptcp_rmem(sk) > sk_rbuf) - return; - - /* Wake-up the reader only for in-sequence data */ mptcp_data_lock(sk); - if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) - sk->sk_data_ready(sk); + if (!sock_owned_by_user(sk)) + __mptcp_data_ready(sk, ssk); + else + __set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags); mptcp_data_unlock(sk); } @@ -950,20 +862,6 @@ bool mptcp_schedule_work(struct sock *sk) return false; } -static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - msk_owned_by_me(msk); - - mptcp_for_each_subflow(msk, subflow) { - if (READ_ONCE(subflow->data_avail)) - return mptcp_subflow_tcp_sock(subflow); - } - - return NULL; -} - static bool mptcp_skb_can_collapse_to(u64 write_seq, const struct sk_buff *skb, const struct mptcp_ext *mpext) @@ -1767,8 +1665,10 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, * see mptcp_disconnect(). * Attempt it again outside the problematic scope. */ - if (!mptcp_disconnect(sk, 0)) + if (!mptcp_disconnect(sk, 0)) { + sk->sk_disconnects++; sk->sk_socket->state = SS_UNCONNECTED; + } } inet_clear_bit(DEFER_CONNECT, sk); @@ -1942,16 +1842,17 @@ do_error: static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); -static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, +static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { + struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb, *tmp; int copied = 0; - skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { + skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { u32 offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; u32 count = min_t(size_t, len - copied, data_len); @@ -1983,10 +1884,11 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, } if (!(flags & MSG_PEEK)) { - /* we will bulk release the skb memory later */ + /* avoid the indirect call, we know the destructor is sock_wfree */ skb->destructor = NULL; - WRITE_ONCE(msk->rmem_released, msk->rmem_released + skb->truesize); - __skb_unlink(skb, &msk->receive_queue); + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, skb->truesize); + __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); msk->bytes_consumed += count; } @@ -2099,66 +2001,65 @@ new_measure: msk->rcvq_space.time = mstamp; } -static void __mptcp_update_rmem(struct sock *sk) +static struct mptcp_subflow_context * +__mptcp_first_ready_from(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) { - struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *start_subflow = subflow; - if (!msk->rmem_released) - return; - - atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc); - mptcp_rmem_uncharge(sk, msk->rmem_released); - WRITE_ONCE(msk->rmem_released, 0); + while (!READ_ONCE(subflow->data_avail)) { + subflow = mptcp_next_subflow(msk, subflow); + if (subflow == start_subflow) + return NULL; + } + return subflow; } -static void __mptcp_splice_receive_queue(struct sock *sk) +static bool __mptcp_move_skbs(struct sock *sk) { + struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); + bool ret = false; - skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); -} + if (list_empty(&msk->conn_list)) + return false; -static bool __mptcp_move_skbs(struct mptcp_sock *msk) -{ - struct sock *sk = (struct sock *)msk; - unsigned int moved = 0; - bool ret, done; + /* verify we can move any data from the subflow, eventually updating */ + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + mptcp_for_each_subflow(msk, subflow) + __mptcp_rcvbuf_update(sk, subflow->tcp_sock); - do { - struct sock *ssk = mptcp_subflow_recv_lookup(msk); + subflow = list_first_entry(&msk->conn_list, + struct mptcp_subflow_context, node); + for (;;) { + struct sock *ssk; bool slowpath; - /* we can have data pending in the subflows only if the msk - * receive buffer was full at subflow_data_ready() time, - * that is an unlikely slow path. + /* + * As an optimization avoid traversing the subflows list + * and ev. acquiring the subflow socket lock before baling out */ - if (likely(!ssk)) + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) break; - slowpath = lock_sock_fast(ssk); - mptcp_data_lock(sk); - __mptcp_update_rmem(sk); - done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); - mptcp_data_unlock(sk); + subflow = __mptcp_first_ready_from(msk, subflow); + if (!subflow) + break; + ssk = mptcp_subflow_tcp_sock(subflow); + slowpath = lock_sock_fast(ssk); + ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret; if (unlikely(ssk->sk_err)) __mptcp_error_report(sk); unlock_sock_fast(ssk, slowpath); - } while (!done); - /* acquire the data lock only if some input data is pending */ - ret = moved > 0; - if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || - !skb_queue_empty_lockless(&sk->sk_receive_queue)) { - mptcp_data_lock(sk); - __mptcp_update_rmem(sk); - ret |= __mptcp_ofo_queue(msk); - __mptcp_splice_receive_queue(sk); - mptcp_data_unlock(sk); + subflow = mptcp_next_subflow(msk, subflow); } + + __mptcp_ofo_queue(msk); if (ret) mptcp_check_data_fin((struct sock *)msk); - return !skb_queue_empty(&msk->receive_queue); + return ret; } static unsigned int mptcp_inq_hint(const struct sock *sk) @@ -2166,7 +2067,7 @@ static unsigned int mptcp_inq_hint(const struct sock *sk) const struct mptcp_sock *msk = mptcp_sk(sk); const struct sk_buff *skb; - skb = skb_peek(&msk->receive_queue); + skb = skb_peek(&sk->sk_receive_queue); if (skb) { u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq; @@ -2212,7 +2113,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, while (copied < len) { int err, bytes_read; - bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); + bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; @@ -2221,7 +2122,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) + if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk)) continue; /* only the MPTCP socket status is relevant here. The exit @@ -2247,7 +2148,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, /* race breaker: the shutdown could be after the * previous receive queue check */ - if (__mptcp_move_skbs(msk)) + if (__mptcp_move_skbs(sk)) continue; break; } @@ -2291,9 +2192,8 @@ out_err: } } - pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", - msk, skb_queue_empty_lockless(&sk->sk_receive_queue), - skb_queue_empty(&msk->receive_queue), copied); + pr_debug("msk=%p rx queue empty=%d copied=%d\n", + msk, skb_queue_empty(&sk->sk_receive_queue), copied); release_sock(sk); return copied; @@ -2820,11 +2720,8 @@ static void __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); INIT_WORK(&msk->work, mptcp_worker); - __skb_queue_head_init(&msk->receive_queue); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; - WRITE_ONCE(msk->rmem_fwd_alloc, 0); - WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival = TCP_RTO_MIN; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; @@ -3050,8 +2947,6 @@ static void __mptcp_destroy_sock(struct sock *sk) sk->sk_prot->destroy(sk); - WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc)); - WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); @@ -3283,12 +3178,9 @@ static void mptcp_copy_ip_options(struct sock *newsk, const struct sock *sk) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + + newopt = sock_kmemdup(newsk, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); - if (newopt) - memcpy(newopt, inet_opt, sizeof(*inet_opt) + - inet_opt->opt.optlen); - else + if (!newopt) net_warn_ratelimited("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); @@ -3403,18 +3295,12 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); - /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ - mptcp_data_lock(sk); - skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue); __skb_queue_purge(&sk->sk_receive_queue); skb_rbtree_purge(&msk->out_of_order_queue); - mptcp_data_unlock(sk); /* move all the rx fwd alloc into the sk_mem_reclaim_final in * inet_sock_destruct() will dispose it */ - sk_forward_alloc_add(sk, msk->rmem_fwd_alloc); - WRITE_ONCE(msk->rmem_fwd_alloc, 0); mptcp_token_destroy(msk); mptcp_pm_free_anno_list(msk); mptcp_free_local_addr_list(msk); @@ -3451,7 +3337,8 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ BIT(MPTCP_RETRANSMIT) | \ - BIT(MPTCP_FLUSH_JOIN_LIST)) + BIT(MPTCP_FLUSH_JOIN_LIST) | \ + BIT(MPTCP_DEQUEUE)) /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) @@ -3485,6 +3372,11 @@ static void mptcp_release_cb(struct sock *sk) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) __mptcp_retrans(sk); + if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) { + /* notify ack seq update */ + mptcp_cleanup_rbuf(msk, 0); + sk->sk_data_ready(sk); + } cond_resched(); spin_lock_bh(&sk->sk_lock.slock); @@ -3504,8 +3396,6 @@ static void mptcp_release_cb(struct sock *sk) if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags)) __mptcp_sync_sndbuf(sk); } - - __mptcp_update_rmem(sk); } /* MP_JOIN client subflow must wait for 4th ack before sending any data: @@ -3676,12 +3566,6 @@ static void mptcp_shutdown(struct sock *sk, int how) __mptcp_wr_shutdown(sk); } -static int mptcp_forward_alloc_get(const struct sock *sk) -{ - return READ_ONCE(sk->sk_forward_alloc) + - READ_ONCE(mptcp_sk(sk)->rmem_fwd_alloc); -} - static int mptcp_ioctl_outq(const struct mptcp_sock *msk, u64 v) { const struct sock *sk = (void *)msk; @@ -3722,7 +3606,8 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) return -EINVAL; lock_sock(sk); - __mptcp_move_skbs(msk); + if (__mptcp_move_skbs(sk)) + mptcp_cleanup_rbuf(msk, 0); *karg = mptcp_inq_hint(sk); release_sock(sk); break; @@ -3839,7 +3724,6 @@ static struct proto mptcp_prot = { .hash = mptcp_hash, .unhash = mptcp_unhash, .get_port = mptcp_get_port, - .forward_alloc_get = mptcp_forward_alloc_get, .stream_memory_free = mptcp_stream_memory_free, .sockets_allocated = &mptcp_sockets_allocated, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 0174a5aad279..7b74dedc7936 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -124,12 +124,14 @@ #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_SYNC_STATE 6 #define MPTCP_SYNC_SNDBUF 7 +#define MPTCP_DEQUEUE 8 struct mptcp_skb_cb { u64 map_seq; u64 end_seq; u32 offset; - u8 has_rxtstamp:1; + u8 has_rxtstamp; + u8 cant_coalesce; }; #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) @@ -149,22 +151,24 @@ struct mptcp_options_received { u32 subflow_seq; u16 data_len; __sum16 csum; - u16 suboptions; + struct_group(status, + u16 suboptions; + u16 use_map:1, + dsn64:1, + data_fin:1, + use_ack:1, + ack64:1, + mpc_map:1, + reset_reason:4, + reset_transient:1, + echo:1, + backup:1, + deny_join_id0:1, + __unused:2; + ); + u8 join_id; u32 token; u32 nonce; - u16 use_map:1, - dsn64:1, - data_fin:1, - use_ack:1, - ack64:1, - mpc_map:1, - reset_reason:4, - reset_transient:1, - echo:1, - backup:1, - deny_join_id0:1, - __unused:2; - u8 join_id; u64 thmac; u8 hmac[MPTCPOPT_HMAC_LEN]; struct mptcp_addr_info addr; @@ -277,7 +281,6 @@ struct mptcp_sock { u64 rcv_data_fin_seq; u64 bytes_retrans; u64 bytes_consumed; - int rmem_fwd_alloc; int snd_burst; int old_wspace; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; @@ -292,7 +295,6 @@ struct mptcp_sock { u32 last_ack_recv; unsigned long timer_ival; u32 token; - int rmem_released; unsigned long flags; unsigned long cb_flags; bool recovery; /* closing subflow write queue reinjected */ @@ -322,7 +324,6 @@ struct mptcp_sock { struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; - struct sk_buff_head receive_queue; struct list_head conn_list; struct list_head rtx_queue; struct mptcp_data_frag *first_pending; @@ -353,6 +354,8 @@ struct mptcp_sock { list_for_each_entry(__subflow, &((__msk)->conn_list), node) #define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) +#define mptcp_next_subflow(__msk, __subflow) \ + list_next_entry_circular(__subflow, &((__msk)->conn_list), node) extern struct genl_family mptcp_genl_family; @@ -379,14 +382,6 @@ static inline void msk_owned_by_me(const struct mptcp_sock *msk) #define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) #endif -/* the msk socket don't use the backlog, also account for the bulk - * free memory - */ -static inline int __mptcp_rmem(const struct sock *sk) -{ - return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); -} - static inline int mptcp_win_from_space(const struct sock *sk, int space) { return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); @@ -399,7 +394,8 @@ static inline int mptcp_space_from_win(const struct sock *sk, int win) static inline int __mptcp_space(const struct sock *sk) { - return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); + return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - + sk_rmem_alloc_get(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) @@ -724,7 +720,6 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); void __mptcp_unaccepted_force_close(struct sock *sk); -void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); void mptcp_set_state(struct sock *sk, int state); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, @@ -1036,9 +1031,10 @@ bool mptcp_lookup_subflow_by_saddr(const struct list_head *list, const struct mptcp_addr_info *saddr); bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); -int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info); -int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info); -int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info); +int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info); +int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local, + struct genl_info *info); int mptcp_pm_announce_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool echo); @@ -1056,12 +1052,13 @@ void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); -void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - const struct mptcp_options_received *mp_opt); void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req); int mptcp_nl_fill_addr(struct sk_buff *skb, struct mptcp_pm_addr_entry *entry); +int mptcp_pm_genl_fill_addr(struct sk_buff *msg, + struct netlink_callback *cb, + struct mptcp_pm_addr_entry *entry); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { @@ -1129,14 +1126,13 @@ int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_in bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc); bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); -int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb); -int mptcp_pm_get_addr(struct sk_buff *skb, struct genl_info *info); -int mptcp_pm_nl_get_addr(struct sk_buff *skb, struct genl_info *info); -int mptcp_userspace_pm_get_addr(struct sk_buff *skb, +int mptcp_pm_nl_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, + struct genl_info *info); +int mptcp_userspace_pm_get_addr(u8 id, struct mptcp_pm_addr_entry *addr, struct genl_info *info); static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflow) @@ -1197,6 +1193,8 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk) pr_debug("TCP fallback already done (msk=%p)\n", msk); return; } + if (WARN_ON_ONCE(!READ_ONCE(msk->allow_infinite_fallback))) + return; set_bit(MPTCP_FALLBACK_DONE, &msk->flags); } diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index df7dbcfa3b71..c16c6fbd4ba2 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -16,13 +16,25 @@ static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); -static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, +static int mptcp_sched_default_get_send(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + struct sock *ssk; + + ssk = mptcp_subflow_get_send(msk); + if (!ssk) + return -EINVAL; + + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); + return 0; +} + +static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk, struct mptcp_sched_data *data) { struct sock *ssk; - ssk = data->reinject ? mptcp_subflow_get_retrans(msk) : - mptcp_subflow_get_send(msk); + ssk = mptcp_subflow_get_retrans(msk); if (!ssk) return -EINVAL; @@ -31,7 +43,8 @@ static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, } static struct mptcp_sched_ops mptcp_sched_default = { - .get_subflow = mptcp_sched_default_get_subflow, + .get_send = mptcp_sched_default_get_send, + .get_retrans = mptcp_sched_default_get_retrans, .name = "default", .owner = THIS_MODULE, }; @@ -73,7 +86,7 @@ void mptcp_get_available_schedulers(char *buf, size_t maxlen) int mptcp_register_scheduler(struct mptcp_sched_ops *sched) { - if (!sched->get_subflow) + if (!sched->get_send) return -EINVAL; spin_lock(&mptcp_sched_list_lock); @@ -144,7 +157,7 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data data; + struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -164,16 +177,15 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) return 0; } - data.reinject = false; if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_subflow(msk, &data); - return msk->sched->get_subflow(msk, &data); + return mptcp_sched_default_get_send(msk, data); + return msk->sched->get_send(msk, data); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data data; + struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -186,8 +198,9 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) return 0; } - data.reinject = true; if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_subflow(msk, &data); - return msk->sched->get_subflow(msk, &data); + return mptcp_sched_default_get_retrans(msk, data); + if (msk->sched->get_retrans) + return msk->sched->get_retrans(msk, data); + return msk->sched->get_send(msk, data); } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index fd021cf8286e..efe8d86496db 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -802,9 +802,6 @@ void __mptcp_subflow_fully_established(struct mptcp_sock *msk, subflow_set_remote_key(msk, subflow, mp_opt); WRITE_ONCE(subflow->fully_established, true); WRITE_ONCE(msk->fully_established, true); - - if (subflow->is_mptfo) - __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, @@ -1142,7 +1139,6 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (data_len == 0) { pr_debug("infinite mapping received\n"); MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); - subflow->map_data_len = 0; return MAPPING_INVALID; } @@ -1271,7 +1267,12 @@ out: subflow->map_valid = 0; } -/* sched mptcp worker to remove the subflow if no more data is pending */ +static bool subflow_is_done(const struct sock *sk) +{ + return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; +} + +/* sched mptcp worker for subflow cleanup if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { struct sock *sk = (struct sock *)msk; @@ -1281,21 +1282,19 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss inet_sk_state_load(sk) != TCP_ESTABLISHED))) return; - if (skb_queue_empty(&ssk->sk_receive_queue) && - !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) - mptcp_schedule_work(sk); -} + if (!skb_queue_empty(&ssk->sk_receive_queue)) + return; -static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) -{ - struct mptcp_sock *msk = mptcp_sk(subflow->conn); + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + mptcp_schedule_work(sk); - if (subflow->mp_join) - return false; - else if (READ_ONCE(msk->csum_enabled)) - return !subflow->valid_csum_seen; - else - return READ_ONCE(msk->allow_infinite_fallback); + /* when the fallback subflow closes the rx side, trigger a 'dummy' + * ingress data fin, so that the msk state will follow along + */ + if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) && + msk->first == ssk && + mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) + mptcp_schedule_work(sk); } static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk) @@ -1393,7 +1392,7 @@ fallback: return true; } - if (!subflow_can_fallback(subflow) && subflow->map_data_len) { + if (!READ_ONCE(msk->allow_infinite_fallback)) { /* fatal protocol error, close the socket. * subflow_error_report() will introduce the appropriate barriers */ @@ -1772,10 +1771,7 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, * needs it. * Update ns_tracker to current stack trace and refcounted tracker. */ - __netns_tracker_free(net, &sf->sk->ns_tracker, false); - sf->sk->sk_net_refcnt = 1; - get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sf->sk); err = tcp_set_ulp(sf->sk, "mptcp"); if (err) goto err_free; @@ -1842,11 +1838,6 @@ static void __subflow_state_change(struct sock *sk) rcu_read_unlock(); } -static bool subflow_is_done(const struct sock *sk) -{ - return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE; -} - static void subflow_state_change(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -1873,13 +1864,6 @@ static void subflow_state_change(struct sock *sk) subflow_error_report(sk); subflow_sched_work_if_closed(mptcp_sk(parent), sk); - - /* when the fallback subflow closes the rx side, trigger a 'dummy' - * ingress data fin, so that the msk state will follow along - */ - if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk && - mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) - mptcp_schedule_work(parent); } void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index bf276eaf9330..7891a537bddd 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1385,6 +1385,12 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) nd->state = ncsi_dev_state_probe_package; break; case ncsi_dev_state_probe_package: + if (ndp->package_probe_id >= 8) { + /* Last package probed, finishing */ + ndp->flags |= NCSI_DEV_PROBED; + break; + } + ndp->pending_req_num = 1; nca.type = NCSI_PKT_CMD_SP; @@ -1501,13 +1507,8 @@ static void ncsi_probe_channel(struct ncsi_dev_priv *ndp) if (ret) goto error; - /* Probe next package */ + /* Probe next package after receiving response */ ndp->package_probe_id++; - if (ndp->package_probe_id >= 8) { - /* Probe finished */ - ndp->flags |= NCSI_DEV_PROBED; - break; - } nd->state = ncsi_dev_state_probe_package; ndp->active_package = NULL; break; diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index 14bd66909ca4..4a8ce2949fae 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -1089,14 +1089,12 @@ static int ncsi_rsp_handler_netlink(struct ncsi_request *nr) static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr) { struct ncsi_dev_priv *ndp = nr->ndp; + struct sockaddr *saddr = &ndp->pending_mac; struct net_device *ndev = ndp->ndev.dev; struct ncsi_rsp_gmcma_pkt *rsp; - struct sockaddr saddr; - int ret = -1; int i; rsp = (struct ncsi_rsp_gmcma_pkt *)skb_network_header(nr->rsp); - saddr.sa_family = ndev->type; ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; netdev_info(ndev, "NCSI: Received %d provisioned MAC addresses\n", @@ -1108,20 +1106,20 @@ static int ncsi_rsp_handler_gmcma(struct ncsi_request *nr) rsp->addresses[i][4], rsp->addresses[i][5]); } + saddr->sa_family = ndev->type; for (i = 0; i < rsp->address_count; i++) { - memcpy(saddr.sa_data, &rsp->addresses[i], ETH_ALEN); - ret = ndev->netdev_ops->ndo_set_mac_address(ndev, &saddr); - if (ret < 0) { + if (!is_valid_ether_addr(rsp->addresses[i])) { netdev_warn(ndev, "NCSI: Unable to assign %pM to device\n", - saddr.sa_data); + rsp->addresses[i]); continue; } - netdev_warn(ndev, "NCSI: Set MAC address to %pM\n", saddr.sa_data); + memcpy(saddr->sa_data, rsp->addresses[i], ETH_ALEN); + netdev_warn(ndev, "NCSI: Will set MAC address to %pM\n", saddr->sa_data); break; } - ndp->gma_flag = ret == 0; - return ret; + ndp->gma_flag = 1; + return 0; } static struct ncsi_rsp_handler { diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index d011d2eb0848..7be4c35e4795 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -106,7 +106,7 @@ static int amanda_help(struct sk_buff *skb, /* increase the UDP timeout of the master connection as replies from * Amanda clients to the server can be quite delayed */ - nf_ct_refresh(ct, skb, master_timeout * HZ); + nf_ct_refresh(ct, master_timeout * HZ); /* No data? */ dataoff = protoff + sizeof(struct udphdr); diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c index cfa0fe0356de..a7552a46d6ac 100644 --- a/net/netfilter/nf_conntrack_broadcast.c +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -75,7 +75,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb, nf_ct_expect_related(exp, 0); nf_ct_expect_put(exp); - nf_ct_refresh(ct, skb, timeout * HZ); + nf_ct_refresh(ct, timeout * HZ); out: return NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 456446d7af20..7f8b245e287a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1544,12 +1544,6 @@ static void gc_worker(struct work_struct *work) tmp = nf_ct_tuplehash_to_ctrack(h); - if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { - nf_ct_offload_timeout(tmp); - if (!nf_conntrack_max95) - continue; - } - if (expired_count > GC_SCAN_EXPIRED_MAX) { rcu_read_unlock(); @@ -2089,9 +2083,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_in); /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */ void __nf_ct_refresh_acct(struct nf_conn *ct, enum ip_conntrack_info ctinfo, - const struct sk_buff *skb, u32 extra_jiffies, - bool do_acct) + unsigned int bytes) { /* Only update if this is not a fixed timeout */ if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) @@ -2104,8 +2097,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, if (READ_ONCE(ct->timeout) != extra_jiffies) WRITE_ONCE(ct->timeout, extra_jiffies); acct: - if (do_acct) - nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len); + if (bytes) + nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 5a9bce24f3c3..14f73872f647 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1385,7 +1385,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct, if (info->timeout > 0) { pr_debug("nf_ct_ras: set RAS connection timeout to " "%u seconds\n", info->timeout); - nf_ct_refresh(ct, skb, info->timeout * HZ); + nf_ct_refresh(ct, info->timeout * HZ); /* Set expect timeout */ spin_lock_bh(&nf_conntrack_expect_lock); @@ -1433,7 +1433,7 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct, info->sig_port[!dir] = 0; /* Give it 30 seconds for UCF or URJ */ - nf_ct_refresh(ct, skb, 30 * HZ); + nf_ct_refresh(ct, 30 * HZ); return 0; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 2277b744eb2c..db23876a6016 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -357,11 +357,11 @@ nla_put_failure: static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_secctx; - int len, ret; - char *secctx; + struct lsm_context ctx; + int ret; - ret = security_secid_to_secctx(ct->secmark, &secctx, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, &ctx); + if (ret < 0) return 0; ret = -1; @@ -369,13 +369,13 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) if (!nest_secctx) goto nla_put_failure; - if (nla_put_string(skb, CTA_SECCTX_NAME, secctx)) + if (nla_put_string(skb, CTA_SECCTX_NAME, ctx.context)) goto nla_put_failure; nla_nest_end(skb, nest_secctx); ret = 0; nla_put_failure: - security_release_secctx(secctx, len); + security_release_secctx(&ctx); return ret; } #else @@ -680,14 +680,14 @@ static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) static inline int ctnetlink_secctx_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_SECMARK - int len, ret; + int ret; - ret = security_secid_to_secctx(ct->secmark, NULL, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, NULL); + if (ret < 0) return 0; return nla_total_size(0) /* CTA_SECCTX */ - + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */ + + nla_total_size(sizeof(char) * ret); /* CTA_SECCTX_NAME */ #else return 0; #endif diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 4cc97f971264..7c6f7c9f7332 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -39,20 +39,15 @@ static const char *const sctp_conntrack_names[] = { [SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT", }; -#define SECS * HZ -#define MINS * 60 SECS -#define HOURS * 60 MINS -#define DAYS * 24 HOURS - static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = { - [SCTP_CONNTRACK_CLOSED] = 10 SECS, - [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS, - [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS, - [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS, - [SCTP_CONNTRACK_SHUTDOWN_SENT] = 3 SECS, - [SCTP_CONNTRACK_SHUTDOWN_RECD] = 3 SECS, - [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, - [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, + [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10), + [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3), + [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210), + [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3), + [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3), + [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30), }; #define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1 diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index d0eac27f6ba0..ca748f8dbff1 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1553,7 +1553,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff, if (dataoff >= skb->len) return NF_ACCEPT; - nf_ct_refresh(ct, skb, sip_timeout * HZ); + nf_ct_refresh(ct, sip_timeout * HZ); if (unlikely(skb_linearize(skb))) return NF_DROP; @@ -1624,7 +1624,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff, if (dataoff >= skb->len) return NF_ACCEPT; - nf_ct_refresh(ct, skb, sip_timeout * HZ); + nf_ct_refresh(ct, sip_timeout * HZ); if (unlikely(skb_linearize(skb))) return NF_DROP; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 7d4f0fa8b609..502cf10aab41 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -172,17 +172,16 @@ static void ct_seq_stop(struct seq_file *s, void *v) #ifdef CONFIG_NF_CONNTRACK_SECMARK static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) { + struct lsm_context ctx; int ret; - u32 len; - char *secctx; - ret = security_secid_to_secctx(ct->secmark, &secctx, &len); - if (ret) + ret = security_secid_to_secctx(ct->secmark, &ctx); + if (ret < 0) return; - seq_printf(s, "secctx=%s ", secctx); + seq_printf(s, "secctx=%s ", ctx.context); - security_release_secctx(secctx, len); + security_release_secctx(&ctx); } #else static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index df72b0376970..9d8361526f82 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -161,42 +161,86 @@ void flow_offload_route_init(struct flow_offload *flow, } EXPORT_SYMBOL_GPL(flow_offload_route_init); -static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) +static inline bool nf_flow_has_expired(const struct flow_offload *flow) +{ + return nf_flow_timeout_delta(flow->timeout) <= 0; +} + +static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state) { + struct ip_ct_tcp *tcp = &ct->proto.tcp; + + spin_lock_bh(&ct->lock); + if (tcp->state != tcp_state) + tcp->state = tcp_state; + + /* syn packet triggers the TCP reopen case from conntrack. */ + if (tcp->state == TCP_CONNTRACK_CLOSE) + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + + /* Conntrack state is outdated due to offload bypass. + * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks + * TCP reset validation will fail. + */ tcp->seen[0].td_maxwin = 0; + tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; tcp->seen[1].td_maxwin = 0; + tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET; + spin_unlock_bh(&ct->lock); } -static void flow_offload_fixup_ct(struct nf_conn *ct) +static void flow_offload_fixup_ct(struct flow_offload *flow) { + struct nf_conn *ct = flow->ct; struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); + bool expired, closing = false; + u32 offload_timeout = 0; s32 timeout; if (l4num == IPPROTO_TCP) { - struct nf_tcp_net *tn = nf_tcp_pernet(net); - - flow_offload_fixup_tcp(&ct->proto.tcp); + const struct nf_tcp_net *tn = nf_tcp_pernet(net); + u8 tcp_state; + + /* Enter CLOSE state if fin/rst packet has been seen, this + * allows TCP reopen from conntrack. Otherwise, pick up from + * the last seen TCP state. + */ + closing = test_bit(NF_FLOW_CLOSING, &flow->flags); + if (closing) { + flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE); + timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]); + expired = false; + } else { + tcp_state = READ_ONCE(ct->proto.tcp.state); + flow_offload_fixup_tcp(ct, tcp_state); + timeout = READ_ONCE(tn->timeouts[tcp_state]); + expired = nf_flow_has_expired(flow); + } + offload_timeout = READ_ONCE(tn->offload_timeout); - timeout = tn->timeouts[ct->proto.tcp.state]; - timeout -= tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { - struct nf_udp_net *tn = nf_udp_pernet(net); + const struct nf_udp_net *tn = nf_udp_pernet(net); enum udp_conntrack state = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? UDP_CT_REPLIED : UDP_CT_UNREPLIED; - timeout = tn->timeouts[state]; - timeout -= tn->offload_timeout; + timeout = READ_ONCE(tn->timeouts[state]); + expired = nf_flow_has_expired(flow); + offload_timeout = READ_ONCE(tn->offload_timeout); } else { return; } + if (expired) + timeout -= offload_timeout; + if (timeout < 0) timeout = 0; - if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) - WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout); + if (closing || + nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout) + nf_ct_refresh(ct, timeout); } static void flow_offload_route_release(struct flow_offload *flow) @@ -294,7 +338,7 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } - nf_ct_offload_timeout(flow->ct); + nf_ct_refresh(flow->ct, NF_CT_DAY); if (nf_flowtable_hw_offload(flow_table)) { __set_bit(NF_FLOW_HW, &flow->flags); @@ -316,18 +360,14 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, else return; - if (likely(!nf_flowtable_hw_offload(flow_table))) + if (likely(!nf_flowtable_hw_offload(flow_table)) || + test_bit(NF_FLOW_CLOSING, &flow->flags)) return; nf_flow_offload_add(flow_table, flow); } EXPORT_SYMBOL_GPL(flow_offload_refresh); -static inline bool nf_flow_has_expired(const struct flow_offload *flow) -{ - return nf_flow_timeout_delta(flow->timeout) <= 0; -} - static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { @@ -344,7 +384,7 @@ void flow_offload_teardown(struct flow_offload *flow) { clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status); set_bit(NF_FLOW_TEARDOWN, &flow->flags); - flow_offload_fixup_ct(flow->ct); + flow_offload_fixup_ct(flow); } EXPORT_SYMBOL_GPL(flow_offload_teardown); @@ -414,15 +454,116 @@ static bool nf_flow_custom_gc(struct nf_flowtable *flow_table, return flow_table->type->gc && flow_table->type->gc(flow); } +/** + * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry + * @ct: Flowtable offloaded tcp ct + * + * Return: number of seconds when ct entry should expire. + */ +static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct) +{ + u8 state = READ_ONCE(ct->proto.tcp.state); + + switch (state) { + case TCP_CONNTRACK_SYN_SENT: + case TCP_CONNTRACK_SYN_RECV: + return 0; + case TCP_CONNTRACK_ESTABLISHED: + return NF_CT_DAY; + case TCP_CONNTRACK_FIN_WAIT: + case TCP_CONNTRACK_CLOSE_WAIT: + case TCP_CONNTRACK_LAST_ACK: + case TCP_CONNTRACK_TIME_WAIT: + return 5 * 60 * HZ; + case TCP_CONNTRACK_CLOSE: + return 0; + } + + return 0; +} + +/** + * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry + * @ct: Flowtable offloaded ct + * + * Datapath lookups in the conntrack table will evict nf_conn entries + * if they have expired. + * + * Once nf_conn entries have been offloaded, nf_conntrack might not see any + * packets anymore. Thus ct->timeout is no longer refreshed and ct can + * be evicted. + * + * To avoid the need for an additional check on the offload bit for every + * packet processed via nf_conntrack_in(), set an arbitrary timeout large + * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT + * from the packet path via nf_ct_is_expired(). + */ +static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct) +{ + static const u32 min_timeout = 5 * 60 * HZ; + u32 expires = nf_ct_expires(ct); + + /* normal case: large enough timeout, nothing to do. */ + if (likely(expires >= min_timeout)) + return; + + /* must check offload bit after this, we do not hold any locks. + * flowtable and ct entries could have been removed on another CPU. + */ + if (!refcount_inc_not_zero(&ct->ct_general.use)) + return; + + /* load ct->status after refcount increase */ + smp_acquire__after_ctrl_dep(); + + if (nf_ct_is_confirmed(ct) && + test_bit(IPS_OFFLOAD_BIT, &ct->status)) { + u8 l4proto = nf_ct_protonum(ct); + u32 new_timeout = true; + + switch (l4proto) { + case IPPROTO_UDP: + new_timeout = NF_CT_DAY; + break; + case IPPROTO_TCP: + new_timeout = nf_flow_table_tcp_timeout(ct); + break; + default: + WARN_ON_ONCE(1); + break; + } + + /* Update to ct->timeout from nf_conntrack happens + * without holding ct->lock. + * + * Use cmpxchg to ensure timeout extension doesn't + * happen when we race with conntrack datapath. + * + * The inverse -- datapath updating ->timeout right + * after this -- is fine, datapath is authoritative. + */ + if (new_timeout) { + new_timeout += nfct_time_stamp; + cmpxchg(&ct->timeout, expires, new_timeout); + } + } + + nf_ct_put(ct); +} + static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, struct flow_offload *flow, void *data) { + bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags); + if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || nf_flow_custom_gc(flow_table, flow)) flow_offload_teardown(flow); + else if (!teardown) + nf_flow_table_extend_ct_timeout(flow->ct); - if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { + if (teardown) { if (test_bit(NF_FLOW_HW, &flow->flags)) { if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); @@ -431,6 +572,10 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, } else { flow_offload_del(flow_table, flow); } + } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) && + test_bit(NF_FLOW_HW, &flow->flags) && + !test_bit(NF_FLOW_HW_DYING, &flow->flags)) { + nf_flow_offload_del(flow_table, flow); } else if (test_bit(NF_FLOW_HW, &flow->flags)) { nf_flow_offload_stats(flow_table, flow); } diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 98edcaa37b38..8cd4cf7ae211 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -28,11 +28,15 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto, return 0; tcph = (void *)(skb_network_header(skb) + thoff); - if (unlikely(tcph->fin || tcph->rst)) { + if (tcph->syn && test_bit(NF_FLOW_CLOSING, &flow->flags)) { flow_offload_teardown(flow); return -1; } + if ((tcph->fin || tcph->rst) && + !test_bit(NF_FLOW_CLOSING, &flow->flags)) + set_bit(NF_FLOW_CLOSING, &flow->flags); + return 0; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 83f3face8bb3..a34de9c17cf1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1956,15 +1956,16 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, if (!first) first = hook; - if (nla_put_string(skb, NFTA_DEVICE_NAME, - hook->ops.dev->name)) + if (nla_put(skb, NFTA_DEVICE_NAME, + hook->ifnamelen, hook->ifname)) goto nla_put_failure; n++; } nla_nest_end(skb, nest_devs); if (n == 1 && - nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name)) + nla_put(skb, NFTA_HOOK_DEV, + first->ifnamelen, first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -2276,7 +2277,6 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, const struct nlattr *attr) { struct net_device *dev; - char ifname[IFNAMSIZ]; struct nft_hook *hook; int err; @@ -2286,12 +2286,17 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, goto err_hook_alloc; } - nla_strscpy(ifname, attr, IFNAMSIZ); + err = nla_strscpy(hook->ifname, attr, IFNAMSIZ); + if (err < 0) + goto err_hook_dev; + + hook->ifnamelen = nla_len(attr); + /* nf_tables_netdev_event() is called under rtnl_mutex, this is * indirectly serializing all the other holders of the commit_mutex with * the rtnl_mutex. */ - dev = __dev_get_by_name(net, ifname); + dev = __dev_get_by_name(net, hook->ifname); if (!dev) { err = -ENOENT; goto err_hook_dev; @@ -2312,7 +2317,7 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, struct nft_hook *hook; list_for_each_entry(hook, hook_list, list) { - if (this->ops.dev == hook->ops.dev) + if (!strcmp(hook->ifname, this->ifname)) return hook; } @@ -4752,6 +4757,14 @@ static int nf_tables_fill_set_concat(struct sk_buff *skb, return 0; } +static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size) +{ + if (ops->usize) + return ops->usize(size); + + return size; +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -4822,7 +4835,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, if (!nest) goto nla_put_failure; if (set->size && - nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) + nla_put_be32(skb, NFTA_SET_DESC_SIZE, + htonl(nft_set_userspace_size(set->ops, set->size)))) goto nla_put_failure; if (set->field_count > 1 && @@ -5064,7 +5078,7 @@ static int nft_set_desc_concat_parse(const struct nlattr *attr, static int nft_set_desc_concat(struct nft_set_desc *desc, const struct nlattr *nla) { - u32 num_regs = 0, key_num_regs = 0; + u32 len = 0, num_regs; struct nlattr *attr; int rem, err, i; @@ -5078,12 +5092,12 @@ static int nft_set_desc_concat(struct nft_set_desc *desc, } for (i = 0; i < desc->field_count; i++) - num_regs += DIV_ROUND_UP(desc->field_len[i], sizeof(u32)); + len += round_up(desc->field_len[i], sizeof(u32)); - key_num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); - if (key_num_regs != num_regs) + if (len != desc->klen) return -EINVAL; + num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32)); if (num_regs > NFT_REG32_COUNT) return -E2BIG; @@ -5190,6 +5204,15 @@ static bool nft_set_is_same(const struct nft_set *set, return true; } +static u32 nft_set_kernel_size(const struct nft_set_ops *ops, + const struct nft_set_desc *desc) +{ + if (ops->ksize) + return ops->ksize(desc->size); + + return desc->size; +} + static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nla[]) { @@ -5372,6 +5395,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (err < 0) return err; + if (desc.size) + desc.size = nft_set_kernel_size(set->ops, &desc); + err = 0; if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); @@ -5394,6 +5420,9 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (IS_ERR(ops)) return PTR_ERR(ops); + if (desc.size) + desc.size = nft_set_kernel_size(ops, &desc); + udlen = 0; if (nla[NFTA_SET_USERDATA]) udlen = nla_len(nla[NFTA_SET_USERDATA]); @@ -7050,6 +7079,27 @@ static bool nft_setelem_valid_key_end(const struct nft_set *set, return true; } +static u32 nft_set_maxsize(const struct nft_set *set) +{ + u32 maxsize, delta; + + if (!set->size) + return UINT_MAX; + + if (set->ops->adjust_maxsize) + delta = set->ops->adjust_maxsize(set); + else + delta = 0; + + if (check_add_overflow(set->size, set->ndeact, &maxsize)) + return UINT_MAX; + + if (check_add_overflow(maxsize, delta, &maxsize)) + return UINT_MAX; + + return maxsize; +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr, u32 nlmsg_flags) { @@ -7422,7 +7472,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } if (!(flags & NFT_SET_ELEM_CATCHALL)) { - unsigned int max = set->size ? set->size + set->ndeact : UINT_MAX; + unsigned int max = nft_set_maxsize(set); if (!atomic_add_unless(&set->nelems, 1, max)) { err = -ENFILE; @@ -8850,7 +8900,7 @@ static int nft_register_flowtable_net_hooks(struct net *net, struct list_head *hook_list, struct nft_flowtable *flowtable) { - struct nft_hook *hook, *hook2, *next; + struct nft_hook *hook, *next; struct nft_flowtable *ft; int err, i = 0; @@ -8859,12 +8909,9 @@ static int nft_register_flowtable_net_hooks(struct net *net, if (!nft_is_active_next(net, ft)) continue; - list_for_each_entry(hook2, &ft->hook_list, list) { - if (hook->ops.dev == hook2->ops.dev && - hook->ops.pf == hook2->ops.pf) { - err = -EEXIST; - goto err_unregister_net_hooks; - } + if (nft_hook_list_find(&ft->hook_list, hook)) { + err = -EEXIST; + goto err_unregister_net_hooks; } } @@ -9278,7 +9325,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, list_for_each_entry_rcu(hook, hook_list, list, lockdep_commit_lock_is_held(net)) { - if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) + if (nla_put(skb, NFTA_DEVICE_NAME, + hook->ifnamelen, hook->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -11693,47 +11741,6 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, } EXPORT_SYMBOL_GPL(nft_data_dump); -static void __nft_release_basechain_now(struct nft_ctx *ctx) -{ - struct nft_rule *rule, *nr; - - list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { - list_del(&rule->list); - nf_tables_rule_release(ctx, rule); - } - nf_tables_chain_destroy(ctx->chain); -} - -int __nft_release_basechain(struct nft_ctx *ctx) -{ - struct nft_rule *rule; - - if (WARN_ON_ONCE(!nft_is_base_chain(ctx->chain))) - return 0; - - nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); - list_for_each_entry(rule, &ctx->chain->rules, list) - nft_use_dec(&ctx->chain->use); - - nft_chain_del(ctx->chain); - nft_use_dec(&ctx->table->use); - - if (!maybe_get_net(ctx->net)) { - __nft_release_basechain_now(ctx); - return 0; - } - - /* wait for ruleset dumps to complete. Owning chain is no longer in - * lists, so new dumps can't find any of these rules anymore. - */ - synchronize_rcu(); - - __nft_release_basechain_now(ctx); - put_net(ctx->net); - return 0; -} -EXPORT_SYMBOL_GPL(__nft_release_basechain); - static void __nft_release_hook(struct net *net, struct nft_table *table) { struct nft_flowtable *flowtable; diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index d2773ce9b585..5c913987901a 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -470,18 +470,18 @@ static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk) return 0; } -static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) +static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx) { - u32 seclen = 0; + int seclen = 0; #if IS_ENABLED(CONFIG_NETWORK_SECMARK) + if (!skb || !sk_fullsock(skb->sk)) return 0; read_lock_bh(&skb->sk->sk_callback_lock); if (skb->secmark) - security_secid_to_secctx(skb->secmark, secdata, &seclen); - + seclen = security_secid_to_secctx(skb->secmark, ctx); read_unlock_bh(&skb->sk->sk_callback_lock); #endif return seclen; @@ -567,8 +567,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, enum ip_conntrack_info ctinfo = 0; const struct nfnl_ct_hook *nfnl_ct; bool csum_verify; - char *secdata = NULL; - u32 seclen = 0; + struct lsm_context ctx; + int seclen = 0; ktime_t tstamp; size = nlmsg_total_size(sizeof(struct nfgenmsg)) @@ -642,7 +642,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { - seclen = nfqnl_get_sk_secctx(entskb, &secdata); + seclen = nfqnl_get_sk_secctx(entskb, &ctx); + if (seclen < 0) + return NULL; if (seclen) size += nla_total_size(seclen); } @@ -782,7 +784,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (nfqnl_put_sk_classid(skb, entskb->sk) < 0) goto nla_put_failure; - if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata)) + if (seclen > 0 && nla_put(skb, NFQA_SECCTX, ctx.len, ctx.context)) goto nla_put_failure; if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0) @@ -810,8 +812,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } nlh->nlmsg_len = skb->len; - if (seclen) - security_release_secctx(secdata, seclen); + if (seclen >= 0) + security_release_secctx(&ctx); return skb; nla_put_failure: @@ -819,8 +821,8 @@ nla_put_failure: kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); nlmsg_failure: - if (seclen) - security_release_secctx(secdata, seclen); + if (seclen >= 0) + security_release_secctx(&ctx); return NULL; } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 7010541fcca6..19a553550c76 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -319,37 +319,21 @@ static const struct nft_chain_type nft_chain_filter_netdev = { }; static void nft_netdev_event(unsigned long event, struct net_device *dev, - struct nft_ctx *ctx) + struct nft_base_chain *basechain) { - struct nft_base_chain *basechain = nft_base_chain(ctx->chain); - struct nft_hook *hook, *found = NULL; - int n = 0; + struct nft_hook *hook; list_for_each_entry(hook, &basechain->hook_list, list) { - if (hook->ops.dev == dev) - found = hook; - - n++; - } - if (!found) - return; + if (hook->ops.dev != dev) + continue; - if (n > 1) { - if (!(ctx->chain->table->flags & NFT_TABLE_F_DORMANT)) - nf_unregister_net_hook(ctx->net, &found->ops); + if (!(basechain->chain.table->flags & NFT_TABLE_F_DORMANT)) + nf_unregister_net_hook(dev_net(dev), &hook->ops); - list_del_rcu(&found->list); - kfree_rcu(found, rcu); - return; + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + break; } - - /* UNREGISTER events are also happening on netns exit. - * - * Although nf_tables core releases all tables/chains, only this event - * handler provides guarantee that hook->ops.dev is still accessible, - * so we cannot skip exiting net namespaces. - */ - __nft_release_basechain(ctx); } static int nf_tables_netdev_event(struct notifier_block *this, @@ -358,25 +342,20 @@ static int nf_tables_netdev_event(struct notifier_block *this, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_base_chain *basechain; struct nftables_pernet *nft_net; - struct nft_chain *chain, *nr; + struct nft_chain *chain; struct nft_table *table; - struct nft_ctx ctx = { - .net = dev_net(dev), - }; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; - nft_net = nft_pernet(ctx.net); + nft_net = nft_pernet(dev_net(dev)); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV && table->family != NFPROTO_INET) continue; - ctx.family = table->family; - ctx.table = table; - list_for_each_entry_safe(chain, nr, &table->chains, list) { + list_for_each_entry(chain, &table->chains, list) { if (!nft_is_base_chain(chain)) continue; @@ -385,8 +364,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, basechain->ops.hooknum != NF_INET_INGRESS) continue; - ctx.chain = chain; - nft_netdev_event(event, dev, &ctx); + nft_netdev_event(event, dev, basechain); } } mutex_unlock(&nft_net->commit_mutex); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 67a41cd2baaf..2e59aba681a1 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -929,7 +929,7 @@ static void nft_ct_timeout_obj_eval(struct nft_object *obj, */ values = nf_ct_timeout_data(timeout); if (values) - nf_ct_refresh(ct, pkt->skb, values[0]); + nf_ct_refresh(ct, values[0]); } static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 3b474d235663..221d50223018 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -289,6 +289,15 @@ static bool nft_flow_offload_skip(struct sk_buff *skb, int family) return false; } +static void flow_offload_ct_tcp(struct nf_conn *ct) +{ + /* conntrack will not see all packets, disable tcp window validation. */ + spin_lock_bh(&ct->lock); + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + spin_unlock_bh(&ct->lock); +} + static void nft_flow_offload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -356,11 +365,8 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, goto err_flow_alloc; flow_offload_route_init(flow, &route); - - if (tcph) { - ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; - ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; - } + if (tcph) + flow_offload_ct_tcp(ct); __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index b7ea21327549..2e8ef16ff191 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -750,6 +750,46 @@ static void nft_rbtree_gc_init(const struct nft_set *set) priv->last_gc = jiffies; } +/* rbtree stores ranges as singleton elements, each range is composed of two + * elements ... + */ +static u32 nft_rbtree_ksize(u32 size) +{ + return size * 2; +} + +/* ... hide this detail to userspace. */ +static u32 nft_rbtree_usize(u32 size) +{ + if (!size) + return 0; + + return size / 2; +} + +static u32 nft_rbtree_adjust_maxsize(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe; + struct rb_node *node; + const void *key; + + node = rb_last(&priv->root); + if (!node) + return 0; + + rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (!nft_rbtree_interval_end(rbe)) + return 0; + + key = nft_set_ext_key(&rbe->ext); + if (memchr(key, 1, set->klen)) + return 0; + + /* this is the all-zero no-match element. */ + return 1; +} + const struct nft_set_type nft_set_rbtree_type = { .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT, .ops = { @@ -768,5 +808,8 @@ const struct nft_set_type nft_set_rbtree_type = { .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, .get = nft_rbtree_get, + .ksize = nft_rbtree_ksize, + .usize = nft_rbtree_usize, + .adjust_maxsize = nft_rbtree_adjust_maxsize, }, }; diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 1bc2d0890a9f..dfda9ea61971 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -374,8 +374,7 @@ int netlbl_unlhsh_add(struct net *net, struct net_device *dev; struct netlbl_unlhsh_iface *iface; struct audit_buffer *audit_buf = NULL; - char *secctx = NULL; - u32 secctx_len; + struct lsm_context ctx; if (addr_len != sizeof(struct in_addr) && addr_len != sizeof(struct in6_addr)) @@ -438,11 +437,9 @@ int netlbl_unlhsh_add(struct net *net, unlhsh_add_return: rcu_read_unlock(); if (audit_buf != NULL) { - if (security_secid_to_secctx(secid, - &secctx, - &secctx_len) == 0) { - audit_log_format(audit_buf, " sec_obj=%s", secctx); - security_release_secctx(secctx, secctx_len); + if (security_secid_to_secctx(secid, &ctx) >= 0) { + audit_log_format(audit_buf, " sec_obj=%s", ctx.context); + security_release_secctx(&ctx); } audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); @@ -473,8 +470,7 @@ static int netlbl_unlhsh_remove_addr4(struct net *net, struct netlbl_unlhsh_addr4 *entry; struct audit_buffer *audit_buf; struct net_device *dev; - char *secctx; - u32 secctx_len; + struct lsm_context ctx; spin_lock(&netlbl_unlhsh_lock); list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr, @@ -494,10 +490,9 @@ static int netlbl_unlhsh_remove_addr4(struct net *net, addr->s_addr, mask->s_addr); dev_put(dev); if (entry != NULL && - security_secid_to_secctx(entry->secid, - &secctx, &secctx_len) == 0) { - audit_log_format(audit_buf, " sec_obj=%s", secctx); - security_release_secctx(secctx, secctx_len); + security_secid_to_secctx(entry->secid, &ctx) >= 0) { + audit_log_format(audit_buf, " sec_obj=%s", ctx.context); + security_release_secctx(&ctx); } audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); @@ -534,8 +529,7 @@ static int netlbl_unlhsh_remove_addr6(struct net *net, struct netlbl_unlhsh_addr6 *entry; struct audit_buffer *audit_buf; struct net_device *dev; - char *secctx; - u32 secctx_len; + struct lsm_context ctx; spin_lock(&netlbl_unlhsh_lock); list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list); @@ -554,10 +548,9 @@ static int netlbl_unlhsh_remove_addr6(struct net *net, addr, mask); dev_put(dev); if (entry != NULL && - security_secid_to_secctx(entry->secid, - &secctx, &secctx_len) == 0) { - audit_log_format(audit_buf, " sec_obj=%s", secctx); - security_release_secctx(secctx, secctx_len); + security_secid_to_secctx(entry->secid, &ctx) >= 0) { + audit_log_format(audit_buf, " sec_obj=%s", ctx.context); + security_release_secctx(&ctx); } audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0); audit_log_end(audit_buf); @@ -1069,10 +1062,9 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd, int ret_val = -ENOMEM; struct netlbl_unlhsh_walk_arg *cb_arg = arg; struct net_device *dev; + struct lsm_context ctx; void *data; u32 secid; - char *secctx; - u32 secctx_len; data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid, cb_arg->seq, &netlbl_unlabel_gnl_family, @@ -1127,14 +1119,14 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd, secid = addr6->secid; } - ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len); - if (ret_val != 0) + ret_val = security_secid_to_secctx(secid, &ctx); + if (ret_val < 0) goto list_cb_failure; ret_val = nla_put(cb_arg->skb, NLBL_UNLABEL_A_SECCTX, - secctx_len, - secctx); - security_release_secctx(secctx, secctx_len); + ctx.len, + ctx.context); + security_release_secctx(&ctx); if (ret_val != 0) goto list_cb_failure; diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 81635a13987b..0d04d23aafe7 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -84,8 +84,7 @@ struct audit_buffer *netlbl_audit_start_common(int type, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; - char *secctx; - u32 secctx_len; + struct lsm_context ctx; if (audit_enabled == AUDIT_OFF) return NULL; @@ -99,10 +98,9 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_info->sessionid); if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &secctx, - &secctx_len) == 0) { - audit_log_format(audit_buf, " subj=%s", secctx); - security_release_secctx(secctx, secctx_len); + security_lsmprop_to_secctx(&audit_info->prop, &ctx) > 0) { + audit_log_format(audit_buf, " subj=%s", ctx.context); + security_release_secctx(&ctx); } return audit_buf; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 85311226183a..e8972a857e51 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -771,6 +771,7 @@ static int netlink_release(struct socket *sock) nlk->cb.done(&nlk->cb); module_put(nlk->cb.module); kfree_skb(nlk->cb.skb); + WRITE_ONCE(nlk->cb_running, false); } module_put(nlk->module); @@ -795,16 +796,6 @@ static int netlink_release(struct socket *sock) sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); - /* Because struct net might disappear soon, do not keep a pointer. */ - if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) { - __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); - /* Because of deferred_put_nlk_sk and use of work queue, - * it is possible netns will be freed before this socket. - */ - sock_net_set(sk, &init_net); - __netns_tracker_alloc(&init_net, &sk->ns_tracker, - false, GFP_KERNEL); - } call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index ba91284f4086..e6cf4eb06b46 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -78,17 +78,6 @@ static struct nfc_llc_engine *nfc_llc_name_to_engine(const char *name) return NULL; } -void nfc_llc_unregister(const char *name) -{ - struct nfc_llc_engine *llc_engine; - - llc_engine = nfc_llc_name_to_engine(name); - if (llc_engine == NULL) - return; - - nfc_llc_del_engine(llc_engine); -} - struct nfc_llc *nfc_llc_allocate(const char *name, struct nfc_hci_dev *hdev, xmit_to_drv_t xmit_to_drv, rcv_to_hci_t rcv_to_hci, int tx_headroom, diff --git a/net/nfc/hci/llc.h b/net/nfc/hci/llc.h index d66271d211a5..09914608ec43 100644 --- a/net/nfc/hci/llc.h +++ b/net/nfc/hci/llc.h @@ -40,7 +40,6 @@ struct nfc_llc { void *nfc_llc_get_data(struct nfc_llc *llc); int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops); -void nfc_llc_unregister(const char *name); int nfc_llc_nop_register(void); diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index de175318a3a0..082ab66f120b 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -542,6 +542,8 @@ static u8 nci_hci_create_pipe(struct nci_dev *ndev, u8 dest_host, pr_debug("pipe created=%d\n", pipe); + if (pipe >= NCI_HCI_MAX_PIPES) + pipe = NCI_HCI_INVALID_PIPE; return pipe; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 225f6048867f..5d548eda742d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -2101,6 +2101,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; + struct net *net_vport; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, @@ -2117,12 +2118,15 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) goto nla_put_failure; - if (!net_eq(net, dev_net(vport->dev))) { - int id = peernet2id_alloc(net, dev_net(vport->dev), gfp); + rcu_read_lock(); + net_vport = dev_net_rcu(vport->dev); + if (!net_eq(net, net_vport)) { + int id = peernet2id_alloc(net, net_vport, GFP_ATOMIC); if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) - goto nla_put_failure; + goto nla_put_failure_unlock; } + rcu_read_unlock(); ovs_vport_get_stats(vport, &vport_stats); if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, @@ -2143,6 +2147,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, genlmsg_end(skb, ovs_header); return 0; +nla_put_failure_unlock: + rcu_read_unlock(); nla_put_failure: err = -EMSGSIZE; error: diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 2412d7813d24..125d310871e9 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -149,7 +149,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms) /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) - vport->dev->netns_local = true; + vport->dev->netns_immutable = true; rtnl_lock(); err = register_netdevice(vport->dev); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c131e5ceea37..3e9ddf72cd03 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2102,8 +2102,8 @@ retry: skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); - skb->mark = READ_ONCE(sk->sk_mark); + skb->priority = sockc.priority; + skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); skb_setup_tx_timestamp(skb, &sockc); @@ -2634,8 +2634,8 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, skb->protocol = proto; skb->dev = dev; - skb->priority = READ_ONCE(po->sk.sk_priority); - skb->mark = READ_ONCE(po->sk.sk_mark); + skb->priority = sockc->priority; + skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); @@ -3039,7 +3039,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) goto out_unlock; sockcm_init(&sockc, sk); - sockc.mark = READ_ONCE(sk->sk_mark); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) diff --git a/net/rds/stats.c b/net/rds/stats.c index 9e87da43c004..cb2e3d2cdf73 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -89,8 +89,7 @@ void rds_stats_info_copy(struct rds_info_iterator *iter, for (i = 0; i < nr; i++) { BUG_ON(strlen(names[i]) >= sizeof(ctr.name)); - strncpy(ctr.name, names[i], sizeof(ctr.name) - 1); - ctr.name[sizeof(ctr.name) - 1] = '\0'; + strscpy_pad(ctr.name, names[i]); ctr.value = values[i]; rds_info_copy(iter, &ctr, sizeof(ctr)); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 0581c53e6517..3cc2f303bf78 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -504,12 +504,8 @@ bool rds_tcp_tune(struct socket *sock) release_sock(sk); return false; } - /* Update ns_tracker to current stack trace and refcounted tracker */ - __netns_tracker_free(net, &sk->ns_tracker, false); - - sk->sk_net_refcnt = 1; - netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sk); + put_net(net); } rtn = net_generic(net, rds_tcp_netid); if (rtn->sndbuf_size > 0) { diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 59050caab65c..a4a668b88a8f 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -397,15 +397,15 @@ static int rose_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); - int opt; + unsigned int opt; if (level != SOL_ROSE) return -ENOPROTOOPT; - if (optlen < sizeof(int)) + if (optlen < sizeof(unsigned int)) return -EINVAL; - if (copy_from_sockptr(&opt, optval, sizeof(int))) + if (copy_from_sockptr(&opt, optval, sizeof(unsigned int))) return -EFAULT; switch (optname) { @@ -414,31 +414,31 @@ static int rose_setsockopt(struct socket *sock, int level, int optname, return 0; case ROSE_T1: - if (opt < 1) + if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; rose->t1 = opt * HZ; return 0; case ROSE_T2: - if (opt < 1) + if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; rose->t2 = opt * HZ; return 0; case ROSE_T3: - if (opt < 1) + if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; rose->t3 = opt * HZ; return 0; case ROSE_HOLDBACK: - if (opt < 1) + if (opt < 1 || opt > UINT_MAX / HZ) return -EINVAL; rose->hb = opt * HZ; return 0; case ROSE_IDLE: - if (opt < 0) + if (opt > UINT_MAX / (60 * HZ)) return -EINVAL; rose->idle = opt * 60 * HZ; return 0; @@ -701,11 +701,9 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net_device *dev; ax25_address *source; ax25_uid_assoc *user; + int err = -EINVAL; int n; - if (!sock_flag(sk, SOCK_ZAPPED)) - return -EINVAL; - if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) return -EINVAL; @@ -718,8 +716,15 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS) return -EINVAL; - if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) - return -EADDRNOTAVAIL; + lock_sock(sk); + + if (!sock_flag(sk, SOCK_ZAPPED)) + goto out_release; + + err = -EADDRNOTAVAIL; + dev = rose_dev_get(&addr->srose_addr); + if (!dev) + goto out_release; source = &addr->srose_call; @@ -730,7 +735,8 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } else { if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { dev_put(dev); - return -EACCES; + err = -EACCES; + goto out_release; } rose->source_call = *source; } @@ -753,8 +759,10 @@ static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) rose_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); - - return 0; + err = 0; +out_release: + release_sock(sk); + return err; } static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index f06ddbed3fed..1525773e94aa 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -122,6 +122,10 @@ static void rose_heartbeat_expiry(struct timer_list *t) struct rose_sock *rose = rose_sk(sk); bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ/20); + goto out; + } switch (rose->state) { case ROSE_STATE_0: /* Magic here: If we listen() and a new link dies before it @@ -152,6 +156,7 @@ static void rose_heartbeat_expiry(struct timer_list *t) } rose_start_heartbeat(sk); +out: bh_unlock_sock(sk); sock_put(sk); } @@ -162,6 +167,10 @@ static void rose_timer_expiry(struct timer_list *t) struct sock *sk = &rose->sock; bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &rose->timer, jiffies + HZ/20); + goto out; + } switch (rose->state) { case ROSE_STATE_1: /* T1 */ case ROSE_STATE_4: /* T2 */ @@ -182,6 +191,7 @@ static void rose_timer_expiry(struct timer_list *t) } break; } +out: bh_unlock_sock(sk); sock_put(sk); } @@ -192,6 +202,10 @@ static void rose_idletimer_expiry(struct timer_list *t) struct sock *sk = &rose->sock; bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + sk_reset_timer(sk, &rose->idletimer, jiffies + HZ/20); + goto out; + } rose_clear_queues(sk); rose_write_internal(sk, ROSE_CLEAR_REQUEST); @@ -207,6 +221,7 @@ static void rose_idletimer_expiry(struct timer_list *t) sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } +out: bh_unlock_sock(sk); sock_put(sk); } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 718193df9d2e..a64a0cab1bf7 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -327,8 +327,8 @@ struct rxrpc_local { * packet with a maximum set of jumbo subpackets or a PING ACK padded * out to 64K with zeropages for PMTUD. */ - struct kvec kvec[RXRPC_MAX_NR_JUMBO > 3 + 16 ? - RXRPC_MAX_NR_JUMBO : 3 + 16]; + struct kvec kvec[1 + RXRPC_MAX_NR_JUMBO > 3 + 16 ? + 1 + RXRPC_MAX_NR_JUMBO : 3 + 16]; }; /* @@ -360,7 +360,6 @@ struct rxrpc_peer { u8 pmtud_jumbo; /* Max jumbo packets for the MTU */ bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */ unsigned int ackr_max_data; /* Maximum data advertised by peer */ - seqcount_t mtu_lock; /* Lockless MTU access management */ unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */ unsigned int max_data; /* Maximum packet data capacity for this peer */ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */ @@ -582,6 +581,7 @@ enum rxrpc_call_flag { RXRPC_CALL_EXCLUSIVE, /* The call uses a once-only connection */ RXRPC_CALL_RX_IS_IDLE, /* recvmsg() is idle - send an ACK */ RXRPC_CALL_RECVMSG_READ_ALL, /* recvmsg() read all of the received data */ + RXRPC_CALL_CONN_CHALLENGING, /* The connection is being challenged */ }; /* @@ -602,7 +602,6 @@ enum rxrpc_call_state { RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */ RXRPC_CALL_SERVER_PREALLOC, /* - service preallocation */ - RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */ @@ -874,8 +873,7 @@ struct rxrpc_txbuf { #define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */ __be16 cksum; /* Checksum to go in header */ bool jumboable; /* Can be non-terminal jumbo subpacket */ - u8 nr_kvec; /* Amount of kvec[] used */ - struct kvec kvec[1]; + void *data; /* Data with preceding jumbo header */ }; static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb) diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index 5a543c3f6fb0..c4c8b46a68c6 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -22,7 +22,6 @@ const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = { [RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl", [RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl", [RXRPC_CALL_SERVER_PREALLOC] = "SvPrealc", - [RXRPC_CALL_SERVER_SECURING] = "SvSecure", [RXRPC_CALL_SERVER_RECV_REQUEST] = "SvRcvReq", [RXRPC_CALL_SERVER_ACK_REQUEST] = "SvAckReq", [RXRPC_CALL_SERVER_SEND_REPLY] = "SvSndRpl", @@ -453,17 +452,16 @@ void rxrpc_incoming_call(struct rxrpc_sock *rx, call->cong_tstamp = skb->tstamp; __set_bit(RXRPC_CALL_EXPOSED, &call->flags); - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); spin_lock(&conn->state_lock); switch (conn->state) { case RXRPC_CONN_SERVICE_UNSECURED: case RXRPC_CONN_SERVICE_CHALLENGING: - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_SECURING); + __set_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags); break; case RXRPC_CONN_SERVICE: - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); break; case RXRPC_CONN_ABORTED: diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index 713e04394ceb..4d9c5e21ba78 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -228,10 +228,8 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn) */ static void rxrpc_call_is_secure(struct rxrpc_call *call) { - if (call && __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SECURING) { - rxrpc_set_call_state(call, RXRPC_CALL_SERVER_RECV_REQUEST); + if (call && __test_and_clear_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) rxrpc_notify_socket(call); - } } /* @@ -272,6 +270,7 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, * we've already received the packet, put it on the * front of the queue. */ + sp->conn = rxrpc_get_connection(conn, rxrpc_conn_get_poke_secured); skb->mark = RXRPC_SKB_MARK_SERVICE_CONN_SECURED; rxrpc_get_skb(skb, rxrpc_skb_get_conn_secured); skb_queue_head(&conn->local->rx_queue, skb); @@ -437,14 +436,16 @@ void rxrpc_input_conn_event(struct rxrpc_connection *conn, struct sk_buff *skb) if (test_and_clear_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events)) rxrpc_abort_calls(conn); - switch (skb->mark) { - case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: - if (conn->state != RXRPC_CONN_SERVICE) - break; + if (skb) { + switch (skb->mark) { + case RXRPC_SKB_MARK_SERVICE_CONN_SECURED: + if (conn->state != RXRPC_CONN_SERVICE) + break; - for (loop = 0; loop < RXRPC_MAXCALLS; loop++) - rxrpc_call_is_secure(conn->channels[loop].call); - break; + for (loop = 0; loop < RXRPC_MAXCALLS; loop++) + rxrpc_call_is_secure(conn->channels[loop].call); + break; + } } /* Process delayed ACKs whose time has come. */ diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c index 7eba4d7d9a38..2f1fd1e2e7e4 100644 --- a/net/rxrpc/conn_object.c +++ b/net/rxrpc/conn_object.c @@ -67,6 +67,7 @@ struct rxrpc_connection *rxrpc_alloc_connection(struct rxrpc_net *rxnet, INIT_WORK(&conn->destructor, rxrpc_clean_up_connection); INIT_LIST_HEAD(&conn->proc_link); INIT_LIST_HEAD(&conn->link); + INIT_LIST_HEAD(&conn->attend_link); mutex_init(&conn->security_lock); mutex_init(&conn->tx_data_alloc_lock); skb_queue_head_init(&conn->rx_queue); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 4974b5accafa..24aceb183c2c 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -448,11 +448,19 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_skb_priv *sp = rxrpc_skb(skb); bool last = sp->hdr.flags & RXRPC_LAST_PACKET; - skb_queue_tail(&call->recvmsg_queue, skb); + spin_lock_irq(&call->recvmsg_queue.lock); + + __skb_queue_tail(&call->recvmsg_queue, skb); rxrpc_input_update_ack_window(call, window, wtop); trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq); if (last) + /* Change the state inside the lock so that recvmsg syncs + * correctly with it and using sendmsg() to send a reply + * doesn't race. + */ rxrpc_end_rx_phase(call, sp->hdr.serial); + + spin_unlock_irq(&call->recvmsg_queue.lock); } /* @@ -657,7 +665,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb rxrpc_propose_delay_ACK(call, sp->hdr.serial, rxrpc_propose_ack_input_data); } - if (notify) { + if (notify && !test_bit(RXRPC_CALL_CONN_CHALLENGING, &call->flags)) { trace_rxrpc_notify_socket(call->debug_id, sp->hdr.serial); rxrpc_notify_socket(call); } @@ -802,9 +810,7 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb if (max_mtu < peer->max_data) { trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_mtu, rxrpc_pmtud_reduce_ack); - write_seqcount_begin(&peer->mtu_lock); peer->max_data = max_mtu; - write_seqcount_end(&peer->mtu_lock); } max_data = umin(max_mtu, peer->max_data); diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 6f7a125d6e90..95905b85a8d7 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -428,13 +428,13 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call) static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_send_data_req *req, struct rxrpc_txbuf *txb, + struct rxrpc_wire_header *whdr, rxrpc_serial_t serial, int subpkt) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxrpc_jumbo_header *jumbo = (void *)(whdr + 1) - sizeof(*jumbo); + struct rxrpc_jumbo_header *jumbo = txb->data - sizeof(*jumbo); enum rxrpc_req_ack_trace why; struct rxrpc_connection *conn = call->conn; - struct kvec *kv = &call->local->kvec[subpkt]; + struct kvec *kv = &call->local->kvec[1 + subpkt]; size_t len = txb->pkt_len; bool last; u8 flags; @@ -491,18 +491,15 @@ static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call, } dont_set_request_ack: - /* The jumbo header overlays the wire header in the txbuf. */ + /* There's a jumbo header prepended to the data if we need it. */ if (subpkt < req->n - 1) flags |= RXRPC_JUMBO_PACKET; else flags &= ~RXRPC_JUMBO_PACKET; if (subpkt == 0) { whdr->flags = flags; - whdr->serial = htonl(txb->serial); whdr->cksum = txb->cksum; - whdr->serviceId = htons(conn->service_id); - kv->iov_base = whdr; - len += sizeof(*whdr); + kv->iov_base = txb->data; } else { jumbo->flags = flags; jumbo->pad = 0; @@ -535,7 +532,9 @@ static unsigned int rxrpc_prepare_txqueue(struct rxrpc_txqueue *tq, /* * Prepare a (jumbo) packet for transmission. */ -static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) +static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, + struct rxrpc_send_data_req *req, + struct rxrpc_wire_header *whdr) { struct rxrpc_txqueue *tq = req->tq; rxrpc_serial_t serial; @@ -549,6 +548,18 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se /* Each transmission of a Tx packet needs a new serial number */ serial = rxrpc_get_next_serials(call->conn, req->n); + whdr->epoch = htonl(call->conn->proto.epoch); + whdr->cid = htonl(call->cid); + whdr->callNumber = htonl(call->call_id); + whdr->seq = htonl(seq); + whdr->serial = htonl(serial); + whdr->type = RXRPC_PACKET_TYPE_DATA; + whdr->flags = 0; + whdr->userStatus = 0; + whdr->securityIndex = call->security_ix; + whdr->_rsvd = 0; + whdr->serviceId = htons(call->conn->service_id); + call->tx_last_serial = serial + req->n - 1; call->tx_last_sent = req->now; xmit_ts = rxrpc_prepare_txqueue(tq, req); @@ -576,7 +587,7 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se if (i + 1 == req->n) /* Only sample the last subpacket in a jumbo. */ __set_bit(ix, &tq->rtt_samples); - len += rxrpc_prepare_data_subpacket(call, req, txb, serial, i); + len += rxrpc_prepare_data_subpacket(call, req, txb, whdr, serial, i); serial++; seq++; i++; @@ -618,6 +629,7 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se } rxrpc_set_keepalive(call, req->now); + page_frag_free(whdr); return len; } @@ -626,25 +638,33 @@ static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_se */ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req) { + struct rxrpc_wire_header *whdr; struct rxrpc_connection *conn = call->conn; enum rxrpc_tx_point frag; struct rxrpc_txqueue *tq = req->tq; struct rxrpc_txbuf *txb; struct msghdr msg; rxrpc_seq_t seq = req->seq; - size_t len; + size_t len = sizeof(*whdr); bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags); int ret, stat_ix; _enter("%x,%x-%x", tq->qbase, seq, seq + req->n - 1); + whdr = page_frag_alloc(&call->local->tx_alloc, sizeof(*whdr), GFP_NOFS); + if (!whdr) + return; /* Drop the packet if no memory. */ + + call->local->kvec[0].iov_base = whdr; + call->local->kvec[0].iov_len = sizeof(*whdr); + stat_ix = umin(req->n, ARRAY_SIZE(call->rxnet->stat_tx_jumbo)) - 1; atomic_inc(&call->rxnet->stat_tx_jumbo[stat_ix]); - len = rxrpc_prepare_data_packet(call, req); + len += rxrpc_prepare_data_packet(call, req, whdr); txb = tq->bufs[seq & RXRPC_TXQ_MASK]; - iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, req->n, len); + iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, 1 + req->n, len); msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; @@ -695,13 +715,13 @@ void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req if (ret == -EMSGSIZE) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_msgsize); - trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag); + trace_rxrpc_tx_packet(call->debug_id, whdr, frag); ret = 0; } else if (ret < 0) { rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail); trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag); } else { - trace_rxrpc_tx_packet(call->debug_id, call->local->kvec[0].iov_base, frag); + trace_rxrpc_tx_packet(call->debug_id, whdr, frag); } rxrpc_tx_backoff(call, ret); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index d82e44a3901b..7f4729234957 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -130,9 +130,7 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) peer->pmtud_bad = max_data + 1; trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp); - write_seqcount_begin(&peer->mtu_lock); peer->max_data = max_data; - write_seqcount_end(&peer->mtu_lock); } } @@ -169,6 +167,13 @@ void rxrpc_input_error(struct rxrpc_local *local, struct sk_buff *skb) goto out; } + if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 && + serr->ee.ee_type == ICMPV6_PKT_TOOBIG && + serr->ee.ee_code == 0)) { + rxrpc_adjust_mtu(peer, serr->ee.ee_info); + goto out; + } + rxrpc_store_error(peer, skb); out: rxrpc_put_peer(peer, rxrpc_peer_put_input_error); @@ -246,7 +251,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, bool use; int slot; - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); while (!list_empty(collector)) { peer = list_entry(collector->next, @@ -257,7 +262,7 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, continue; use = __rxrpc_use_local(peer->local, rxrpc_local_use_peer_keepalive); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (use) { keepalive_at = peer->last_tx_at + RXRPC_KEEPALIVE_TIME; @@ -277,17 +282,17 @@ static void rxrpc_peer_keepalive_dispatch(struct rxrpc_net *rxnet, */ slot += cursor; slot &= mask; - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive[slot & mask]); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_unuse_local(peer->local, rxrpc_local_unuse_peer_keepalive); } rxrpc_put_peer(peer, rxrpc_peer_put_keepalive); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); } - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); } /* @@ -317,7 +322,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) * second; the bucket at cursor + 1 goes at now + 1s and so * on... */ - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); list_splice_init(&rxnet->peer_keepalive_new, &collector); stop = cursor + ARRAY_SIZE(rxnet->peer_keepalive); @@ -329,7 +334,7 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work) } base = now; - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxnet->peer_keepalive_base = base; rxnet->peer_keepalive_cursor = cursor; @@ -401,13 +406,8 @@ void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t a } max_data = umin(max_data, peer->ackr_max_data); - if (max_data != peer->max_data) { - preempt_disable(); - write_seqcount_begin(&peer->mtu_lock); + if (max_data != peer->max_data) peer->max_data = max_data; - write_seqcount_end(&peer->mtu_lock); - preempt_enable(); - } jumbo = max_data + sizeof(struct rxrpc_jumbo_header); jumbo /= RXRPC_JUMBO_SUBPKTLEN; diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c index e1c63129586b..56e09d161a97 100644 --- a/net/rxrpc/peer_object.c +++ b/net/rxrpc/peer_object.c @@ -235,7 +235,6 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp, peer->service_conns = RB_ROOT; seqlock_init(&peer->service_conn_lock); spin_lock_init(&peer->lock); - seqcount_init(&peer->mtu_lock); peer->debug_id = atomic_inc_return(&rxrpc_debug_id); peer->recent_srtt_us = UINT_MAX; peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW; @@ -360,7 +359,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, return NULL; } - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); /* Need to check that we aren't racing with someone else */ peer = __rxrpc_lookup_peer_rcu(local, srx, hash_key); @@ -373,7 +372,7 @@ struct rxrpc_peer *rxrpc_lookup_peer(struct rxrpc_local *local, &rxnet->peer_keepalive_new); } - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); if (peer) rxrpc_free_peer(candidate); @@ -423,10 +422,10 @@ static void __rxrpc_put_peer(struct rxrpc_peer *peer) ASSERT(hlist_empty(&peer->error_targets)); - spin_lock(&rxnet->peer_hash_lock); + spin_lock_bh(&rxnet->peer_hash_lock); hash_del_rcu(&peer->hash_link); list_del_init(&peer->keepalive_link); - spin_unlock(&rxnet->peer_hash_lock); + spin_unlock_bh(&rxnet->peer_hash_lock); rxrpc_free_peer(peer); } diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 62b09d23ec08..6cb37b0eb77f 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -257,8 +257,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, struct rxrpc_txbuf *txb, struct skcipher_request *req) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxkad_level1_hdr *hdr = (void *)(whdr + 1); + struct rxkad_level1_hdr *hdr = txb->data; struct rxrpc_crypt iv; struct scatterlist sg; size_t pad; @@ -274,7 +273,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, pad = RXKAD_ALIGN - pad; pad &= RXKAD_ALIGN - 1; if (pad) { - memset(txb->kvec[0].iov_base + txb->offset, 0, pad); + memset(txb->data + txb->offset, 0, pad); txb->pkt_len += pad; } @@ -300,8 +299,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, struct skcipher_request *req) { const struct rxrpc_key_token *token; - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; - struct rxkad_level2_hdr *rxkhdr = (void *)(whdr + 1); + struct rxkad_level2_hdr *rxkhdr = txb->data; struct rxrpc_crypt iv; struct scatterlist sg; size_t content, pad; @@ -319,7 +317,7 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call, txb->pkt_len = round_up(content, RXKAD_ALIGN); pad = txb->pkt_len - content; if (pad) - memset(txb->kvec[0].iov_base + txb->offset, 0, pad); + memset(txb->data + txb->offset, 0, pad); /* encrypt from the session key */ token = call->conn->key->payload.data[0]; @@ -407,9 +405,8 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb) /* Clear excess space in the packet */ if (txb->pkt_len < txb->alloc_size) { - struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base; size_t gap = txb->alloc_size - txb->pkt_len; - void *p = whdr + 1; + void *p = txb->data; memset(p + txb->pkt_len, 0, gap); } diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 7ef93407be83..e848a4777b8c 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -478,6 +478,18 @@ static int rxperf_deliver_request(struct rxperf_call *call) call->unmarshal++; fallthrough; case 2: + ret = rxperf_extract_data(call, true); + if (ret < 0) + return ret; + + /* Deal with the terminal magic cookie. */ + call->iov_len = 4; + call->kvec[0].iov_len = call->iov_len; + call->kvec[0].iov_base = call->tmp; + iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len); + call->unmarshal++; + fallthrough; + case 3: ret = rxperf_extract_data(call, false); if (ret < 0) return ret; diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 0e8da909d4f2..84dc6c94f23b 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -419,7 +419,7 @@ reload: size_t copy = umin(txb->space, msg_data_left(msg)); _debug("add %zu", copy); - if (!copy_from_iter_full(txb->kvec[0].iov_base + txb->offset, + if (!copy_from_iter_full(txb->data + txb->offset, copy, &msg->msg_iter)) goto efault; _debug("added"); @@ -445,8 +445,6 @@ reload: ret = call->security->secure_packet(call, txb); if (ret < 0) goto out; - - txb->kvec[0].iov_len += txb->len; rxrpc_queue_packet(rx, call, txb, notify_end_tx); txb = NULL; } @@ -707,7 +705,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) } else { switch (rxrpc_call_state(call)) { case RXRPC_CALL_CLIENT_AWAIT_CONN: - case RXRPC_CALL_SERVER_SECURING: + case RXRPC_CALL_SERVER_RECV_REQUEST: if (p.command == RXRPC_CMD_SEND_ABORT) break; fallthrough; diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c index 131d9e55c8e9..c550991d48fa 100644 --- a/net/rxrpc/txbuf.c +++ b/net/rxrpc/txbuf.c @@ -19,17 +19,19 @@ atomic_t rxrpc_nr_txbuf; struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size, size_t data_align, gfp_t gfp) { - struct rxrpc_wire_header *whdr; struct rxrpc_txbuf *txb; - size_t total, hoff; + size_t total, doff, jsize = sizeof(struct rxrpc_jumbo_header); void *buf; txb = kzalloc(sizeof(*txb), gfp); if (!txb) return NULL; - hoff = round_up(sizeof(*whdr), data_align) - sizeof(*whdr); - total = hoff + sizeof(*whdr) + data_size; + /* We put a jumbo header in the buffer, but not a full wire header to + * avoid delayed-corruption problems with zerocopy. + */ + doff = round_up(jsize, data_align); + total = doff + data_size; data_align = umax(data_align, L1_CACHE_BYTES); mutex_lock(&call->conn->tx_data_alloc_lock); @@ -41,30 +43,15 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_ return NULL; } - whdr = buf + hoff; - refcount_set(&txb->ref, 1); txb->call_debug_id = call->debug_id; txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids); txb->alloc_size = data_size; txb->space = data_size; - txb->offset = sizeof(*whdr); + txb->offset = 0; txb->flags = call->conn->out_clientflag; txb->seq = call->send_top + 1; - txb->nr_kvec = 1; - txb->kvec[0].iov_base = whdr; - txb->kvec[0].iov_len = sizeof(*whdr); - - whdr->epoch = htonl(call->conn->proto.epoch); - whdr->cid = htonl(call->cid); - whdr->callNumber = htonl(call->call_id); - whdr->seq = htonl(txb->seq); - whdr->type = RXRPC_PACKET_TYPE_DATA; - whdr->flags = 0; - whdr->userStatus = 0; - whdr->securityIndex = call->security_ix; - whdr->_rsvd = 0; - whdr->serviceId = htons(call->dest_srx.srx_service); + txb->data = buf + doff; trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1, rxrpc_txbuf_alloc_data); @@ -90,14 +77,10 @@ void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what) static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb) { - int i; - trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0, rxrpc_txbuf_free); - for (i = 0; i < txb->nr_kvec; i++) - if (txb->kvec[i].iov_base && - !is_zero_pfn(page_to_pfn(virt_to_page(txb->kvec[i].iov_base)))) - page_frag_free(txb->kvec[i].iov_base); + if (txb->data) + page_frag_free(txb->data); kfree(txb); atomic_dec(&rxrpc_nr_txbuf); } diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index af7c99845948..ae5dea7c48a8 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -571,8 +571,8 @@ static void tunnel_key_release(struct tc_action *a) static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { + const u8 *src = ip_tunnel_info_opts(info); int len = info->options_len; - u8 *src = (u8 *)(info + 1); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_GENEVE); @@ -580,7 +580,7 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, return -EMSGSIZE; while (len > 0) { - struct geneve_opt *opt = (struct geneve_opt *)src; + const struct geneve_opt *opt = (const struct geneve_opt *)src; if (nla_put_be16(skb, TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS, opt->opt_class) || @@ -603,7 +603,7 @@ static int tunnel_key_geneve_opts_dump(struct sk_buff *skb, static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { - struct vxlan_metadata *md = (struct vxlan_metadata *)(info + 1); + const struct vxlan_metadata *md = ip_tunnel_info_opts(info); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_VXLAN); @@ -622,7 +622,7 @@ static int tunnel_key_vxlan_opts_dump(struct sk_buff *skb, static int tunnel_key_erspan_opts_dump(struct sk_buff *skb, const struct ip_tunnel_info *info) { - struct erspan_metadata *md = (struct erspan_metadata *)(info + 1); + const struct erspan_metadata *md = ip_tunnel_info_opts(info); struct nlattr *start; start = nla_nest_start_noflag(skb, TCA_TUNNEL_KEY_ENC_OPTS_ERSPAN); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 7578e27260c9..4f648af8cfaa 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -97,7 +97,7 @@ tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, err = xa_alloc_cyclic(&tcf_exts_miss_cookies_xa, &n->miss_cookie_base, n, xa_limit_32b, &next, GFP_KERNEL); - if (err) + if (err < 0) goto err_xa_alloc; exts->miss_cookie_node = n; @@ -390,6 +390,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, tp->protocol = protocol; tp->prio = prio; tp->chain = chain; + tp->usesw = !tp->ops->reoffload; spin_lock_init(&tp->lock); refcount_set(&tp->refcnt, 1); @@ -410,39 +411,31 @@ static void tcf_proto_get(struct tcf_proto *tp) refcount_inc(&tp->refcnt); } -static void tcf_maintain_bypass(struct tcf_block *block) +static void tcf_proto_count_usesw(struct tcf_proto *tp, bool add) { - int filtercnt = atomic_read(&block->filtercnt); - int skipswcnt = atomic_read(&block->skipswcnt); - bool bypass_wanted = filtercnt > 0 && filtercnt == skipswcnt; - - if (bypass_wanted != block->bypass_wanted) { #ifdef CONFIG_NET_CLS_ACT - if (bypass_wanted) - static_branch_inc(&tcf_bypass_check_needed_key); - else - static_branch_dec(&tcf_bypass_check_needed_key); -#endif - block->bypass_wanted = bypass_wanted; + struct tcf_block *block = tp->chain->block; + bool counted = false; + + if (!add) { + if (tp->usesw && tp->counted) { + if (!atomic_dec_return(&block->useswcnt)) + static_branch_dec(&tcf_sw_enabled_key); + tp->counted = false; + } + return; } -} - -static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted, bool add) -{ - lockdep_assert_not_held(&block->cb_lock); - down_write(&block->cb_lock); - if (*counted != add) { - if (add) { - atomic_inc(&block->filtercnt); - *counted = true; - } else { - atomic_dec(&block->filtercnt); - *counted = false; - } + spin_lock(&tp->lock); + if (tp->usesw && !tp->counted) { + counted = true; + tp->counted = true; } - tcf_maintain_bypass(block); - up_write(&block->cb_lock); + spin_unlock(&tp->lock); + + if (counted && atomic_inc_return(&block->useswcnt) == 1) + static_branch_inc(&tcf_sw_enabled_key); +#endif } static void tcf_chain_put(struct tcf_chain *chain); @@ -451,7 +444,7 @@ static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, bool sig_destroy, struct netlink_ext_ack *extack) { tp->ops->destroy(tp, rtnl_held, extack); - tcf_block_filter_cnt_update(tp->chain->block, &tp->counted, false); + tcf_proto_count_usesw(tp, false); if (sig_destroy) tcf_proto_signal_destroyed(tp->chain, tp); tcf_chain_put(tp->chain); @@ -2409,7 +2402,7 @@ replay: tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); - tcf_block_filter_cnt_update(block, &tp->counted, true); + tcf_proto_count_usesw(tp, true); /* q pointer is NULL for shared blocks */ if (q) q->flags &= ~TCQ_F_CAN_BYPASS; @@ -3532,8 +3525,6 @@ static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) if (*flags & TCA_CLS_FLAGS_IN_HW) return; *flags |= TCA_CLS_FLAGS_IN_HW; - if (tc_skip_sw(*flags)) - atomic_inc(&block->skipswcnt); atomic_inc(&block->offloadcnt); } @@ -3542,8 +3533,6 @@ static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) if (!(*flags & TCA_CLS_FLAGS_IN_HW)) return; *flags &= ~TCA_CLS_FLAGS_IN_HW; - if (tc_skip_sw(*flags)) - atomic_dec(&block->skipswcnt); atomic_dec(&block->offloadcnt); } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 1941ebec23ff..7fbe42f0e5c2 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -509,6 +509,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(prog->gen_flags)) prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, prog->gen_flags); + if (oldprog) { idr_replace(&head->handle_idr, prog, handle); list_replace_rcu(&oldprog->link, &prog->link); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1008ec8a464c..03505673d523 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2503,6 +2503,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(fnew->flags)) fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, fnew->flags); + spin_lock(&tp->lock); /* tp was deleted concurrently. -EAGAIN will cause caller to lookup diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 9f1e62ca508d..f03bf5da39ee 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -228,6 +228,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(new->flags)) new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, new->flags); + *arg = head; rcu_assign_pointer(tp->root, new); return 0; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index d3a03c57545b..2a1c00048fd6 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -951,6 +951,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(new->flags)) new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, new->flags); + u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); tcf_exts_get_net(&n->exts); @@ -1164,6 +1166,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(n->flags)) n->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, n->flags); + ins = &ht->ht[TC_U32_HASH(handle)]; for (pins = rtnl_dereference(*ins); pins; ins = &pins->next, pins = rtnl_dereference(*ins)) diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 8996c73c9779..3f2e707a11d1 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -460,7 +460,7 @@ META_COLLECTOR(int_sk_fwd_alloc) *err = -1; return; } - dst->value = sk_forward_alloc_get(sk); + dst->value = READ_ONCE(sk->sk_forward_alloc); } META_COLLECTOR(int_sk_sndbuf) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 5d017c06a96f..e3e91cf867eb 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1664,6 +1664,10 @@ replay: q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) goto create_n_graft; + if (q->parent != tcm->tcm_parent) { + NL_SET_ERR_MSG(extack, "Cannot move an existing qdisc to a different parent"); + return -EINVAL; + } if (n->nlmsg_flags & NLM_F_EXCL) { NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); return -EEXIST; diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index f80bc05d4c5a..516038a44163 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -91,6 +91,8 @@ ets_class_from_arg(struct Qdisc *sch, unsigned long arg) { struct ets_sched *q = qdisc_priv(sch); + if (arg == 0 || arg > q->nbands) + return NULL; return &q->classes[arg - 1]; } diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index b50b2c2cc09b..e6bfd39ff339 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -40,6 +40,9 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch, { unsigned int prev_backlog; + if (unlikely(READ_ONCE(sch->limit) == 0)) + return qdisc_drop(skb, sch, to_free); + if (likely(sch->q.qlen < READ_ONCE(sch->limit))) return qdisc_enqueue_tail(skb, sch); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 71ec9986ed37..fdd79d3ccd8c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -749,9 +749,9 @@ deliver: if (err != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(err)) qdisc_qstats_drop(sch); - qdisc_tree_reduce_backlog(sch, 1, pkt_len); sch->qstats.backlog -= pkt_len; sch->q.qlen--; + qdisc_tree_reduce_backlog(sch, 1, pkt_len); } goto tfifo_dequeue; } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 6a07cdbdb9e1..2cfbc977fe6d 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -447,7 +447,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (q->wsum + delta_w > QFQ_MAX_WSUM) { NL_SET_ERR_MSG_FMT_MOD(extack, - "total weight out of range (%d + %u)\n", + "total weight out of range (%d + %u)", delta_w, q->wsum); return -EINVAL; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 29727ed1008e..5407a3922101 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -185,12 +185,9 @@ static void sctp_v4_copy_ip_options(struct sock *sk, struct sock *newsk) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - newopt = sock_kmalloc(newsk, sizeof(*inet_opt) + + newopt = sock_kmemdup(newsk, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen, GFP_ATOMIC); - if (newopt) - memcpy(newopt, inet_opt, sizeof(*inet_opt) + - inet_opt->opt.optlen); - else + if (!newopt) pr_err("%s: Failed to copy ip options\n", __func__); } RCU_INIT_POINTER(newinet->inet_opt, newopt); diff --git a/net/sctp/stream.c b/net/sctp/stream.c index c241cc552e8d..bfcff6d6a438 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -735,7 +735,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq( * value SHOULD be the smallest TSN not acknowledged by the * receiver of the request plus 2^31. */ - init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1 << 31); + init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1U << 31); sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, init_tsn, GFP_ATOMIC); diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ca6984541edb..3e6cb35baf25 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3337,10 +3337,7 @@ int smc_create_clcsk(struct net *net, struct sock *sk, int family) * which need net ref. */ sk = smc->clcsock->sk; - __netns_tracker_free(net, &sk->ns_tracker, false); - sk->sk_net_refcnt = 1; - get_net_track(net, &sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sk); return 0; } diff --git a/net/socket.c b/net/socket.c index 262a28b59c7f..b64ecf2722e7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -479,6 +479,11 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) sock->file = file; file->private_data = sock; stream_open(SOCK_INODE(sock), file); + /* + * Disable permission and pre-content events, but enable legacy + * inotify events for legacy users. + */ + file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM); return file; } EXPORT_SYMBOL(sock_alloc_file); @@ -675,17 +680,8 @@ void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags) { u8 flags = *tx_flags; - if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) { - flags |= SKBTX_HW_TSTAMP; - - /* PTP hardware clocks can provide a free running cycle counter - * as a time base for virtual clocks. Tell driver to use the - * free running cycle counter for timestamp if socket is bound - * to virtual clock. - */ - if (tsflags & SOF_TIMESTAMPING_BIND_PHC) - flags |= SKBTX_HW_TSTAMP_USE_CYCLES; - } + if (tsflags & SOF_TIMESTAMPING_TX_HARDWARE) + flags |= SKBTX_HW_TSTAMP_NOBPF; if (tsflags & SOF_TIMESTAMPING_TX_SOFTWARE) flags |= SKBTX_SW_TSTAMP; diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 8299ceb3e373..95696f42647e 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -347,7 +347,10 @@ static int strp_read_sock(struct strparser *strp) struct socket *sock = strp->sk->sk_socket; read_descriptor_t desc; - if (unlikely(!sock || !sock->ops || !sock->ops->read_sock)) + if (unlikely(!sock || !sock->ops)) + return -EBUSY; + + if (unlikely(!strp->cb.read_sock && !sock->ops->read_sock)) return -EBUSY; desc.arg.data = strp; @@ -355,7 +358,10 @@ static int strp_read_sock(struct strparser *strp) desc.count = 1; /* give more than one skb per call */ /* sk should be locked here, so okay to do read_sock */ - sock->ops->read_sock(strp->sk, &desc, strp_recv); + if (strp->cb.read_sock) + strp->cb.read_sock(strp, &desc, strp_recv); + else + sock->ops->read_sock(strp->sk, &desc, strp_recv); desc.error = strp->cb.read_sock_done(strp, desc.error); @@ -468,6 +474,7 @@ int strp_init(struct strparser *strp, struct sock *sk, strp->cb.unlock = cb->unlock ? : strp_sock_unlock; strp->cb.rcv_msg = cb->rcv_msg; strp->cb.parse_msg = cb->parse_msg; + strp->cb.read_sock = cb->read_sock; strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done; strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp; diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index ad1736d93b76..452f67deebc6 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o -auth_rpcgss-y := auth_gss.o gss_generic_token.o \ +auth_rpcgss-y := auth_gss.o \ gss_mech_switch.o svcauth_gss.o \ gss_rpc_upcall.o gss_rpc_xdr.o trace.o diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c deleted file mode 100644 index 4a4082bb22ad..000000000000 --- a/net/sunrpc/auth_gss/gss_generic_token.c +++ /dev/null @@ -1,231 +0,0 @@ -/* - * linux/net/sunrpc/gss_generic_token.c - * - * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c - * - * Copyright (c) 2000 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@umich.edu> - */ - -/* - * Copyright 1993 by OpenVision Technologies, Inc. - * - * Permission to use, copy, modify, distribute, and sell this software - * and its documentation for any purpose is hereby granted without fee, - * provided that the above copyright notice appears in all copies and - * that both that copyright notice and this permission notice appear in - * supporting documentation, and that the name of OpenVision not be used - * in advertising or publicity pertaining to distribution of the software - * without specific, written prior permission. OpenVision makes no - * representations about the suitability of this software for any - * purpose. It is provided "as is" without express or implied warranty. - * - * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, - * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO - * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF - * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR - * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR - * PERFORMANCE OF THIS SOFTWARE. - */ - -#include <linux/types.h> -#include <linux/module.h> -#include <linux/string.h> -#include <linux/sunrpc/sched.h> -#include <linux/sunrpc/gss_asn1.h> - - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - - -/* TWRITE_STR from gssapiP_generic.h */ -#define TWRITE_STR(ptr, str, len) \ - memcpy((ptr), (char *) (str), (len)); \ - (ptr) += (len); - -/* XXXX this code currently makes the assumption that a mech oid will - never be longer than 127 bytes. This assumption is not inherent in - the interfaces, so the code can be fixed if the OSI namespace - balloons unexpectedly. */ - -/* Each token looks like this: - -0x60 tag for APPLICATION 0, SEQUENCE - (constructed, definite-length) - <length> possible multiple bytes, need to parse/generate - 0x06 tag for OBJECT IDENTIFIER - <moid_length> compile-time constant string (assume 1 byte) - <moid_bytes> compile-time constant string - <inner_bytes> the ANY containing the application token - bytes 0,1 are the token type - bytes 2,n are the token data - -For the purposes of this abstraction, the token "header" consists of -the sequence tag and length octets, the mech OID DER encoding, and the -first two inner bytes, which indicate the token type. The token -"body" consists of everything else. - -*/ - -static int -der_length_size( int length) -{ - if (length < (1<<7)) - return 1; - else if (length < (1<<8)) - return 2; -#if (SIZEOF_INT == 2) - else - return 3; -#else - else if (length < (1<<16)) - return 3; - else if (length < (1<<24)) - return 4; - else - return 5; -#endif -} - -static void -der_write_length(unsigned char **buf, int length) -{ - if (length < (1<<7)) { - *(*buf)++ = (unsigned char) length; - } else { - *(*buf)++ = (unsigned char) (der_length_size(length)+127); -#if (SIZEOF_INT > 2) - if (length >= (1<<24)) - *(*buf)++ = (unsigned char) (length>>24); - if (length >= (1<<16)) - *(*buf)++ = (unsigned char) ((length>>16)&0xff); -#endif - if (length >= (1<<8)) - *(*buf)++ = (unsigned char) ((length>>8)&0xff); - *(*buf)++ = (unsigned char) (length&0xff); - } -} - -/* returns decoded length, or < 0 on failure. Advances buf and - decrements bufsize */ - -static int -der_read_length(unsigned char **buf, int *bufsize) -{ - unsigned char sf; - int ret; - - if (*bufsize < 1) - return -1; - sf = *(*buf)++; - (*bufsize)--; - if (sf & 0x80) { - if ((sf &= 0x7f) > ((*bufsize)-1)) - return -1; - if (sf > SIZEOF_INT) - return -1; - ret = 0; - for (; sf; sf--) { - ret = (ret<<8) + (*(*buf)++); - (*bufsize)--; - } - } else { - ret = sf; - } - - return ret; -} - -/* returns the length of a token, given the mech oid and the body size */ - -int -g_token_size(struct xdr_netobj *mech, unsigned int body_size) -{ - /* set body_size to sequence contents size */ - body_size += 2 + (int) mech->len; /* NEED overflow check */ - return 1 + der_length_size(body_size) + body_size; -} - -EXPORT_SYMBOL_GPL(g_token_size); - -/* fills in a buffer with the token header. The buffer is assumed to - be the right size. buf is advanced past the token header */ - -void -g_make_token_header(struct xdr_netobj *mech, int body_size, unsigned char **buf) -{ - *(*buf)++ = 0x60; - der_write_length(buf, 2 + mech->len + body_size); - *(*buf)++ = 0x06; - *(*buf)++ = (unsigned char) mech->len; - TWRITE_STR(*buf, mech->data, ((int) mech->len)); -} - -EXPORT_SYMBOL_GPL(g_make_token_header); - -/* - * Given a buffer containing a token, reads and verifies the token, - * leaving buf advanced past the token header, and setting body_size - * to the number of remaining bytes. Returns 0 on success, - * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the - * mechanism in the token does not match the mech argument. buf and - * *body_size are left unmodified on error. - */ -u32 -g_verify_token_header(struct xdr_netobj *mech, int *body_size, - unsigned char **buf_in, int toksize) -{ - unsigned char *buf = *buf_in; - int seqsize; - struct xdr_netobj toid; - int ret = 0; - - if ((toksize-=1) < 0) - return G_BAD_TOK_HEADER; - if (*buf++ != 0x60) - return G_BAD_TOK_HEADER; - - if ((seqsize = der_read_length(&buf, &toksize)) < 0) - return G_BAD_TOK_HEADER; - - if (seqsize != toksize) - return G_BAD_TOK_HEADER; - - if ((toksize-=1) < 0) - return G_BAD_TOK_HEADER; - if (*buf++ != 0x06) - return G_BAD_TOK_HEADER; - - if ((toksize-=1) < 0) - return G_BAD_TOK_HEADER; - toid.len = *buf++; - - if ((toksize-=toid.len) < 0) - return G_BAD_TOK_HEADER; - toid.data = buf; - buf+=toid.len; - - if (! g_OID_equal(&toid, mech)) - ret = G_WRONG_MECH; - - /* G_WRONG_MECH is not returned immediately because it's more important - to return G_BAD_TOK_HEADER if the token header is in fact bad */ - - if ((toksize-=2) < 0) - return G_BAD_TOK_HEADER; - - if (ret) - return ret; - - *buf_in = buf; - *body_size = toksize; - - return ret; -} - -EXPORT_SYMBOL_GPL(g_verify_token_header); diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index d2b02710ab07..9a27201638e2 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -442,35 +442,6 @@ encryptor(struct scatterlist *sg, void *data) return 0; } -int -gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf, - int offset, struct page **pages) -{ - int ret; - struct encryptor_desc desc; - SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - - BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0); - - skcipher_request_set_sync_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - - memset(desc.iv, 0, sizeof(desc.iv)); - desc.req = req; - desc.pos = offset; - desc.outbuf = buf; - desc.pages = pages; - desc.fragno = 0; - desc.fraglen = 0; - - sg_init_table(desc.infrags, 4); - sg_init_table(desc.outfrags, 4); - - ret = xdr_process_buf(buf, offset, buf->len - offset, encryptor, &desc); - skcipher_request_zero(req); - return ret; -} - struct decryptor_desc { u8 iv[GSS_KRB5_MAX_BLOCKSIZE]; struct skcipher_request *req; @@ -525,32 +496,6 @@ decryptor(struct scatterlist *sg, void *data) return 0; } -int -gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf, - int offset) -{ - int ret; - struct decryptor_desc desc; - SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); - - /* XXXJBF: */ - BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0); - - skcipher_request_set_sync_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - - memset(desc.iv, 0, sizeof(desc.iv)); - desc.req = req; - desc.fragno = 0; - desc.fraglen = 0; - - sg_init_table(desc.frags, 4); - - ret = xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc); - skcipher_request_zero(req); - return ret; -} - /* * This function makes the assumption that it was ultimately called * from gss_wrap(). diff --git a/net/sunrpc/auth_gss/gss_krb5_internal.h b/net/sunrpc/auth_gss/gss_krb5_internal.h index 3afd4065bf3d..a47e9ec228a5 100644 --- a/net/sunrpc/auth_gss/gss_krb5_internal.h +++ b/net/sunrpc/auth_gss/gss_krb5_internal.h @@ -172,13 +172,6 @@ u32 krb5_decrypt(struct crypto_sync_skcipher *key, void *iv, void *in, int xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen); -int gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm, - struct xdr_buf *outbuf, int offset, - struct page **pages); - -int gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm, - struct xdr_buf *inbuf, int offset); - u32 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, struct page **pages); diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index fae632da1058..c84d0cf61980 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -13,7 +13,6 @@ #include <linux/module.h> #include <linux/oid_registry.h> #include <linux/sunrpc/msg_prot.h> -#include <linux/sunrpc/gss_asn1.h> #include <linux/sunrpc/auth_gss.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/gss_err.h> diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 059f6ef1ad18..7ce5e28a6c03 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -281,21 +281,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h return rv; } -/* - * This is the generic cache management routine for all - * the authentication caches. - * It checks the currency of a cache item and will (later) - * initiate an upcall to fill it if needed. - * - * - * Returns 0 if the cache_head can be used, or cache_puts it and returns - * -EAGAIN if upcall is pending and request has been queued - * -ETIMEDOUT if upcall failed or request could not be queue or - * upcall completed but item is still invalid (implying that - * the cache item has been replaced with a newer one). - * -ENOENT if cache entry was negative - */ -int cache_check(struct cache_detail *detail, +int cache_check_rcu(struct cache_detail *detail, struct cache_head *h, struct cache_req *rqstp) { int rv; @@ -336,6 +322,31 @@ int cache_check(struct cache_detail *detail, rv = -ETIMEDOUT; } } + + return rv; +} +EXPORT_SYMBOL_GPL(cache_check_rcu); + +/* + * This is the generic cache management routine for all + * the authentication caches. + * It checks the currency of a cache item and will (later) + * initiate an upcall to fill it if needed. + * + * + * Returns 0 if the cache_head can be used, or cache_puts it and returns + * -EAGAIN if upcall is pending and request has been queued + * -ETIMEDOUT if upcall failed or request could not be queue or + * upcall completed but item is still invalid (implying that + * the cache item has been replaced with a newer one). + * -ENOENT if cache entry was negative + */ +int cache_check(struct cache_detail *detail, + struct cache_head *h, struct cache_req *rqstp) +{ + int rv; + + rv = cache_check_rcu(detail, h, rqstp); if (rv) cache_put(h, detail); return rv; @@ -1427,17 +1438,11 @@ static int c_show(struct seq_file *m, void *p) seq_printf(m, "# expiry=%lld refcnt=%d flags=%lx\n", convert_to_wallclock(cp->expiry_time), kref_read(&cp->ref), cp->flags); - if (!cache_get_rcu(cp)) - return 0; - if (cache_check(cd, cp, NULL)) - /* cache_check does a cache_put on failure */ + if (cache_check_rcu(cd, cp, NULL)) + seq_puts(m, "# "); + else if (cache_is_expired(cd, cp)) seq_puts(m, "# "); - else { - if (cache_is_expired(cd, cp)) - seq_puts(m, "# "); - cache_put(cp, cd); - } return cd->cache_show(m, cd, cp); } @@ -1669,12 +1674,14 @@ static void remove_cache_proc_entries(struct cache_detail *cd) } } -#ifdef CONFIG_PROC_FS static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) { struct proc_dir_entry *p; struct sunrpc_net *sn; + if (!IS_ENABLED(CONFIG_PROC_FS)) + return 0; + sn = net_generic(net, sunrpc_net_id); cd->procfs = proc_mkdir(cd->name, sn->proc_net_rpc); if (cd->procfs == NULL) @@ -1702,12 +1709,6 @@ out_nomem: remove_cache_proc_entries(cd); return -ENOMEM; } -#else /* CONFIG_PROC_FS */ -static int create_cache_proc_entries(struct cache_detail *cd, struct net *net) -{ - return 0; -} -#endif void __init cache_initialize(void) { diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0090162ee8c3..2fe88ea79a70 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -958,12 +958,17 @@ void rpc_shutdown_client(struct rpc_clnt *clnt) trace_rpc_clnt_shutdown(clnt); + clnt->cl_shutdown = 1; while (!list_empty(&clnt->cl_tasks)) { rpc_killall_tasks(clnt); wait_event_timeout(destroy_wait, list_empty(&clnt->cl_tasks), 1*HZ); } + /* wait for tasks still in workqueue or waitqueue */ + wait_event_timeout(destroy_wait, + atomic_read(&clnt->cl_task_count) == 0, 1 * HZ); + rpc_release_client(clnt); } EXPORT_SYMBOL_GPL(rpc_shutdown_client); @@ -1139,6 +1144,7 @@ void rpc_task_release_client(struct rpc_task *task) list_del(&task->tk_task); spin_unlock(&clnt->cl_lock); task->tk_client = NULL; + atomic_dec(&clnt->cl_task_count); rpc_release_client(clnt); } @@ -1189,10 +1195,7 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) task->tk_flags |= RPC_TASK_TIMEOUT; if (clnt->cl_noretranstimeo) task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT; - /* Add to the client's list of all tasks */ - spin_lock(&clnt->cl_lock); - list_add_tail(&task->tk_task, &clnt->cl_tasks); - spin_unlock(&clnt->cl_lock); + atomic_inc(&clnt->cl_task_count); } static void @@ -1787,9 +1790,14 @@ call_reserveresult(struct rpc_task *task) if (status >= 0) { if (task->tk_rqstp) { task->tk_action = call_refresh; + + /* Add to the client's list of all tasks */ + spin_lock(&task->tk_client->cl_lock); + if (list_empty(&task->tk_task)) + list_add_tail(&task->tk_task, &task->tk_client->cl_tasks); + spin_unlock(&task->tk_client->cl_lock); return; } - rpc_call_rpcerror(task, -EIO); return; } @@ -1854,13 +1862,13 @@ call_refreshresult(struct rpc_task *task) fallthrough; case -EAGAIN: status = -EACCES; - fallthrough; - case -EKEYEXPIRED: if (!task->tk_cred_retry) break; task->tk_cred_retry--; trace_rpc_retry_refresh_status(task); return; + case -EKEYEXPIRED: + break; case -ENOMEM: rpc_delay(task, HZ >> 4); return; @@ -3319,8 +3327,11 @@ bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -static void rpc_show_header(void) +static void rpc_show_header(struct rpc_clnt *clnt) { + printk(KERN_INFO "clnt[%pISpc] RPC tasks[%d]\n", + (struct sockaddr *)&clnt->cl_xprt->addr, + atomic_read(&clnt->cl_task_count)); printk(KERN_INFO "-pid- flgs status -client- --rqstp- " "-timeout ---ops--\n"); } @@ -3352,7 +3363,7 @@ void rpc_show_tasks(struct net *net) spin_lock(&clnt->cl_lock); list_for_each_entry(task, &clnt->cl_tasks, tk_task) { if (!header) { - rpc_show_header(); + rpc_show_header(clnt); header++; } rpc_show_task(clnt, task); diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c index a176d5a0b0ee..32417db340de 100644 --- a/net/sunrpc/debugfs.c +++ b/net/sunrpc/debugfs.c @@ -74,6 +74,9 @@ tasks_stop(struct seq_file *f, void *v) { struct rpc_clnt *clnt = f->private; spin_unlock(&clnt->cl_lock); + seq_printf(f, "clnt[%pISpc] RPC tasks[%d]\n", + (struct sockaddr *)&clnt->cl_xprt->addr, + atomic_read(&clnt->cl_task_count)); } static const struct seq_operations tasks_seq_operations = { @@ -179,6 +182,18 @@ xprt_info_show(struct seq_file *f, void *v) seq_printf(f, "addr: %s\n", xprt->address_strings[RPC_DISPLAY_ADDR]); seq_printf(f, "port: %s\n", xprt->address_strings[RPC_DISPLAY_PORT]); seq_printf(f, "state: 0x%lx\n", xprt->state); + seq_printf(f, "netns: %u\n", xprt->xprt_net->ns.inum); + + if (xprt->ops->get_srcaddr) { + int ret, buflen; + char buf[INET6_ADDRSTRLEN]; + + buflen = ARRAY_SIZE(buf); + ret = xprt->ops->get_srcaddr(xprt, buf, buflen); + if (ret < 0) + ret = sprintf(buf, "<closed>"); + seq_printf(f, "saddr: %.*s\n", ret, buf); + } return 0; } diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 7ce3721c06ca..eadc00410ebc 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -630,7 +630,7 @@ static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry) static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent, const char *name) { - struct qstr q = QSTR_INIT(name, strlen(name)); + struct qstr q = QSTR(name); struct dentry *dentry = d_hash_and_lookup(parent, &q); if (!dentry) { dentry = d_alloc(parent, &q); @@ -1190,8 +1190,7 @@ static const struct rpc_filelist files[] = { struct dentry *rpc_d_lookup_sb(const struct super_block *sb, const unsigned char *dir_name) { - struct qstr dir = QSTR_INIT(dir_name, strlen(dir_name)); - return d_hash_and_lookup(sb->s_root, &dir); + return d_hash_and_lookup(sb->s_root, &QSTR(dir_name)); } EXPORT_SYMBOL_GPL(rpc_d_lookup_sb); @@ -1300,11 +1299,9 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) struct dentry *gssd_dentry; struct dentry *clnt_dentry = NULL; struct dentry *pipe_dentry = NULL; - struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name, - strlen(files[RPCAUTH_gssd].name)); /* We should never get this far if "gssd" doesn't exist */ - gssd_dentry = d_hash_and_lookup(root, &q); + gssd_dentry = d_hash_and_lookup(root, &QSTR(files[RPCAUTH_gssd].name)); if (!gssd_dentry) return ERR_PTR(-ENOENT); @@ -1314,9 +1311,8 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data) goto out; } - q.name = gssd_dummy_clnt_dir[0].name; - q.len = strlen(gssd_dummy_clnt_dir[0].name); - clnt_dentry = d_hash_and_lookup(gssd_dentry, &q); + clnt_dentry = d_hash_and_lookup(gssd_dentry, + &QSTR(gssd_dummy_clnt_dir[0].name)); if (!clnt_dentry) { __rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1); pipe_dentry = ERR_PTR(-ENOENT); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index cef623ea1506..9b45fbdc90ca 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -864,8 +864,6 @@ void rpc_signal_task(struct rpc_task *task) if (!rpc_task_set_rpc_status(task, -ERESTARTSYS)) return; trace_rpc_task_signalled(task, task->tk_action); - set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); - smp_mb__after_atomic(); queue = READ_ONCE(task->tk_waitqueue); if (queue) rpc_wake_up_queued_task(queue, task); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 79879b7d39cb..e7f9c295d13c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -651,8 +651,8 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node) if (pages > RPCSVC_MAXPAGES) pages = RPCSVC_MAXPAGES; - ret = alloc_pages_bulk_array_node(GFP_KERNEL, node, pages, - rqstp->rq_pages); + ret = alloc_pages_bulk_node(GFP_KERNEL, node, pages, + rqstp->rq_pages); return ret == pages; } diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 43c57124de52..ae25405d8bd2 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -606,7 +606,8 @@ int svc_port_is_privileged(struct sockaddr *sin) } /* - * Make sure that we don't have too many active connections. If we have, + * Make sure that we don't have too many connections that have not yet + * demonstrated that they have access to the NFS server. If we have, * something must be dropped. It's not clear what will happen if we allow * "too many" connections, but when dealing with network-facing software, * we have to code defensively. Here we do that by imposing hard limits. @@ -618,34 +619,26 @@ int svc_port_is_privileged(struct sockaddr *sin) * The only somewhat efficient mechanism would be if drop old * connections from the same IP first. But right now we don't even * record the client IP in svc_sock. - * - * single-threaded services that expect a lot of clients will probably - * need to set sv_maxconn to override the default value which is based - * on the number of threads */ static void svc_check_conn_limits(struct svc_serv *serv) { - unsigned int limit = serv->sv_maxconn ? serv->sv_maxconn : - (serv->sv_nrthreads+3) * 20; - - if (serv->sv_tmpcnt > limit) { - struct svc_xprt *xprt = NULL; + if (serv->sv_tmpcnt > XPT_MAX_TMP_CONN) { + struct svc_xprt *xprt = NULL, *xprti; spin_lock_bh(&serv->sv_lock); if (!list_empty(&serv->sv_tempsocks)) { - /* Try to help the admin */ - net_notice_ratelimited("%s: too many open connections, consider increasing the %s\n", - serv->sv_name, serv->sv_maxconn ? - "max number of connections" : - "number of threads"); /* * Always select the oldest connection. It's not fair, - * but so is life + * but nor is life. */ - xprt = list_entry(serv->sv_tempsocks.prev, - struct svc_xprt, - xpt_list); - set_bit(XPT_CLOSE, &xprt->xpt_flags); - svc_xprt_get(xprt); + list_for_each_entry_reverse(xprti, &serv->sv_tempsocks, + xpt_list) { + if (!test_bit(XPT_PEER_VALID, &xprti->xpt_flags)) { + xprt = xprti; + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_get(xprt); + break; + } + } } spin_unlock_bh(&serv->sv_lock); @@ -671,8 +664,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp) } for (filled = 0; filled < pages; filled = ret) { - ret = alloc_pages_bulk_array(GFP_KERNEL, pages, - rqstp->rq_pages); + ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages); if (ret > filled) /* Made progress, don't sleep yet */ continue; @@ -1039,7 +1031,8 @@ static void svc_delete_xprt(struct svc_xprt *xprt) spin_lock_bh(&serv->sv_lock); list_del_init(&xprt->xpt_list); - if (test_bit(XPT_TEMP, &xprt->xpt_flags)) + if (test_bit(XPT_TEMP, &xprt->xpt_flags) && + !test_bit(XPT_PEER_VALID, &xprt->xpt_flags)) serv->sv_tmpcnt--; spin_unlock_bh(&serv->sv_lock); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 95397677673b..72e5a01df3d3 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1083,9 +1083,6 @@ static void svc_tcp_fragment_received(struct svc_sock *svsk) /* If we have more data, signal svc_xprt_enqueue() to try again */ svsk->sk_tcplen = 0; svsk->sk_marker = xdr_zero; - - smp_wmb(); - tcp_set_rcvlowat(svsk->sk_sk, 1); } /** @@ -1175,17 +1172,10 @@ err_incomplete: goto err_delete; if (len == want) svc_tcp_fragment_received(svsk); - else { - /* Avoid more ->sk_data_ready() calls until the rest - * of the message has arrived. This reduces service - * thread wake-ups on large incoming messages. */ - tcp_set_rcvlowat(svsk->sk_sk, - svc_sock_reclen(svsk) - svsk->sk_tcplen); - + else trace_svcsock_tcp_recv_short(&svsk->sk_xprt, svc_sock_reclen(svsk), svsk->sk_tcplen - sizeof(rpc_fraghdr)); - } goto err_noclose; error: if (len != -EAGAIN) @@ -1551,10 +1541,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, newlen = error; if (protocol == IPPROTO_TCP) { - __netns_tracker_free(net, &sock->sk->ns_tracker, false); - sock->sk->sk_net_refcnt = 1; - get_net_track(net, &sock->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(net, 1); + sk_net_refcnt_upgrade(sock->sk); if ((error = kernel_listen(sock, 64)) < 0) goto bummer; } diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 62e07c330a66..4e003cb516fe 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1097,6 +1097,12 @@ out_overflow: * Checks that we have enough buffer space to encode 'nbytes' more * bytes of data. If so, update the total xdr_buf length, and * adjust the length of the current kvec. + * + * The returned pointer is valid only until the next call to + * xdr_reserve_space() or xdr_commit_encode() on @xdr. The current + * implementation of this API guarantees that space reserved for a + * four-byte data item remains valid until @xdr is destroyed, but + * that might not always be true in the future. */ __be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) { diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 720d3ba742ec..7e98d4dd9f10 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -603,23 +603,6 @@ struct rpc_xprt *xprt_iter_get_helper(struct rpc_xprt_iter *xpi, } /** - * xprt_iter_get_xprt - Returns the rpc_xprt pointed to by the cursor - * @xpi: pointer to rpc_xprt_iter - * - * Returns a reference to the struct rpc_xprt that is currently - * pointed to by the cursor. - */ -struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi) -{ - struct rpc_xprt *xprt; - - rcu_read_lock(); - xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_xprt); - rcu_read_unlock(); - return xprt; -} - -/** * xprt_iter_get_next - Returns the next rpc_xprt following the cursor * @xpi: pointer to rpc_xprt_iter * diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c60936d8cef7..83cc095846d3 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1941,12 +1941,8 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, goto out; } - if (protocol == IPPROTO_TCP) { - __netns_tracker_free(xprt->xprt_net, &sock->sk->ns_tracker, false); - sock->sk->sk_net_refcnt = 1; - get_net_track(xprt->xprt_net, &sock->sk->ns_tracker, GFP_KERNEL); - sock_inuse_add(xprt->xprt_net, 1); - } + if (protocol == IPPROTO_TCP) + sk_net_refcnt_upgrade(sock->sk); filp = sock_alloc_file(sock, O_NONBLOCK, NULL); if (IS_ERR(filp)) @@ -2581,7 +2577,15 @@ static void xs_tls_handshake_done(void *data, int status, key_serial_t peerid) struct sock_xprt *lower_transport = container_of(lower_xprt, struct sock_xprt, xprt); - lower_transport->xprt_err = status ? -EACCES : 0; + switch (status) { + case 0: + case -EACCES: + case -ETIMEDOUT: + lower_transport->xprt_err = status; + break; + default: + lower_transport->xprt_err = -EACCES; + } complete(&lower_transport->handshake_done); xprt_put(lower_xprt); } diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 43c3f1c971b8..c524421ec652 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -2293,8 +2293,8 @@ static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr) keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME))); /* Verify the supplied size values */ - if (unlikely(size != keylen + sizeof(struct tipc_aead_key) || - keylen > TIPC_AEAD_KEY_SIZE_MAX)) { + if (unlikely(keylen > TIPC_AEAD_KEY_SIZE_MAX || + size != keylen + sizeof(struct tipc_aead_key))) { pr_debug("%s: invalid MSG_CRYPTO key size\n", rx->name); goto exit; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8f2b605ce5b3..7f8f3859cdb3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -622,7 +622,9 @@ static void unix_write_space(struct sock *sk) static void unix_dgram_disconnected(struct sock *sk, struct sock *other) { if (!skb_queue_empty(&sk->sk_receive_queue)) { - skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge_reason(&sk->sk_receive_queue, + SKB_DROP_REASON_UNIX_DISCONNECT); + wake_up_interruptible_all(&unix_sk(sk)->peer_wait); /* If one link of bidirectional dgram pipe is disconnected, @@ -640,7 +642,7 @@ static void unix_sock_destructor(struct sock *sk) { struct unix_sock *u = unix_sk(sk); - skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); @@ -715,8 +717,8 @@ static void unix_release_sock(struct sock *sk, int embrion) if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); - /* passed fds are erased in the kfree_skb hook */ - kfree_skb(skb); + /* passed fds are erased in the kfree_skb hook */ + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); } if (path.dentry) @@ -1506,7 +1508,6 @@ out: } static long unix_wait_for_peer(struct sock *other, long timeo) - __releases(&unix_sk(other)->lock) { struct unix_sock *u = unix_sk(other); int sched; @@ -1699,7 +1700,7 @@ out_unlock: unix_state_unlock(other); sock_put(other); out_free_skb: - kfree_skb(skb); + consume_skb(skb); out_free_sk: unix_release_sock(newsk, 0); out: @@ -2100,6 +2101,7 @@ restart_locked: goto out_sock_put; } + sock_put(other); goto lookup; } @@ -2170,7 +2172,7 @@ out_unlock: out_sock_put: sock_put(other); out_free: - kfree_skb(skb); + consume_skb(skb); out: scm_destroy(&scm); return err; @@ -2187,7 +2189,7 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other { struct unix_sock *ousk = unix_sk(other); struct sk_buff *skb; - int err = 0; + int err; skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); @@ -2195,25 +2197,22 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other return err; err = unix_scm_to_skb(scm, skb, !fds_sent); - if (err < 0) { - kfree_skb(skb); - return err; - } + if (err < 0) + goto out; + skb_put(skb, 1); err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); - if (err) { - kfree_skb(skb); - return err; - } + if (err) + goto out; unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) { unix_state_unlock(other); - kfree_skb(skb); - return -EPIPE; + err = -EPIPE; + goto out; } maybe_add_creds(skb, sock, other); @@ -2228,6 +2227,9 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other unix_state_unlock(other); other->sk_data_ready(other); + return 0; +out: + consume_skb(skb); return err; } #endif @@ -2236,13 +2238,11 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; + struct sk_buff *skb = NULL; struct sock *other = NULL; - int err, size; - struct sk_buff *skb; - int sent = 0; struct scm_cookie scm; bool fds_sent = false; - int data_len; + int err, sent = 0; err = scm_send(sock, msg, &scm, false); if (err < 0) @@ -2271,16 +2271,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, } } - if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) { - if (!(msg->msg_flags & MSG_NOSIGNAL)) - send_sig(SIGPIPE, current, 0); - - err = -EPIPE; - goto out_err; - } + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) + goto out_pipe; while (sent < len) { - size = len - sent; + int size = len - sent; + int data_len; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb = sock_alloc_send_pskb(sk, 0, 0, @@ -2333,7 +2329,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) - goto out_pipe; + goto out_pipe_unlock; maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); @@ -2356,13 +2352,14 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, return sent; -out_pipe: +out_pipe_unlock: unix_state_unlock(other); +out_pipe: if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; out_free: - kfree_skb(skb); + consume_skb(skb); out_err: scm_destroy(&scm); return sent ? : err; @@ -2695,7 +2692,7 @@ unlock: spin_unlock(&sk->sk_receive_queue.lock); consume_skb(read_skb); - kfree_skb(unread_skb); + kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return skb; } @@ -2724,7 +2721,7 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (sock_flag(sk, SOCK_DEAD)) { unix_state_unlock(sk); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); return -ECONNRESET; } @@ -2738,7 +2735,7 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) unix_state_unlock(sk); if (drop) { - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return -EAGAIN; } } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 0068e758be4d..9848b7b78701 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -573,7 +573,7 @@ static void __unix_gc(struct work_struct *work) UNIXCB(skb).fp->dead = true; } - __skb_queue_purge(&hitlist); + __skb_queue_purge_reason(&hitlist, SKB_DROP_REASON_SOCKET_CLOSE); skip_gc: WRITE_ONCE(gc_in_progress, false); } diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index fa9d1b49599b..7e3db87ae433 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -337,7 +337,10 @@ EXPORT_SYMBOL_GPL(vsock_find_connected_socket); void vsock_remove_sock(struct vsock_sock *vsk) { - vsock_remove_bound(vsk); + /* Transport reassignment must not remove the binding. */ + if (sock_flag(sk_vsock(vsk), SOCK_DEAD)) + vsock_remove_bound(vsk); + vsock_remove_connected(vsk); } EXPORT_SYMBOL_GPL(vsock_remove_sock); @@ -821,6 +824,13 @@ static void __vsock_release(struct sock *sk, int level) */ lock_sock_nested(sk, level); + /* Indicate to vsock_remove_sock() that the socket is being released and + * can be removed from the bound_table. Unlike transport reassignment + * case, where the socket must remain bound despite vsock_remove_sock() + * being called from the transport release() callback. + */ + sock_set_flag(sk, SOCK_DEAD); + if (vsk->transport) vsk->transport->release(vsk); else if (sock_type_connectible(sk->sk_type)) @@ -1179,6 +1189,9 @@ static int vsock_read_skb(struct sock *sk, skb_read_actor_t read_actor) { struct vsock_sock *vsk = vsock_sk(sk); + if (WARN_ON_ONCE(!vsk->transport)) + return -ENODEV; + return vsk->transport->read_skb(vsk, read_actor); } @@ -1519,6 +1532,11 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, if (err < 0) goto out; + /* sk_err might have been set as a result of an earlier + * (failed) connect attempt. + */ + sk->sk_err = 0; + /* Mark sock as connecting and set the error code to in * progress in case this is a non-blocking connect. */ diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 56c232cf5b0f..31342ab502b4 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -13,12 +13,12 @@ #include <linux/hyperv.h> #include <net/sock.h> #include <net/af_vsock.h> -#include <asm/hyperv-tlfs.h> +#include <hyperv/hvhdk.h> /* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some * stricter requirements on the hv_sock ring buffer size of six 4K pages. - * hyperv-tlfs defines HV_HYP_PAGE_SIZE as 4K. Newer hosts don't have this - * limitation; but, keep the defaults the same for compat. + * HV_HYP_PAGE_SIZE is defined as 4K. Newer hosts don't have this limitation; + * but, keep the defaults the same for compat. */ #define RINGBUFFER_HVS_RCV_SIZE (HV_HYP_PAGE_SIZE * 6) #define RINGBUFFER_HVS_SND_SIZE (HV_HYP_PAGE_SIZE * 6) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index b58c3818f284..f0e48e6911fc 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -670,6 +670,13 @@ static int virtio_vsock_vqs_init(struct virtio_vsock *vsock) }; int ret; + mutex_lock(&vsock->rx_lock); + vsock->rx_buf_nr = 0; + vsock->rx_buf_max_nr = 0; + mutex_unlock(&vsock->rx_lock); + + atomic_set(&vsock->queued_replies, 0); + ret = virtio_find_vqs(vdev, VSOCK_VQ_MAX, vsock->vqs, vqs_info, NULL); if (ret < 0) return ret; @@ -779,9 +786,6 @@ static int virtio_vsock_probe(struct virtio_device *vdev) vsock->vdev = vdev; - vsock->rx_buf_nr = 0; - vsock->rx_buf_max_nr = 0; - atomic_set(&vsock->queued_replies, 0); mutex_init(&vsock->tx_lock); mutex_init(&vsock->rx_lock); diff --git a/net/vmw_vsock/vsock_bpf.c b/net/vmw_vsock/vsock_bpf.c index f201d9eca1df..07b96d56f3a5 100644 --- a/net/vmw_vsock/vsock_bpf.c +++ b/net/vmw_vsock/vsock_bpf.c @@ -87,7 +87,7 @@ static int vsock_bpf_recvmsg(struct sock *sk, struct msghdr *msg, lock_sock(sk); vsk = vsock_sk(sk); - if (!vsk->transport) { + if (WARN_ON_ONCE(!vsk->transport)) { copied = -ENODEV; goto out; } diff --git a/net/wireless/core.c b/net/wireless/core.c index 70857018f020..15bbc9d06c9e 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -143,10 +143,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, if (result) return result; - if (!IS_ERR_OR_NULL(rdev->wiphy.debugfsdir)) - debugfs_rename(rdev->wiphy.debugfsdir->d_parent, - rdev->wiphy.debugfsdir, - rdev->wiphy.debugfsdir->d_parent, newname); + debugfs_change_name(rdev->wiphy.debugfsdir, "%s", newname); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); @@ -165,11 +162,11 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; - wdev->netdev->netns_local = false; + wdev->netdev->netns_immutable = false; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); if (err) break; - wdev->netdev->netns_local = true; + wdev->netdev->netns_immutable = true; } if (err) { @@ -181,11 +178,11 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, list) { if (!wdev->netdev) continue; - wdev->netdev->netns_local = false; + wdev->netdev->netns_immutable = false; err = dev_change_net_namespace(wdev->netdev, net, "wlan%d"); WARN_ON(err); - wdev->netdev->netns_local = true; + wdev->netdev->netns_immutable = true; } return err; @@ -1516,7 +1513,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, SET_NETDEV_DEVTYPE(dev, &wiphy_type); wdev->netdev = dev; /* can only change netns with wiphy */ - dev->netns_local = true; + dev->netns_immutable = true; cfg80211_init_wdev(wdev); break; diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 3bb04b05c5ce..bea70eb6f034 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -640,10 +640,8 @@ EXPORT_SYMBOL(wireless_send_event); #ifdef CONFIG_CFG80211_WEXT static void wireless_warn_cfg80211_wext(void) { - char name[sizeof(current->comm)]; - pr_warn_once("warning: `%s' uses wireless extensions which will stop working for Wi-Fi 7 hardware; use nl80211\n", - get_task_comm(name, current)); + current->comm); } #endif diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 89d2bef96469..84bf9f1d4bf2 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -742,6 +742,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, goto free_err; } } + + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + skb->skb_mstamp_ns = meta->request.launch_time; } } @@ -875,7 +878,7 @@ static bool xsk_no_wakeup(struct sock *sk) #ifdef CONFIG_NET_RX_BUSY_POLL /* Prefer busy-polling, skip the wakeup. */ return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) && - READ_ONCE(sk->sk_napi_id) >= MIN_NAPI_ID; + napi_id_valid(READ_ONCE(sk->sk_napi_id)); #else return false; #endif diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 1f7975b49657..c263fb7a68dc 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -699,18 +699,56 @@ void xp_free(struct xdp_buff_xsk *xskb) } EXPORT_SYMBOL(xp_free); -void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +static u64 __xp_raw_get_addr(const struct xsk_buff_pool *pool, u64 addr) +{ + return pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; +} + +static void *__xp_raw_get_data(const struct xsk_buff_pool *pool, u64 addr) { - addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; return pool->addrs + addr; } + +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +{ + return __xp_raw_get_data(pool, __xp_raw_get_addr(pool, addr)); +} EXPORT_SYMBOL(xp_raw_get_data); -dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +static dma_addr_t __xp_raw_get_dma(const struct xsk_buff_pool *pool, u64 addr) { - addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; return (pool->dma_pages[addr >> PAGE_SHIFT] & ~XSK_NEXT_PG_CONTIG_MASK) + (addr & ~PAGE_MASK); } + +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +{ + return __xp_raw_get_dma(pool, __xp_raw_get_addr(pool, addr)); +} EXPORT_SYMBOL(xp_raw_get_dma); + +/** + * xp_raw_get_ctx - get &xdp_desc context + * @pool: XSk buff pool desc address belongs to + * @addr: desc address (from userspace) + * + * Helper for getting desc's DMA address and metadata pointer, if present. + * Saves one call on hotpath, double calculation of the actual address, + * and inline checks for metadata presence and sanity. + * + * Return: new &xdp_desc_ctx struct containing desc's DMA address and metadata + * pointer, if it is present and valid (initialized to %NULL otherwise). + */ +struct xdp_desc_ctx xp_raw_get_ctx(const struct xsk_buff_pool *pool, u64 addr) +{ + struct xdp_desc_ctx ret; + + addr = __xp_raw_get_addr(pool, addr); + + ret.dma = __xp_raw_get_dma(pool, addr); + ret.meta = __xsk_buff_get_metadata(pool, __xp_raw_get_data(pool, addr)); + + return ret; +} +EXPORT_SYMBOL(xp_raw_get_ctx); diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 98f1e2b67c76..622445f041d3 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -242,10 +242,9 @@ static void xfrmi_dev_free(struct net_device *dev) gro_cells_destroy(&xi->gro_cells); } -static int xfrmi_create(struct net_device *dev) +static int xfrmi_create(struct net *net, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); - struct net *net = dev_net(dev); struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; @@ -506,7 +505,7 @@ xmit: skb_dst_set(skb, dst); skb->dev = tdev; - err = dst_output(xi->net, skb->sk, skb); + err = dst_output(xi->net, skb_to_full_sk(skb), skb); if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { @@ -814,15 +813,17 @@ static void xfrmi_netlink_parms(struct nlattr *data[], parms->collect_md = true; } -static int xfrmi_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) +static int xfrmi_newlink(struct net_device *dev, + struct rtnl_newlink_params *params, + struct netlink_ext_ack *extack) { - struct net *net = dev_net(dev); + struct nlattr **data = params->data; struct xfrm_if_parms p = {}; struct xfrm_if *xi; + struct net *net; int err; + net = params->link_net ? : dev_net(dev); xfrmi_netlink_parms(data, &p); if (p.collect_md) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); @@ -851,7 +852,7 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, xi->net = net; xi->dev = dev; - err = xfrmi_create(dev); + err = xfrmi_create(net, dev); return err; } diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index b5025cf6136e..f7abd42c077d 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -802,7 +802,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) { skb->protocol = htons(ETH_P_IP); - if (skb->sk) + if (skb->sk && sk_fullsock(skb->sk)) xfrm_local_error(skb, mtu); else icmp_send(skb, ICMP_DEST_UNREACH, @@ -838,6 +838,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; struct dst_entry *dst = skb_dst(skb); + struct sock *sk = skb_to_full_sk(skb); if (skb->ignore_df) goto out; @@ -852,9 +853,9 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) skb->dev = dst->dev; skb->protocol = htons(ETH_P_IPV6); - if (xfrm6_local_dontfrag(skb->sk)) + if (xfrm6_local_dontfrag(sk)) ipv6_stub->xfrm6_local_rxpmtu(skb, mtu); - else if (skb->sk) + else if (sk) xfrm_local_error(skb, mtu); else icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 9e510021ee91..6551e588fe52 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2964,7 +2964,7 @@ static void xfrm_policy_queue_process(struct timer_list *t) skb_dst_drop(skb); skb_dst_set(skb, dst); - dst_output(net, skb->sk, skb); + dst_output(net, skb_to_full_sk(skb), skb); } out: diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index e500aebbad22..dbdf8a39dffe 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -714,10 +714,12 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff oseq += skb_shinfo(skb)->gso_segs; } - if (unlikely(xo->seq.low < replay_esn->oseq)) { - XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi; - xo->seq.hi = oseq_hi; - replay_esn->oseq_hi = oseq_hi; + if (unlikely(oseq < replay_esn->oseq)) { + replay_esn->oseq_hi = ++oseq_hi; + if (xo->seq.low < replay_esn->oseq) { + XFRM_SKB_CB(skb)->seq.output.hi = oseq_hi; + xo->seq.hi = oseq_hi; + } if (replay_esn->oseq_hi == 0) { replay_esn->oseq--; replay_esn->oseq_hi--; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 34067cb8a479..ad2202fa82f3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -34,6 +34,8 @@ #define xfrm_state_deref_prot(table, net) \ rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) +#define xfrm_state_deref_check(table, net) \ + rcu_dereference_check((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) static void xfrm_state_gc_task(struct work_struct *work); @@ -62,6 +64,8 @@ static inline unsigned int xfrm_dst_hash(struct net *net, u32 reqid, unsigned short family) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_dst_hash(daddr, saddr, reqid, family, net->xfrm.state_hmask); } @@ -70,6 +74,8 @@ static inline unsigned int xfrm_src_hash(struct net *net, const xfrm_address_t *saddr, unsigned short family) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_src_hash(daddr, saddr, family, net->xfrm.state_hmask); } @@ -77,11 +83,15 @@ static inline unsigned int xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask); } static unsigned int xfrm_seq_hash(struct net *net, u32 seq) { + lockdep_assert_held(&net->xfrm.xfrm_state_lock); + return __xfrm_seq_hash(seq, net->xfrm.state_hmask); } @@ -1108,16 +1118,38 @@ xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, x->props.family = tmpl->encap_family; } -static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, +struct xfrm_hash_state_ptrs { + const struct hlist_head *bydst; + const struct hlist_head *bysrc; + const struct hlist_head *byspi; + unsigned int hmask; +}; + +static void xfrm_hash_ptrs_get(const struct net *net, struct xfrm_hash_state_ptrs *ptrs) +{ + unsigned int sequence; + + do { + sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); + + ptrs->bydst = xfrm_state_deref_check(net->xfrm.state_bydst, net); + ptrs->bysrc = xfrm_state_deref_check(net->xfrm.state_bysrc, net); + ptrs->byspi = xfrm_state_deref_check(net->xfrm.state_byspi, net); + ptrs->hmask = net->xfrm.state_hmask; + } while (read_seqcount_retry(&net->xfrm.xfrm_state_hash_generation, sequence)); +} + +static struct xfrm_state *__xfrm_state_lookup_all(const struct xfrm_hash_state_ptrs *state_ptrs, + u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family, struct xfrm_dev_offload *xdo) { - unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); + unsigned int h = __xfrm_spi_hash(daddr, spi, proto, family, state_ptrs->hmask); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { + hlist_for_each_entry_rcu(x, state_ptrs->byspi + h, byspi) { #ifdef CONFIG_XFRM_OFFLOAD if (xdo->type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) @@ -1151,15 +1183,16 @@ static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, return NULL; } -static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, +static struct xfrm_state *__xfrm_state_lookup(const struct xfrm_hash_state_ptrs *state_ptrs, + u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { - unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); + unsigned int h = __xfrm_spi_hash(daddr, spi, proto, family, state_ptrs->hmask); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { + hlist_for_each_entry_rcu(x, state_ptrs->byspi + h, byspi) { if (x->props.family != family || x->id.spi != spi || x->id.proto != proto || @@ -1181,11 +1214,11 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, __be32 spi, u8 proto, unsigned short family) { + struct xfrm_hash_state_ptrs state_ptrs; struct hlist_head *state_cache_input; struct xfrm_state *x = NULL; - int cpu = get_cpu(); - state_cache_input = per_cpu_ptr(net->xfrm.state_cache_input, cpu); + state_cache_input = raw_cpu_ptr(net->xfrm.state_cache_input); rcu_read_lock(); hlist_for_each_entry_rcu(x, state_cache_input, state_cache_input) { @@ -1202,7 +1235,9 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, goto out; } - x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); + xfrm_hash_ptrs_get(net, &state_ptrs); + + x = __xfrm_state_lookup(&state_ptrs, mark, daddr, spi, proto, family); if (x && x->km.state == XFRM_STATE_VALID) { spin_lock_bh(&net->xfrm.xfrm_state_lock); @@ -1217,20 +1252,20 @@ struct xfrm_state *xfrm_input_state_lookup(struct net *net, u32 mark, out: rcu_read_unlock(); - put_cpu(); return x; } EXPORT_SYMBOL(xfrm_input_state_lookup); -static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, +static struct xfrm_state *__xfrm_state_lookup_byaddr(const struct xfrm_hash_state_ptrs *state_ptrs, + u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family) { - unsigned int h = xfrm_src_hash(net, daddr, saddr, family); + unsigned int h = __xfrm_src_hash(daddr, saddr, family, state_ptrs->hmask); struct xfrm_state *x; - hlist_for_each_entry_rcu(x, net->xfrm.state_bysrc + h, bysrc) { + hlist_for_each_entry_rcu(x, state_ptrs->bysrc + h, bysrc) { if (x->props.family != family || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family) || @@ -1250,14 +1285,17 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, static inline struct xfrm_state * __xfrm_state_locate(struct xfrm_state *x, int use_spi, int family) { + struct xfrm_hash_state_ptrs state_ptrs; struct net *net = xs_net(x); u32 mark = x->mark.v & x->mark.m; + xfrm_hash_ptrs_get(net, &state_ptrs); + if (use_spi) - return __xfrm_state_lookup(net, mark, &x->id.daddr, + return __xfrm_state_lookup(&state_ptrs, mark, &x->id.daddr, x->id.spi, x->id.proto, family); else - return __xfrm_state_lookup_byaddr(net, mark, + return __xfrm_state_lookup_byaddr(&state_ptrs, mark, &x->id.daddr, &x->props.saddr, x->id.proto, family); @@ -1331,6 +1369,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, u32 if_id) { static xfrm_address_t saddr_wildcard = { }; + struct xfrm_hash_state_ptrs state_ptrs; struct net *net = xp_net(pol); unsigned int h, h_wildcard; struct xfrm_state *x, *x0, *to_put; @@ -1395,8 +1434,10 @@ cached: else if (acquire_in_progress) /* XXX: acquire_in_progress should not happen */ WARN_ON(1); - h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); - hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { + xfrm_hash_ptrs_get(net, &state_ptrs); + + h = __xfrm_dst_hash(daddr, saddr, tmpl->reqid, encap_family, state_ptrs.hmask); + hlist_for_each_entry_rcu(x, state_ptrs.bydst + h, bydst) { #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) @@ -1429,8 +1470,9 @@ cached: if (best || acquire_in_progress) goto found; - h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); - hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) { + h_wildcard = __xfrm_dst_hash(daddr, &saddr_wildcard, tmpl->reqid, + encap_family, state_ptrs.hmask); + hlist_for_each_entry_rcu(x, state_ptrs.bydst + h_wildcard, bydst) { #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) @@ -1468,7 +1510,7 @@ found: if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && - (x0 = __xfrm_state_lookup_all(net, mark, daddr, + (x0 = __xfrm_state_lookup_all(&state_ptrs, mark, daddr, tmpl->id.spi, tmpl->id.proto, encap_family, &pol->xdo)) != NULL) { @@ -2253,10 +2295,13 @@ struct xfrm_state * xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { + struct xfrm_hash_state_ptrs state_ptrs; struct xfrm_state *x; rcu_read_lock(); - x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); + xfrm_hash_ptrs_get(net, &state_ptrs); + + x = __xfrm_state_lookup(&state_ptrs, mark, daddr, spi, proto, family); rcu_read_unlock(); return x; } @@ -2267,10 +2312,14 @@ xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family) { + struct xfrm_hash_state_ptrs state_ptrs; struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); - x = __xfrm_state_lookup_byaddr(net, mark, daddr, saddr, proto, family); + + xfrm_hash_ptrs_get(net, &state_ptrs); + + x = __xfrm_state_lookup_byaddr(&state_ptrs, mark, daddr, saddr, proto, family); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } |
